]> git.pld-linux.org Git - packages/kernel.git/blob - linux-2.6-xen.patch
- ups... I forgot about it
[packages/kernel.git] / linux-2.6-xen.patch
1 diff -urNp linux-2.6/arch/i386/boot-xen/Makefile new/arch/i386/boot-xen/Makefile
2 --- linux-2.6/arch/i386/boot-xen/Makefile       1970-01-01 01:00:00.000000000 +0100
3 +++ new/arch/i386/boot-xen/Makefile     2006-05-09 12:32:33.000000000 +0200
4 @@ -0,0 +1,21 @@
5 +
6 +OBJCOPYFLAGS := -g --strip-unneeded
7 +
8 +vmlinuz: vmlinux-stripped FORCE
9 +       $(call if_changed,gzip)
10 +
11 +vmlinux-stripped: vmlinux FORCE
12 +       $(call if_changed,objcopy)
13 +
14 +INSTALL_ROOT := $(patsubst %/boot,%,$(INSTALL_PATH))
15 +
16 +XINSTALL_NAME ?= $(KERNELRELEASE)
17 +install:
18 +       mkdir -p $(INSTALL_ROOT)/boot
19 +       ln -f -s vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_ROOT)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(XENGUEST)$(INSTALL_SUFFIX)
20 +       rm -f $(INSTALL_ROOT)/boot/vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
21 +       install -m0644 vmlinuz $(INSTALL_ROOT)/boot/vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
22 +       install -m0644 vmlinux $(INSTALL_ROOT)/boot/vmlinux-syms-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
23 +       install -m0664 .config $(INSTALL_ROOT)/boot/config-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
24 +       install -m0664 System.map $(INSTALL_ROOT)/boot/System.map-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
25 +       ln -f -s vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_ROOT)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL)$(XENGUEST)$(INSTALL_SUFFIX)
26 diff -urNp linux-2.6/arch/i386/Kconfig new/arch/i386/Kconfig
27 --- linux-2.6/arch/i386/Kconfig 2006-07-03 14:14:14.000000000 +0200
28 +++ new/arch/i386/Kconfig       2006-05-23 18:37:09.000000000 +0200
29 @@ -91,6 +91,15 @@ config X86_PC
30         help
31           Choose this option if your computer is a standard PC or compatible.
32  
33 +config X86_XEN
34 +       bool "Xen-compatible"
35 +       select X86_UP_APIC if !SMP && XEN_PRIVILEGED_GUEST
36 +       select X86_UP_IOAPIC if !SMP && XEN_PRIVILEGED_GUEST
37 +       select SWIOTLB
38 +       help
39 +         Choose this option if you plan to run this kernel on top of the
40 +         Xen Hypervisor.
41 +
42  config X86_ELAN
43         bool "AMD Elan"
44         help
45 @@ -193,6 +202,7 @@ source "arch/i386/Kconfig.cpu"
46  
47  config HPET_TIMER
48         bool "HPET Timer Support"
49 +       depends on !X86_XEN
50         help
51           This enables the use of the HPET for the kernel's internal timer.
52           HPET is the next generation timer replacing legacy 8254s.
53 @@ -223,7 +233,7 @@ config NR_CPUS
54  
55  config SCHED_SMT
56         bool "SMT (Hyperthreading) scheduler support"
57 -       depends on SMP
58 +       depends on SMP && !X86_XEN
59         default off
60         help
61           SMT scheduler support improves the CPU scheduler's decision making
62 @@ -233,7 +243,7 @@ config SCHED_SMT
63  
64  config SCHED_MC
65         bool "Multi-core scheduler support"
66 -       depends on SMP
67 +       depends on SMP && !X86_XEN
68         default y
69         help
70           Multi-core scheduler support improves the CPU scheduler's decision
71 @@ -244,7 +254,7 @@ source "kernel/Kconfig.preempt"
72  
73  config X86_UP_APIC
74         bool "Local APIC support on uniprocessors"
75 -       depends on !SMP && !(X86_VISWS || X86_VOYAGER)
76 +       depends on !SMP && !(X86_VISWS || X86_VOYAGER || XEN_UNPRIVILEGED_GUEST)
77         help
78           A local APIC (Advanced Programmable Interrupt Controller) is an
79           integrated interrupt controller in the CPU. If you have a single-CPU
80 @@ -269,12 +279,12 @@ config X86_UP_IOAPIC
81  
82  config X86_LOCAL_APIC
83         bool
84 -       depends on X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER)
85 +       depends on X86_UP_APIC || ((X86_VISWS || SMP) && !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST))
86         default y
87  
88  config X86_IO_APIC
89         bool
90 -       depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER))
91 +       depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER || XEN_UNPRIVILEGED_GUEST))
92         default y
93  
94  config X86_VISWS_APIC
95 @@ -282,9 +292,14 @@ config X86_VISWS_APIC
96         depends on X86_VISWS
97         default y
98  
99 +config X86_TSC
100 +       bool
101 +       depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1) && !X86_NUMAQ && !X86_XEN
102 +       default y
103 +
104  config X86_MCE
105         bool "Machine Check Exception"
106 -       depends on !X86_VOYAGER
107 +       depends on !(X86_VOYAGER || X86_XEN)
108         ---help---
109           Machine Check Exception support allows the processor to notify the
110           kernel if it detects a problem (e.g. overheating, component failure).
111 @@ -374,6 +389,7 @@ config X86_REBOOTFIXUPS
112  
113  config MICROCODE
114         tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support"
115 +       depends on !XEN_UNPRIVILEGED_GUEST
116         ---help---
117           If you say Y here and also to "/dev file system support" in the
118           'File systems' section, you will be able to update the microcode on
119 @@ -391,6 +407,7 @@ config MICROCODE
120  
121  config X86_MSR
122         tristate "/dev/cpu/*/msr - Model-specific register support"
123 +       depends on !X86_XEN
124         help
125           This device gives privileged processes access to the x86
126           Model-Specific Registers (MSRs).  It is a character device with
127 @@ -406,6 +423,10 @@ config X86_CPUID
128           with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
129           /dev/cpu/31/cpuid.
130  
131 +config SWIOTLB
132 +       bool
133 +       default n
134 +
135  source "drivers/firmware/Kconfig"
136  
137  choice
138 @@ -578,7 +599,7 @@ config HAVE_ARCH_EARLY_PFN_TO_NID
139  
140  config HIGHPTE
141         bool "Allocate 3rd-level pagetables from highmem"
142 -       depends on HIGHMEM4G || HIGHMEM64G
143 +       depends on (HIGHMEM4G || HIGHMEM64G) && !X86_XEN
144         help
145           The VM uses one page table entry for each page of physical memory.
146           For systems with a lot of RAM, this can be wasteful of precious
147 @@ -587,6 +608,7 @@ config HIGHPTE
148  
149  config MATH_EMULATION
150         bool "Math emulation"
151 +       depends on !X86_XEN
152         ---help---
153           Linux can emulate a math coprocessor (used for floating point
154           operations) if you don't have one. 486DX and Pentium processors have
155 @@ -612,6 +634,8 @@ config MATH_EMULATION
156  
157  config MTRR
158         bool "MTRR (Memory Type Range Register) support"
159 +       depends on !XEN_UNPRIVILEGED_GUEST
160 +       default y if X86_XEN
161         ---help---
162           On Intel P6 family processors (Pentium Pro, Pentium II and later)
163           the Memory Type Range Registers (MTRRs) may be used to control
164 @@ -646,7 +670,7 @@ config MTRR
165  
166  config EFI
167         bool "Boot from EFI support (EXPERIMENTAL)"
168 -       depends on ACPI
169 +       depends on ACPI && !X86_XEN
170         default n
171         ---help---
172         This enables the the kernel to boot on EFI platforms using
173 @@ -664,7 +688,7 @@ config EFI
174  
175  config IRQBALANCE
176         bool "Enable kernel irq balancing"
177 -       depends on SMP && X86_IO_APIC
178 +       depends on SMP && X86_IO_APIC && !X86_XEN
179         default y
180         help
181           The default yes will allow the kernel to do irq load balancing.
182 @@ -712,7 +736,7 @@ source kernel/Kconfig.hz
183  
184  config KEXEC
185         bool "kexec system call (EXPERIMENTAL)"
186 -       depends on EXPERIMENTAL
187 +       depends on EXPERIMENTAL && !X86_XEN
188         help
189           kexec is a system call that implements the ability to shutdown your
190           current kernel, and to start another kernel.  It is like a reboot
191 @@ -767,18 +791,20 @@ endmenu
192  
193  
194  menu "Power management options (ACPI, APM)"
195 -       depends on !X86_VOYAGER
196 +       depends on !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST)
197  
198 +if !X86_XEN
199  source kernel/power/Kconfig
200 +endif
201  
202  source "drivers/acpi/Kconfig"
203  
204  menu "APM (Advanced Power Management) BIOS Support"
205 -depends on PM && !X86_VISWS
206 +depends on PM && !(X86_VISWS || X86_XEN)
207  
208  config APM
209         tristate "APM (Advanced Power Management) BIOS support"
210 -       depends on PM
211 +       depends on PM && PM_LEGACY
212         ---help---
213           APM is a BIOS specification for saving power using several different
214           techniques. This is mostly useful for battery powered laptops with
215 @@ -963,6 +989,7 @@ choice
216  
217  config PCI_GOBIOS
218         bool "BIOS"
219 +       depends on !X86_XEN
220  
221  config PCI_GOMMCONFIG
222         bool "MMConfig"
223 @@ -970,6 +997,13 @@ config PCI_GOMMCONFIG
224  config PCI_GODIRECT
225         bool "Direct"
226  
227 +config PCI_GOXEN_FE
228 +       bool "Xen PCI Frontend"
229 +       depends on X86_XEN
230 +       help
231 +         The PCI device frontend driver allows the kernel to import arbitrary
232 +         PCI devices from a PCI backend to support PCI driver domains.
233 +
234  config PCI_GOANY
235         bool "Any"
236  
237 @@ -977,7 +1011,7 @@ endchoice
238  
239  config PCI_BIOS
240         bool
241 -       depends on !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY)
242 +       depends on !(X86_VISWS || X86_XEN) && PCI && (PCI_GOBIOS || PCI_GOANY)
243         default y
244  
245  config PCI_DIRECT
246 @@ -990,6 +1024,18 @@ config PCI_MMCONFIG
247         depends on PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY)
248         default y
249  
250 +config XEN_PCIDEV_FRONTEND
251 +       bool
252 +       depends on PCI && X86_XEN && (PCI_GOXEN_FE || PCI_GOANY)
253 +       default y
254 +
255 +config XEN_PCIDEV_FE_DEBUG
256 +       bool "Xen PCI Frontend Debugging"
257 +       depends on XEN_PCIDEV_FRONTEND
258 +       default n
259 +       help
260 +         Enables some debug statements within the PCI Frontend.
261 +
262  source "drivers/pci/pcie/Kconfig"
263  
264  source "drivers/pci/Kconfig"
265 @@ -1000,7 +1046,7 @@ config ISA_DMA_API
266  
267  config ISA
268         bool "ISA support"
269 -       depends on !(X86_VOYAGER || X86_VISWS)
270 +       depends on !(X86_VOYAGER || X86_VISWS || X86_XEN)
271         help
272           Find out whether you have ISA slots on your motherboard.  ISA is the
273           name of a bus system, i.e. the way the CPU talks to the other stuff
274 @@ -1027,7 +1073,7 @@ config EISA
275  source "drivers/eisa/Kconfig"
276  
277  config MCA
278 -       bool "MCA support" if !(X86_VISWS || X86_VOYAGER)
279 +       bool "MCA support" if !(X86_VISWS || X86_VOYAGER || X86_XEN)
280         default y if X86_VOYAGER
281         help
282           MicroChannel Architecture is found in some IBM PS/2 machines and
283 @@ -1089,6 +1135,8 @@ source "security/Kconfig"
284  
285  source "crypto/Kconfig"
286  
287 +source "drivers/xen/Kconfig"
288 +
289  source "lib/Kconfig"
290  
291  #
292 @@ -1114,7 +1162,7 @@ config X86_SMP
293  
294  config X86_HT
295         bool
296 -       depends on SMP && !(X86_VISWS || X86_VOYAGER)
297 +       depends on SMP && !(X86_VISWS || X86_VOYAGER || X86_XEN)
298         default y
299  
300  config X86_BIOS_REBOOT
301 @@ -1127,6 +1175,16 @@ config X86_TRAMPOLINE
302         depends on X86_SMP || (X86_VOYAGER && SMP)
303         default y
304  
305 +config X86_NO_TSS
306 +       bool
307 +       depends on X86_XEN
308 +       default y
309 +
310 +config X86_NO_IDT
311 +       bool
312 +       depends on X86_XEN
313 +       default y
314 +
315  config KTIME_SCALAR
316         bool
317         default y
318 diff -urNp linux-2.6/arch/i386/Kconfig.cpu new/arch/i386/Kconfig.cpu
319 --- linux-2.6/arch/i386/Kconfig.cpu     2006-07-03 14:14:14.000000000 +0200
320 +++ new/arch/i386/Kconfig.cpu   2006-05-09 12:32:33.000000000 +0200
321 @@ -251,7 +251,7 @@ config X86_PPRO_FENCE
322  
323  config X86_F00F_BUG
324         bool
325 -       depends on M586MMX || M586TSC || M586 || M486 || M386
326 +       depends on (M586MMX || M586TSC || M586 || M486 || M386) && !X86_NO_IDT
327         default y
328  
329  config X86_WP_WORKS_OK
330 diff -urNp linux-2.6/arch/i386/Kconfig.debug new/arch/i386/Kconfig.debug
331 --- linux-2.6/arch/i386/Kconfig.debug   2006-07-03 14:14:14.000000000 +0200
332 +++ new/arch/i386/Kconfig.debug 2006-05-09 12:32:33.000000000 +0200
333 @@ -84,6 +84,7 @@ config X86_MPPARSE
334  config DOUBLEFAULT
335         default y
336         bool "Enable doublefault exception handler" if EMBEDDED
337 +       depends on !X86_NO_TSS
338         help
339            This option allows trapping of rare doublefault exceptions that
340            would otherwise cause a system to silently reboot. Disabling this
341 diff -urNp linux-2.6/arch/i386/kernel/acpi/boot-xen.c new/arch/i386/kernel/acpi/boot-xen.c
342 --- linux-2.6/arch/i386/kernel/acpi/boot-xen.c  1970-01-01 01:00:00.000000000 +0100
343 +++ new/arch/i386/kernel/acpi/boot-xen.c        2006-05-23 18:37:09.000000000 +0200
344 @@ -0,0 +1,1167 @@
345 +/*
346 + *  boot.c - Architecture-Specific Low-Level ACPI Boot Support
347 + *
348 + *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
349 + *  Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
350 + *
351 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
352 + *
353 + *  This program is free software; you can redistribute it and/or modify
354 + *  it under the terms of the GNU General Public License as published by
355 + *  the Free Software Foundation; either version 2 of the License, or
356 + *  (at your option) any later version.
357 + *
358 + *  This program is distributed in the hope that it will be useful,
359 + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
360 + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
361 + *  GNU General Public License for more details.
362 + *
363 + *  You should have received a copy of the GNU General Public License
364 + *  along with this program; if not, write to the Free Software
365 + *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
366 + *
367 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
368 + */
369 +
370 +#include <linux/init.h>
371 +#include <linux/config.h>
372 +#include <linux/acpi.h>
373 +#include <linux/efi.h>
374 +#include <linux/module.h>
375 +#include <linux/dmi.h>
376 +#include <linux/irq.h>
377 +
378 +#include <asm/pgtable.h>
379 +#include <asm/io_apic.h>
380 +#include <asm/apic.h>
381 +#include <asm/io.h>
382 +#include <asm/mpspec.h>
383 +
384 +#ifdef CONFIG_X86_64
385 +
386 +extern void __init clustered_apic_check(void);
387 +
388 +extern int gsi_irq_sharing(int gsi);
389 +#include <asm/proto.h>
390 +
391 +static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; }
392 +
393 +
394 +#else                          /* X86 */
395 +
396 +#ifdef CONFIG_X86_LOCAL_APIC
397 +#include <mach_apic.h>
398 +#include <mach_mpparse.h>
399 +#endif                         /* CONFIG_X86_LOCAL_APIC */
400 +
401 +static inline int gsi_irq_sharing(int gsi) { return gsi; }
402 +
403 +#endif                         /* X86 */
404 +
405 +#define BAD_MADT_ENTRY(entry, end) (                                       \
406 +               (!entry) || (unsigned long)entry + sizeof(*entry) > end ||  \
407 +               ((acpi_table_entry_header *)entry)->length != sizeof(*entry))
408 +
409 +#define PREFIX                 "ACPI: "
410 +
411 +int acpi_noirq __initdata;     /* skip ACPI IRQ initialization */
412 +int acpi_pci_disabled __initdata;      /* skip ACPI PCI scan and IRQ initialization */
413 +int acpi_ht __initdata = 1;    /* enable HT */
414 +
415 +int acpi_lapic;
416 +int acpi_ioapic;
417 +int acpi_strict;
418 +EXPORT_SYMBOL(acpi_strict);
419 +
420 +acpi_interrupt_flags acpi_sci_flags __initdata;
421 +int acpi_sci_override_gsi __initdata;
422 +int acpi_skip_timer_override __initdata;
423 +
424 +#ifdef CONFIG_X86_LOCAL_APIC
425 +static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
426 +#endif
427 +
428 +#ifndef __HAVE_ARCH_CMPXCHG
429 +#warning ACPI uses CMPXCHG, i486 and later hardware
430 +#endif
431 +
432 +#define MAX_MADT_ENTRIES       256
433 +u8 x86_acpiid_to_apicid[MAX_MADT_ENTRIES] =
434 +    {[0 ... MAX_MADT_ENTRIES - 1] = 0xff };
435 +EXPORT_SYMBOL(x86_acpiid_to_apicid);
436 +
437 +/* --------------------------------------------------------------------------
438 +                              Boot-time Configuration
439 +   -------------------------------------------------------------------------- */
440 +
441 +/*
442 + * The default interrupt routing model is PIC (8259).  This gets
443 + * overriden if IOAPICs are enumerated (below).
444 + */
445 +enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
446 +
447 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
448 +
449 +/* rely on all ACPI tables being in the direct mapping */
450 +char *__acpi_map_table(unsigned long phys_addr, unsigned long size)
451 +{
452 +       if (!phys_addr || !size)
453 +               return NULL;
454 +
455 +       if (phys_addr+size <= (end_pfn_map << PAGE_SHIFT) + PAGE_SIZE)
456 +               return __va(phys_addr);
457 +
458 +       return NULL;
459 +}
460 +
461 +#else
462 +
463 +/*
464 + * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
465 + * to map the target physical address. The problem is that set_fixmap()
466 + * provides a single page, and it is possible that the page is not
467 + * sufficient.
468 + * By using this area, we can map up to MAX_IO_APICS pages temporarily,
469 + * i.e. until the next __va_range() call.
470 + *
471 + * Important Safety Note:  The fixed I/O APIC page numbers are *subtracted*
472 + * from the fixed base.  That's why we start at FIX_IO_APIC_BASE_END and
473 + * count idx down while incrementing the phys address.
474 + */
475 +char *__acpi_map_table(unsigned long phys, unsigned long size)
476 +{
477 +       unsigned long base, offset, mapped_size;
478 +       int idx;
479 +
480 +#ifndef CONFIG_XEN
481 +       if (phys + size < 8 * 1024 * 1024)
482 +               return __va(phys);
483 +#endif
484 +
485 +       offset = phys & (PAGE_SIZE - 1);
486 +       mapped_size = PAGE_SIZE - offset;
487 +       set_fixmap(FIX_ACPI_END, phys);
488 +       base = fix_to_virt(FIX_ACPI_END);
489 +
490 +       /*
491 +        * Most cases can be covered by the below.
492 +        */
493 +       idx = FIX_ACPI_END;
494 +       while (mapped_size < size) {
495 +               if (--idx < FIX_ACPI_BEGIN)
496 +                       return NULL;    /* cannot handle this */
497 +               phys += PAGE_SIZE;
498 +               set_fixmap(idx, phys);
499 +               mapped_size += PAGE_SIZE;
500 +       }
501 +
502 +       return ((unsigned char *)base + offset);
503 +}
504 +#endif
505 +
506 +#ifdef CONFIG_PCI_MMCONFIG
507 +/* The physical address of the MMCONFIG aperture.  Set from ACPI tables. */
508 +struct acpi_table_mcfg_config *pci_mmcfg_config;
509 +int pci_mmcfg_config_num;
510 +
511 +int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size)
512 +{
513 +       struct acpi_table_mcfg *mcfg;
514 +       unsigned long i;
515 +       int config_size;
516 +
517 +       if (!phys_addr || !size)
518 +               return -EINVAL;
519 +
520 +       mcfg = (struct acpi_table_mcfg *)__acpi_map_table(phys_addr, size);
521 +       if (!mcfg) {
522 +               printk(KERN_WARNING PREFIX "Unable to map MCFG\n");
523 +               return -ENODEV;
524 +       }
525 +
526 +       /* how many config structures do we have */
527 +       pci_mmcfg_config_num = 0;
528 +       i = size - sizeof(struct acpi_table_mcfg);
529 +       while (i >= sizeof(struct acpi_table_mcfg_config)) {
530 +               ++pci_mmcfg_config_num;
531 +               i -= sizeof(struct acpi_table_mcfg_config);
532 +       };
533 +       if (pci_mmcfg_config_num == 0) {
534 +               printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
535 +               return -ENODEV;
536 +       }
537 +
538 +       config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config);
539 +       pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL);
540 +       if (!pci_mmcfg_config) {
541 +               printk(KERN_WARNING PREFIX
542 +                      "No memory for MCFG config tables\n");
543 +               return -ENOMEM;
544 +       }
545 +
546 +       memcpy(pci_mmcfg_config, &mcfg->config, config_size);
547 +       for (i = 0; i < pci_mmcfg_config_num; ++i) {
548 +               if (mcfg->config[i].base_reserved) {
549 +                       printk(KERN_ERR PREFIX
550 +                              "MMCONFIG not in low 4GB of memory\n");
551 +                       return -ENODEV;
552 +               }
553 +       }
554 +
555 +       return 0;
556 +}
557 +#endif                         /* CONFIG_PCI_MMCONFIG */
558 +
559 +#ifdef CONFIG_X86_LOCAL_APIC
560 +static int __init acpi_parse_madt(unsigned long phys_addr, unsigned long size)
561 +{
562 +       struct acpi_table_madt *madt = NULL;
563 +
564 +       if (!phys_addr || !size)
565 +               return -EINVAL;
566 +
567 +       madt = (struct acpi_table_madt *)__acpi_map_table(phys_addr, size);
568 +       if (!madt) {
569 +               printk(KERN_WARNING PREFIX "Unable to map MADT\n");
570 +               return -ENODEV;
571 +       }
572 +
573 +       if (madt->lapic_address) {
574 +               acpi_lapic_addr = (u64) madt->lapic_address;
575 +
576 +               printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n",
577 +                      madt->lapic_address);
578 +       }
579 +
580 +       acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id);
581 +
582 +       return 0;
583 +}
584 +
585 +static int __init
586 +acpi_parse_lapic(acpi_table_entry_header * header, const unsigned long end)
587 +{
588 +       struct acpi_table_lapic *processor = NULL;
589 +
590 +       processor = (struct acpi_table_lapic *)header;
591 +
592 +       if (BAD_MADT_ENTRY(processor, end))
593 +               return -EINVAL;
594 +
595 +       acpi_table_print_madt_entry(header);
596 +
597 +       /* Record local apic id only when enabled */
598 +       if (processor->flags.enabled)
599 +               x86_acpiid_to_apicid[processor->acpi_id] = processor->id;
600 +
601 +       /*
602 +        * We need to register disabled CPU as well to permit
603 +        * counting disabled CPUs. This allows us to size
604 +        * cpus_possible_map more accurately, to permit
605 +        * to not preallocating memory for all NR_CPUS
606 +        * when we use CPU hotplug.
607 +        */
608 +       mp_register_lapic(processor->id,        /* APIC ID */
609 +                         processor->flags.enabled);    /* Enabled? */
610 +
611 +       return 0;
612 +}
613 +
614 +static int __init
615 +acpi_parse_lapic_addr_ovr(acpi_table_entry_header * header,
616 +                         const unsigned long end)
617 +{
618 +       struct acpi_table_lapic_addr_ovr *lapic_addr_ovr = NULL;
619 +
620 +       lapic_addr_ovr = (struct acpi_table_lapic_addr_ovr *)header;
621 +
622 +       if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
623 +               return -EINVAL;
624 +
625 +       acpi_lapic_addr = lapic_addr_ovr->address;
626 +
627 +       return 0;
628 +}
629 +
630 +static int __init
631 +acpi_parse_lapic_nmi(acpi_table_entry_header * header, const unsigned long end)
632 +{
633 +       struct acpi_table_lapic_nmi *lapic_nmi = NULL;
634 +
635 +       lapic_nmi = (struct acpi_table_lapic_nmi *)header;
636 +
637 +       if (BAD_MADT_ENTRY(lapic_nmi, end))
638 +               return -EINVAL;
639 +
640 +       acpi_table_print_madt_entry(header);
641 +
642 +       if (lapic_nmi->lint != 1)
643 +               printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
644 +
645 +       return 0;
646 +}
647 +
648 +#endif                         /*CONFIG_X86_LOCAL_APIC */
649 +
650 +#ifdef CONFIG_X86_IO_APIC
651 +
652 +static int __init
653 +acpi_parse_ioapic(acpi_table_entry_header * header, const unsigned long end)
654 +{
655 +       struct acpi_table_ioapic *ioapic = NULL;
656 +
657 +       ioapic = (struct acpi_table_ioapic *)header;
658 +
659 +       if (BAD_MADT_ENTRY(ioapic, end))
660 +               return -EINVAL;
661 +
662 +       acpi_table_print_madt_entry(header);
663 +
664 +       mp_register_ioapic(ioapic->id,
665 +                          ioapic->address, ioapic->global_irq_base);
666 +
667 +       return 0;
668 +}
669 +
670 +/*
671 + * Parse Interrupt Source Override for the ACPI SCI
672 + */
673 +static void acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
674 +{
675 +       if (trigger == 0)       /* compatible SCI trigger is level */
676 +               trigger = 3;
677 +
678 +       if (polarity == 0)      /* compatible SCI polarity is low */
679 +               polarity = 3;
680 +
681 +       /* Command-line over-ride via acpi_sci= */
682 +       if (acpi_sci_flags.trigger)
683 +               trigger = acpi_sci_flags.trigger;
684 +
685 +       if (acpi_sci_flags.polarity)
686 +               polarity = acpi_sci_flags.polarity;
687 +
688 +       /*
689 +        * mp_config_acpi_legacy_irqs() already setup IRQs < 16
690 +        * If GSI is < 16, this will update its flags,
691 +        * else it will create a new mp_irqs[] entry.
692 +        */
693 +       mp_override_legacy_irq(gsi, polarity, trigger, gsi);
694 +
695 +       /*
696 +        * stash over-ride to indicate we've been here
697 +        * and for later update of acpi_fadt
698 +        */
699 +       acpi_sci_override_gsi = gsi;
700 +       return;
701 +}
702 +
703 +static int __init
704 +acpi_parse_int_src_ovr(acpi_table_entry_header * header,
705 +                      const unsigned long end)
706 +{
707 +       struct acpi_table_int_src_ovr *intsrc = NULL;
708 +
709 +       intsrc = (struct acpi_table_int_src_ovr *)header;
710 +
711 +       if (BAD_MADT_ENTRY(intsrc, end))
712 +               return -EINVAL;
713 +
714 +       acpi_table_print_madt_entry(header);
715 +
716 +       if (intsrc->bus_irq == acpi_fadt.sci_int) {
717 +               acpi_sci_ioapic_setup(intsrc->global_irq,
718 +                                     intsrc->flags.polarity,
719 +                                     intsrc->flags.trigger);
720 +               return 0;
721 +       }
722 +
723 +       if (acpi_skip_timer_override &&
724 +           intsrc->bus_irq == 0 && intsrc->global_irq == 2) {
725 +               printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
726 +               return 0;
727 +       }
728 +
729 +       mp_override_legacy_irq(intsrc->bus_irq,
730 +                              intsrc->flags.polarity,
731 +                              intsrc->flags.trigger, intsrc->global_irq);
732 +
733 +       return 0;
734 +}
735 +
736 +static int __init
737 +acpi_parse_nmi_src(acpi_table_entry_header * header, const unsigned long end)
738 +{
739 +       struct acpi_table_nmi_src *nmi_src = NULL;
740 +
741 +       nmi_src = (struct acpi_table_nmi_src *)header;
742 +
743 +       if (BAD_MADT_ENTRY(nmi_src, end))
744 +               return -EINVAL;
745 +
746 +       acpi_table_print_madt_entry(header);
747 +
748 +       /* TBD: Support nimsrc entries? */
749 +
750 +       return 0;
751 +}
752 +
753 +#endif                         /* CONFIG_X86_IO_APIC */
754 +
755 +/*
756 + * acpi_pic_sci_set_trigger()
757 + * 
758 + * use ELCR to set PIC-mode trigger type for SCI
759 + *
760 + * If a PIC-mode SCI is not recognized or gives spurious IRQ7's
761 + * it may require Edge Trigger -- use "acpi_sci=edge"
762 + *
763 + * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers
764 + * for the 8259 PIC.  bit[n] = 1 means irq[n] is Level, otherwise Edge.
765 + * ECLR1 is IRQ's 0-7 (IRQ 0, 1, 2 must be 0)
766 + * ECLR2 is IRQ's 8-15 (IRQ 8, 13 must be 0)
767 + */
768 +
769 +void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
770 +{
771 +       unsigned int mask = 1 << irq;
772 +       unsigned int old, new;
773 +
774 +       /* Real old ELCR mask */
775 +       old = inb(0x4d0) | (inb(0x4d1) << 8);
776 +
777 +       /*
778 +        * If we use ACPI to set PCI irq's, then we should clear ELCR
779 +        * since we will set it correctly as we enable the PCI irq
780 +        * routing.
781 +        */
782 +       new = acpi_noirq ? old : 0;
783 +
784 +       /*
785 +        * Update SCI information in the ELCR, it isn't in the PCI
786 +        * routing tables..
787 +        */
788 +       switch (trigger) {
789 +       case 1:         /* Edge - clear */
790 +               new &= ~mask;
791 +               break;
792 +       case 3:         /* Level - set */
793 +               new |= mask;
794 +               break;
795 +       }
796 +
797 +       if (old == new)
798 +               return;
799 +
800 +       printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old);
801 +       outb(new, 0x4d0);
802 +       outb(new >> 8, 0x4d1);
803 +}
804 +
805 +int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
806 +{
807 +#ifdef CONFIG_X86_IO_APIC
808 +       if (use_pci_vector() && !platform_legacy_irq(gsi))
809 +               *irq = IO_APIC_VECTOR(gsi);
810 +       else
811 +#endif
812 +               *irq = gsi_irq_sharing(gsi);
813 +       return 0;
814 +}
815 +
816 +/*
817 + * success: return IRQ number (>=0)
818 + * failure: return < 0
819 + */
820 +int acpi_register_gsi(u32 gsi, int triggering, int polarity)
821 +{
822 +       unsigned int irq;
823 +       unsigned int plat_gsi = gsi;
824 +
825 +#ifdef CONFIG_PCI
826 +       /*
827 +        * Make sure all (legacy) PCI IRQs are set as level-triggered.
828 +        */
829 +       if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
830 +               extern void eisa_set_level_irq(unsigned int irq);
831 +
832 +               if (triggering == ACPI_LEVEL_SENSITIVE)
833 +                       eisa_set_level_irq(gsi);
834 +       }
835 +#endif
836 +
837 +#ifdef CONFIG_X86_IO_APIC
838 +       if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
839 +               plat_gsi = mp_register_gsi(gsi, triggering, polarity);
840 +       }
841 +#endif
842 +       acpi_gsi_to_irq(plat_gsi, &irq);
843 +       return irq;
844 +}
845 +
846 +EXPORT_SYMBOL(acpi_register_gsi);
847 +
848 +/*
849 + *  ACPI based hotplug support for CPU
850 + */
851 +#ifdef CONFIG_ACPI_HOTPLUG_CPU
852 +int acpi_map_lsapic(acpi_handle handle, int *pcpu)
853 +{
854 +       /* TBD */
855 +       return -EINVAL;
856 +}
857 +
858 +EXPORT_SYMBOL(acpi_map_lsapic);
859 +
860 +int acpi_unmap_lsapic(int cpu)
861 +{
862 +       /* TBD */
863 +       return -EINVAL;
864 +}
865 +
866 +EXPORT_SYMBOL(acpi_unmap_lsapic);
867 +#endif                         /* CONFIG_ACPI_HOTPLUG_CPU */
868 +
869 +int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
870 +{
871 +       /* TBD */
872 +       return -EINVAL;
873 +}
874 +
875 +EXPORT_SYMBOL(acpi_register_ioapic);
876 +
877 +int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
878 +{
879 +       /* TBD */
880 +       return -EINVAL;
881 +}
882 +
883 +EXPORT_SYMBOL(acpi_unregister_ioapic);
884 +
885 +static unsigned long __init
886 +acpi_scan_rsdp(unsigned long start, unsigned long length)
887 +{
888 +       unsigned long offset = 0;
889 +       unsigned long sig_len = sizeof("RSD PTR ") - 1;
890 +       unsigned long vstart = (unsigned long)isa_bus_to_virt(start);
891 +
892 +       /*
893 +        * Scan all 16-byte boundaries of the physical memory region for the
894 +        * RSDP signature.
895 +        */
896 +       for (offset = 0; offset < length; offset += 16) {
897 +               if (strncmp((char *)(vstart + offset), "RSD PTR ", sig_len))
898 +                       continue;
899 +               return (start + offset);
900 +       }
901 +
902 +       return 0;
903 +}
904 +
905 +static int __init acpi_parse_sbf(unsigned long phys_addr, unsigned long size)
906 +{
907 +       struct acpi_table_sbf *sb;
908 +
909 +       if (!phys_addr || !size)
910 +               return -EINVAL;
911 +
912 +       sb = (struct acpi_table_sbf *)__acpi_map_table(phys_addr, size);
913 +       if (!sb) {
914 +               printk(KERN_WARNING PREFIX "Unable to map SBF\n");
915 +               return -ENODEV;
916 +       }
917 +
918 +       sbf_port = sb->sbf_cmos;        /* Save CMOS port */
919 +
920 +       return 0;
921 +}
922 +
923 +#ifdef CONFIG_HPET_TIMER
924 +
925 +static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
926 +{
927 +       struct acpi_table_hpet *hpet_tbl;
928 +
929 +       if (!phys || !size)
930 +               return -EINVAL;
931 +
932 +       hpet_tbl = (struct acpi_table_hpet *)__acpi_map_table(phys, size);
933 +       if (!hpet_tbl) {
934 +               printk(KERN_WARNING PREFIX "Unable to map HPET\n");
935 +               return -ENODEV;
936 +       }
937 +
938 +       if (hpet_tbl->addr.space_id != ACPI_SPACE_MEM) {
939 +               printk(KERN_WARNING PREFIX "HPET timers must be located in "
940 +                      "memory.\n");
941 +               return -1;
942 +       }
943 +#ifdef CONFIG_X86_64
944 +       vxtime.hpet_address = hpet_tbl->addr.addrl |
945 +           ((long)hpet_tbl->addr.addrh << 32);
946 +
947 +       printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
948 +              hpet_tbl->id, vxtime.hpet_address);
949 +#else                          /* X86 */
950 +       {
951 +               extern unsigned long hpet_address;
952 +
953 +               hpet_address = hpet_tbl->addr.addrl;
954 +               printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
955 +                      hpet_tbl->id, hpet_address);
956 +       }
957 +#endif                         /* X86 */
958 +
959 +       return 0;
960 +}
961 +#else
962 +#define        acpi_parse_hpet NULL
963 +#endif
964 +
965 +#ifdef CONFIG_X86_PM_TIMER
966 +extern u32 pmtmr_ioport;
967 +#endif
968 +
969 +static int __init acpi_parse_fadt(unsigned long phys, unsigned long size)
970 +{
971 +       struct fadt_descriptor_rev2 *fadt = NULL;
972 +
973 +       fadt = (struct fadt_descriptor_rev2 *)__acpi_map_table(phys, size);
974 +       if (!fadt) {
975 +               printk(KERN_WARNING PREFIX "Unable to map FADT\n");
976 +               return 0;
977 +       }
978 +       /* initialize sci_int early for INT_SRC_OVR MADT parsing */
979 +       acpi_fadt.sci_int = fadt->sci_int;
980 +
981 +       /* initialize rev and apic_phys_dest_mode for x86_64 genapic */
982 +       acpi_fadt.revision = fadt->revision;
983 +       acpi_fadt.force_apic_physical_destination_mode =
984 +           fadt->force_apic_physical_destination_mode;
985 +
986 +#if defined(CONFIG_X86_PM_TIMER) && !defined(CONFIG_XEN)
987 +       /* detect the location of the ACPI PM Timer */
988 +       if (fadt->revision >= FADT2_REVISION_ID) {
989 +               /* FADT rev. 2 */
990 +               if (fadt->xpm_tmr_blk.address_space_id !=
991 +                   ACPI_ADR_SPACE_SYSTEM_IO)
992 +                       return 0;
993 +
994 +               pmtmr_ioport = fadt->xpm_tmr_blk.address;
995 +               /*
996 +                * "X" fields are optional extensions to the original V1.0
997 +                * fields, so we must selectively expand V1.0 fields if the
998 +                * corresponding X field is zero.
999 +                */
1000 +               if (!pmtmr_ioport)
1001 +                       pmtmr_ioport = fadt->V1_pm_tmr_blk;
1002 +       } else {
1003 +               /* FADT rev. 1 */
1004 +               pmtmr_ioport = fadt->V1_pm_tmr_blk;
1005 +       }
1006 +       if (pmtmr_ioport)
1007 +               printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n",
1008 +                      pmtmr_ioport);
1009 +#endif
1010 +       return 0;
1011 +}
1012 +
1013 +unsigned long __init acpi_find_rsdp(void)
1014 +{
1015 +       unsigned long rsdp_phys = 0;
1016 +
1017 +       if (efi_enabled) {
1018 +               if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
1019 +                       return efi.acpi20;
1020 +               else if (efi.acpi != EFI_INVALID_TABLE_ADDR)
1021 +                       return efi.acpi;
1022 +       }
1023 +       /*
1024 +        * Scan memory looking for the RSDP signature. First search EBDA (low
1025 +        * memory) paragraphs and then search upper memory (E0000-FFFFF).
1026 +        */
1027 +       rsdp_phys = acpi_scan_rsdp(0, 0x400);
1028 +       if (!rsdp_phys)
1029 +               rsdp_phys = acpi_scan_rsdp(0xE0000, 0x20000);
1030 +
1031 +       return rsdp_phys;
1032 +}
1033 +
1034 +#ifdef CONFIG_X86_LOCAL_APIC
1035 +/*
1036 + * Parse LAPIC entries in MADT
1037 + * returns 0 on success, < 0 on error
1038 + */
1039 +static int __init acpi_parse_madt_lapic_entries(void)
1040 +{
1041 +       int count;
1042 +
1043 +       if (!cpu_has_apic)
1044 +               return -ENODEV;
1045 +
1046 +       /* 
1047 +        * Note that the LAPIC address is obtained from the MADT (32-bit value)
1048 +        * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
1049 +        */
1050 +
1051 +       count =
1052 +           acpi_table_parse_madt(ACPI_MADT_LAPIC_ADDR_OVR,
1053 +                                 acpi_parse_lapic_addr_ovr, 0);
1054 +       if (count < 0) {
1055 +               printk(KERN_ERR PREFIX
1056 +                      "Error parsing LAPIC address override entry\n");
1057 +               return count;
1058 +       }
1059 +
1060 +       mp_register_lapic_address(acpi_lapic_addr);
1061 +
1062 +       count = acpi_table_parse_madt(ACPI_MADT_LAPIC, acpi_parse_lapic,
1063 +                                     MAX_APICS);
1064 +       if (!count) {
1065 +               printk(KERN_ERR PREFIX "No LAPIC entries present\n");
1066 +               /* TBD: Cleanup to allow fallback to MPS */
1067 +               return -ENODEV;
1068 +       } else if (count < 0) {
1069 +               printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
1070 +               /* TBD: Cleanup to allow fallback to MPS */
1071 +               return count;
1072 +       }
1073 +
1074 +       count =
1075 +           acpi_table_parse_madt(ACPI_MADT_LAPIC_NMI, acpi_parse_lapic_nmi, 0);
1076 +       if (count < 0) {
1077 +               printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
1078 +               /* TBD: Cleanup to allow fallback to MPS */
1079 +               return count;
1080 +       }
1081 +       return 0;
1082 +}
1083 +#endif                         /* CONFIG_X86_LOCAL_APIC */
1084 +
1085 +#ifdef CONFIG_X86_IO_APIC
1086 +/*
1087 + * Parse IOAPIC related entries in MADT
1088 + * returns 0 on success, < 0 on error
1089 + */
1090 +static int __init acpi_parse_madt_ioapic_entries(void)
1091 +{
1092 +       int count;
1093 +
1094 +       /*
1095 +        * ACPI interpreter is required to complete interrupt setup,
1096 +        * so if it is off, don't enumerate the io-apics with ACPI.
1097 +        * If MPS is present, it will handle them,
1098 +        * otherwise the system will stay in PIC mode
1099 +        */
1100 +       if (acpi_disabled || acpi_noirq) {
1101 +               return -ENODEV;
1102 +       }
1103 +
1104 +       if (!cpu_has_apic)
1105 +               return -ENODEV;
1106 +
1107 +       /*
1108 +        * if "noapic" boot option, don't look for IO-APICs
1109 +        */
1110 +       if (skip_ioapic_setup) {
1111 +               printk(KERN_INFO PREFIX "Skipping IOAPIC probe "
1112 +                      "due to 'noapic' option.\n");
1113 +               return -ENODEV;
1114 +       }
1115 +
1116 +       count =
1117 +           acpi_table_parse_madt(ACPI_MADT_IOAPIC, acpi_parse_ioapic,
1118 +                                 MAX_IO_APICS);
1119 +       if (!count) {
1120 +               printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
1121 +               return -ENODEV;
1122 +       } else if (count < 0) {
1123 +               printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n");
1124 +               return count;
1125 +       }
1126 +
1127 +       count =
1128 +           acpi_table_parse_madt(ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr,
1129 +                                 NR_IRQ_VECTORS);
1130 +       if (count < 0) {
1131 +               printk(KERN_ERR PREFIX
1132 +                      "Error parsing interrupt source overrides entry\n");
1133 +               /* TBD: Cleanup to allow fallback to MPS */
1134 +               return count;
1135 +       }
1136 +
1137 +       /*
1138 +        * If BIOS did not supply an INT_SRC_OVR for the SCI
1139 +        * pretend we got one so we can set the SCI flags.
1140 +        */
1141 +       if (!acpi_sci_override_gsi)
1142 +               acpi_sci_ioapic_setup(acpi_fadt.sci_int, 0, 0);
1143 +
1144 +       /* Fill in identity legacy mapings where no override */
1145 +       mp_config_acpi_legacy_irqs();
1146 +
1147 +       count =
1148 +           acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src,
1149 +                                 NR_IRQ_VECTORS);
1150 +       if (count < 0) {
1151 +               printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
1152 +               /* TBD: Cleanup to allow fallback to MPS */
1153 +               return count;
1154 +       }
1155 +
1156 +       return 0;
1157 +}
1158 +#else
1159 +static inline int acpi_parse_madt_ioapic_entries(void)
1160 +{
1161 +       return -1;
1162 +}
1163 +#endif /* !CONFIG_X86_IO_APIC */
1164 +
1165 +static void __init acpi_process_madt(void)
1166 +{
1167 +#ifdef CONFIG_X86_LOCAL_APIC
1168 +       int count, error;
1169 +
1170 +       count = acpi_table_parse(ACPI_APIC, acpi_parse_madt);
1171 +       if (count >= 1) {
1172 +
1173 +               /*
1174 +                * Parse MADT LAPIC entries
1175 +                */
1176 +               error = acpi_parse_madt_lapic_entries();
1177 +               if (!error) {
1178 +                       acpi_lapic = 1;
1179 +
1180 +#ifdef CONFIG_X86_GENERICARCH
1181 +                       generic_bigsmp_probe();
1182 +#endif
1183 +                       /*
1184 +                        * Parse MADT IO-APIC entries
1185 +                        */
1186 +                       error = acpi_parse_madt_ioapic_entries();
1187 +                       if (!error) {
1188 +                               acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
1189 +                               acpi_irq_balance_set(NULL);
1190 +                               acpi_ioapic = 1;
1191 +
1192 +                               smp_found_config = 1;
1193 +                               clustered_apic_check();
1194 +                       }
1195 +               }
1196 +               if (error == -EINVAL) {
1197 +                       /*
1198 +                        * Dell Precision Workstation 410, 610 come here.
1199 +                        */
1200 +                       printk(KERN_ERR PREFIX
1201 +                              "Invalid BIOS MADT, disabling ACPI\n");
1202 +                       disable_acpi();
1203 +               }
1204 +       }
1205 +#endif
1206 +       return;
1207 +}
1208 +
1209 +extern int acpi_force;
1210 +
1211 +#ifdef __i386__
1212 +
1213 +static int __init disable_acpi_irq(struct dmi_system_id *d)
1214 +{
1215 +       if (!acpi_force) {
1216 +               printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n",
1217 +                      d->ident);
1218 +               acpi_noirq_set();
1219 +       }
1220 +       return 0;
1221 +}
1222 +
1223 +static int __init disable_acpi_pci(struct dmi_system_id *d)
1224 +{
1225 +       if (!acpi_force) {
1226 +               printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n",
1227 +                      d->ident);
1228 +               acpi_disable_pci();
1229 +       }
1230 +       return 0;
1231 +}
1232 +
1233 +static int __init dmi_disable_acpi(struct dmi_system_id *d)
1234 +{
1235 +       if (!acpi_force) {
1236 +               printk(KERN_NOTICE "%s detected: acpi off\n", d->ident);
1237 +               disable_acpi();
1238 +       } else {
1239 +               printk(KERN_NOTICE
1240 +                      "Warning: DMI blacklist says broken, but acpi forced\n");
1241 +       }
1242 +       return 0;
1243 +}
1244 +
1245 +/*
1246 + * Limit ACPI to CPU enumeration for HT
1247 + */
1248 +static int __init force_acpi_ht(struct dmi_system_id *d)
1249 +{
1250 +       if (!acpi_force) {
1251 +               printk(KERN_NOTICE "%s detected: force use of acpi=ht\n",
1252 +                      d->ident);
1253 +               disable_acpi();
1254 +               acpi_ht = 1;
1255 +       } else {
1256 +               printk(KERN_NOTICE
1257 +                      "Warning: acpi=force overrules DMI blacklist: acpi=ht\n");
1258 +       }
1259 +       return 0;
1260 +}
1261 +
1262 +/*
1263 + * If your system is blacklisted here, but you find that acpi=force
1264 + * works for you, please contact acpi-devel@sourceforge.net
1265 + */
1266 +static struct dmi_system_id __initdata acpi_dmi_table[] = {
1267 +       /*
1268 +        * Boxes that need ACPI disabled
1269 +        */
1270 +       {
1271 +        .callback = dmi_disable_acpi,
1272 +        .ident = "IBM Thinkpad",
1273 +        .matches = {
1274 +                    DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1275 +                    DMI_MATCH(DMI_BOARD_NAME, "2629H1G"),
1276 +                    },
1277 +        },
1278 +
1279 +       /*
1280 +        * Boxes that need acpi=ht
1281 +        */
1282 +       {
1283 +        .callback = force_acpi_ht,
1284 +        .ident = "FSC Primergy T850",
1285 +        .matches = {
1286 +                    DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
1287 +                    DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"),
1288 +                    },
1289 +        },
1290 +       {
1291 +        .callback = force_acpi_ht,
1292 +        .ident = "DELL GX240",
1293 +        .matches = {
1294 +                    DMI_MATCH(DMI_BOARD_VENDOR, "Dell Computer Corporation"),
1295 +                    DMI_MATCH(DMI_BOARD_NAME, "OptiPlex GX240"),
1296 +                    },
1297 +        },
1298 +       {
1299 +        .callback = force_acpi_ht,
1300 +        .ident = "HP VISUALIZE NT Workstation",
1301 +        .matches = {
1302 +                    DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
1303 +                    DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"),
1304 +                    },
1305 +        },
1306 +       {
1307 +        .callback = force_acpi_ht,
1308 +        .ident = "Compaq Workstation W8000",
1309 +        .matches = {
1310 +                    DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
1311 +                    DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"),
1312 +                    },
1313 +        },
1314 +       {
1315 +        .callback = force_acpi_ht,
1316 +        .ident = "ASUS P4B266",
1317 +        .matches = {
1318 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1319 +                    DMI_MATCH(DMI_BOARD_NAME, "P4B266"),
1320 +                    },
1321 +        },
1322 +       {
1323 +        .callback = force_acpi_ht,
1324 +        .ident = "ASUS P2B-DS",
1325 +        .matches = {
1326 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1327 +                    DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"),
1328 +                    },
1329 +        },
1330 +       {
1331 +        .callback = force_acpi_ht,
1332 +        .ident = "ASUS CUR-DLS",
1333 +        .matches = {
1334 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1335 +                    DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"),
1336 +                    },
1337 +        },
1338 +       {
1339 +        .callback = force_acpi_ht,
1340 +        .ident = "ABIT i440BX-W83977",
1341 +        .matches = {
1342 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"),
1343 +                    DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"),
1344 +                    },
1345 +        },
1346 +       {
1347 +        .callback = force_acpi_ht,
1348 +        .ident = "IBM Bladecenter",
1349 +        .matches = {
1350 +                    DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1351 +                    DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"),
1352 +                    },
1353 +        },
1354 +       {
1355 +        .callback = force_acpi_ht,
1356 +        .ident = "IBM eServer xSeries 360",
1357 +        .matches = {
1358 +                    DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1359 +                    DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"),
1360 +                    },
1361 +        },
1362 +       {
1363 +        .callback = force_acpi_ht,
1364 +        .ident = "IBM eserver xSeries 330",
1365 +        .matches = {
1366 +                    DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1367 +                    DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"),
1368 +                    },
1369 +        },
1370 +       {
1371 +        .callback = force_acpi_ht,
1372 +        .ident = "IBM eserver xSeries 440",
1373 +        .matches = {
1374 +                    DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1375 +                    DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"),
1376 +                    },
1377 +        },
1378 +
1379 +       /*
1380 +        * Boxes that need ACPI PCI IRQ routing disabled
1381 +        */
1382 +       {
1383 +        .callback = disable_acpi_irq,
1384 +        .ident = "ASUS A7V",
1385 +        .matches = {
1386 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"),
1387 +                    DMI_MATCH(DMI_BOARD_NAME, "<A7V>"),
1388 +                    /* newer BIOS, Revision 1011, does work */
1389 +                    DMI_MATCH(DMI_BIOS_VERSION,
1390 +                              "ASUS A7V ACPI BIOS Revision 1007"),
1391 +                    },
1392 +        },
1393 +
1394 +       /*
1395 +        * Boxes that need ACPI PCI IRQ routing and PCI scan disabled
1396 +        */
1397 +       {                       /* _BBN 0 bug */
1398 +        .callback = disable_acpi_pci,
1399 +        .ident = "ASUS PR-DLS",
1400 +        .matches = {
1401 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1402 +                    DMI_MATCH(DMI_BOARD_NAME, "PR-DLS"),
1403 +                    DMI_MATCH(DMI_BIOS_VERSION,
1404 +                              "ASUS PR-DLS ACPI BIOS Revision 1010"),
1405 +                    DMI_MATCH(DMI_BIOS_DATE, "03/21/2003")
1406 +                    },
1407 +        },
1408 +       {
1409 +        .callback = disable_acpi_pci,
1410 +        .ident = "Acer TravelMate 36x Laptop",
1411 +        .matches = {
1412 +                    DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
1413 +                    DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
1414 +                    },
1415 +        },
1416 +       {}
1417 +};
1418 +
1419 +#endif                         /* __i386__ */
1420 +
1421 +/*
1422 + * acpi_boot_table_init() and acpi_boot_init()
1423 + *  called from setup_arch(), always.
1424 + *     1. checksums all tables
1425 + *     2. enumerates lapics
1426 + *     3. enumerates io-apics
1427 + *
1428 + * acpi_table_init() is separate to allow reading SRAT without
1429 + * other side effects.
1430 + *
1431 + * side effects of acpi_boot_init:
1432 + *     acpi_lapic = 1 if LAPIC found
1433 + *     acpi_ioapic = 1 if IOAPIC found
1434 + *     if (acpi_lapic && acpi_ioapic) smp_found_config = 1;
1435 + *     if acpi_blacklisted() acpi_disabled = 1;
1436 + *     acpi_irq_model=...
1437 + *     ...
1438 + *
1439 + * return value: (currently ignored)
1440 + *     0: success
1441 + *     !0: failure
1442 + */
1443 +
1444 +int __init acpi_boot_table_init(void)
1445 +{
1446 +       int error;
1447 +
1448 +#ifdef __i386__
1449 +       dmi_check_system(acpi_dmi_table);
1450 +#endif
1451 +
1452 +       /*
1453 +        * If acpi_disabled, bail out
1454 +        * One exception: acpi=ht continues far enough to enumerate LAPICs
1455 +        */
1456 +       if (acpi_disabled && !acpi_ht)
1457 +               return 1;
1458 +
1459 +       /* 
1460 +        * Initialize the ACPI boot-time table parser.
1461 +        */
1462 +       error = acpi_table_init();
1463 +       if (error) {
1464 +               disable_acpi();
1465 +               return error;
1466 +       }
1467 +
1468 +       acpi_table_parse(ACPI_BOOT, acpi_parse_sbf);
1469 +
1470 +       /*
1471 +        * blacklist may disable ACPI entirely
1472 +        */
1473 +       error = acpi_blacklisted();
1474 +       if (error) {
1475 +               if (acpi_force) {
1476 +                       printk(KERN_WARNING PREFIX "acpi=force override\n");
1477 +               } else {
1478 +                       printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
1479 +                       disable_acpi();
1480 +                       return error;
1481 +               }
1482 +       }
1483 +
1484 +       return 0;
1485 +}
1486 +
1487 +int __init acpi_boot_init(void)
1488 +{
1489 +       /*
1490 +        * If acpi_disabled, bail out
1491 +        * One exception: acpi=ht continues far enough to enumerate LAPICs
1492 +        */
1493 +       if (acpi_disabled && !acpi_ht)
1494 +               return 1;
1495 +
1496 +       acpi_table_parse(ACPI_BOOT, acpi_parse_sbf);
1497 +
1498 +       /*
1499 +        * set sci_int and PM timer address
1500 +        */
1501 +       acpi_table_parse(ACPI_FADT, acpi_parse_fadt);
1502 +
1503 +       /*
1504 +        * Process the Multiple APIC Description Table (MADT), if present
1505 +        */
1506 +       acpi_process_madt();
1507 +
1508 +       acpi_table_parse(ACPI_HPET, acpi_parse_hpet);
1509 +
1510 +       return 0;
1511 +}
1512 diff -urNp linux-2.6/arch/i386/kernel/acpi/Makefile new/arch/i386/kernel/acpi/Makefile
1513 --- linux-2.6/arch/i386/kernel/acpi/Makefile    2006-07-03 14:14:14.000000000 +0200
1514 +++ new/arch/i386/kernel/acpi/Makefile  2006-05-09 12:32:33.000000000 +0200
1515 @@ -6,3 +6,7 @@ ifneq ($(CONFIG_ACPI_PROCESSOR),)
1516  obj-y                          += cstate.o processor.o
1517  endif
1518  
1519 +ifdef CONFIG_XEN
1520 +include $(srctree)/scripts/Makefile.xen
1521 +obj-y := $(call cherrypickxen, $(obj-y), $(src))
1522 +endif
1523 diff -urNp linux-2.6/arch/i386/kernel/apic-xen.c new/arch/i386/kernel/apic-xen.c
1524 --- linux-2.6/arch/i386/kernel/apic-xen.c       1970-01-01 01:00:00.000000000 +0100
1525 +++ new/arch/i386/kernel/apic-xen.c     2006-05-09 12:32:33.000000000 +0200
1526 @@ -0,0 +1,160 @@
1527 +/*
1528 + *     Local APIC handling, local APIC timers
1529 + *
1530 + *     (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
1531 + *
1532 + *     Fixes
1533 + *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
1534 + *                                     thanks to Eric Gilmore
1535 + *                                     and Rolf G. Tews
1536 + *                                     for testing these extensively.
1537 + *     Maciej W. Rozycki       :       Various updates and fixes.
1538 + *     Mikael Pettersson       :       Power Management for UP-APIC.
1539 + *     Pavel Machek and
1540 + *     Mikael Pettersson       :       PM converted to driver model.
1541 + */
1542 +
1543 +#include <linux/config.h>
1544 +#include <linux/init.h>
1545 +
1546 +#include <linux/mm.h>
1547 +#include <linux/delay.h>
1548 +#include <linux/bootmem.h>
1549 +#include <linux/smp_lock.h>
1550 +#include <linux/interrupt.h>
1551 +#include <linux/mc146818rtc.h>
1552 +#include <linux/kernel_stat.h>
1553 +#include <linux/sysdev.h>
1554 +#include <linux/cpu.h>
1555 +#include <linux/module.h>
1556 +
1557 +#include <asm/atomic.h>
1558 +#include <asm/smp.h>
1559 +#include <asm/mtrr.h>
1560 +#include <asm/mpspec.h>
1561 +#include <asm/desc.h>
1562 +#include <asm/arch_hooks.h>
1563 +#include <asm/hpet.h>
1564 +#include <asm/i8253.h>
1565 +
1566 +#include <mach_apic.h>
1567 +#include <mach_apicdef.h>
1568 +#include <mach_ipi.h>
1569 +
1570 +#include "io_ports.h"
1571 +
1572 +#ifndef CONFIG_XEN
1573 +/*
1574 + * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
1575 + * IPIs in place of local APIC timers
1576 + */
1577 +static cpumask_t timer_bcast_ipi;
1578 +#endif
1579 +
1580 +/*
1581 + * Knob to control our willingness to enable the local APIC.
1582 + */
1583 +int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
1584 +
1585 +/*
1586 + * Debug level
1587 + */
1588 +int apic_verbosity;
1589 +
1590 +int modern_apic(void)
1591 +{
1592 +#ifndef CONFIG_XEN
1593 +       unsigned int lvr, version;
1594 +       /* AMD systems use old APIC versions, so check the CPU */
1595 +       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
1596 +               boot_cpu_data.x86 >= 0xf)
1597 +               return 1;
1598 +       lvr = apic_read(APIC_LVR);
1599 +       version = GET_APIC_VERSION(lvr);
1600 +       return version >= 0x14;
1601 +#else
1602 +       return 1;
1603 +#endif
1604 +}
1605 +
1606 +/*
1607 + * 'what should we do if we get a hw irq event on an illegal vector'.
1608 + * each architecture has to answer this themselves.
1609 + */
1610 +void ack_bad_irq(unsigned int irq)
1611 +{
1612 +       printk("unexpected IRQ trap at vector %02x\n", irq);
1613 +       /*
1614 +        * Currently unexpected vectors happen only on SMP and APIC.
1615 +        * We _must_ ack these because every local APIC has only N
1616 +        * irq slots per priority level, and a 'hanging, unacked' IRQ
1617 +        * holds up an irq slot - in excessive cases (when multiple
1618 +        * unexpected vectors occur) that might lock up the APIC
1619 +        * completely.
1620 +        * But only ack when the APIC is enabled -AK
1621 +        */
1622 +       if (cpu_has_apic)
1623 +               ack_APIC_irq();
1624 +}
1625 +
1626 +int get_physical_broadcast(void)
1627 +{
1628 +       if (modern_apic())
1629 +               return 0xff;
1630 +       else
1631 +               return 0xf;
1632 +}
1633 +
1634 +#ifndef CONFIG_XEN
1635 +#ifndef CONFIG_SMP
1636 +static void up_apic_timer_interrupt_call(struct pt_regs *regs)
1637 +{
1638 +       int cpu = smp_processor_id();
1639 +
1640 +       /*
1641 +        * the NMI deadlock-detector uses this.
1642 +        */
1643 +       per_cpu(irq_stat, cpu).apic_timer_irqs++;
1644 +
1645 +       smp_local_timer_interrupt(regs);
1646 +}
1647 +#endif
1648 +
1649 +void smp_send_timer_broadcast_ipi(struct pt_regs *regs)
1650 +{
1651 +       cpumask_t mask;
1652 +
1653 +       cpus_and(mask, cpu_online_map, timer_bcast_ipi);
1654 +       if (!cpus_empty(mask)) {
1655 +#ifdef CONFIG_SMP
1656 +               send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
1657 +#else
1658 +               /*
1659 +                * We can directly call the apic timer interrupt handler
1660 +                * in UP case. Minus all irq related functions
1661 +                */
1662 +               up_apic_timer_interrupt_call(regs);
1663 +#endif
1664 +       }
1665 +}
1666 +#endif
1667 +
1668 +int setup_profiling_timer(unsigned int multiplier)
1669 +{
1670 +       return -EINVAL;
1671 +}
1672 +
1673 +/*
1674 + * This initializes the IO-APIC and APIC hardware if this is
1675 + * a UP kernel.
1676 + */
1677 +int __init APIC_init_uniprocessor (void)
1678 +{
1679 +#ifdef CONFIG_X86_IO_APIC
1680 +       if (smp_found_config)
1681 +               if (!skip_ioapic_setup && nr_ioapics)
1682 +                       setup_IO_APIC();
1683 +#endif
1684 +
1685 +       return 0;
1686 +}
1687 diff -urNp linux-2.6/arch/i386/kernel/asm-offsets.c new/arch/i386/kernel/asm-offsets.c
1688 --- linux-2.6/arch/i386/kernel/asm-offsets.c    2006-07-03 14:14:14.000000000 +0200
1689 +++ new/arch/i386/kernel/asm-offsets.c  2006-05-09 12:32:34.000000000 +0200
1690 @@ -13,6 +13,7 @@
1691  #include <asm/fixmap.h>
1692  #include <asm/processor.h>
1693  #include <asm/thread_info.h>
1694 +#include <asm/elf.h>
1695  
1696  #define DEFINE(sym, val) \
1697          asm volatile("\n->" #sym " %0 " #val : : "i" (val))
1698 @@ -63,10 +64,15 @@ void foo(void)
1699         OFFSET(pbe_orig_address, pbe, orig_address);
1700         OFFSET(pbe_next, pbe, next);
1701  
1702 +#ifndef CONFIG_X86_NO_TSS
1703         /* Offset from the sysenter stack to tss.esp0 */
1704 -       DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) -
1705 +       DEFINE(SYSENTER_stack_esp0, offsetof(struct tss_struct, esp0) -
1706                  sizeof(struct tss_struct));
1707 +#else
1708 +       /* sysenter stack points directly to esp0 */
1709 +       DEFINE(SYSENTER_stack_esp0, 0);
1710 +#endif
1711  
1712         DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
1713 -       DEFINE(VSYSCALL_BASE, __fix_to_virt(FIX_VSYSCALL));
1714 +       DEFINE(VSYSCALL_BASE, VSYSCALL_BASE);
1715  }
1716 diff -urNp linux-2.6/arch/i386/kernel/cpu/common-xen.c new/arch/i386/kernel/cpu/common-xen.c
1717 --- linux-2.6/arch/i386/kernel/cpu/common-xen.c 1970-01-01 01:00:00.000000000 +0100
1718 +++ new/arch/i386/kernel/cpu/common-xen.c       2006-05-23 18:37:09.000000000 +0200
1719 @@ -0,0 +1,732 @@
1720 +#include <linux/init.h>
1721 +#include <linux/string.h>
1722 +#include <linux/delay.h>
1723 +#include <linux/smp.h>
1724 +#include <linux/module.h>
1725 +#include <linux/percpu.h>
1726 +#include <linux/bootmem.h>
1727 +#include <asm/semaphore.h>
1728 +#include <asm/processor.h>
1729 +#include <asm/i387.h>
1730 +#include <asm/msr.h>
1731 +#include <asm/io.h>
1732 +#include <asm/mmu_context.h>
1733 +#ifdef CONFIG_X86_LOCAL_APIC
1734 +#include <asm/mpspec.h>
1735 +#include <asm/apic.h>
1736 +#include <mach_apic.h>
1737 +#endif
1738 +#include <asm/hypervisor.h>
1739 +
1740 +#include "cpu.h"
1741 +
1742 +DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
1743 +EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
1744 +
1745 +#ifndef CONFIG_XEN
1746 +DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
1747 +EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
1748 +#endif
1749 +
1750 +static int cachesize_override __cpuinitdata = -1;
1751 +static int disable_x86_fxsr __cpuinitdata;
1752 +static int disable_x86_serial_nr __cpuinitdata = 1;
1753 +static int disable_x86_sep __cpuinitdata;
1754 +
1755 +struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
1756 +
1757 +extern int disable_pse;
1758 +
1759 +static void default_init(struct cpuinfo_x86 * c)
1760 +{
1761 +       /* Not much we can do here... */
1762 +       /* Check if at least it has cpuid */
1763 +       if (c->cpuid_level == -1) {
1764 +               /* No cpuid. It must be an ancient CPU */
1765 +               if (c->x86 == 4)
1766 +                       strcpy(c->x86_model_id, "486");
1767 +               else if (c->x86 == 3)
1768 +                       strcpy(c->x86_model_id, "386");
1769 +       }
1770 +}
1771 +
1772 +static struct cpu_dev default_cpu = {
1773 +       .c_init = default_init,
1774 +       .c_vendor = "Unknown",
1775 +};
1776 +static struct cpu_dev * this_cpu = &default_cpu;
1777 +
1778 +static int __init cachesize_setup(char *str)
1779 +{
1780 +       get_option (&str, &cachesize_override);
1781 +       return 1;
1782 +}
1783 +__setup("cachesize=", cachesize_setup);
1784 +
1785 +int __cpuinit get_model_name(struct cpuinfo_x86 *c)
1786 +{
1787 +       unsigned int *v;
1788 +       char *p, *q;
1789 +
1790 +       if (cpuid_eax(0x80000000) < 0x80000004)
1791 +               return 0;
1792 +
1793 +       v = (unsigned int *) c->x86_model_id;
1794 +       cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
1795 +       cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
1796 +       cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
1797 +       c->x86_model_id[48] = 0;
1798 +
1799 +       /* Intel chips right-justify this string for some dumb reason;
1800 +          undo that brain damage */
1801 +       p = q = &c->x86_model_id[0];
1802 +       while ( *p == ' ' )
1803 +            p++;
1804 +       if ( p != q ) {
1805 +            while ( *p )
1806 +                 *q++ = *p++;
1807 +            while ( q <= &c->x86_model_id[48] )
1808 +                 *q++ = '\0';  /* Zero-pad the rest */
1809 +       }
1810 +
1811 +       return 1;
1812 +}
1813 +
1814 +
1815 +void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
1816 +{
1817 +       unsigned int n, dummy, ecx, edx, l2size;
1818 +
1819 +       n = cpuid_eax(0x80000000);
1820 +
1821 +       if (n >= 0x80000005) {
1822 +               cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
1823 +               printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
1824 +                       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
1825 +               c->x86_cache_size=(ecx>>24)+(edx>>24);  
1826 +       }
1827 +
1828 +       if (n < 0x80000006)     /* Some chips just has a large L1. */
1829 +               return;
1830 +
1831 +       ecx = cpuid_ecx(0x80000006);
1832 +       l2size = ecx >> 16;
1833 +       
1834 +       /* do processor-specific cache resizing */
1835 +       if (this_cpu->c_size_cache)
1836 +               l2size = this_cpu->c_size_cache(c,l2size);
1837 +
1838 +       /* Allow user to override all this if necessary. */
1839 +       if (cachesize_override != -1)
1840 +               l2size = cachesize_override;
1841 +
1842 +       if ( l2size == 0 )
1843 +               return;         /* Again, no L2 cache is possible */
1844 +
1845 +       c->x86_cache_size = l2size;
1846 +
1847 +       printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
1848 +              l2size, ecx & 0xFF);
1849 +}
1850 +
1851 +/* Naming convention should be: <Name> [(<Codename>)] */
1852 +/* This table only is used unless init_<vendor>() below doesn't set it; */
1853 +/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */
1854 +
1855 +/* Look up CPU names by table lookup. */
1856 +static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
1857 +{
1858 +       struct cpu_model_info *info;
1859 +
1860 +       if ( c->x86_model >= 16 )
1861 +               return NULL;    /* Range check */
1862 +
1863 +       if (!this_cpu)
1864 +               return NULL;
1865 +
1866 +       info = this_cpu->c_models;
1867 +
1868 +       while (info && info->family) {
1869 +               if (info->family == c->x86)
1870 +                       return info->model_names[c->x86_model];
1871 +               info++;
1872 +       }
1873 +       return NULL;            /* Not found */
1874 +}
1875 +
1876 +
1877 +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
1878 +{
1879 +       char *v = c->x86_vendor_id;
1880 +       int i;
1881 +       static int printed;
1882 +
1883 +       for (i = 0; i < X86_VENDOR_NUM; i++) {
1884 +               if (cpu_devs[i]) {
1885 +                       if (!strcmp(v,cpu_devs[i]->c_ident[0]) ||
1886 +                           (cpu_devs[i]->c_ident[1] && 
1887 +                            !strcmp(v,cpu_devs[i]->c_ident[1]))) {
1888 +                               c->x86_vendor = i;
1889 +                               if (!early)
1890 +                                       this_cpu = cpu_devs[i];
1891 +                               return;
1892 +                       }
1893 +               }
1894 +       }
1895 +       if (!printed) {
1896 +               printed++;
1897 +               printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
1898 +               printk(KERN_ERR "CPU: Your system may be unstable.\n");
1899 +       }
1900 +       c->x86_vendor = X86_VENDOR_UNKNOWN;
1901 +       this_cpu = &default_cpu;
1902 +}
1903 +
1904 +
1905 +static int __init x86_fxsr_setup(char * s)
1906 +{
1907 +       disable_x86_fxsr = 1;
1908 +       return 1;
1909 +}
1910 +__setup("nofxsr", x86_fxsr_setup);
1911 +
1912 +
1913 +static int __init x86_sep_setup(char * s)
1914 +{
1915 +       disable_x86_sep = 1;
1916 +       return 1;
1917 +}
1918 +__setup("nosep", x86_sep_setup);
1919 +
1920 +
1921 +/* Standard macro to see if a specific flag is changeable */
1922 +static inline int flag_is_changeable_p(u32 flag)
1923 +{
1924 +       u32 f1, f2;
1925 +
1926 +       asm("pushfl\n\t"
1927 +           "pushfl\n\t"
1928 +           "popl %0\n\t"
1929 +           "movl %0,%1\n\t"
1930 +           "xorl %2,%0\n\t"
1931 +           "pushl %0\n\t"
1932 +           "popfl\n\t"
1933 +           "pushfl\n\t"
1934 +           "popl %0\n\t"
1935 +           "popfl\n\t"
1936 +           : "=&r" (f1), "=&r" (f2)
1937 +           : "ir" (flag));
1938 +
1939 +       return ((f1^f2) & flag) != 0;
1940 +}
1941 +
1942 +
1943 +/* Probe for the CPUID instruction */
1944 +static int __cpuinit have_cpuid_p(void)
1945 +{
1946 +       return flag_is_changeable_p(X86_EFLAGS_ID);
1947 +}
1948 +
1949 +/* Do minimum CPU detection early.
1950 +   Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
1951 +   The others are not touched to avoid unwanted side effects.
1952 +
1953 +   WARNING: this function is only called on the BP.  Don't add code here
1954 +   that is supposed to run on all CPUs. */
1955 +static void __init early_cpu_detect(void)
1956 +{
1957 +       struct cpuinfo_x86 *c = &boot_cpu_data;
1958 +
1959 +       c->x86_cache_alignment = 32;
1960 +
1961 +       if (!have_cpuid_p())
1962 +               return;
1963 +
1964 +       /* Get vendor name */
1965 +       cpuid(0x00000000, &c->cpuid_level,
1966 +             (int *)&c->x86_vendor_id[0],
1967 +             (int *)&c->x86_vendor_id[8],
1968 +             (int *)&c->x86_vendor_id[4]);
1969 +
1970 +       get_cpu_vendor(c, 1);
1971 +
1972 +       c->x86 = 4;
1973 +       if (c->cpuid_level >= 0x00000001) {
1974 +               u32 junk, tfms, cap0, misc;
1975 +               cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
1976 +               c->x86 = (tfms >> 8) & 15;
1977 +               c->x86_model = (tfms >> 4) & 15;
1978 +               if (c->x86 == 0xf)
1979 +                       c->x86 += (tfms >> 20) & 0xff;
1980 +               if (c->x86 >= 0x6)
1981 +                       c->x86_model += ((tfms >> 16) & 0xF) << 4;
1982 +               c->x86_mask = tfms & 15;
1983 +               if (cap0 & (1<<19))
1984 +                       c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
1985 +       }
1986 +}
1987 +
1988 +void __cpuinit generic_identify(struct cpuinfo_x86 * c)
1989 +{
1990 +       u32 tfms, xlvl;
1991 +       int ebx;
1992 +
1993 +       if (have_cpuid_p()) {
1994 +               /* Get vendor name */
1995 +               cpuid(0x00000000, &c->cpuid_level,
1996 +                     (int *)&c->x86_vendor_id[0],
1997 +                     (int *)&c->x86_vendor_id[8],
1998 +                     (int *)&c->x86_vendor_id[4]);
1999 +               
2000 +               get_cpu_vendor(c, 0);
2001 +               /* Initialize the standard set of capabilities */
2002 +               /* Note that the vendor-specific code below might override */
2003 +       
2004 +               /* Intel-defined flags: level 0x00000001 */
2005 +               if ( c->cpuid_level >= 0x00000001 ) {
2006 +                       u32 capability, excap;
2007 +                       cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
2008 +                       c->x86_capability[0] = capability;
2009 +                       c->x86_capability[4] = excap;
2010 +                       c->x86 = (tfms >> 8) & 15;
2011 +                       c->x86_model = (tfms >> 4) & 15;
2012 +                       if (c->x86 == 0xf)
2013 +                               c->x86 += (tfms >> 20) & 0xff;
2014 +                       if (c->x86 >= 0x6)
2015 +                               c->x86_model += ((tfms >> 16) & 0xF) << 4;
2016 +                       c->x86_mask = tfms & 15;
2017 +#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
2018 +                       c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
2019 +#else
2020 +                       c->apicid = (ebx >> 24) & 0xFF;
2021 +#endif
2022 +               } else {
2023 +                       /* Have CPUID level 0 only - unheard of */
2024 +                       c->x86 = 4;
2025 +               }
2026 +
2027 +               /* AMD-defined flags: level 0x80000001 */
2028 +               xlvl = cpuid_eax(0x80000000);
2029 +               if ( (xlvl & 0xffff0000) == 0x80000000 ) {
2030 +                       if ( xlvl >= 0x80000001 ) {
2031 +                               c->x86_capability[1] = cpuid_edx(0x80000001);
2032 +                               c->x86_capability[6] = cpuid_ecx(0x80000001);
2033 +                       }
2034 +                       if ( xlvl >= 0x80000004 )
2035 +                               get_model_name(c); /* Default name */
2036 +               }
2037 +       }
2038 +
2039 +       early_intel_workaround(c);
2040 +
2041 +#ifdef CONFIG_X86_HT
2042 +       phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
2043 +#endif
2044 +}
2045 +
2046 +static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
2047 +{
2048 +       if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) {
2049 +               /* Disable processor serial number */
2050 +               unsigned long lo,hi;
2051 +               rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
2052 +               lo |= 0x200000;
2053 +               wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
2054 +               printk(KERN_NOTICE "CPU serial number disabled.\n");
2055 +               clear_bit(X86_FEATURE_PN, c->x86_capability);
2056 +
2057 +               /* Disabling the serial number may affect the cpuid level */
2058 +               c->cpuid_level = cpuid_eax(0);
2059 +       }
2060 +}
2061 +
2062 +static int __init x86_serial_nr_setup(char *s)
2063 +{
2064 +       disable_x86_serial_nr = 0;
2065 +       return 1;
2066 +}
2067 +__setup("serialnumber", x86_serial_nr_setup);
2068 +
2069 +
2070 +
2071 +/*
2072 + * This does the hard work of actually picking apart the CPU stuff...
2073 + */
2074 +void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
2075 +{
2076 +       int i;
2077 +
2078 +       c->loops_per_jiffy = loops_per_jiffy;
2079 +       c->x86_cache_size = -1;
2080 +       c->x86_vendor = X86_VENDOR_UNKNOWN;
2081 +       c->cpuid_level = -1;    /* CPUID not detected */
2082 +       c->x86_model = c->x86_mask = 0; /* So far unknown... */
2083 +       c->x86_vendor_id[0] = '\0'; /* Unset */
2084 +       c->x86_model_id[0] = '\0';  /* Unset */
2085 +       c->x86_max_cores = 1;
2086 +       memset(&c->x86_capability, 0, sizeof c->x86_capability);
2087 +
2088 +       if (!have_cpuid_p()) {
2089 +               /* First of all, decide if this is a 486 or higher */
2090 +               /* It's a 486 if we can modify the AC flag */
2091 +               if ( flag_is_changeable_p(X86_EFLAGS_AC) )
2092 +                       c->x86 = 4;
2093 +               else
2094 +                       c->x86 = 3;
2095 +       }
2096 +
2097 +       generic_identify(c);
2098 +
2099 +       printk(KERN_DEBUG "CPU: After generic identify, caps:");
2100 +       for (i = 0; i < NCAPINTS; i++)
2101 +               printk(" %08lx", c->x86_capability[i]);
2102 +       printk("\n");
2103 +
2104 +       if (this_cpu->c_identify) {
2105 +               this_cpu->c_identify(c);
2106 +
2107 +               printk(KERN_DEBUG "CPU: After vendor identify, caps:");
2108 +               for (i = 0; i < NCAPINTS; i++)
2109 +                       printk(" %08lx", c->x86_capability[i]);
2110 +               printk("\n");
2111 +       }
2112 +
2113 +       /*
2114 +        * Vendor-specific initialization.  In this section we
2115 +        * canonicalize the feature flags, meaning if there are
2116 +        * features a certain CPU supports which CPUID doesn't
2117 +        * tell us, CPUID claiming incorrect flags, or other bugs,
2118 +        * we handle them here.
2119 +        *
2120 +        * At the end of this section, c->x86_capability better
2121 +        * indicate the features this CPU genuinely supports!
2122 +        */
2123 +       if (this_cpu->c_init)
2124 +               this_cpu->c_init(c);
2125 +
2126 +       /* Disable the PN if appropriate */
2127 +       squash_the_stupid_serial_number(c);
2128 +
2129 +       /*
2130 +        * The vendor-specific functions might have changed features.  Now
2131 +        * we do "generic changes."
2132 +        */
2133 +
2134 +       /* TSC disabled? */
2135 +       if ( tsc_disable )
2136 +               clear_bit(X86_FEATURE_TSC, c->x86_capability);
2137 +
2138 +       /* FXSR disabled? */
2139 +       if (disable_x86_fxsr) {
2140 +               clear_bit(X86_FEATURE_FXSR, c->x86_capability);
2141 +               clear_bit(X86_FEATURE_XMM, c->x86_capability);
2142 +       }
2143 +
2144 +       /* SEP disabled? */
2145 +       if (disable_x86_sep)
2146 +               clear_bit(X86_FEATURE_SEP, c->x86_capability);
2147 +
2148 +       if (disable_pse)
2149 +               clear_bit(X86_FEATURE_PSE, c->x86_capability);
2150 +
2151 +       /* If the model name is still unset, do table lookup. */
2152 +       if ( !c->x86_model_id[0] ) {
2153 +               char *p;
2154 +               p = table_lookup_model(c);
2155 +               if ( p )
2156 +                       strcpy(c->x86_model_id, p);
2157 +               else
2158 +                       /* Last resort... */
2159 +                       sprintf(c->x86_model_id, "%02x/%02x",
2160 +                               c->x86, c->x86_model);
2161 +       }
2162 +
2163 +       /* Now the feature flags better reflect actual CPU features! */
2164 +
2165 +       printk(KERN_DEBUG "CPU: After all inits, caps:");
2166 +       for (i = 0; i < NCAPINTS; i++)
2167 +               printk(" %08lx", c->x86_capability[i]);
2168 +       printk("\n");
2169 +
2170 +       /*
2171 +        * On SMP, boot_cpu_data holds the common feature set between
2172 +        * all CPUs; so make sure that we indicate which features are
2173 +        * common between the CPUs.  The first time this routine gets
2174 +        * executed, c == &boot_cpu_data.
2175 +        */
2176 +       if ( c != &boot_cpu_data ) {
2177 +               /* AND the already accumulated flags with these */
2178 +               for ( i = 0 ; i < NCAPINTS ; i++ )
2179 +                       boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
2180 +       }
2181 +
2182 +       /* Init Machine Check Exception if available. */
2183 +       mcheck_init(c);
2184 +
2185 +       if (c == &boot_cpu_data)
2186 +               sysenter_setup();
2187 +       enable_sep_cpu();
2188 +
2189 +       if (c == &boot_cpu_data)
2190 +               mtrr_bp_init();
2191 +       else
2192 +               mtrr_ap_init();
2193 +}
2194 +
2195 +#ifdef CONFIG_X86_HT
2196 +void __cpuinit detect_ht(struct cpuinfo_x86 *c)
2197 +{
2198 +       u32     eax, ebx, ecx, edx;
2199 +       int     index_msb, core_bits;
2200 +       int     cpu = smp_processor_id();
2201 +
2202 +       cpuid(1, &eax, &ebx, &ecx, &edx);
2203 +
2204 +
2205 +       if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
2206 +               return;
2207 +
2208 +       smp_num_siblings = (ebx & 0xff0000) >> 16;
2209 +
2210 +       if (smp_num_siblings == 1) {
2211 +               printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
2212 +       } else if (smp_num_siblings > 1 ) {
2213 +
2214 +               if (smp_num_siblings > NR_CPUS) {
2215 +                       printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
2216 +                       smp_num_siblings = 1;
2217 +                       return;
2218 +               }
2219 +
2220 +               index_msb = get_count_order(smp_num_siblings);
2221 +               phys_proc_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
2222 +
2223 +               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
2224 +                      phys_proc_id[cpu]);
2225 +
2226 +               smp_num_siblings = smp_num_siblings / c->x86_max_cores;
2227 +
2228 +               index_msb = get_count_order(smp_num_siblings) ;
2229 +
2230 +               core_bits = get_count_order(c->x86_max_cores);
2231 +
2232 +               cpu_core_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
2233 +                                              ((1 << core_bits) - 1);
2234 +
2235 +               if (c->x86_max_cores > 1)
2236 +                       printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
2237 +                              cpu_core_id[cpu]);
2238 +       }
2239 +}
2240 +#endif
2241 +
2242 +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
2243 +{
2244 +       char *vendor = NULL;
2245 +
2246 +       if (c->x86_vendor < X86_VENDOR_NUM)
2247 +               vendor = this_cpu->c_vendor;
2248 +       else if (c->cpuid_level >= 0)
2249 +               vendor = c->x86_vendor_id;
2250 +
2251 +       if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor)))
2252 +               printk("%s ", vendor);
2253 +
2254 +       if (!c->x86_model_id[0])
2255 +               printk("%d86", c->x86);
2256 +       else
2257 +               printk("%s", c->x86_model_id);
2258 +
2259 +       if (c->x86_mask || c->cpuid_level >= 0) 
2260 +               printk(" stepping %02x\n", c->x86_mask);
2261 +       else
2262 +               printk("\n");
2263 +}
2264 +
2265 +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
2266 +
2267 +/* This is hacky. :)
2268 + * We're emulating future behavior.
2269 + * In the future, the cpu-specific init functions will be called implicitly
2270 + * via the magic of initcalls.
2271 + * They will insert themselves into the cpu_devs structure.
2272 + * Then, when cpu_init() is called, we can just iterate over that array.
2273 + */
2274 +
2275 +extern int intel_cpu_init(void);
2276 +extern int cyrix_init_cpu(void);
2277 +extern int nsc_init_cpu(void);
2278 +extern int amd_init_cpu(void);
2279 +extern int centaur_init_cpu(void);
2280 +extern int transmeta_init_cpu(void);
2281 +extern int rise_init_cpu(void);
2282 +extern int nexgen_init_cpu(void);
2283 +extern int umc_init_cpu(void);
2284 +
2285 +void __init early_cpu_init(void)
2286 +{
2287 +       intel_cpu_init();
2288 +       cyrix_init_cpu();
2289 +       nsc_init_cpu();
2290 +       amd_init_cpu();
2291 +       centaur_init_cpu();
2292 +       transmeta_init_cpu();
2293 +       rise_init_cpu();
2294 +       nexgen_init_cpu();
2295 +       umc_init_cpu();
2296 +       early_cpu_detect();
2297 +
2298 +#ifdef CONFIG_DEBUG_PAGEALLOC
2299 +       /* pse is not compatible with on-the-fly unmapping,
2300 +        * disable it even if the cpus claim to support it.
2301 +        */
2302 +       clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
2303 +       disable_pse = 1;
2304 +#endif
2305 +}
2306 +
2307 +void __cpuinit cpu_gdt_init(struct Xgt_desc_struct *gdt_descr)
2308 +{
2309 +       unsigned long frames[16];
2310 +       unsigned long va;
2311 +       int f;
2312 +
2313 +       for (va = gdt_descr->address, f = 0;
2314 +            va < gdt_descr->address + gdt_descr->size;
2315 +            va += PAGE_SIZE, f++) {
2316 +               frames[f] = virt_to_mfn(va);
2317 +               make_lowmem_page_readonly(
2318 +                       (void *)va, XENFEAT_writable_descriptor_tables);
2319 +       }
2320 +       if (HYPERVISOR_set_gdt(frames, gdt_descr->size / 8))
2321 +               BUG();
2322 +}
2323 +
2324 +/*
2325 + * cpu_init() initializes state that is per-CPU. Some data is already
2326 + * initialized (naturally) in the bootstrap process, such as the GDT
2327 + * and IDT. We reload them nevertheless, this function acts as a
2328 + * 'CPU state barrier', nothing should get across.
2329 + */
2330 +void __cpuinit cpu_init(void)
2331 +{
2332 +       int cpu = smp_processor_id();
2333 +#ifndef CONFIG_X86_NO_TSS
2334 +       struct tss_struct * t = &per_cpu(init_tss, cpu);
2335 +#endif
2336 +       struct thread_struct *thread = &current->thread;
2337 +       struct desc_struct *gdt;
2338 +       struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
2339 +
2340 +       if (cpu_test_and_set(cpu, cpu_initialized)) {
2341 +               printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
2342 +               for (;;) local_irq_enable();
2343 +       }
2344 +       printk(KERN_INFO "Initializing CPU#%d\n", cpu);
2345 +
2346 +       if (cpu_has_vme || cpu_has_de)
2347 +               clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
2348 +       if (tsc_disable && cpu_has_tsc) {
2349 +               printk(KERN_NOTICE "Disabling TSC...\n");
2350 +               /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
2351 +               clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
2352 +               set_in_cr4(X86_CR4_TSD);
2353 +       }
2354 +
2355 +#ifndef CONFIG_XEN
2356 +       /*
2357 +        * This is a horrible hack to allocate the GDT.  The problem
2358 +        * is that cpu_init() is called really early for the boot CPU
2359 +        * (and hence needs bootmem) but much later for the secondary
2360 +        * CPUs, when bootmem will have gone away
2361 +        */
2362 +       if (NODE_DATA(0)->bdata->node_bootmem_map) {
2363 +               gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
2364 +               /* alloc_bootmem_pages panics on failure, so no check */
2365 +               memset(gdt, 0, PAGE_SIZE);
2366 +       } else {
2367 +               gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
2368 +               if (unlikely(!gdt)) {
2369 +                       printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
2370 +                       for (;;)
2371 +                               local_irq_enable();
2372 +               }
2373 +       }
2374 +
2375 +       /*
2376 +        * Initialize the per-CPU GDT with the boot GDT,
2377 +        * and set up the GDT descriptor:
2378 +        */
2379 +       memcpy(gdt, cpu_gdt_table, GDT_SIZE);
2380 +
2381 +       /* Set up GDT entry for 16bit stack */
2382 +       *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |=
2383 +               ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
2384 +               ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
2385 +               (CPU_16BIT_STACK_SIZE - 1);
2386 +
2387 +       cpu_gdt_descr->size = GDT_SIZE - 1;
2388 +       cpu_gdt_descr->address = (unsigned long)gdt;
2389 +#else
2390 +       if (cpu == 0 && cpu_gdt_descr->address == 0) {
2391 +               gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
2392 +               /* alloc_bootmem_pages panics on failure, so no check */
2393 +               memset(gdt, 0, PAGE_SIZE);
2394 +
2395 +               memcpy(gdt, cpu_gdt_table, GDT_SIZE);
2396 +               
2397 +               cpu_gdt_descr->size = GDT_SIZE;
2398 +               cpu_gdt_descr->address = (unsigned long)gdt;
2399 +       }
2400 +#endif
2401 +
2402 +       cpu_gdt_init(cpu_gdt_descr);
2403 +
2404 +       /*
2405 +        * Set up and load the per-CPU TSS and LDT
2406 +        */
2407 +       atomic_inc(&init_mm.mm_count);
2408 +       current->active_mm = &init_mm;
2409 +       if (current->mm)
2410 +               BUG();
2411 +       enter_lazy_tlb(&init_mm, current);
2412 +
2413 +       load_esp0(t, thread);
2414 +
2415 +       load_LDT(&init_mm.context);
2416 +
2417 +#ifdef CONFIG_DOUBLEFAULT
2418 +       /* Set up doublefault TSS pointer in the GDT */
2419 +       __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
2420 +#endif
2421 +
2422 +       /* Clear %fs and %gs. */
2423 +       asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
2424 +
2425 +       /* Clear all 6 debug registers: */
2426 +       set_debugreg(0, 0);
2427 +       set_debugreg(0, 1);
2428 +       set_debugreg(0, 2);
2429 +       set_debugreg(0, 3);
2430 +       set_debugreg(0, 6);
2431 +       set_debugreg(0, 7);
2432 +
2433 +       /*
2434 +        * Force FPU initialization:
2435 +        */
2436 +       current_thread_info()->status = 0;
2437 +       clear_used_math();
2438 +       mxcsr_feature_mask_init();
2439 +}
2440 +
2441 +#ifdef CONFIG_HOTPLUG_CPU
2442 +void __cpuinit cpu_uninit(void)
2443 +{
2444 +       int cpu = raw_smp_processor_id();
2445 +       cpu_clear(cpu, cpu_initialized);
2446 +
2447 +       /* lazy TLB state */
2448 +       per_cpu(cpu_tlbstate, cpu).state = 0;
2449 +       per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
2450 +}
2451 +#endif
2452 diff -urNp linux-2.6/arch/i386/kernel/cpu/Makefile new/arch/i386/kernel/cpu/Makefile
2453 --- linux-2.6/arch/i386/kernel/cpu/Makefile     2006-07-03 14:14:14.000000000 +0200
2454 +++ new/arch/i386/kernel/cpu/Makefile   2006-05-09 12:32:34.000000000 +0200
2455 @@ -17,3 +17,8 @@ obj-$(CONFIG_X86_MCE) +=      mcheck/
2456  
2457  obj-$(CONFIG_MTRR)     +=      mtrr/
2458  obj-$(CONFIG_CPU_FREQ) +=      cpufreq/
2459 +
2460 +ifdef CONFIG_XEN
2461 +include $(srctree)/scripts/Makefile.xen
2462 +obj-y := $(call cherrypickxen, $(obj-y), $(src))
2463 +endif
2464 diff -urNp linux-2.6/arch/i386/kernel/cpu/mtrr/main-xen.c new/arch/i386/kernel/cpu/mtrr/main-xen.c
2465 --- linux-2.6/arch/i386/kernel/cpu/mtrr/main-xen.c      1970-01-01 01:00:00.000000000 +0100
2466 +++ new/arch/i386/kernel/cpu/mtrr/main-xen.c    2006-05-09 12:32:34.000000000 +0200
2467 @@ -0,0 +1,197 @@
2468 +#include <linux/init.h>
2469 +#include <linux/proc_fs.h>
2470 +#include <linux/ctype.h>
2471 +#include <linux/module.h>
2472 +#include <linux/seq_file.h>
2473 +#include <linux/mutex.h>
2474 +#include <asm/uaccess.h>
2475 +
2476 +#include <asm/mtrr.h>
2477 +#include "mtrr.h"
2478 +
2479 +static DEFINE_MUTEX(mtrr_mutex);
2480 +
2481 +void generic_get_mtrr(unsigned int reg, unsigned long *base,
2482 +                     unsigned int *size, mtrr_type * type)
2483 +{
2484 +       dom0_op_t op;
2485 +
2486 +       op.cmd = DOM0_READ_MEMTYPE;
2487 +       op.u.read_memtype.reg = reg;
2488 +       (void)HYPERVISOR_dom0_op(&op);
2489 +
2490 +       *size = op.u.read_memtype.nr_mfns;
2491 +       *base = op.u.read_memtype.mfn;
2492 +       *type = op.u.read_memtype.type;
2493 +}
2494 +
2495 +struct mtrr_ops generic_mtrr_ops = {
2496 +       .use_intel_if      = 1,
2497 +       .get               = generic_get_mtrr,
2498 +};
2499 +
2500 +struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
2501 +unsigned int num_var_ranges;
2502 +unsigned int *usage_table;
2503 +
2504 +static void __init set_num_var_ranges(void)
2505 +{
2506 +       dom0_op_t op;
2507 +
2508 +       for (num_var_ranges = 0; ; num_var_ranges++) {
2509 +               op.cmd = DOM0_READ_MEMTYPE;
2510 +               op.u.read_memtype.reg = num_var_ranges;
2511 +               if (HYPERVISOR_dom0_op(&op) != 0)
2512 +                       break;
2513 +       }
2514 +}
2515 +
2516 +static void __init init_table(void)
2517 +{
2518 +       int i, max;
2519 +
2520 +       max = num_var_ranges;
2521 +       if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
2522 +           == NULL) {
2523 +               printk(KERN_ERR "mtrr: could not allocate\n");
2524 +               return;
2525 +       }
2526 +       for (i = 0; i < max; i++)
2527 +               usage_table[i] = 0;
2528 +}
2529 +
2530 +int mtrr_add_page(unsigned long base, unsigned long size, 
2531 +                 unsigned int type, char increment)
2532 +{
2533 +       int error;
2534 +       dom0_op_t op;
2535 +
2536 +       mutex_lock(&mtrr_mutex);
2537 +
2538 +       op.cmd = DOM0_ADD_MEMTYPE;
2539 +       op.u.add_memtype.mfn     = base;
2540 +       op.u.add_memtype.nr_mfns = size;
2541 +       op.u.add_memtype.type    = type;
2542 +       error = HYPERVISOR_dom0_op(&op);
2543 +       if (error) {
2544 +               mutex_unlock(&mtrr_mutex);
2545 +               BUG_ON(error > 0);
2546 +               return error;
2547 +       }
2548 +
2549 +       if (increment)
2550 +               ++usage_table[op.u.add_memtype.reg];
2551 +
2552 +       mutex_unlock(&mtrr_mutex);
2553 +
2554 +       return op.u.add_memtype.reg;
2555 +}
2556 +
2557 +static int mtrr_check(unsigned long base, unsigned long size)
2558 +{
2559 +       if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
2560 +               printk(KERN_WARNING
2561 +                       "mtrr: size and base must be multiples of 4 kiB\n");
2562 +               printk(KERN_DEBUG
2563 +                       "mtrr: size: 0x%lx  base: 0x%lx\n", size, base);
2564 +               dump_stack();
2565 +               return -1;
2566 +       }
2567 +       return 0;
2568 +}
2569 +
2570 +int
2571 +mtrr_add(unsigned long base, unsigned long size, unsigned int type,
2572 +        char increment)
2573 +{
2574 +       if (mtrr_check(base, size))
2575 +               return -EINVAL;
2576 +       return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
2577 +                            increment);
2578 +}
2579 +
2580 +int mtrr_del_page(int reg, unsigned long base, unsigned long size)
2581 +{
2582 +       unsigned i;
2583 +       mtrr_type ltype;
2584 +       unsigned long lbase;
2585 +       unsigned int lsize;
2586 +       int error = -EINVAL;
2587 +       dom0_op_t op;
2588 +
2589 +       mutex_lock(&mtrr_mutex);
2590 +
2591 +       if (reg < 0) {
2592 +               /*  Search for existing MTRR  */
2593 +               for (i = 0; i < num_var_ranges; ++i) {
2594 +                       mtrr_if->get(i, &lbase, &lsize, &ltype);
2595 +                       if (lbase == base && lsize == size) {
2596 +                               reg = i;
2597 +                               break;
2598 +                       }
2599 +               }
2600 +               if (reg < 0) {
2601 +                       printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base,
2602 +                              size);
2603 +                       goto out;
2604 +               }
2605 +       }
2606 +       if (usage_table[reg] < 1) {
2607 +               printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
2608 +               goto out;
2609 +       }
2610 +       if (--usage_table[reg] < 1) {
2611 +               op.cmd = DOM0_DEL_MEMTYPE;
2612 +               op.u.del_memtype.handle = 0;
2613 +               op.u.del_memtype.reg    = reg;
2614 +               error = HYPERVISOR_dom0_op(&op);
2615 +               if (error) {
2616 +                       BUG_ON(error > 0);
2617 +                       goto out;
2618 +               }
2619 +       }
2620 +       error = reg;
2621 + out:
2622 +       mutex_unlock(&mtrr_mutex);
2623 +       return error;
2624 +}
2625 +
2626 +int
2627 +mtrr_del(int reg, unsigned long base, unsigned long size)
2628 +{
2629 +       if (mtrr_check(base, size))
2630 +               return -EINVAL;
2631 +       return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
2632 +}
2633 +
2634 +EXPORT_SYMBOL(mtrr_add);
2635 +EXPORT_SYMBOL(mtrr_del);
2636 +
2637 +void __init mtrr_bp_init(void)
2638 +{
2639 +}
2640 +
2641 +void mtrr_ap_init(void)
2642 +{
2643 +}
2644 +
2645 +static int __init mtrr_init(void)
2646 +{
2647 +       struct cpuinfo_x86 *c = &boot_cpu_data;
2648 +
2649 +       if (!(xen_start_info->flags & SIF_PRIVILEGED))
2650 +               return -ENODEV;
2651 +
2652 +       if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
2653 +           (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
2654 +           (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
2655 +           (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
2656 +               return -ENODEV;
2657 +
2658 +       set_num_var_ranges();
2659 +       init_table();
2660 +
2661 +       return 0;
2662 +}
2663 +
2664 +subsys_initcall(mtrr_init);
2665 diff -urNp linux-2.6/arch/i386/kernel/cpu/mtrr/Makefile new/arch/i386/kernel/cpu/mtrr/Makefile
2666 --- linux-2.6/arch/i386/kernel/cpu/mtrr/Makefile        2006-07-03 14:14:14.000000000 +0200
2667 +++ new/arch/i386/kernel/cpu/mtrr/Makefile      2006-05-09 12:32:34.000000000 +0200
2668 @@ -3,3 +3,10 @@ obj-y          += amd.o
2669  obj-y          += cyrix.o
2670  obj-y          += centaur.o
2671  
2672 +ifdef CONFIG_XEN
2673 +include $(srctree)/scripts/Makefile.xen
2674 +n-obj-xen := generic.o state.o amd.o cyrix.o centaur.o
2675 +
2676 +obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
2677 +obj-y := $(call cherrypickxen, $(obj-y))
2678 +endif
2679 diff -urNp linux-2.6/arch/i386/kernel/early_printk-xen.c new/arch/i386/kernel/early_printk-xen.c
2680 --- linux-2.6/arch/i386/kernel/early_printk-xen.c       1970-01-01 01:00:00.000000000 +0100
2681 +++ new/arch/i386/kernel/early_printk-xen.c     2006-05-09 12:32:34.000000000 +0200
2682 @@ -0,0 +1,2 @@
2683 +
2684 +#include "../../x86_64/kernel/early_printk-xen.c"
2685 diff -urNp linux-2.6/arch/i386/kernel/entry.S new/arch/i386/kernel/entry.S
2686 --- linux-2.6/arch/i386/kernel/entry.S  2006-07-03 14:14:14.000000000 +0200
2687 +++ new/arch/i386/kernel/entry.S        2006-05-09 12:32:34.000000000 +0200
2688 @@ -177,7 +177,7 @@ need_resched:
2689  
2690         # sysenter call handler stub
2691  ENTRY(sysenter_entry)
2692 -       movl TSS_sysenter_esp0(%esp),%esp
2693 +       movl SYSENTER_stack_esp0(%esp),%esp
2694  sysenter_past_esp:
2695         sti
2696         pushl $(__USER_DS)
2697 @@ -410,7 +410,7 @@ vector=0
2698  ENTRY(irq_entries_start)
2699  .rept NR_IRQS
2700         ALIGN
2701 -1:     pushl $vector-256
2702 +1:     pushl $~(vector)
2703         jmp common_interrupt
2704  .data
2705         .long 1b
2706 @@ -427,7 +427,7 @@ common_interrupt:
2707  
2708  #define BUILD_INTERRUPT(name, nr)      \
2709  ENTRY(name)                            \
2710 -       pushl $nr-256;                  \
2711 +       pushl $~(nr);                   \
2712         SAVE_ALL                        \
2713         movl %esp,%eax;                 \
2714         call smp_/**/name;              \
2715 @@ -496,7 +496,7 @@ device_not_available_emulate:
2716   * that sets up the real kernel stack. Check here, since we can't
2717   * allow the wrong stack to be used.
2718   *
2719 - * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have
2720 + * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have
2721   * already pushed 3 words if it hits on the sysenter instruction:
2722   * eflags, cs and eip.
2723   *
2724 @@ -508,7 +508,7 @@ device_not_available_emulate:
2725         cmpw $__KERNEL_CS,4(%esp);              \
2726         jne ok;                                 \
2727  label:                                         \
2728 -       movl TSS_sysenter_esp0+offset(%esp),%esp;       \
2729 +       movl SYSENTER_stack_esp0+offset(%esp),%esp;     \
2730         pushfl;                                 \
2731         pushl $__KERNEL_CS;                     \
2732         pushl $sysenter_past_esp
2733 diff -urNp linux-2.6/arch/i386/kernel/entry-xen.S new/arch/i386/kernel/entry-xen.S
2734 --- linux-2.6/arch/i386/kernel/entry-xen.S      1970-01-01 01:00:00.000000000 +0100
2735 +++ new/arch/i386/kernel/entry-xen.S    2006-05-09 12:32:34.000000000 +0200
2736 @@ -0,0 +1,903 @@
2737 +/*
2738 + *  linux/arch/i386/entry.S
2739 + *
2740 + *  Copyright (C) 1991, 1992  Linus Torvalds
2741 + */
2742 +
2743 +/*
2744 + * entry.S contains the system-call and fault low-level handling routines.
2745 + * This also contains the timer-interrupt handler, as well as all interrupts
2746 + * and faults that can result in a task-switch.
2747 + *
2748 + * NOTE: This code handles signal-recognition, which happens every time
2749 + * after a timer-interrupt and after each system call.
2750 + *
2751 + * I changed all the .align's to 4 (16 byte alignment), as that's faster
2752 + * on a 486.
2753 + *
2754 + * Stack layout in 'ret_from_system_call':
2755 + *     ptrace needs to have all regs on the stack.
2756 + *     if the order here is changed, it needs to be
2757 + *     updated in fork.c:copy_process, signal.c:do_signal,
2758 + *     ptrace.c and ptrace.h
2759 + *
2760 + *      0(%esp) - %ebx
2761 + *      4(%esp) - %ecx
2762 + *      8(%esp) - %edx
2763 + *       C(%esp) - %esi
2764 + *     10(%esp) - %edi
2765 + *     14(%esp) - %ebp
2766 + *     18(%esp) - %eax
2767 + *     1C(%esp) - %ds
2768 + *     20(%esp) - %es
2769 + *     24(%esp) - orig_eax
2770 + *     28(%esp) - %eip
2771 + *     2C(%esp) - %cs
2772 + *     30(%esp) - %eflags
2773 + *     34(%esp) - %oldesp
2774 + *     38(%esp) - %oldss
2775 + *
2776 + * "current" is in register %ebx during any slow entries.
2777 + */
2778 +
2779 +#include <linux/config.h>
2780 +#include <linux/linkage.h>
2781 +#include <asm/thread_info.h>
2782 +#include <asm/errno.h>
2783 +#include <asm/segment.h>
2784 +#include <asm/smp.h>
2785 +#include <asm/page.h>
2786 +#include <asm/desc.h>
2787 +#include "irq_vectors.h"
2788 +#include <xen/interface/xen.h>
2789 +
2790 +#define nr_syscalls ((syscall_table_size)/4)
2791 +
2792 +EBX            = 0x00
2793 +ECX            = 0x04
2794 +EDX            = 0x08
2795 +ESI            = 0x0C
2796 +EDI            = 0x10
2797 +EBP            = 0x14
2798 +EAX            = 0x18
2799 +DS             = 0x1C
2800 +ES             = 0x20
2801 +ORIG_EAX       = 0x24
2802 +EIP            = 0x28
2803 +CS             = 0x2C
2804 +EFLAGS         = 0x30
2805 +OLDESP         = 0x34
2806 +OLDSS          = 0x38
2807 +
2808 +CF_MASK                = 0x00000001
2809 +TF_MASK                = 0x00000100
2810 +IF_MASK                = 0x00000200
2811 +DF_MASK                = 0x00000400 
2812 +NT_MASK                = 0x00004000
2813 +VM_MASK                = 0x00020000
2814 +/* Pseudo-eflags. */
2815 +NMI_MASK       = 0x80000000
2816 +
2817 +#ifndef CONFIG_XEN
2818 +#define DISABLE_INTERRUPTS     cli
2819 +#define ENABLE_INTERRUPTS      sti
2820 +#else
2821 +/* Offsets into shared_info_t. */
2822 +#define evtchn_upcall_pending          /* 0 */
2823 +#define evtchn_upcall_mask             1
2824 +
2825 +#define sizeof_vcpu_shift              6
2826 +
2827 +#ifdef CONFIG_SMP
2828 +#define GET_VCPU_INFO          movl TI_cpu(%ebp),%esi                  ; \
2829 +                               shl  $sizeof_vcpu_shift,%esi            ; \
2830 +                               addl HYPERVISOR_shared_info,%esi
2831 +#else
2832 +#define GET_VCPU_INFO          movl HYPERVISOR_shared_info,%esi
2833 +#endif
2834 +
2835 +#define __DISABLE_INTERRUPTS   movb $1,evtchn_upcall_mask(%esi)
2836 +#define __ENABLE_INTERRUPTS    movb $0,evtchn_upcall_mask(%esi)
2837 +#define DISABLE_INTERRUPTS     GET_VCPU_INFO                           ; \
2838 +                               __DISABLE_INTERRUPTS
2839 +#define ENABLE_INTERRUPTS      GET_VCPU_INFO                           ; \
2840 +                               __ENABLE_INTERRUPTS
2841 +#define __TEST_PENDING         testb $0xFF,evtchn_upcall_pending(%esi)
2842 +#endif
2843 +
2844 +#ifdef CONFIG_PREEMPT
2845 +#define preempt_stop           cli
2846 +#else
2847 +#define preempt_stop
2848 +#define resume_kernel          restore_nocheck
2849 +#endif
2850 +
2851 +#define SAVE_ALL \
2852 +       cld; \
2853 +       pushl %es; \
2854 +       pushl %ds; \
2855 +       pushl %eax; \
2856 +       pushl %ebp; \
2857 +       pushl %edi; \
2858 +       pushl %esi; \
2859 +       pushl %edx; \
2860 +       pushl %ecx; \
2861 +       pushl %ebx; \
2862 +       movl $(__USER_DS), %edx; \
2863 +       movl %edx, %ds; \
2864 +       movl %edx, %es;
2865 +
2866 +#define RESTORE_INT_REGS \
2867 +       popl %ebx;      \
2868 +       popl %ecx;      \
2869 +       popl %edx;      \
2870 +       popl %esi;      \
2871 +       popl %edi;      \
2872 +       popl %ebp;      \
2873 +       popl %eax
2874 +
2875 +#define RESTORE_REGS   \
2876 +       RESTORE_INT_REGS; \
2877 +1:     popl %ds;       \
2878 +2:     popl %es;       \
2879 +.section .fixup,"ax";  \
2880 +3:     movl $0,(%esp); \
2881 +       jmp 1b;         \
2882 +4:     movl $0,(%esp); \
2883 +       jmp 2b;         \
2884 +.previous;             \
2885 +.section __ex_table,"a";\
2886 +       .align 4;       \
2887 +       .long 1b,3b;    \
2888 +       .long 2b,4b;    \
2889 +.previous
2890 +
2891 +
2892 +ENTRY(ret_from_fork)
2893 +       pushl %eax
2894 +       call schedule_tail
2895 +       GET_THREAD_INFO(%ebp)
2896 +       popl %eax
2897 +       jmp syscall_exit
2898 +
2899 +/*
2900 + * Return to user mode is not as complex as all this looks,
2901 + * but we want the default path for a system call return to
2902 + * go as quickly as possible which is why some of this is
2903 + * less clear than it otherwise should be.
2904 + */
2905 +
2906 +       # userspace resumption stub bypassing syscall exit tracing
2907 +       ALIGN
2908 +ret_from_exception:
2909 +       preempt_stop
2910 +ret_from_intr:
2911 +       GET_THREAD_INFO(%ebp)
2912 +       movl EFLAGS(%esp), %eax         # mix EFLAGS and CS
2913 +       movb CS(%esp), %al
2914 +       testl $(VM_MASK | 2), %eax
2915 +       jz resume_kernel
2916 +ENTRY(resume_userspace)
2917 +       DISABLE_INTERRUPTS              # make sure we don't miss an interrupt
2918 +                                       # setting need_resched or sigpending
2919 +                                       # between sampling and the iret
2920 +       movl TI_flags(%ebp), %ecx
2921 +       andl $_TIF_WORK_MASK, %ecx      # is there any work to be done on
2922 +                                       # int/exception return?
2923 +       jne work_pending
2924 +       jmp restore_all
2925 +
2926 +#ifdef CONFIG_PREEMPT
2927 +ENTRY(resume_kernel)
2928 +       cli
2929 +       cmpl $0,TI_preempt_count(%ebp)  # non-zero preempt_count ?
2930 +       jnz restore_nocheck
2931 +need_resched:
2932 +       movl TI_flags(%ebp), %ecx       # need_resched set ?
2933 +       testb $_TIF_NEED_RESCHED, %cl
2934 +       jz restore_all
2935 +       testl $IF_MASK,EFLAGS(%esp)     # interrupts off (exception path) ?
2936 +       jz restore_all
2937 +       call preempt_schedule_irq
2938 +       jmp need_resched
2939 +#endif
2940 +
2941 +/* SYSENTER_RETURN points to after the "sysenter" instruction in
2942 +   the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
2943 +
2944 +       # sysenter call handler stub
2945 +ENTRY(sysenter_entry)
2946 +       movl SYSENTER_stack_esp0(%esp),%esp
2947 +sysenter_past_esp:
2948 +       sti
2949 +       pushl $(__USER_DS)
2950 +       pushl %ebp
2951 +       pushfl
2952 +       pushl $(__USER_CS)
2953 +       pushl $SYSENTER_RETURN
2954 +
2955 +/*
2956 + * Load the potential sixth argument from user stack.
2957 + * Careful about security.
2958 + */
2959 +       cmpl $__PAGE_OFFSET-3,%ebp
2960 +       jae syscall_fault
2961 +1:     movl (%ebp),%ebp
2962 +.section __ex_table,"a"
2963 +       .align 4
2964 +       .long 1b,syscall_fault
2965 +.previous
2966 +
2967 +       pushl %eax
2968 +       SAVE_ALL
2969 +       GET_THREAD_INFO(%ebp)
2970 +
2971 +       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
2972 +       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
2973 +       jnz syscall_trace_entry
2974 +       cmpl $(nr_syscalls), %eax
2975 +       jae syscall_badsys
2976 +       call *sys_call_table(,%eax,4)
2977 +       movl %eax,EAX(%esp)
2978 +       DISABLE_INTERRUPTS
2979 +       movl TI_flags(%ebp), %ecx
2980 +       testw $_TIF_ALLWORK_MASK, %cx
2981 +       jne syscall_exit_work
2982 +/* if something modifies registers it must also disable sysexit */
2983 +       movl EIP(%esp), %edx
2984 +       movl OLDESP(%esp), %ecx
2985 +       xorl %ebp,%ebp
2986 +#ifdef CONFIG_XEN
2987 +       __ENABLE_INTERRUPTS
2988 +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/
2989 +       __TEST_PENDING
2990 +       jnz  14f                        # process more events if necessary...
2991 +       movl ESI(%esp), %esi
2992 +       sysexit
2993 +14:    __DISABLE_INTERRUPTS
2994 +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/
2995 +       push %esp
2996 +       call evtchn_do_upcall
2997 +       add  $4,%esp
2998 +       jmp  ret_from_intr
2999 +#else
3000 +       sti
3001 +       sysexit
3002 +#endif /* !CONFIG_XEN */
3003 +
3004 +
3005 +       # system call handler stub
3006 +ENTRY(system_call)
3007 +       pushl %eax                      # save orig_eax
3008 +       SAVE_ALL
3009 +       GET_THREAD_INFO(%ebp)
3010 +       testl $TF_MASK,EFLAGS(%esp)
3011 +       jz no_singlestep
3012 +       orl $_TIF_SINGLESTEP,TI_flags(%ebp)
3013 +no_singlestep:
3014 +                                       # system call tracing in operation / emulation
3015 +       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
3016 +       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
3017 +       jnz syscall_trace_entry
3018 +       cmpl $(nr_syscalls), %eax
3019 +       jae syscall_badsys
3020 +syscall_call:
3021 +       call *sys_call_table(,%eax,4)
3022 +       movl %eax,EAX(%esp)             # store the return value
3023 +syscall_exit:
3024 +       DISABLE_INTERRUPTS              # make sure we don't miss an interrupt
3025 +                                       # setting need_resched or sigpending
3026 +                                       # between sampling and the iret
3027 +       movl TI_flags(%ebp), %ecx
3028 +       testw $_TIF_ALLWORK_MASK, %cx   # current->work
3029 +       jne syscall_exit_work
3030 +
3031 +restore_all:
3032 +#ifndef CONFIG_XEN
3033 +       movl EFLAGS(%esp), %eax         # mix EFLAGS, SS and CS
3034 +       # Warning: OLDSS(%esp) contains the wrong/random values if we
3035 +       # are returning to the kernel.
3036 +       # See comments in process.c:copy_thread() for details.
3037 +       movb OLDSS(%esp), %ah
3038 +       movb CS(%esp), %al
3039 +       andl $(VM_MASK | (4 << 8) | 3), %eax
3040 +       cmpl $((4 << 8) | 3), %eax
3041 +       je ldt_ss                       # returning to user-space with LDT SS
3042 +restore_nocheck:
3043 +#else
3044 +restore_nocheck:
3045 +       movl EFLAGS(%esp), %eax
3046 +       testl $(VM_MASK|NMI_MASK), %eax
3047 +       jnz hypervisor_iret
3048 +       shr $9, %eax                    # EAX[0] == IRET_EFLAGS.IF
3049 +       GET_VCPU_INFO
3050 +       andb evtchn_upcall_mask(%esi),%al
3051 +       andb $1,%al                     # EAX[0] == IRET_EFLAGS.IF & event_mask
3052 +       jnz restore_all_enable_events   #        != 0 => enable event delivery
3053 +#endif
3054 +       RESTORE_REGS
3055 +       addl $4, %esp
3056 +1:     iret
3057 +.section .fixup,"ax"
3058 +iret_exc:
3059 +#ifndef CONFIG_XEN
3060 +       sti
3061 +#endif
3062 +       pushl $0                        # no error code
3063 +       pushl $do_iret_error
3064 +       jmp error_code
3065 +.previous
3066 +.section __ex_table,"a"
3067 +       .align 4
3068 +       .long 1b,iret_exc
3069 +.previous
3070 +
3071 +#ifndef CONFIG_XEN
3072 +ldt_ss:
3073 +       larl OLDSS(%esp), %eax
3074 +       jnz restore_nocheck
3075 +       testl $0x00400000, %eax         # returning to 32bit stack?
3076 +       jnz restore_nocheck             # allright, normal return
3077 +       /* If returning to userspace with 16bit stack,
3078 +        * try to fix the higher word of ESP, as the CPU
3079 +        * won't restore it.
3080 +        * This is an "official" bug of all the x86-compatible
3081 +        * CPUs, which we can try to work around to make
3082 +        * dosemu and wine happy. */
3083 +       subl $8, %esp           # reserve space for switch16 pointer
3084 +       cli
3085 +       movl %esp, %eax
3086 +       /* Set up the 16bit stack frame with switch32 pointer on top,
3087 +        * and a switch16 pointer on top of the current frame. */
3088 +       call setup_x86_bogus_stack
3089 +       RESTORE_REGS
3090 +       lss 20+4(%esp), %esp    # switch to 16bit stack
3091 +1:     iret
3092 +.section __ex_table,"a"
3093 +       .align 4
3094 +       .long 1b,iret_exc
3095 +.previous
3096 +#else
3097 +hypervisor_iret:
3098 +       andl $~NMI_MASK, EFLAGS(%esp)
3099 +       RESTORE_REGS
3100 +       addl $4, %esp
3101 +       jmp  hypercall_page + (__HYPERVISOR_iret * 32)
3102 +#endif
3103 +
3104 +       # perform work that needs to be done immediately before resumption
3105 +       ALIGN
3106 +work_pending:
3107 +       testb $_TIF_NEED_RESCHED, %cl
3108 +       jz work_notifysig
3109 +work_resched:
3110 +       call schedule
3111 +       DISABLE_INTERRUPTS              # make sure we don't miss an interrupt
3112 +                                       # setting need_resched or sigpending
3113 +                                       # between sampling and the iret
3114 +       movl TI_flags(%ebp), %ecx
3115 +       andl $_TIF_WORK_MASK, %ecx      # is there any work to be done other
3116 +                                       # than syscall tracing?
3117 +       jz restore_all
3118 +       testb $_TIF_NEED_RESCHED, %cl
3119 +       jnz work_resched
3120 +
3121 +work_notifysig:                                # deal with pending signals and
3122 +                                       # notify-resume requests
3123 +       testl $VM_MASK, EFLAGS(%esp)
3124 +       movl %esp, %eax
3125 +       jne work_notifysig_v86          # returning to kernel-space or
3126 +                                       # vm86-space
3127 +       xorl %edx, %edx
3128 +       call do_notify_resume
3129 +       jmp resume_userspace
3130 +
3131 +       ALIGN
3132 +work_notifysig_v86:
3133 +#ifdef CONFIG_VM86
3134 +       pushl %ecx                      # save ti_flags for do_notify_resume
3135 +       call save_v86_state             # %eax contains pt_regs pointer
3136 +       popl %ecx
3137 +       movl %eax, %esp
3138 +       xorl %edx, %edx
3139 +       call do_notify_resume
3140 +       jmp resume_userspace
3141 +#endif
3142 +
3143 +       # perform syscall exit tracing
3144 +       ALIGN
3145 +syscall_trace_entry:
3146 +       movl $-ENOSYS,EAX(%esp)
3147 +       movl %esp, %eax
3148 +       xorl %edx,%edx
3149 +       call do_syscall_trace
3150 +       cmpl $0, %eax
3151 +       jne resume_userspace            # ret != 0 -> running under PTRACE_SYSEMU,
3152 +                                       # so must skip actual syscall
3153 +       movl ORIG_EAX(%esp), %eax
3154 +       cmpl $(nr_syscalls), %eax
3155 +       jnae syscall_call
3156 +       jmp syscall_exit
3157 +
3158 +       # perform syscall exit tracing
3159 +       ALIGN
3160 +syscall_exit_work:
3161 +       testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
3162 +       jz work_pending
3163 +       ENABLE_INTERRUPTS               # could let do_syscall_trace() call
3164 +                                       # schedule() instead
3165 +       movl %esp, %eax
3166 +       movl $1, %edx
3167 +       call do_syscall_trace
3168 +       jmp resume_userspace
3169 +
3170 +       ALIGN
3171 +syscall_fault:
3172 +       pushl %eax                      # save orig_eax
3173 +       SAVE_ALL
3174 +       GET_THREAD_INFO(%ebp)
3175 +       movl $-EFAULT,EAX(%esp)
3176 +       jmp resume_userspace
3177 +
3178 +       ALIGN
3179 +syscall_badsys:
3180 +       movl $-ENOSYS,EAX(%esp)
3181 +       jmp resume_userspace
3182 +
3183 +#ifndef CONFIG_XEN
3184 +#define FIXUP_ESPFIX_STACK \
3185 +       movl %esp, %eax; \
3186 +       /* switch to 32bit stack using the pointer on top of 16bit stack */ \
3187 +       lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
3188 +       /* copy data from 16bit stack to 32bit stack */ \
3189 +       call fixup_x86_bogus_stack; \
3190 +       /* put ESP to the proper location */ \
3191 +       movl %eax, %esp;
3192 +#define UNWIND_ESPFIX_STACK \
3193 +       pushl %eax; \
3194 +       movl %ss, %eax; \
3195 +       /* see if on 16bit stack */ \
3196 +       cmpw $__ESPFIX_SS, %ax; \
3197 +       jne 28f; \
3198 +       movl $__KERNEL_DS, %edx; \
3199 +       movl %edx, %ds; \
3200 +       movl %edx, %es; \
3201 +       /* switch to 32bit stack */ \
3202 +       FIXUP_ESPFIX_STACK \
3203 +28:    popl %eax;
3204 +
3205 +/*
3206 + * Build the entry stubs and pointer table with
3207 + * some assembler magic.
3208 + */
3209 +.data
3210 +ENTRY(interrupt)
3211 +.text
3212 +
3213 +vector=0
3214 +ENTRY(irq_entries_start)
3215 +.rept NR_IRQS
3216 +       ALIGN
3217 +1:     pushl $~(vector)
3218 +       jmp common_interrupt
3219 +.data
3220 +       .long 1b
3221 +.text
3222 +vector=vector+1
3223 +.endr
3224 +
3225 +       ALIGN
3226 +common_interrupt:
3227 +       SAVE_ALL
3228 +       movl %esp,%eax
3229 +       call do_IRQ
3230 +       jmp ret_from_intr
3231 +
3232 +#define BUILD_INTERRUPT(name, nr)      \
3233 +ENTRY(name)                            \
3234 +       pushl $~(nr);                   \
3235 +       SAVE_ALL                        \
3236 +       movl %esp,%eax;                 \
3237 +       call smp_/**/name;              \
3238 +       jmp ret_from_intr;
3239 +
3240 +/* The include is where all of the SMP etc. interrupts come from */
3241 +#include "entry_arch.h"
3242 +#else
3243 +#define UNWIND_ESPFIX_STACK
3244 +#endif
3245 +
3246 +ENTRY(divide_error)
3247 +       pushl $0                        # no error code
3248 +       pushl $do_divide_error
3249 +       ALIGN
3250 +error_code:
3251 +       pushl %ds
3252 +       pushl %eax
3253 +       xorl %eax, %eax
3254 +       pushl %ebp
3255 +       pushl %edi
3256 +       pushl %esi
3257 +       pushl %edx
3258 +       decl %eax                       # eax = -1
3259 +       pushl %ecx
3260 +       pushl %ebx
3261 +       cld
3262 +       pushl %es
3263 +       UNWIND_ESPFIX_STACK
3264 +       popl %ecx
3265 +       movl ES(%esp), %edi             # get the function address
3266 +       movl ORIG_EAX(%esp), %edx       # get the error code
3267 +       movl %eax, ORIG_EAX(%esp)
3268 +       movl %ecx, ES(%esp)
3269 +       movl $(__USER_DS), %ecx
3270 +       movl %ecx, %ds
3271 +       movl %ecx, %es
3272 +       movl %esp,%eax                  # pt_regs pointer
3273 +       call *%edi
3274 +       jmp ret_from_exception
3275 +
3276 +#ifdef CONFIG_XEN
3277 +# A note on the "critical region" in our callback handler.
3278 +# We want to avoid stacking callback handlers due to events occurring
3279 +# during handling of the last event. To do this, we keep events disabled
3280 +# until we've done all processing. HOWEVER, we must enable events before
3281 +# popping the stack frame (can't be done atomically) and so it would still
3282 +# be possible to get enough handler activations to overflow the stack.
3283 +# Although unlikely, bugs of that kind are hard to track down, so we'd
3284 +# like to avoid the possibility.
3285 +# So, on entry to the handler we detect whether we interrupted an
3286 +# existing activation in its critical region -- if so, we pop the current
3287 +# activation and restart the handler using the previous one.
3288 +#
3289 +# The sysexit critical region is slightly different. sysexit
3290 +# atomically removes the entire stack frame. If we interrupt in the
3291 +# critical region we know that the entire frame is present and correct
3292 +# so we can simply throw away the new one.
3293 +ENTRY(hypervisor_callback)
3294 +       pushl %eax
3295 +       SAVE_ALL
3296 +       movl EIP(%esp),%eax
3297 +       cmpl $scrit,%eax
3298 +       jb   11f
3299 +       cmpl $ecrit,%eax
3300 +       jb   critical_region_fixup
3301 +       cmpl $sysexit_scrit,%eax
3302 +       jb   11f
3303 +       cmpl $sysexit_ecrit,%eax
3304 +       ja   11f
3305 +       addl $0x34,%esp                 # Remove cs...ebx from stack frame.
3306 +11:    push %esp
3307 +       call evtchn_do_upcall
3308 +       add  $4,%esp
3309 +       jmp  ret_from_intr
3310 +
3311 +        ALIGN
3312 +restore_all_enable_events:
3313 +       __ENABLE_INTERRUPTS
3314 +scrit: /**** START OF CRITICAL REGION ****/
3315 +       __TEST_PENDING
3316 +       jnz  14f                        # process more events if necessary...
3317 +       RESTORE_REGS
3318 +       addl $4, %esp
3319 +1:     iret
3320 +.section __ex_table,"a"
3321 +       .align 4
3322 +       .long 1b,iret_exc
3323 +.previous
3324 +14:    __DISABLE_INTERRUPTS
3325 +       jmp  11b
3326 +ecrit:  /**** END OF CRITICAL REGION ****/
3327 +# [How we do the fixup]. We want to merge the current stack frame with the
3328 +# just-interrupted frame. How we do this depends on where in the critical
3329 +# region the interrupted handler was executing, and so how many saved
3330 +# registers are in each frame. We do this quickly using the lookup table
3331 +# 'critical_fixup_table'. For each byte offset in the critical region, it
3332 +# provides the number of bytes which have already been popped from the
3333 +# interrupted stack frame.
3334 +critical_region_fixup:
3335 +       addl $critical_fixup_table-scrit,%eax
3336 +       movzbl (%eax),%eax              # %eax contains num bytes popped
3337 +       cmpb $0xff,%al                  # 0xff => vcpu_info critical region
3338 +       jne  15f
3339 +       GET_THREAD_INFO(%ebp)
3340 +        xorl %eax,%eax
3341 +15:    mov  %esp,%esi
3342 +       add  %eax,%esi                  # %esi points at end of src region
3343 +       mov  %esp,%edi
3344 +       add  $0x34,%edi                 # %edi points at end of dst region
3345 +       mov  %eax,%ecx
3346 +       shr  $2,%ecx                    # convert words to bytes
3347 +       je   17f                        # skip loop if nothing to copy
3348 +16:    subl $4,%esi                    # pre-decrementing copy loop
3349 +       subl $4,%edi
3350 +       movl (%esi),%eax
3351 +       movl %eax,(%edi)
3352 +       loop 16b
3353 +17:    movl %edi,%esp                  # final %edi is top of merged stack
3354 +       jmp  11b
3355 +
3356 +critical_fixup_table:
3357 +       .byte 0xff,0xff,0xff            # testb $0xff,(%esi) = __TEST_PENDING
3358 +       .byte 0xff,0xff                 # jnz  14f
3359 +       .byte 0x00                      # pop  %ebx
3360 +       .byte 0x04                      # pop  %ecx
3361 +       .byte 0x08                      # pop  %edx
3362 +       .byte 0x0c                      # pop  %esi
3363 +       .byte 0x10                      # pop  %edi
3364 +       .byte 0x14                      # pop  %ebp
3365 +       .byte 0x18                      # pop  %eax
3366 +       .byte 0x1c                      # pop  %ds
3367 +       .byte 0x20                      # pop  %es
3368 +       .byte 0x24,0x24,0x24            # add  $4,%esp
3369 +       .byte 0x28                      # iret
3370 +       .byte 0xff,0xff,0xff,0xff       # movb $1,1(%esi)
3371 +       .byte 0x00,0x00                 # jmp  11b
3372 +
3373 +# Hypervisor uses this for application faults while it executes.
3374 +# We get here for two reasons:
3375 +#  1. Fault while reloading DS, ES, FS or GS
3376 +#  2. Fault while executing IRET
3377 +# Category 1 we fix up by reattempting the load, and zeroing the segment
3378 +# register if the load fails.
3379 +# Category 2 we fix up by jumping to do_iret_error. We cannot use the
3380 +# normal Linux return path in this case because if we use the IRET hypercall
3381 +# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
3382 +# We distinguish between categories by maintaining a status value in EAX.
3383 +ENTRY(failsafe_callback)
3384 +       pushl %eax
3385 +       movl $1,%eax
3386 +1:     mov 4(%esp),%ds
3387 +2:     mov 8(%esp),%es
3388 +3:     mov 12(%esp),%fs
3389 +4:     mov 16(%esp),%gs
3390 +       testl %eax,%eax
3391 +       popl %eax
3392 +       jz 5f
3393 +       addl $16,%esp           # EAX != 0 => Category 2 (Bad IRET)
3394 +       jmp iret_exc
3395 +5:     addl $16,%esp           # EAX == 0 => Category 1 (Bad segment)
3396 +       pushl $0
3397 +       SAVE_ALL
3398 +       jmp ret_from_exception
3399 +.section .fixup,"ax";          \
3400 +6:     xorl %eax,%eax;         \
3401 +       movl %eax,4(%esp);      \
3402 +       jmp 1b;                 \
3403 +7:     xorl %eax,%eax;         \
3404 +       movl %eax,8(%esp);      \
3405 +       jmp 2b;                 \
3406 +8:     xorl %eax,%eax;         \
3407 +       movl %eax,12(%esp);     \
3408 +       jmp 3b;                 \
3409 +9:     xorl %eax,%eax;         \
3410 +       movl %eax,16(%esp);     \
3411 +       jmp 4b;                 \
3412 +.previous;                     \
3413 +.section __ex_table,"a";       \
3414 +       .align 4;               \
3415 +       .long 1b,6b;            \
3416 +       .long 2b,7b;            \
3417 +       .long 3b,8b;            \
3418 +       .long 4b,9b;            \
3419 +.previous
3420 +#endif
3421 +
3422 +ENTRY(coprocessor_error)
3423 +       pushl $0
3424 +       pushl $do_coprocessor_error
3425 +       jmp error_code
3426 +
3427 +ENTRY(simd_coprocessor_error)
3428 +       pushl $0
3429 +       pushl $do_simd_coprocessor_error
3430 +       jmp error_code
3431 +
3432 +ENTRY(device_not_available)
3433 +       pushl $-1                       # mark this as an int
3434 +       SAVE_ALL
3435 +#ifndef CONFIG_XEN
3436 +       movl %cr0, %eax
3437 +       testl $0x4, %eax                # EM (math emulation bit)
3438 +       je device_available_emulate
3439 +       pushl $0                        # temporary storage for ORIG_EIP
3440 +       call math_emulate
3441 +       addl $4, %esp
3442 +       jmp ret_from_exception
3443 +device_available_emulate:
3444 +#endif
3445 +       preempt_stop
3446 +       call math_state_restore
3447 +       jmp ret_from_exception
3448 +
3449 +#ifndef CONFIG_XEN
3450 +/*
3451 + * Debug traps and NMI can happen at the one SYSENTER instruction
3452 + * that sets up the real kernel stack. Check here, since we can't
3453 + * allow the wrong stack to be used.
3454 + *
3455 + * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have
3456 + * already pushed 3 words if it hits on the sysenter instruction:
3457 + * eflags, cs and eip.
3458 + *
3459 + * We just load the right stack, and push the three (known) values
3460 + * by hand onto the new stack - while updating the return eip past
3461 + * the instruction that would have done it for sysenter.
3462 + */
3463 +#define FIX_STACK(offset, ok, label)           \
3464 +       cmpw $__KERNEL_CS,4(%esp);              \
3465 +       jne ok;                                 \
3466 +label:                                         \
3467 +       movl SYSENTER_stack_esp0+offset(%esp),%esp;     \
3468 +       pushfl;                                 \
3469 +       pushl $__KERNEL_CS;                     \
3470 +       pushl $sysenter_past_esp
3471 +#endif /* CONFIG_XEN */
3472 +
3473 +KPROBE_ENTRY(debug)
3474 +#ifndef CONFIG_XEN
3475 +       cmpl $sysenter_entry,(%esp)
3476 +       jne debug_stack_correct
3477 +       FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
3478 +debug_stack_correct:
3479 +#endif /* !CONFIG_XEN */
3480 +       pushl $-1                       # mark this as an int
3481 +       SAVE_ALL
3482 +       xorl %edx,%edx                  # error code 0
3483 +       movl %esp,%eax                  # pt_regs pointer
3484 +       call do_debug
3485 +       jmp ret_from_exception
3486 +       .previous .text
3487 +
3488 +#ifndef CONFIG_XEN
3489 +/*
3490 + * NMI is doubly nasty. It can happen _while_ we're handling
3491 + * a debug fault, and the debug fault hasn't yet been able to
3492 + * clear up the stack. So we first check whether we got  an
3493 + * NMI on the sysenter entry path, but after that we need to
3494 + * check whether we got an NMI on the debug path where the debug
3495 + * fault happened on the sysenter path.
3496 + */
3497 +ENTRY(nmi)
3498 +       pushl %eax
3499 +       movl %ss, %eax
3500 +       cmpw $__ESPFIX_SS, %ax
3501 +       popl %eax
3502 +       je nmi_16bit_stack
3503 +       cmpl $sysenter_entry,(%esp)
3504 +       je nmi_stack_fixup
3505 +       pushl %eax
3506 +       movl %esp,%eax
3507 +       /* Do not access memory above the end of our stack page,
3508 +        * it might not exist.
3509 +        */
3510 +       andl $(THREAD_SIZE-1),%eax
3511 +       cmpl $(THREAD_SIZE-20),%eax
3512 +       popl %eax
3513 +       jae nmi_stack_correct
3514 +       cmpl $sysenter_entry,12(%esp)
3515 +       je nmi_debug_stack_check
3516 +nmi_stack_correct:
3517 +       pushl %eax
3518 +       SAVE_ALL
3519 +       xorl %edx,%edx          # zero error code
3520 +       movl %esp,%eax          # pt_regs pointer
3521 +       call do_nmi
3522 +       jmp restore_all
3523 +
3524 +nmi_stack_fixup:
3525 +       FIX_STACK(12,nmi_stack_correct, 1)
3526 +       jmp nmi_stack_correct
3527 +nmi_debug_stack_check:
3528 +       cmpw $__KERNEL_CS,16(%esp)
3529 +       jne nmi_stack_correct
3530 +       cmpl $debug,(%esp)
3531 +       jb nmi_stack_correct
3532 +       cmpl $debug_esp_fix_insn,(%esp)
3533 +       ja nmi_stack_correct
3534 +       FIX_STACK(24,nmi_stack_correct, 1)
3535 +       jmp nmi_stack_correct
3536 +
3537 +nmi_16bit_stack:
3538 +       /* create the pointer to lss back */
3539 +       pushl %ss
3540 +       pushl %esp
3541 +       movzwl %sp, %esp
3542 +       addw $4, (%esp)
3543 +       /* copy the iret frame of 12 bytes */
3544 +       .rept 3
3545 +       pushl 16(%esp)
3546 +       .endr
3547 +       pushl %eax
3548 +       SAVE_ALL
3549 +       FIXUP_ESPFIX_STACK              # %eax == %esp
3550 +       xorl %edx,%edx                  # zero error code
3551 +       call do_nmi
3552 +       RESTORE_REGS
3553 +       lss 12+4(%esp), %esp            # back to 16bit stack
3554 +1:     iret
3555 +.section __ex_table,"a"
3556 +       .align 4
3557 +       .long 1b,iret_exc
3558 +.previous
3559 +#else
3560 +ENTRY(nmi)
3561 +       pushl %eax
3562 +       SAVE_ALL
3563 +       xorl %edx,%edx          # zero error code
3564 +       movl %esp,%eax          # pt_regs pointer
3565 +       call do_nmi
3566 +       orl  $NMI_MASK, EFLAGS(%esp)
3567 +       jmp restore_all
3568 +#endif
3569 +
3570 +KPROBE_ENTRY(int3)
3571 +       pushl $-1                       # mark this as an int
3572 +       SAVE_ALL
3573 +       xorl %edx,%edx          # zero error code
3574 +       movl %esp,%eax          # pt_regs pointer
3575 +       call do_int3
3576 +       jmp ret_from_exception
3577 +       .previous .text
3578 +
3579 +ENTRY(overflow)
3580 +       pushl $0
3581 +       pushl $do_overflow
3582 +       jmp error_code
3583 +
3584 +ENTRY(bounds)
3585 +       pushl $0
3586 +       pushl $do_bounds
3587 +       jmp error_code
3588 +
3589 +ENTRY(invalid_op)
3590 +       pushl $0
3591 +       pushl $do_invalid_op
3592 +       jmp error_code
3593 +
3594 +ENTRY(coprocessor_segment_overrun)
3595 +       pushl $0
3596 +       pushl $do_coprocessor_segment_overrun
3597 +       jmp error_code
3598 +
3599 +ENTRY(invalid_TSS)
3600 +       pushl $do_invalid_TSS
3601 +       jmp error_code
3602 +
3603 +ENTRY(segment_not_present)
3604 +       pushl $do_segment_not_present
3605 +       jmp error_code
3606 +
3607 +ENTRY(stack_segment)
3608 +       pushl $do_stack_segment
3609 +       jmp error_code
3610 +
3611 +KPROBE_ENTRY(general_protection)
3612 +       pushl $do_general_protection
3613 +       jmp error_code
3614 +       .previous .text
3615 +
3616 +ENTRY(alignment_check)
3617 +       pushl $do_alignment_check
3618 +       jmp error_code
3619 +
3620 +KPROBE_ENTRY(page_fault)
3621 +       pushl $do_page_fault
3622 +       jmp error_code
3623 +       .previous .text
3624 +
3625 +#ifdef CONFIG_X86_MCE
3626 +ENTRY(machine_check)
3627 +       pushl $0
3628 +       pushl machine_check_vector
3629 +       jmp error_code
3630 +#endif
3631 +
3632 +ENTRY(fixup_4gb_segment)
3633 +       pushl $do_fixup_4gb_segment
3634 +       jmp error_code
3635 +
3636 +.section .rodata,"a"
3637 +#include "syscall_table.S"
3638 +
3639 +syscall_table_size=(.-sys_call_table)
3640 diff -urNp linux-2.6/arch/i386/kernel/fixup.c new/arch/i386/kernel/fixup.c
3641 --- linux-2.6/arch/i386/kernel/fixup.c  1970-01-01 01:00:00.000000000 +0100
3642 +++ new/arch/i386/kernel/fixup.c        2006-05-23 18:42:17.000000000 +0200
3643 @@ -0,0 +1,86 @@
3644 +/******************************************************************************
3645 + * fixup.c
3646 + * 
3647 + * Binary-rewriting of certain IA32 instructions, on notification by Xen.
3648 + * Used to avoid repeated slow emulation of common instructions used by the
3649 + * user-space TLS (Thread-Local Storage) libraries.
3650 + * 
3651 + * **** NOTE ****
3652 + *  Issues with the binary rewriting have caused it to be removed. Instead
3653 + *  we rely on Xen's emulator to boot the kernel, and then print a banner
3654 + *  message recommending that the user disables /lib/tls.
3655 + * 
3656 + * Copyright (c) 2004, K A Fraser
3657 + * 
3658 + * This program is free software; you can redistribute it and/or modify
3659 + * it under the terms of the GNU General Public License as published by
3660 + * the Free Software Foundation; either version 2 of the License, or
3661 + * (at your option) any later version.
3662 + * 
3663 + * This program is distributed in the hope that it will be useful,
3664 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
3665 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
3666 + * GNU General Public License for more details.
3667 + * 
3668 + * You should have received a copy of the GNU General Public License
3669 + * along with this program; if not, write to the Free Software
3670 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
3671 + */
3672 +
3673 +#include <linux/config.h>
3674 +#include <linux/init.h>
3675 +#include <linux/sched.h>
3676 +#include <linux/slab.h>
3677 +#include <linux/kernel.h>
3678 +#include <linux/delay.h>
3679 +#include <linux/version.h>
3680 +
3681 +#define DP(_f, _args...) printk(KERN_ALERT "  " _f "\n" , ## _args )
3682 +
3683 +fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
3684 +{
3685 +       static unsigned long printed = 0;
3686 +       char info[100];
3687 +       int i;
3688 +
3689 +       if (test_and_set_bit(0, &printed))
3690 +               return;
3691 +
3692 +       HYPERVISOR_vm_assist(
3693 +               VMASST_CMD_disable, VMASST_TYPE_4gb_segments_notify);
3694 +
3695 +       sprintf(info, "%s (pid=%d)", current->comm, current->tgid);
3696 +
3697 +
3698 +       DP("");
3699 +       DP("***************************************************************");
3700 +       DP("***************************************************************");
3701 +       DP("** WARNING: Currently emulating unsupported memory accesses  **");
3702 +       DP("**          in /lib/tls glibc libraries. The emulation is    **");
3703 +       DP("**          slow. To ensure full performance you should      **");
3704 +       DP("**          install a 'xen-friendly' (nosegneg) version of   **");
3705 +       DP("**          the library, or disable tls support by executing **");
3706 +       DP("**          the following as root:                           **");
3707 +       DP("**          mv /lib/tls /lib/tls.disabled                    **");
3708 +       DP("** Offending process: %-38.38s **", info);
3709 +       DP("***************************************************************");
3710 +       DP("***************************************************************");
3711 +       DP("");
3712 +
3713 +       for (i = 5; i > 0; i--) {
3714 +               touch_softlockup_watchdog();
3715 +               printk("Pausing... %d", i);
3716 +               mdelay(1000);
3717 +               printk("\b\b\b\b\b\b\b\b\b\b\b\b");
3718 +       }
3719 +
3720 +       printk("Continuing...\n\n");
3721 +}
3722 +
3723 +static int __init fixup_init(void)
3724 +{
3725 +       HYPERVISOR_vm_assist(
3726 +               VMASST_CMD_enable, VMASST_TYPE_4gb_segments_notify);
3727 +       return 0;
3728 +}
3729 +__initcall(fixup_init);
3730 diff -urNp linux-2.6/arch/i386/kernel/head-xen.S new/arch/i386/kernel/head-xen.S
3731 --- linux-2.6/arch/i386/kernel/head-xen.S       1970-01-01 01:00:00.000000000 +0100
3732 +++ new/arch/i386/kernel/head-xen.S     2006-06-07 13:15:16.000000000 +0200
3733 @@ -0,0 +1,181 @@
3734 +
3735 +
3736 +.text
3737 +#include <linux/config.h>
3738 +#include <linux/threads.h>
3739 +#include <linux/linkage.h>
3740 +#include <asm/segment.h>
3741 +#include <asm/page.h>
3742 +#include <asm/thread_info.h>
3743 +#include <asm/asm-offsets.h>
3744 +#include <xen/interface/arch-x86_32.h>
3745 +
3746 +/*
3747 + * References to members of the new_cpu_data structure.
3748 + */
3749 +
3750 +#define X86            new_cpu_data+CPUINFO_x86
3751 +#define X86_VENDOR     new_cpu_data+CPUINFO_x86_vendor
3752 +#define X86_MODEL      new_cpu_data+CPUINFO_x86_model
3753 +#define X86_MASK       new_cpu_data+CPUINFO_x86_mask
3754 +#define X86_HARD_MATH  new_cpu_data+CPUINFO_hard_math
3755 +#define X86_CPUID      new_cpu_data+CPUINFO_cpuid_level
3756 +#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
3757 +#define X86_VENDOR_ID  new_cpu_data+CPUINFO_x86_vendor_id
3758 +
3759 +#define VIRT_ENTRY_OFFSET 0x0
3760 +.org VIRT_ENTRY_OFFSET
3761 +ENTRY(startup_32)
3762 +       movl %esi,xen_start_info
3763 +       cld
3764 +
3765 +       /* Set up the stack pointer */
3766 +       movl $(init_thread_union+THREAD_SIZE),%esp
3767 +
3768 +       /* get vendor info */
3769 +       xorl %eax,%eax                  # call CPUID with 0 -> return vendor ID
3770 +       XEN_CPUID
3771 +       movl %eax,X86_CPUID             # save CPUID level
3772 +       movl %ebx,X86_VENDOR_ID         # lo 4 chars
3773 +       movl %edx,X86_VENDOR_ID+4       # next 4 chars
3774 +       movl %ecx,X86_VENDOR_ID+8       # last 4 chars
3775 +
3776 +       movl $1,%eax            # Use the CPUID instruction to get CPU type
3777 +       XEN_CPUID
3778 +       movb %al,%cl            # save reg for future use
3779 +       andb $0x0f,%ah          # mask processor family
3780 +       movb %ah,X86
3781 +       andb $0xf0,%al          # mask model
3782 +       shrb $4,%al
3783 +       movb %al,X86_MODEL
3784 +       andb $0x0f,%cl          # mask mask revision
3785 +       movb %cl,X86_MASK
3786 +       movl %edx,X86_CAPABILITY
3787 +
3788 +       movb $1,X86_HARD_MATH
3789 +
3790 +       xorl %eax,%eax                  # Clear FS/GS and LDT
3791 +       movl %eax,%fs
3792 +       movl %eax,%gs
3793 +       cld                     # gcc2 wants the direction flag cleared at all times
3794 +
3795 +       call start_kernel
3796 +L6:
3797 +       jmp L6                  # main should never return here, but
3798 +                               # just in case, we know what happens.
3799 +
3800 +#define HYPERCALL_PAGE_OFFSET 0x1000
3801 +.org HYPERCALL_PAGE_OFFSET
3802 +ENTRY(hypercall_page)
3803 +.skip 0x1000
3804 +
3805 +/*
3806 + * Real beginning of normal "text" segment
3807 + */
3808 +ENTRY(stext)
3809 +ENTRY(_stext)
3810 +
3811 +/*
3812 + * BSS section
3813 + */
3814 +.section ".bss.page_aligned","w"
3815 +ENTRY(empty_zero_page)
3816 +       .fill 4096,1,0
3817 +
3818 +/*
3819 + * This starts the data section.
3820 + */
3821 +.data
3822 +
3823 +/*
3824 + * The Global Descriptor Table contains 28 quadwords, per-CPU.
3825 + */
3826 +ENTRY(cpu_gdt_table)
3827 +       .quad 0x0000000000000000        /* NULL descriptor */
3828 +       .quad 0x0000000000000000        /* 0x0b reserved */
3829 +       .quad 0x0000000000000000        /* 0x13 reserved */
3830 +       .quad 0x0000000000000000        /* 0x1b reserved */
3831 +       .quad 0x0000000000000000        /* 0x20 unused */
3832 +       .quad 0x0000000000000000        /* 0x28 unused */
3833 +       .quad 0x0000000000000000        /* 0x33 TLS entry 1 */
3834 +       .quad 0x0000000000000000        /* 0x3b TLS entry 2 */
3835 +       .quad 0x0000000000000000        /* 0x43 TLS entry 3 */
3836 +       .quad 0x0000000000000000        /* 0x4b reserved */
3837 +       .quad 0x0000000000000000        /* 0x53 reserved */
3838 +       .quad 0x0000000000000000        /* 0x5b reserved */
3839 +
3840 +       .quad 0x00cf9a000000ffff        /* 0x60 kernel 4GB code at 0x00000000 */
3841 +       .quad 0x00cf92000000ffff        /* 0x68 kernel 4GB data at 0x00000000 */
3842 +       .quad 0x00cffa000000ffff        /* 0x73 user 4GB code at 0x00000000 */
3843 +       .quad 0x00cff2000000ffff        /* 0x7b user 4GB data at 0x00000000 */
3844 +
3845 +       .quad 0x0000000000000000        /* 0x80 TSS descriptor */
3846 +       .quad 0x0000000000000000        /* 0x88 LDT descriptor */
3847 +
3848 +       /*
3849 +        * Segments used for calling PnP BIOS have byte granularity.
3850 +        * They code segments and data segments have fixed 64k limits,
3851 +        * the transfer segment sizes are set at run time.
3852 +        */
3853 +       .quad 0x0000000000000000        /* 0x90 32-bit code */
3854 +       .quad 0x0000000000000000        /* 0x98 16-bit code */
3855 +       .quad 0x0000000000000000        /* 0xa0 16-bit data */
3856 +       .quad 0x0000000000000000        /* 0xa8 16-bit data */
3857 +       .quad 0x0000000000000000        /* 0xb0 16-bit data */
3858 +
3859 +       /*
3860 +        * The APM segments have byte granularity and their bases
3861 +        * are set at run time.  All have 64k limits.
3862 +        */
3863 +       .quad 0x0000000000000000        /* 0xb8 APM CS    code */
3864 +       .quad 0x0000000000000000        /* 0xc0 APM CS 16 code (16 bit) */
3865 +       .quad 0x0000000000000000        /* 0xc8 APM DS    data */
3866 +
3867 +       .quad 0x0000000000000000        /* 0xd0 - ESPFIX 16-bit SS */
3868 +       .quad 0x0000000000000000        /* 0xd8 - unused */
3869 +       .quad 0x0000000000000000        /* 0xe0 - unused */
3870 +       .quad 0x0000000000000000        /* 0xe8 - unused */
3871 +       .quad 0x0000000000000000        /* 0xf0 - unused */
3872 +       .quad 0x0000000000000000        /* 0xf8 - GDT entry 31: double-fault TSS */
3873 +
3874 +/*
3875 + * __xen_guest information
3876 + */
3877 +.macro utoa value
3878 + .if (\value) < 0 || (\value) >= 0x10
3879 +       utoa (((\value)>>4)&0x0fffffff)
3880 + .endif
3881 + .if ((\value) & 0xf) < 10
3882 +  .byte '0' + ((\value) & 0xf)
3883 + .else
3884 +  .byte 'A' + ((\value) & 0xf) - 10
3885 + .endif
3886 +.endm
3887 +
3888 +.section __xen_guest
3889 +       .ascii  "GUEST_OS=linux,GUEST_VER=2.6"
3890 +       .ascii  ",XEN_VER=xen-3.0"
3891 +       .ascii  ",VIRT_BASE=0x"
3892 +               utoa __PAGE_OFFSET
3893 +#ifdef CONFIG_XEN_COMPAT_030002
3894 +       .ascii  ",ELF_PADDR_OFFSET=0x"
3895 +               utoa __PAGE_OFFSET
3896 +#else
3897 +       .ascii  ",ELF_PADDR_OFFSET=0x0"
3898 +#endif /* !CONFIG_XEN_COMPAT_030002 */
3899 +       .ascii  ",VIRT_ENTRY=0x"
3900 +               utoa (__PAGE_OFFSET + __PHYSICAL_START + VIRT_ENTRY_OFFSET)
3901 +       .ascii  ",HYPERCALL_PAGE=0x"
3902 +               utoa ((__PHYSICAL_START+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT)
3903 +       .ascii  ",FEATURES=writable_page_tables"
3904 +       .ascii           "|writable_descriptor_tables"
3905 +       .ascii           "|auto_translated_physmap"
3906 +       .ascii           "|pae_pgdir_above_4gb"
3907 +       .ascii           "|supervisor_mode_kernel"
3908 +#ifdef CONFIG_X86_PAE
3909 +       .ascii  ",PAE=yes[extended-cr3]"
3910 +#else
3911 +       .ascii  ",PAE=no"
3912 +#endif
3913 +       .ascii  ",LOADER=generic"
3914 +       .byte   0
3915 diff -urNp linux-2.6/arch/i386/kernel/init_task-xen.c new/arch/i386/kernel/init_task-xen.c
3916 --- linux-2.6/arch/i386/kernel/init_task-xen.c  1970-01-01 01:00:00.000000000 +0100
3917 +++ new/arch/i386/kernel/init_task-xen.c        2006-05-09 12:32:35.000000000 +0200
3918 @@ -0,0 +1,51 @@
3919 +#include <linux/mm.h>
3920 +#include <linux/module.h>
3921 +#include <linux/sched.h>
3922 +#include <linux/init.h>
3923 +#include <linux/init_task.h>
3924 +#include <linux/fs.h>
3925 +#include <linux/mqueue.h>
3926 +
3927 +#include <asm/uaccess.h>
3928 +#include <asm/pgtable.h>
3929 +#include <asm/desc.h>
3930 +
3931 +static struct fs_struct init_fs = INIT_FS;
3932 +static struct files_struct init_files = INIT_FILES;
3933 +static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
3934 +static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
3935 +
3936 +#define swapper_pg_dir ((pgd_t *)NULL)
3937 +struct mm_struct init_mm = INIT_MM(init_mm);
3938 +#undef swapper_pg_dir
3939 +
3940 +EXPORT_SYMBOL(init_mm);
3941 +
3942 +/*
3943 + * Initial thread structure.
3944 + *
3945 + * We need to make sure that this is THREAD_SIZE aligned due to the
3946 + * way process stacks are handled. This is done by having a special
3947 + * "init_task" linker map entry..
3948 + */
3949 +union thread_union init_thread_union 
3950 +       __attribute__((__section__(".data.init_task"))) =
3951 +               { INIT_THREAD_INFO(init_task) };
3952 +
3953 +/*
3954 + * Initial task structure.
3955 + *
3956 + * All other task structs will be allocated on slabs in fork.c
3957 + */
3958 +struct task_struct init_task = INIT_TASK(init_task);
3959 +
3960 +EXPORT_SYMBOL(init_task);
3961 +
3962 +#ifndef CONFIG_X86_NO_TSS
3963 +/*
3964 + * per-CPU TSS segments. Threads are completely 'soft' on Linux,
3965 + * no more per-task TSS's.
3966 + */ 
3967 +DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
3968 +#endif
3969 +
3970 diff -urNp linux-2.6/arch/i386/kernel/io_apic-xen.c new/arch/i386/kernel/io_apic-xen.c
3971 --- linux-2.6/arch/i386/kernel/io_apic-xen.c    1970-01-01 01:00:00.000000000 +0100
3972 +++ new/arch/i386/kernel/io_apic-xen.c  2006-05-23 18:37:09.000000000 +0200
3973 @@ -0,0 +1,2751 @@
3974 +/*
3975 + *     Intel IO-APIC support for multi-Pentium hosts.
3976 + *
3977 + *     Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
3978 + *
3979 + *     Many thanks to Stig Venaas for trying out countless experimental
3980 + *     patches and reporting/debugging problems patiently!
3981 + *
3982 + *     (c) 1999, Multiple IO-APIC support, developed by
3983 + *     Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
3984 + *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
3985 + *     further tested and cleaned up by Zach Brown <zab@redhat.com>
3986 + *     and Ingo Molnar <mingo@redhat.com>
3987 + *
3988 + *     Fixes
3989 + *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
3990 + *                                     thanks to Eric Gilmore
3991 + *                                     and Rolf G. Tews
3992 + *                                     for testing these extensively
3993 + *     Paul Diefenbaugh        :       Added full ACPI support
3994 + */
3995 +
3996 +#include <linux/mm.h>
3997 +#include <linux/interrupt.h>
3998 +#include <linux/init.h>
3999 +#include <linux/delay.h>
4000 +#include <linux/sched.h>
4001 +#include <linux/config.h>
4002 +#include <linux/smp_lock.h>
4003 +#include <linux/mc146818rtc.h>
4004 +#include <linux/compiler.h>
4005 +#include <linux/acpi.h>
4006 +#include <linux/module.h>
4007 +#include <linux/sysdev.h>
4008 +
4009 +#include <asm/io.h>
4010 +#include <asm/smp.h>
4011 +#include <asm/desc.h>
4012 +#include <asm/timer.h>
4013 +#include <asm/i8259.h>
4014 +
4015 +#include <mach_apic.h>
4016 +
4017 +#include "io_ports.h"
4018 +
4019 +#ifdef CONFIG_XEN
4020 +
4021 +#include <xen/interface/xen.h>
4022 +#include <xen/interface/physdev.h>
4023 +
4024 +/* Fake i8259 */
4025 +#define make_8259A_irq(_irq)     (io_apic_irqs &= ~(1UL<<(_irq)))
4026 +#define disable_8259A_irq(_irq)  ((void)0)
4027 +#define i8259A_irq_pending(_irq) (0)
4028 +
4029 +unsigned long io_apic_irqs;
4030 +
4031 +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
4032 +{
4033 +       struct physdev_apic apic_op;
4034 +       int ret;
4035 +
4036 +       apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
4037 +       apic_op.reg = reg;
4038 +       ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
4039 +       if (ret)
4040 +               return ret;
4041 +       return apic_op.value;
4042 +}
4043 +
4044 +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
4045 +{
4046 +       struct physdev_apic apic_op;
4047 +
4048 +       apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
4049 +       apic_op.reg = reg;
4050 +       apic_op.value = value;
4051 +       HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op);
4052 +}
4053 +
4054 +#define io_apic_read(a,r)    xen_io_apic_read(a,r)
4055 +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
4056 +
4057 +#endif /* CONFIG_XEN */
4058 +
4059 +int (*ioapic_renumber_irq)(int ioapic, int irq);
4060 +atomic_t irq_mis_count;
4061 +
4062 +/* Where if anywhere is the i8259 connect in external int mode */
4063 +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
4064 +
4065 +static DEFINE_SPINLOCK(ioapic_lock);
4066 +
4067 +int timer_over_8254 __initdata = 1;
4068 +
4069 +/*
4070 + *     Is the SiS APIC rmw bug present ?
4071 + *     -1 = don't know, 0 = no, 1 = yes
4072 + */
4073 +int sis_apic_bug = -1;
4074 +
4075 +/*
4076 + * # of IRQ routing registers
4077 + */
4078 +int nr_ioapic_registers[MAX_IO_APICS];
4079 +
4080 +int disable_timer_pin_1 __initdata;
4081 +
4082 +/*
4083 + * Rough estimation of how many shared IRQs there are, can
4084 + * be changed anytime.
4085 + */
4086 +#define MAX_PLUS_SHARED_IRQS NR_IRQS
4087 +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
4088 +
4089 +/*
4090 + * This is performance-critical, we want to do it O(1)
4091 + *
4092 + * the indexing order of this array favors 1:1 mappings
4093 + * between pins and IRQs.
4094 + */
4095 +
4096 +static struct irq_pin_list {
4097 +       int apic, pin, next;
4098 +} irq_2_pin[PIN_MAP_SIZE];
4099 +
4100 +int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
4101 +#ifdef CONFIG_PCI_MSI
4102 +#define vector_to_irq(vector)  \
4103 +       (platform_legacy_irq(vector) ? vector : vector_irq[vector])
4104 +#else
4105 +#define vector_to_irq(vector)  (vector)
4106 +#endif
4107 +
4108 +/*
4109 + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
4110 + * shared ISA-space IRQs, so we have to support them. We are super
4111 + * fast in the common case, and fast for shared ISA-space IRQs.
4112 + */
4113 +static void add_pin_to_irq(unsigned int irq, int apic, int pin)
4114 +{
4115 +       static int first_free_entry = NR_IRQS;
4116 +       struct irq_pin_list *entry = irq_2_pin + irq;
4117 +
4118 +       while (entry->next)
4119 +               entry = irq_2_pin + entry->next;
4120 +
4121 +       if (entry->pin != -1) {
4122 +               entry->next = first_free_entry;
4123 +               entry = irq_2_pin + entry->next;
4124 +               if (++first_free_entry >= PIN_MAP_SIZE)
4125 +                       panic("io_apic.c: whoops");
4126 +       }
4127 +       entry->apic = apic;
4128 +       entry->pin = pin;
4129 +}
4130 +
4131 +#ifdef CONFIG_XEN
4132 +#define clear_IO_APIC() ((void)0)
4133 +#else
4134 +/*
4135 + * Reroute an IRQ to a different pin.
4136 + */
4137 +static void __init replace_pin_at_irq(unsigned int irq,
4138 +                                     int oldapic, int oldpin,
4139 +                                     int newapic, int newpin)
4140 +{
4141 +       struct irq_pin_list *entry = irq_2_pin + irq;
4142 +
4143 +       while (1) {
4144 +               if (entry->apic == oldapic && entry->pin == oldpin) {
4145 +                       entry->apic = newapic;
4146 +                       entry->pin = newpin;
4147 +               }
4148 +               if (!entry->next)
4149 +                       break;
4150 +               entry = irq_2_pin + entry->next;
4151 +       }
4152 +}
4153 +
4154 +static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
4155 +{
4156 +       struct irq_pin_list *entry = irq_2_pin + irq;
4157 +       unsigned int pin, reg;
4158 +
4159 +       for (;;) {
4160 +               pin = entry->pin;
4161 +               if (pin == -1)
4162 +                       break;
4163 +               reg = io_apic_read(entry->apic, 0x10 + pin*2);
4164 +               reg &= ~disable;
4165 +               reg |= enable;
4166 +               io_apic_modify(entry->apic, 0x10 + pin*2, reg);
4167 +               if (!entry->next)
4168 +                       break;
4169 +               entry = irq_2_pin + entry->next;
4170 +       }
4171 +}
4172 +
4173 +/* mask = 1 */
4174 +static void __mask_IO_APIC_irq (unsigned int irq)
4175 +{
4176 +       __modify_IO_APIC_irq(irq, 0x00010000, 0);
4177 +}
4178 +
4179 +/* mask = 0 */
4180 +static void __unmask_IO_APIC_irq (unsigned int irq)
4181 +{
4182 +       __modify_IO_APIC_irq(irq, 0, 0x00010000);
4183 +}
4184 +
4185 +/* mask = 1, trigger = 0 */
4186 +static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
4187 +{
4188 +       __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
4189 +}
4190 +
4191 +/* mask = 0, trigger = 1 */
4192 +static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
4193 +{
4194 +       __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
4195 +}
4196 +
4197 +static void mask_IO_APIC_irq (unsigned int irq)
4198 +{
4199 +       unsigned long flags;
4200 +
4201 +       spin_lock_irqsave(&ioapic_lock, flags);
4202 +       __mask_IO_APIC_irq(irq);
4203 +       spin_unlock_irqrestore(&ioapic_lock, flags);
4204 +}
4205 +
4206 +static void unmask_IO_APIC_irq (unsigned int irq)
4207 +{
4208 +       unsigned long flags;
4209 +
4210 +       spin_lock_irqsave(&ioapic_lock, flags);
4211 +       __unmask_IO_APIC_irq(irq);
4212 +       spin_unlock_irqrestore(&ioapic_lock, flags);
4213 +}
4214 +
4215 +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
4216 +{
4217 +       struct IO_APIC_route_entry entry;
4218 +       unsigned long flags;
4219 +       
4220 +       /* Check delivery_mode to be sure we're not clearing an SMI pin */
4221 +       spin_lock_irqsave(&ioapic_lock, flags);
4222 +       *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
4223 +       *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
4224 +       spin_unlock_irqrestore(&ioapic_lock, flags);
4225 +       if (entry.delivery_mode == dest_SMI)
4226 +               return;
4227 +
4228 +       /*
4229 +        * Disable it in the IO-APIC irq-routing table:
4230 +        */
4231 +       memset(&entry, 0, sizeof(entry));
4232 +       entry.mask = 1;
4233 +       spin_lock_irqsave(&ioapic_lock, flags);
4234 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
4235 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
4236 +       spin_unlock_irqrestore(&ioapic_lock, flags);
4237 +}
4238 +
4239 +static void clear_IO_APIC (void)
4240 +{
4241 +       int apic, pin;
4242 +
4243 +       for (apic = 0; apic < nr_ioapics; apic++)
4244 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
4245 +                       clear_IO_APIC_pin(apic, pin);
4246 +}
4247 +
4248 +#ifdef CONFIG_SMP
4249 +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
4250 +{
4251 +       unsigned long flags;
4252 +       int pin;
4253 +       struct irq_pin_list *entry = irq_2_pin + irq;
4254 +       unsigned int apicid_value;
4255 +       cpumask_t tmp;
4256 +       
4257 +       cpus_and(tmp, cpumask, cpu_online_map);
4258 +       if (cpus_empty(tmp))
4259 +               tmp = TARGET_CPUS;
4260 +
4261 +       cpus_and(cpumask, tmp, CPU_MASK_ALL);
4262 +
4263 +       apicid_value = cpu_mask_to_apicid(cpumask);
4264 +       /* Prepare to do the io_apic_write */
4265 +       apicid_value = apicid_value << 24;
4266 +       spin_lock_irqsave(&ioapic_lock, flags);
4267 +       for (;;) {
4268 +               pin = entry->pin;
4269 +               if (pin == -1)
4270 +                       break;
4271 +               io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
4272 +               if (!entry->next)
4273 +                       break;
4274 +               entry = irq_2_pin + entry->next;
4275 +       }
4276 +       set_irq_info(irq, cpumask);
4277 +       spin_unlock_irqrestore(&ioapic_lock, flags);
4278 +}
4279 +
4280 +#if defined(CONFIG_IRQBALANCE)
4281 +# include <asm/processor.h>    /* kernel_thread() */
4282 +# include <linux/kernel_stat.h>        /* kstat */
4283 +# include <linux/slab.h>               /* kmalloc() */
4284 +# include <linux/timer.h>      /* time_after() */
4285
4286 +# ifdef CONFIG_BALANCED_IRQ_DEBUG
4287 +#  define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0)
4288 +#  define Dprintk(x...) do { TDprintk(x); } while (0)
4289 +# else
4290 +#  define TDprintk(x...) 
4291 +#  define Dprintk(x...) 
4292 +# endif
4293 +
4294 +
4295 +#define IRQBALANCE_CHECK_ARCH -999
4296 +static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH;
4297 +static int physical_balance = 0;
4298 +
4299 +static struct irq_cpu_info {
4300 +       unsigned long * last_irq;
4301 +       unsigned long * irq_delta;
4302 +       unsigned long irq;
4303 +} irq_cpu_data[NR_CPUS];
4304 +
4305 +#define CPU_IRQ(cpu)           (irq_cpu_data[cpu].irq)
4306 +#define LAST_CPU_IRQ(cpu,irq)   (irq_cpu_data[cpu].last_irq[irq])
4307 +#define IRQ_DELTA(cpu,irq)     (irq_cpu_data[cpu].irq_delta[irq])
4308 +
4309 +#define IDLE_ENOUGH(cpu,now) \
4310 +       (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
4311 +
4312 +#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
4313 +
4314 +#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
4315 +
4316 +#define MAX_BALANCED_IRQ_INTERVAL      (5*HZ)
4317 +#define MIN_BALANCED_IRQ_INTERVAL      (HZ/2)
4318 +#define BALANCED_IRQ_MORE_DELTA                (HZ/10)
4319 +#define BALANCED_IRQ_LESS_DELTA                (HZ)
4320 +
4321 +static long balanced_irq_interval = MAX_BALANCED_IRQ_INTERVAL;
4322 +
4323 +static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
4324 +                       unsigned long now, int direction)
4325 +{
4326 +       int search_idle = 1;
4327 +       int cpu = curr_cpu;
4328 +
4329 +       goto inside;
4330 +
4331 +       do {
4332 +               if (unlikely(cpu == curr_cpu))
4333 +                       search_idle = 0;
4334 +inside:
4335 +               if (direction == 1) {
4336 +                       cpu++;
4337 +                       if (cpu >= NR_CPUS)
4338 +                               cpu = 0;
4339 +               } else {
4340 +                       cpu--;
4341 +                       if (cpu == -1)
4342 +                               cpu = NR_CPUS-1;
4343 +               }
4344 +       } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
4345 +                       (search_idle && !IDLE_ENOUGH(cpu,now)));
4346 +
4347 +       return cpu;
4348 +}
4349 +
4350 +static inline void balance_irq(int cpu, int irq)
4351 +{
4352 +       unsigned long now = jiffies;
4353 +       cpumask_t allowed_mask;
4354 +       unsigned int new_cpu;
4355 +               
4356 +       if (irqbalance_disabled)
4357 +               return; 
4358 +
4359 +       cpus_and(allowed_mask, cpu_online_map, irq_affinity[irq]);
4360 +       new_cpu = move(cpu, allowed_mask, now, 1);
4361 +       if (cpu != new_cpu) {
4362 +               set_pending_irq(irq, cpumask_of_cpu(new_cpu));
4363 +       }
4364 +}
4365 +
4366 +static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
4367 +{
4368 +       int i, j;
4369 +       Dprintk("Rotating IRQs among CPUs.\n");
4370 +       for_each_online_cpu(i) {
4371 +               for (j = 0; j < NR_IRQS; j++) {
4372 +                       if (!irq_desc[j].action)
4373 +                               continue;
4374 +                       /* Is it a significant load ?  */
4375 +                       if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
4376 +                                               useful_load_threshold)
4377 +                               continue;
4378 +                       balance_irq(i, j);
4379 +               }
4380 +       }
4381 +       balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
4382 +               balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);       
4383 +       return;
4384 +}
4385 +
4386 +static void do_irq_balance(void)
4387 +{
4388 +       int i, j;
4389 +       unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
4390 +       unsigned long move_this_load = 0;
4391 +       int max_loaded = 0, min_loaded = 0;
4392 +       int load;
4393 +       unsigned long useful_load_threshold = balanced_irq_interval + 10;
4394 +       int selected_irq;
4395 +       int tmp_loaded, first_attempt = 1;
4396 +       unsigned long tmp_cpu_irq;
4397 +       unsigned long imbalance = 0;
4398 +       cpumask_t allowed_mask, target_cpu_mask, tmp;
4399 +
4400 +       for_each_possible_cpu(i) {
4401 +               int package_index;
4402 +               CPU_IRQ(i) = 0;
4403 +               if (!cpu_online(i))
4404 +                       continue;
4405 +               package_index = CPU_TO_PACKAGEINDEX(i);
4406 +               for (j = 0; j < NR_IRQS; j++) {
4407 +                       unsigned long value_now, delta;
4408 +                       /* Is this an active IRQ? */
4409 +                       if (!irq_desc[j].action)
4410 +                               continue;
4411 +                       if ( package_index == i )
4412 +                               IRQ_DELTA(package_index,j) = 0;
4413 +                       /* Determine the total count per processor per IRQ */
4414 +                       value_now = (unsigned long) kstat_cpu(i).irqs[j];
4415 +
4416 +                       /* Determine the activity per processor per IRQ */
4417 +                       delta = value_now - LAST_CPU_IRQ(i,j);
4418 +
4419 +                       /* Update last_cpu_irq[][] for the next time */
4420 +                       LAST_CPU_IRQ(i,j) = value_now;
4421 +
4422 +                       /* Ignore IRQs whose rate is less than the clock */
4423 +                       if (delta < useful_load_threshold)
4424 +                               continue;
4425 +                       /* update the load for the processor or package total */
4426 +                       IRQ_DELTA(package_index,j) += delta;
4427 +
4428 +                       /* Keep track of the higher numbered sibling as well */
4429 +                       if (i != package_index)
4430 +                               CPU_IRQ(i) += delta;
4431 +                       /*
4432 +                        * We have sibling A and sibling B in the package
4433 +                        *
4434 +                        * cpu_irq[A] = load for cpu A + load for cpu B
4435 +                        * cpu_irq[B] = load for cpu B
4436 +                        */
4437 +                       CPU_IRQ(package_index) += delta;
4438 +               }
4439 +       }
4440 +       /* Find the least loaded processor package */
4441 +       for_each_online_cpu(i) {
4442 +               if (i != CPU_TO_PACKAGEINDEX(i))
4443 +                       continue;
4444 +               if (min_cpu_irq > CPU_IRQ(i)) {
4445 +                       min_cpu_irq = CPU_IRQ(i);
4446 +                       min_loaded = i;
4447 +               }
4448 +       }
4449 +       max_cpu_irq = ULONG_MAX;
4450 +
4451 +tryanothercpu:
4452 +       /* Look for heaviest loaded processor.
4453 +        * We may come back to get the next heaviest loaded processor.
4454 +        * Skip processors with trivial loads.
4455 +        */
4456 +       tmp_cpu_irq = 0;
4457 +       tmp_loaded = -1;
4458 +       for_each_online_cpu(i) {
4459 +               if (i != CPU_TO_PACKAGEINDEX(i))
4460 +                       continue;
4461 +               if (max_cpu_irq <= CPU_IRQ(i)) 
4462 +                       continue;
4463 +               if (tmp_cpu_irq < CPU_IRQ(i)) {
4464 +                       tmp_cpu_irq = CPU_IRQ(i);
4465 +                       tmp_loaded = i;
4466 +               }
4467 +       }
4468 +
4469 +       if (tmp_loaded == -1) {
4470 +        /* In the case of small number of heavy interrupt sources, 
4471 +         * loading some of the cpus too much. We use Ingo's original 
4472 +         * approach to rotate them around.
4473 +         */
4474 +               if (!first_attempt && imbalance >= useful_load_threshold) {
4475 +                       rotate_irqs_among_cpus(useful_load_threshold);
4476 +                       return;
4477 +               }
4478 +               goto not_worth_the_effort;
4479 +       }
4480 +       
4481 +       first_attempt = 0;              /* heaviest search */
4482 +       max_cpu_irq = tmp_cpu_irq;      /* load */
4483 +       max_loaded = tmp_loaded;        /* processor */
4484 +       imbalance = (max_cpu_irq - min_cpu_irq) / 2;
4485 +       
4486 +       Dprintk("max_loaded cpu = %d\n", max_loaded);
4487 +       Dprintk("min_loaded cpu = %d\n", min_loaded);
4488 +       Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq);
4489 +       Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq);
4490 +       Dprintk("load imbalance = %lu\n", imbalance);
4491 +
4492 +       /* if imbalance is less than approx 10% of max load, then
4493 +        * observe diminishing returns action. - quit
4494 +        */
4495 +       if (imbalance < (max_cpu_irq >> 3)) {
4496 +               Dprintk("Imbalance too trivial\n");
4497 +               goto not_worth_the_effort;
4498 +       }
4499 +
4500 +tryanotherirq:
4501 +       /* if we select an IRQ to move that can't go where we want, then
4502 +        * see if there is another one to try.
4503 +        */
4504 +       move_this_load = 0;
4505 +       selected_irq = -1;
4506 +       for (j = 0; j < NR_IRQS; j++) {
4507 +               /* Is this an active IRQ? */
4508 +               if (!irq_desc[j].action)
4509 +                       continue;
4510 +               if (imbalance <= IRQ_DELTA(max_loaded,j))
4511 +                       continue;
4512 +               /* Try to find the IRQ that is closest to the imbalance
4513 +                * without going over.
4514 +                */
4515 +               if (move_this_load < IRQ_DELTA(max_loaded,j)) {
4516 +                       move_this_load = IRQ_DELTA(max_loaded,j);
4517 +                       selected_irq = j;
4518 +               }
4519 +       }
4520 +       if (selected_irq == -1) {
4521 +               goto tryanothercpu;
4522 +       }
4523 +
4524 +       imbalance = move_this_load;
4525 +       
4526 +       /* For physical_balance case, we accumlated both load
4527 +        * values in the one of the siblings cpu_irq[],
4528 +        * to use the same code for physical and logical processors
4529 +        * as much as possible. 
4530 +        *
4531 +        * NOTE: the cpu_irq[] array holds the sum of the load for
4532 +        * sibling A and sibling B in the slot for the lowest numbered
4533 +        * sibling (A), _AND_ the load for sibling B in the slot for
4534 +        * the higher numbered sibling.
4535 +        *
4536 +        * We seek the least loaded sibling by making the comparison
4537 +        * (A+B)/2 vs B
4538 +        */
4539 +       load = CPU_IRQ(min_loaded) >> 1;
4540 +       for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) {
4541 +               if (load > CPU_IRQ(j)) {
4542 +                       /* This won't change cpu_sibling_map[min_loaded] */
4543 +                       load = CPU_IRQ(j);
4544 +                       min_loaded = j;
4545 +               }
4546 +       }
4547 +
4548 +       cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]);
4549 +       target_cpu_mask = cpumask_of_cpu(min_loaded);
4550 +       cpus_and(tmp, target_cpu_mask, allowed_mask);
4551 +
4552 +       if (!cpus_empty(tmp)) {
4553 +
4554 +               Dprintk("irq = %d moved to cpu = %d\n",
4555 +                               selected_irq, min_loaded);
4556 +               /* mark for change destination */
4557 +               set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
4558 +
4559 +               /* Since we made a change, come back sooner to 
4560 +                * check for more variation.
4561 +                */
4562 +               balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
4563 +                       balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);       
4564 +               return;
4565 +       }
4566 +       goto tryanotherirq;
4567 +
4568 +not_worth_the_effort:
4569 +       /*
4570 +        * if we did not find an IRQ to move, then adjust the time interval
4571 +        * upward
4572 +        */
4573 +       balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
4574 +               balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);       
4575 +       Dprintk("IRQ worth rotating not found\n");
4576 +       return;
4577 +}
4578 +
4579 +static int balanced_irq(void *unused)
4580 +{
4581 +       int i;
4582 +       unsigned long prev_balance_time = jiffies;
4583 +       long time_remaining = balanced_irq_interval;
4584 +
4585 +       daemonize("kirqd");
4586 +       
4587 +       /* push everything to CPU 0 to give us a starting point.  */
4588 +       for (i = 0 ; i < NR_IRQS ; i++) {
4589 +               pending_irq_cpumask[i] = cpumask_of_cpu(0);
4590 +               set_pending_irq(i, cpumask_of_cpu(0));
4591 +       }
4592 +
4593 +       for ( ; ; ) {
4594 +               time_remaining = schedule_timeout_interruptible(time_remaining);
4595 +               try_to_freeze();
4596 +               if (time_after(jiffies,
4597 +                               prev_balance_time+balanced_irq_interval)) {
4598 +                       preempt_disable();
4599 +                       do_irq_balance();
4600 +                       prev_balance_time = jiffies;
4601 +                       time_remaining = balanced_irq_interval;
4602 +                       preempt_enable();
4603 +               }
4604 +       }
4605 +       return 0;
4606 +}
4607 +
4608 +static int __init balanced_irq_init(void)
4609 +{
4610 +       int i;
4611 +       struct cpuinfo_x86 *c;
4612 +       cpumask_t tmp;
4613 +
4614 +       cpus_shift_right(tmp, cpu_online_map, 2);
4615 +        c = &boot_cpu_data;
4616 +       /* When not overwritten by the command line ask subarchitecture. */
4617 +       if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
4618 +               irqbalance_disabled = NO_BALANCE_IRQ;
4619 +       if (irqbalance_disabled)
4620 +               return 0;
4621 +       
4622 +        /* disable irqbalance completely if there is only one processor online */
4623 +       if (num_online_cpus() < 2) {
4624 +               irqbalance_disabled = 1;
4625 +               return 0;
4626 +       }
4627 +       /*
4628 +        * Enable physical balance only if more than 1 physical processor
4629 +        * is present
4630 +        */
4631 +       if (smp_num_siblings > 1 && !cpus_empty(tmp))
4632 +               physical_balance = 1;
4633 +
4634 +       for_each_online_cpu(i) {
4635 +               irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
4636 +               irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
4637 +               if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
4638 +                       printk(KERN_ERR "balanced_irq_init: out of memory");
4639 +                       goto failed;
4640 +               }
4641 +               memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
4642 +               memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
4643 +       }
4644 +       
4645 +       printk(KERN_INFO "Starting balanced_irq\n");
4646 +       if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) 
4647 +               return 0;
4648 +       else 
4649 +               printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
4650 +failed:
4651 +       for_each_possible_cpu(i) {
4652 +               kfree(irq_cpu_data[i].irq_delta);
4653 +               irq_cpu_data[i].irq_delta = NULL;
4654 +               kfree(irq_cpu_data[i].last_irq);
4655 +               irq_cpu_data[i].last_irq = NULL;
4656 +       }
4657 +       return 0;
4658 +}
4659 +
4660 +int __init irqbalance_disable(char *str)
4661 +{
4662 +       irqbalance_disabled = 1;
4663 +       return 1;
4664 +}
4665 +
4666 +__setup("noirqbalance", irqbalance_disable);
4667 +
4668 +late_initcall(balanced_irq_init);
4669 +#endif /* CONFIG_IRQBALANCE */
4670 +#endif /* CONFIG_SMP */
4671 +#endif
4672 +
4673 +#ifndef CONFIG_SMP
4674 +void fastcall send_IPI_self(int vector)
4675 +{
4676 +#ifndef CONFIG_XEN
4677 +       unsigned int cfg;
4678 +
4679 +       /*
4680 +        * Wait for idle.
4681 +        */
4682 +       apic_wait_icr_idle();
4683 +       cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
4684 +       /*
4685 +        * Send the IPI. The write to APIC_ICR fires this off.
4686 +        */
4687 +       apic_write_around(APIC_ICR, cfg);
4688 +#endif
4689 +}
4690 +#endif /* !CONFIG_SMP */
4691 +
4692 +
4693 +/*
4694 + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
4695 + * specific CPU-side IRQs.
4696 + */
4697 +
4698 +#define MAX_PIRQS 8
4699 +static int pirq_entries [MAX_PIRQS];
4700 +static int pirqs_enabled;
4701 +int skip_ioapic_setup;
4702 +
4703 +static int __init ioapic_setup(char *str)
4704 +{
4705 +       skip_ioapic_setup = 1;
4706 +       return 1;
4707 +}
4708 +
4709 +__setup("noapic", ioapic_setup);
4710 +
4711 +static int __init ioapic_pirq_setup(char *str)
4712 +{
4713 +       int i, max;
4714 +       int ints[MAX_PIRQS+1];
4715 +
4716 +       get_options(str, ARRAY_SIZE(ints), ints);
4717 +
4718 +       for (i = 0; i < MAX_PIRQS; i++)
4719 +               pirq_entries[i] = -1;
4720 +
4721 +       pirqs_enabled = 1;
4722 +       apic_printk(APIC_VERBOSE, KERN_INFO
4723 +                       "PIRQ redirection, working around broken MP-BIOS.\n");
4724 +       max = MAX_PIRQS;
4725 +       if (ints[0] < MAX_PIRQS)
4726 +               max = ints[0];
4727 +
4728 +       for (i = 0; i < max; i++) {
4729 +               apic_printk(APIC_VERBOSE, KERN_DEBUG
4730 +                               "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
4731 +               /*
4732 +                * PIRQs are mapped upside down, usually.
4733 +                */
4734 +               pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
4735 +       }
4736 +       return 1;
4737 +}
4738 +
4739 +__setup("pirq=", ioapic_pirq_setup);
4740 +
4741 +/*
4742 + * Find the IRQ entry number of a certain pin.
4743 + */
4744 +static int find_irq_entry(int apic, int pin, int type)
4745 +{
4746 +       int i;
4747 +
4748 +       for (i = 0; i < mp_irq_entries; i++)
4749 +               if (mp_irqs[i].mpc_irqtype == type &&
4750 +                   (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
4751 +                    mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
4752 +                   mp_irqs[i].mpc_dstirq == pin)
4753 +                       return i;
4754 +
4755 +       return -1;
4756 +}
4757 +
4758 +/*
4759 + * Find the pin to which IRQ[irq] (ISA) is connected
4760 + */
4761 +static int __init find_isa_irq_pin(int irq, int type)
4762 +{
4763 +       int i;
4764 +
4765 +       for (i = 0; i < mp_irq_entries; i++) {
4766 +               int lbus = mp_irqs[i].mpc_srcbus;
4767 +
4768 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
4769 +                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
4770 +                    mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
4771 +                    mp_bus_id_to_type[lbus] == MP_BUS_NEC98
4772 +                   ) &&
4773 +                   (mp_irqs[i].mpc_irqtype == type) &&
4774 +                   (mp_irqs[i].mpc_srcbusirq == irq))
4775 +
4776 +                       return mp_irqs[i].mpc_dstirq;
4777 +       }
4778 +       return -1;
4779 +}
4780 +
4781 +static int __init find_isa_irq_apic(int irq, int type)
4782 +{
4783 +       int i;
4784 +
4785 +       for (i = 0; i < mp_irq_entries; i++) {
4786 +               int lbus = mp_irqs[i].mpc_srcbus;
4787 +
4788 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
4789 +                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
4790 +                    mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
4791 +                    mp_bus_id_to_type[lbus] == MP_BUS_NEC98
4792 +                   ) &&
4793 +                   (mp_irqs[i].mpc_irqtype == type) &&
4794 +                   (mp_irqs[i].mpc_srcbusirq == irq))
4795 +                       break;
4796 +       }
4797 +       if (i < mp_irq_entries) {
4798 +               int apic;
4799 +               for(apic = 0; apic < nr_ioapics; apic++) {
4800 +                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
4801 +                               return apic;
4802 +               }
4803 +       }
4804 +
4805 +       return -1;
4806 +}
4807 +
4808 +/*
4809 + * Find a specific PCI IRQ entry.
4810 + * Not an __init, possibly needed by modules
4811 + */
4812 +static int pin_2_irq(int idx, int apic, int pin);
4813 +
4814 +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
4815 +{
4816 +       int apic, i, best_guess = -1;
4817 +
4818 +       apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
4819 +               "slot:%d, pin:%d.\n", bus, slot, pin);
4820 +       if (mp_bus_id_to_pci_bus[bus] == -1) {
4821 +               printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
4822 +               return -1;
4823 +       }
4824 +       for (i = 0; i < mp_irq_entries; i++) {
4825 +               int lbus = mp_irqs[i].mpc_srcbus;
4826 +
4827 +               for (apic = 0; apic < nr_ioapics; apic++)
4828 +                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
4829 +                           mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
4830 +                               break;
4831 +
4832 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
4833 +                   !mp_irqs[i].mpc_irqtype &&
4834 +                   (bus == lbus) &&
4835 +                   (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
4836 +                       int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
4837 +
4838 +                       if (!(apic || IO_APIC_IRQ(irq)))
4839 +                               continue;
4840 +
4841 +                       if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
4842 +                               return irq;
4843 +                       /*
4844 +                        * Use the first all-but-pin matching entry as a
4845 +                        * best-guess fuzzy result for broken mptables.
4846 +                        */
4847 +                       if (best_guess < 0)
4848 +                               best_guess = irq;
4849 +               }
4850 +       }
4851 +       return best_guess;
4852 +}
4853 +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
4854 +
4855 +/*
4856 + * This function currently is only a helper for the i386 smp boot process where 
4857 + * we need to reprogram the ioredtbls to cater for the cpus which have come online
4858 + * so mask in all cases should simply be TARGET_CPUS
4859 + */
4860 +#ifdef CONFIG_SMP
4861 +#ifndef CONFIG_XEN
4862 +void __init setup_ioapic_dest(void)
4863 +{
4864 +       int pin, ioapic, irq, irq_entry;
4865 +
4866 +       if (skip_ioapic_setup == 1)
4867 +               return;
4868 +
4869 +       for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
4870 +               for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
4871 +                       irq_entry = find_irq_entry(ioapic, pin, mp_INT);
4872 +                       if (irq_entry == -1)
4873 +                               continue;
4874 +                       irq = pin_2_irq(irq_entry, ioapic, pin);
4875 +                       set_ioapic_affinity_irq(irq, TARGET_CPUS);
4876 +               }
4877 +
4878 +       }
4879 +}
4880 +#endif /* !CONFIG_XEN */
4881 +#endif
4882 +
4883 +/*
4884 + * EISA Edge/Level control register, ELCR
4885 + */
4886 +static int EISA_ELCR(unsigned int irq)
4887 +{
4888 +       if (irq < 16) {
4889 +               unsigned int port = 0x4d0 + (irq >> 3);
4890 +               return (inb(port) >> (irq & 7)) & 1;
4891 +       }
4892 +       apic_printk(APIC_VERBOSE, KERN_INFO
4893 +                       "Broken MPtable reports ISA irq %d\n", irq);
4894 +       return 0;
4895 +}
4896 +
4897 +/* EISA interrupts are always polarity zero and can be edge or level
4898 + * trigger depending on the ELCR value.  If an interrupt is listed as
4899 + * EISA conforming in the MP table, that means its trigger type must
4900 + * be read in from the ELCR */
4901 +
4902 +#define default_EISA_trigger(idx)      (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
4903 +#define default_EISA_polarity(idx)     (0)
4904 +
4905 +/* ISA interrupts are always polarity zero edge triggered,
4906 + * when listed as conforming in the MP table. */
4907 +
4908 +#define default_ISA_trigger(idx)       (0)
4909 +#define default_ISA_polarity(idx)      (0)
4910 +
4911 +/* PCI interrupts are always polarity one level triggered,
4912 + * when listed as conforming in the MP table. */
4913 +
4914 +#define default_PCI_trigger(idx)       (1)
4915 +#define default_PCI_polarity(idx)      (1)
4916 +
4917 +/* MCA interrupts are always polarity zero level triggered,
4918 + * when listed as conforming in the MP table. */
4919 +
4920 +#define default_MCA_trigger(idx)       (1)
4921 +#define default_MCA_polarity(idx)      (0)
4922 +
4923 +/* NEC98 interrupts are always polarity zero edge triggered,
4924 + * when listed as conforming in the MP table. */
4925 +
4926 +#define default_NEC98_trigger(idx)     (0)
4927 +#define default_NEC98_polarity(idx)    (0)
4928 +
4929 +static int __init MPBIOS_polarity(int idx)
4930 +{
4931 +       int bus = mp_irqs[idx].mpc_srcbus;
4932 +       int polarity;
4933 +
4934 +       /*
4935 +        * Determine IRQ line polarity (high active or low active):
4936 +        */
4937 +       switch (mp_irqs[idx].mpc_irqflag & 3)
4938 +       {
4939 +               case 0: /* conforms, ie. bus-type dependent polarity */
4940 +               {
4941 +                       switch (mp_bus_id_to_type[bus])
4942 +                       {
4943 +                               case MP_BUS_ISA: /* ISA pin */
4944 +                               {
4945 +                                       polarity = default_ISA_polarity(idx);
4946 +                                       break;
4947 +                               }
4948 +                               case MP_BUS_EISA: /* EISA pin */
4949 +                               {
4950 +                                       polarity = default_EISA_polarity(idx);
4951 +                                       break;
4952 +                               }
4953 +                               case MP_BUS_PCI: /* PCI pin */
4954 +                               {
4955 +                                       polarity = default_PCI_polarity(idx);
4956 +                                       break;
4957 +                               }
4958 +                               case MP_BUS_MCA: /* MCA pin */
4959 +                               {
4960 +                                       polarity = default_MCA_polarity(idx);
4961 +                                       break;
4962 +                               }
4963 +                               case MP_BUS_NEC98: /* NEC 98 pin */
4964 +                               {
4965 +                                       polarity = default_NEC98_polarity(idx);
4966 +                                       break;
4967 +                               }
4968 +                               default:
4969 +                               {
4970 +                                       printk(KERN_WARNING "broken BIOS!!\n");
4971 +                                       polarity = 1;
4972 +                                       break;
4973 +                               }
4974 +                       }
4975 +                       break;
4976 +               }
4977 +               case 1: /* high active */
4978 +               {
4979 +                       polarity = 0;
4980 +                       break;
4981 +               }
4982 +               case 2: /* reserved */
4983 +               {
4984 +                       printk(KERN_WARNING "broken BIOS!!\n");
4985 +                       polarity = 1;
4986 +                       break;
4987 +               }
4988 +               case 3: /* low active */
4989 +               {
4990 +                       polarity = 1;
4991 +                       break;
4992 +               }
4993 +               default: /* invalid */
4994 +               {
4995 +                       printk(KERN_WARNING "broken BIOS!!\n");
4996 +                       polarity = 1;
4997 +                       break;
4998 +               }
4999 +       }
5000 +       return polarity;
5001 +}
5002 +
5003 +static int MPBIOS_trigger(int idx)
5004 +{
5005 +       int bus = mp_irqs[idx].mpc_srcbus;
5006 +       int trigger;
5007 +
5008 +       /*
5009 +        * Determine IRQ trigger mode (edge or level sensitive):
5010 +        */
5011 +       switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
5012 +       {
5013 +               case 0: /* conforms, ie. bus-type dependent */
5014 +               {
5015 +                       switch (mp_bus_id_to_type[bus])
5016 +                       {
5017 +                               case MP_BUS_ISA: /* ISA pin */
5018 +                               {
5019 +                                       trigger = default_ISA_trigger(idx);
5020 +                                       break;
5021 +                               }
5022 +                               case MP_BUS_EISA: /* EISA pin */
5023 +                               {
5024 +                                       trigger = default_EISA_trigger(idx);
5025 +                                       break;
5026 +                               }
5027 +                               case MP_BUS_PCI: /* PCI pin */
5028 +                               {
5029 +                                       trigger = default_PCI_trigger(idx);
5030 +                                       break;
5031 +                               }
5032 +                               case MP_BUS_MCA: /* MCA pin */
5033 +                               {
5034 +                                       trigger = default_MCA_trigger(idx);
5035 +                                       break;
5036 +                               }
5037 +                               case MP_BUS_NEC98: /* NEC 98 pin */
5038 +                               {
5039 +                                       trigger = default_NEC98_trigger(idx);
5040 +                                       break;
5041 +                               }
5042 +                               default:
5043 +                               {
5044 +                                       printk(KERN_WARNING "broken BIOS!!\n");
5045 +                                       trigger = 1;
5046 +                                       break;
5047 +                               }
5048 +                       }
5049 +                       break;
5050 +               }
5051 +               case 1: /* edge */
5052 +               {
5053 +                       trigger = 0;
5054 +                       break;
5055 +               }
5056 +               case 2: /* reserved */
5057 +               {
5058 +                       printk(KERN_WARNING "broken BIOS!!\n");
5059 +                       trigger = 1;
5060 +                       break;
5061 +               }
5062 +               case 3: /* level */
5063 +               {
5064 +                       trigger = 1;
5065 +                       break;
5066 +               }
5067 +               default: /* invalid */
5068 +               {
5069 +                       printk(KERN_WARNING "broken BIOS!!\n");
5070 +                       trigger = 0;
5071 +                       break;
5072 +               }
5073 +       }
5074 +       return trigger;
5075 +}
5076 +
5077 +static inline int irq_polarity(int idx)
5078 +{
5079 +       return MPBIOS_polarity(idx);
5080 +}
5081 +
5082 +static inline int irq_trigger(int idx)
5083 +{
5084 +       return MPBIOS_trigger(idx);
5085 +}
5086 +
5087 +static int pin_2_irq(int idx, int apic, int pin)
5088 +{
5089 +       int irq, i;
5090 +       int bus = mp_irqs[idx].mpc_srcbus;
5091 +
5092 +       /*
5093 +        * Debugging check, we are in big trouble if this message pops up!
5094 +        */
5095 +       if (mp_irqs[idx].mpc_dstirq != pin)
5096 +               printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
5097 +
5098 +       switch (mp_bus_id_to_type[bus])
5099 +       {
5100 +               case MP_BUS_ISA: /* ISA pin */
5101 +               case MP_BUS_EISA:
5102 +               case MP_BUS_MCA:
5103 +               case MP_BUS_NEC98:
5104 +               {
5105 +                       irq = mp_irqs[idx].mpc_srcbusirq;
5106 +                       break;
5107 +               }
5108 +               case MP_BUS_PCI: /* PCI pin */
5109 +               {
5110 +                       /*
5111 +                        * PCI IRQs are mapped in order
5112 +                        */
5113 +                       i = irq = 0;
5114 +                       while (i < apic)
5115 +                               irq += nr_ioapic_registers[i++];
5116 +                       irq += pin;
5117 +
5118 +                       /*
5119 +                        * For MPS mode, so far only needed by ES7000 platform
5120 +                        */
5121 +                       if (ioapic_renumber_irq)
5122 +                               irq = ioapic_renumber_irq(apic, irq);
5123 +
5124 +                       break;
5125 +               }
5126 +               default:
5127 +               {
5128 +                       printk(KERN_ERR "unknown bus type %d.\n",bus); 
5129 +                       irq = 0;
5130 +                       break;
5131 +               }
5132 +       }
5133 +
5134 +       /*
5135 +        * PCI IRQ command line redirection. Yes, limits are hardcoded.
5136 +        */
5137 +       if ((pin >= 16) && (pin <= 23)) {
5138 +               if (pirq_entries[pin-16] != -1) {
5139 +                       if (!pirq_entries[pin-16]) {
5140 +                               apic_printk(APIC_VERBOSE, KERN_DEBUG
5141 +                                               "disabling PIRQ%d\n", pin-16);
5142 +                       } else {
5143 +                               irq = pirq_entries[pin-16];
5144 +                               apic_printk(APIC_VERBOSE, KERN_DEBUG
5145 +                                               "using PIRQ%d -> IRQ %d\n",
5146 +                                               pin-16, irq);
5147 +                       }
5148 +               }
5149 +       }
5150 +       return irq;
5151 +}
5152 +
5153 +static inline int IO_APIC_irq_trigger(int irq)
5154 +{
5155 +       int apic, idx, pin;
5156 +
5157 +       for (apic = 0; apic < nr_ioapics; apic++) {
5158 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
5159 +                       idx = find_irq_entry(apic,pin,mp_INT);
5160 +                       if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
5161 +                               return irq_trigger(idx);
5162 +               }
5163 +       }
5164 +       /*
5165 +        * nonexistent IRQs are edge default
5166 +        */
5167 +       return 0;
5168 +}
5169 +
5170 +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
5171 +u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */
5172 +
5173 +int assign_irq_vector(int irq)
5174 +{
5175 +       struct physdev_irq irq_op;
5176 +
5177 +       BUG_ON(irq >= NR_IRQ_VECTORS);
5178 +       if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
5179 +               return IO_APIC_VECTOR(irq);
5180 +
5181 +       irq_op.irq = irq;
5182 +       if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
5183 +               return -ENOSPC;
5184 +
5185 +       vector_irq[irq_op.vector] = irq;
5186 +       if (irq != AUTO_ASSIGN)
5187 +               IO_APIC_VECTOR(irq) = irq_op.vector;
5188 +
5189 +       return irq_op.vector;
5190 +}
5191 +
5192 +#ifndef CONFIG_XEN
5193 +static struct hw_interrupt_type ioapic_level_type;
5194 +static struct hw_interrupt_type ioapic_edge_type;
5195 +
5196 +#define IOAPIC_AUTO    -1
5197 +#define IOAPIC_EDGE    0
5198 +#define IOAPIC_LEVEL   1
5199 +
5200 +static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
5201 +{
5202 +       if (use_pci_vector() && !platform_legacy_irq(irq)) {
5203 +               if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
5204 +                               trigger == IOAPIC_LEVEL)
5205 +                       irq_desc[vector].handler = &ioapic_level_type;
5206 +               else
5207 +                       irq_desc[vector].handler = &ioapic_edge_type;
5208 +               set_intr_gate(vector, interrupt[vector]);
5209 +       } else  {
5210 +               if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
5211 +                               trigger == IOAPIC_LEVEL)
5212 +                       irq_desc[irq].handler = &ioapic_level_type;
5213 +               else
5214 +                       irq_desc[irq].handler = &ioapic_edge_type;
5215 +               set_intr_gate(vector, interrupt[irq]);
5216 +       }
5217 +}
5218 +#else
5219 +#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
5220 +#endif
5221 +
5222 +static void __init setup_IO_APIC_irqs(void)
5223 +{
5224 +       struct IO_APIC_route_entry entry;
5225 +       int apic, pin, idx, irq, first_notcon = 1, vector;
5226 +       unsigned long flags;
5227 +
5228 +       apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
5229 +
5230 +       for (apic = 0; apic < nr_ioapics; apic++) {
5231 +       for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
5232 +
5233 +               /*
5234 +                * add it to the IO-APIC irq-routing table:
5235 +                */
5236 +               memset(&entry,0,sizeof(entry));
5237 +
5238 +               entry.delivery_mode = INT_DELIVERY_MODE;
5239 +               entry.dest_mode = INT_DEST_MODE;
5240 +               entry.mask = 0;                         /* enable IRQ */
5241 +               entry.dest.logical.logical_dest = 
5242 +                                       cpu_mask_to_apicid(TARGET_CPUS);
5243 +
5244 +               idx = find_irq_entry(apic,pin,mp_INT);
5245 +               if (idx == -1) {
5246 +                       if (first_notcon) {
5247 +                               apic_printk(APIC_VERBOSE, KERN_DEBUG
5248 +                                               " IO-APIC (apicid-pin) %d-%d",
5249 +                                               mp_ioapics[apic].mpc_apicid,
5250 +                                               pin);
5251 +                               first_notcon = 0;
5252 +                       } else
5253 +                               apic_printk(APIC_VERBOSE, ", %d-%d",
5254 +                                       mp_ioapics[apic].mpc_apicid, pin);
5255 +                       continue;
5256 +               }
5257 +
5258 +               entry.trigger = irq_trigger(idx);
5259 +               entry.polarity = irq_polarity(idx);
5260 +
5261 +               if (irq_trigger(idx)) {
5262 +                       entry.trigger = 1;
5263 +                       entry.mask = 1;
5264 +               }
5265 +
5266 +               irq = pin_2_irq(idx, apic, pin);
5267 +               /*
5268 +                * skip adding the timer int on secondary nodes, which causes
5269 +                * a small but painful rift in the time-space continuum
5270 +                */
5271 +               if (multi_timer_check(apic, irq))
5272 +                       continue;
5273 +               else
5274 +                       add_pin_to_irq(irq, apic, pin);
5275 +
5276 +               if (/*!apic &&*/ !IO_APIC_IRQ(irq))
5277 +                       continue;
5278 +
5279 +               if (IO_APIC_IRQ(irq)) {
5280 +                       vector = assign_irq_vector(irq);
5281 +                       entry.vector = vector;
5282 +                       ioapic_register_intr(irq, vector, IOAPIC_AUTO);
5283 +               
5284 +                       if (!apic && (irq < 16))
5285 +                               disable_8259A_irq(irq);
5286 +               }
5287 +               spin_lock_irqsave(&ioapic_lock, flags);
5288 +               io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
5289 +               io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
5290 +               set_native_irq_info(irq, TARGET_CPUS);
5291 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5292 +       }
5293 +       }
5294 +
5295 +       if (!first_notcon)
5296 +               apic_printk(APIC_VERBOSE, " not connected.\n");
5297 +}
5298 +
5299 +/*
5300 + * Set up the 8259A-master output pin:
5301 + */
5302 +#ifndef CONFIG_XEN
5303 +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
5304 +{
5305 +       struct IO_APIC_route_entry entry;
5306 +       unsigned long flags;
5307 +
5308 +       memset(&entry,0,sizeof(entry));
5309 +
5310 +       disable_8259A_irq(0);
5311 +
5312 +       /* mask LVT0 */
5313 +       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
5314 +
5315 +       /*
5316 +        * We use logical delivery to get the timer IRQ
5317 +        * to the first CPU.
5318 +        */
5319 +       entry.dest_mode = INT_DEST_MODE;
5320 +       entry.mask = 0;                                 /* unmask IRQ now */
5321 +       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
5322 +       entry.delivery_mode = INT_DELIVERY_MODE;
5323 +       entry.polarity = 0;
5324 +       entry.trigger = 0;
5325 +       entry.vector = vector;
5326 +
5327 +       /*
5328 +        * The timer IRQ doesn't have to know that behind the
5329 +        * scene we have a 8259A-master in AEOI mode ...
5330 +        */
5331 +       irq_desc[0].handler = &ioapic_edge_type;
5332 +
5333 +       /*
5334 +        * Add it to the IO-APIC irq-routing table:
5335 +        */
5336 +       spin_lock_irqsave(&ioapic_lock, flags);
5337 +       io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
5338 +       io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
5339 +       spin_unlock_irqrestore(&ioapic_lock, flags);
5340 +
5341 +       enable_8259A_irq(0);
5342 +}
5343 +
5344 +static inline void UNEXPECTED_IO_APIC(void)
5345 +{
5346 +}
5347 +
5348 +void __init print_IO_APIC(void)
5349 +{
5350 +       int apic, i;
5351 +       union IO_APIC_reg_00 reg_00;
5352 +       union IO_APIC_reg_01 reg_01;
5353 +       union IO_APIC_reg_02 reg_02;
5354 +       union IO_APIC_reg_03 reg_03;
5355 +       unsigned long flags;
5356 +
5357 +       if (apic_verbosity == APIC_QUIET)
5358 +               return;
5359 +
5360 +       printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
5361 +       for (i = 0; i < nr_ioapics; i++)
5362 +               printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
5363 +                      mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
5364 +
5365 +       /*
5366 +        * We are a bit conservative about what we expect.  We have to
5367 +        * know about every hardware change ASAP.
5368 +        */
5369 +       printk(KERN_INFO "testing the IO APIC.......................\n");
5370 +
5371 +       for (apic = 0; apic < nr_ioapics; apic++) {
5372 +
5373 +       spin_lock_irqsave(&ioapic_lock, flags);
5374 +       reg_00.raw = io_apic_read(apic, 0);
5375 +       reg_01.raw = io_apic_read(apic, 1);
5376 +       if (reg_01.bits.version >= 0x10)
5377 +               reg_02.raw = io_apic_read(apic, 2);
5378 +       if (reg_01.bits.version >= 0x20)
5379 +               reg_03.raw = io_apic_read(apic, 3);
5380 +       spin_unlock_irqrestore(&ioapic_lock, flags);
5381 +
5382 +       printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
5383 +       printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
5384 +       printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
5385 +       printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
5386 +       printk(KERN_DEBUG ".......    : LTS          : %X\n", reg_00.bits.LTS);
5387 +       if (reg_00.bits.ID >= get_physical_broadcast())
5388 +               UNEXPECTED_IO_APIC();
5389 +       if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
5390 +               UNEXPECTED_IO_APIC();
5391 +
5392 +       printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
5393 +       printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
5394 +       if (    (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
5395 +               (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
5396 +               (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
5397 +               (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
5398 +               (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
5399 +               (reg_01.bits.entries != 0x2E) &&
5400 +               (reg_01.bits.entries != 0x3F)
5401 +       )
5402 +               UNEXPECTED_IO_APIC();
5403 +
5404 +       printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
5405 +       printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
5406 +       if (    (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
5407 +               (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
5408 +               (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
5409 +               (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
5410 +               (reg_01.bits.version != 0x20)    /* Intel P64H (82806 AA) */
5411 +       )
5412 +               UNEXPECTED_IO_APIC();
5413 +       if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
5414 +               UNEXPECTED_IO_APIC();
5415 +
5416 +       /*
5417 +        * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
5418 +        * but the value of reg_02 is read as the previous read register
5419 +        * value, so ignore it if reg_02 == reg_01.
5420 +        */
5421 +       if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
5422 +               printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
5423 +               printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
5424 +               if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
5425 +                       UNEXPECTED_IO_APIC();
5426 +       }
5427 +
5428 +       /*
5429 +        * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
5430 +        * or reg_03, but the value of reg_0[23] is read as the previous read
5431 +        * register value, so ignore it if reg_03 == reg_0[12].
5432 +        */
5433 +       if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
5434 +           reg_03.raw != reg_01.raw) {
5435 +               printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
5436 +               printk(KERN_DEBUG ".......     : Boot DT    : %X\n", reg_03.bits.boot_DT);
5437 +               if (reg_03.bits.__reserved_1)
5438 +                       UNEXPECTED_IO_APIC();
5439 +       }
5440 +
5441 +       printk(KERN_DEBUG ".... IRQ redirection table:\n");
5442 +
5443 +       printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
5444 +                         " Stat Dest Deli Vect:   \n");
5445 +
5446 +       for (i = 0; i <= reg_01.bits.entries; i++) {
5447 +               struct IO_APIC_route_entry entry;
5448 +
5449 +               spin_lock_irqsave(&ioapic_lock, flags);
5450 +               *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
5451 +               *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
5452 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5453 +
5454 +               printk(KERN_DEBUG " %02x %03X %02X  ",
5455 +                       i,
5456 +                       entry.dest.logical.logical_dest,
5457 +                       entry.dest.physical.physical_dest
5458 +               );
5459 +
5460 +               printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
5461 +                       entry.mask,
5462 +                       entry.trigger,
5463 +                       entry.irr,
5464 +                       entry.polarity,
5465 +                       entry.delivery_status,
5466 +                       entry.dest_mode,
5467 +                       entry.delivery_mode,
5468 +                       entry.vector
5469 +               );
5470 +       }
5471 +       }
5472 +       if (use_pci_vector())
5473 +               printk(KERN_INFO "Using vector-based indexing\n");
5474 +       printk(KERN_DEBUG "IRQ to pin mappings:\n");
5475 +       for (i = 0; i < NR_IRQS; i++) {
5476 +               struct irq_pin_list *entry = irq_2_pin + i;
5477 +               if (entry->pin < 0)
5478 +                       continue;
5479 +               if (use_pci_vector() && !platform_legacy_irq(i))
5480 +                       printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
5481 +               else
5482 +                       printk(KERN_DEBUG "IRQ%d ", i);
5483 +               for (;;) {
5484 +                       printk("-> %d:%d", entry->apic, entry->pin);
5485 +                       if (!entry->next)
5486 +                               break;
5487 +                       entry = irq_2_pin + entry->next;
5488 +               }
5489 +               printk("\n");
5490 +       }
5491 +
5492 +       printk(KERN_INFO ".................................... done.\n");
5493 +
5494 +       return;
5495 +}
5496 +
5497 +#if 0
5498 +
5499 +static void print_APIC_bitfield (int base)
5500 +{
5501 +       unsigned int v;
5502 +       int i, j;
5503 +
5504 +       if (apic_verbosity == APIC_QUIET)
5505 +               return;
5506 +
5507 +       printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
5508 +       for (i = 0; i < 8; i++) {
5509 +               v = apic_read(base + i*0x10);
5510 +               for (j = 0; j < 32; j++) {
5511 +                       if (v & (1<<j))
5512 +                               printk("1");
5513 +                       else
5514 +                               printk("0");
5515 +               }
5516 +               printk("\n");
5517 +       }
5518 +}
5519 +
5520 +void /*__init*/ print_local_APIC(void * dummy)
5521 +{
5522 +       unsigned int v, ver, maxlvt;
5523 +
5524 +       if (apic_verbosity == APIC_QUIET)
5525 +               return;
5526 +
5527 +       printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
5528 +               smp_processor_id(), hard_smp_processor_id());
5529 +       v = apic_read(APIC_ID);
5530 +       printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(v));
5531 +       v = apic_read(APIC_LVR);
5532 +       printk(KERN_INFO "... APIC VERSION: %08x\n", v);
5533 +       ver = GET_APIC_VERSION(v);
5534 +       maxlvt = get_maxlvt();
5535 +
5536 +       v = apic_read(APIC_TASKPRI);
5537 +       printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
5538 +
5539 +       if (APIC_INTEGRATED(ver)) {                     /* !82489DX */
5540 +               v = apic_read(APIC_ARBPRI);
5541 +               printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
5542 +                       v & APIC_ARBPRI_MASK);
5543 +               v = apic_read(APIC_PROCPRI);
5544 +               printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
5545 +       }
5546 +
5547 +       v = apic_read(APIC_EOI);
5548 +       printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
5549 +       v = apic_read(APIC_RRR);
5550 +       printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
5551 +       v = apic_read(APIC_LDR);
5552 +       printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
5553 +       v = apic_read(APIC_DFR);
5554 +       printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
5555 +       v = apic_read(APIC_SPIV);
5556 +       printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
5557 +
5558 +       printk(KERN_DEBUG "... APIC ISR field:\n");
5559 +       print_APIC_bitfield(APIC_ISR);
5560 +       printk(KERN_DEBUG "... APIC TMR field:\n");
5561 +       print_APIC_bitfield(APIC_TMR);
5562 +       printk(KERN_DEBUG "... APIC IRR field:\n");
5563 +       print_APIC_bitfield(APIC_IRR);
5564 +
5565 +       if (APIC_INTEGRATED(ver)) {             /* !82489DX */
5566 +               if (maxlvt > 3)         /* Due to the Pentium erratum 3AP. */
5567 +                       apic_write(APIC_ESR, 0);
5568 +               v = apic_read(APIC_ESR);
5569 +               printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
5570 +       }
5571 +
5572 +       v = apic_read(APIC_ICR);
5573 +       printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
5574 +       v = apic_read(APIC_ICR2);
5575 +       printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
5576 +
5577 +       v = apic_read(APIC_LVTT);
5578 +       printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
5579 +
5580 +       if (maxlvt > 3) {                       /* PC is LVT#4. */
5581 +               v = apic_read(APIC_LVTPC);
5582 +               printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
5583 +       }
5584 +       v = apic_read(APIC_LVT0);
5585 +       printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
5586 +       v = apic_read(APIC_LVT1);
5587 +       printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
5588 +
5589 +       if (maxlvt > 2) {                       /* ERR is LVT#3. */
5590 +               v = apic_read(APIC_LVTERR);
5591 +               printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
5592 +       }
5593 +
5594 +       v = apic_read(APIC_TMICT);
5595 +       printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
5596 +       v = apic_read(APIC_TMCCT);
5597 +       printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
5598 +       v = apic_read(APIC_TDCR);
5599 +       printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
5600 +       printk("\n");
5601 +}
5602 +
5603 +void print_all_local_APICs (void)
5604 +{
5605 +       on_each_cpu(print_local_APIC, NULL, 1, 1);
5606 +}
5607 +
5608 +void /*__init*/ print_PIC(void)
5609 +{
5610 +       unsigned int v;
5611 +       unsigned long flags;
5612 +
5613 +       if (apic_verbosity == APIC_QUIET)
5614 +               return;
5615 +
5616 +       printk(KERN_DEBUG "\nprinting PIC contents\n");
5617 +
5618 +       spin_lock_irqsave(&i8259A_lock, flags);
5619 +
5620 +       v = inb(0xa1) << 8 | inb(0x21);
5621 +       printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
5622 +
5623 +       v = inb(0xa0) << 8 | inb(0x20);
5624 +       printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
5625 +
5626 +       outb(0x0b,0xa0);
5627 +       outb(0x0b,0x20);
5628 +       v = inb(0xa0) << 8 | inb(0x20);
5629 +       outb(0x0a,0xa0);
5630 +       outb(0x0a,0x20);
5631 +
5632 +       spin_unlock_irqrestore(&i8259A_lock, flags);
5633 +
5634 +       printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
5635 +
5636 +       v = inb(0x4d1) << 8 | inb(0x4d0);
5637 +       printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
5638 +}
5639 +
5640 +#endif  /*  0  */
5641 +
5642 +#else
5643 +void __init print_IO_APIC(void) { }
5644 +#endif /* !CONFIG_XEN */
5645 +
5646 +static void __init enable_IO_APIC(void)
5647 +{
5648 +       union IO_APIC_reg_01 reg_01;
5649 +       int i8259_apic, i8259_pin;
5650 +       int i, apic;
5651 +       unsigned long flags;
5652 +
5653 +       for (i = 0; i < PIN_MAP_SIZE; i++) {
5654 +               irq_2_pin[i].pin = -1;
5655 +               irq_2_pin[i].next = 0;
5656 +       }
5657 +       if (!pirqs_enabled)
5658 +               for (i = 0; i < MAX_PIRQS; i++)
5659 +                       pirq_entries[i] = -1;
5660 +
5661 +       /*
5662 +        * The number of IO-APIC IRQ registers (== #pins):
5663 +        */
5664 +       for (apic = 0; apic < nr_ioapics; apic++) {
5665 +               spin_lock_irqsave(&ioapic_lock, flags);
5666 +               reg_01.raw = io_apic_read(apic, 1);
5667 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5668 +               nr_ioapic_registers[apic] = reg_01.bits.entries+1;
5669 +       }
5670 +       for(apic = 0; apic < nr_ioapics; apic++) {
5671 +               int pin;
5672 +               /* See if any of the pins is in ExtINT mode */
5673 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
5674 +                       struct IO_APIC_route_entry entry;
5675 +                       spin_lock_irqsave(&ioapic_lock, flags);
5676 +                       *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
5677 +                       *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
5678 +                       spin_unlock_irqrestore(&ioapic_lock, flags);
5679 +
5680 +
5681 +                       /* If the interrupt line is enabled and in ExtInt mode
5682 +                        * I have found the pin where the i8259 is connected.
5683 +                        */
5684 +                       if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
5685 +                               ioapic_i8259.apic = apic;
5686 +                               ioapic_i8259.pin  = pin;
5687 +                               goto found_i8259;
5688 +                       }
5689 +               }
5690 +       }
5691 + found_i8259:
5692 +       /* Look to see what if the MP table has reported the ExtINT */
5693 +       /* If we could not find the appropriate pin by looking at the ioapic
5694 +        * the i8259 probably is not connected the ioapic but give the
5695 +        * mptable a chance anyway.
5696 +        */
5697 +       i8259_pin  = find_isa_irq_pin(0, mp_ExtINT);
5698 +       i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
5699 +       /* Trust the MP table if nothing is setup in the hardware */
5700 +       if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
5701 +               printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
5702 +               ioapic_i8259.pin  = i8259_pin;
5703 +               ioapic_i8259.apic = i8259_apic;
5704 +       }
5705 +       /* Complain if the MP table and the hardware disagree */
5706 +       if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
5707 +               (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
5708 +       {
5709 +               printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
5710 +       }
5711 +
5712 +       /*
5713 +        * Do not trust the IO-APIC being empty at bootup
5714 +        */
5715 +       clear_IO_APIC();
5716 +}
5717 +
5718 +/*
5719 + * Not an __init, needed by the reboot code
5720 + */
5721 +void disable_IO_APIC(void)
5722 +{
5723 +       /*
5724 +        * Clear the IO-APIC before rebooting:
5725 +        */
5726 +       clear_IO_APIC();
5727 +
5728 +#ifndef CONFIG_XEN
5729 +       /*
5730 +        * If the i8259 is routed through an IOAPIC
5731 +        * Put that IOAPIC in virtual wire mode
5732 +        * so legacy interrupts can be delivered.
5733 +        */
5734 +       if (ioapic_i8259.pin != -1) {
5735 +               struct IO_APIC_route_entry entry;
5736 +               unsigned long flags;
5737 +
5738 +               memset(&entry, 0, sizeof(entry));
5739 +               entry.mask            = 0; /* Enabled */
5740 +               entry.trigger         = 0; /* Edge */
5741 +               entry.irr             = 0;
5742 +               entry.polarity        = 0; /* High */
5743 +               entry.delivery_status = 0;
5744 +               entry.dest_mode       = 0; /* Physical */
5745 +               entry.delivery_mode   = dest_ExtINT; /* ExtInt */
5746 +               entry.vector          = 0;
5747 +               entry.dest.physical.physical_dest =
5748 +                                       GET_APIC_ID(apic_read(APIC_ID));
5749 +
5750 +               /*
5751 +                * Add it to the IO-APIC irq-routing table:
5752 +                */
5753 +               spin_lock_irqsave(&ioapic_lock, flags);
5754 +               io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
5755 +                       *(((int *)&entry)+1));
5756 +               io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
5757 +                       *(((int *)&entry)+0));
5758 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5759 +       }
5760 +       disconnect_bsp_APIC(ioapic_i8259.pin != -1);
5761 +#endif
5762 +}
5763 +
5764 +/*
5765 + * function to set the IO-APIC physical IDs based on the
5766 + * values stored in the MPC table.
5767 + *
5768 + * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
5769 + */
5770 +
5771 +#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
5772 +static void __init setup_ioapic_ids_from_mpc(void)
5773 +{
5774 +       union IO_APIC_reg_00 reg_00;
5775 +       physid_mask_t phys_id_present_map;
5776 +       int apic;
5777 +       int i;
5778 +       unsigned char old_id;
5779 +       unsigned long flags;
5780 +
5781 +       /*
5782 +        * Don't check I/O APIC IDs for xAPIC systems.  They have
5783 +        * no meaning without the serial APIC bus.
5784 +        */
5785 +       if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
5786 +               || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
5787 +               return;
5788 +       /*
5789 +        * This is broken; anything with a real cpu count has to
5790 +        * circumvent this idiocy regardless.
5791 +        */
5792 +       phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
5793 +
5794 +       /*
5795 +        * Set the IOAPIC ID to the value stored in the MPC table.
5796 +        */
5797 +       for (apic = 0; apic < nr_ioapics; apic++) {
5798 +
5799 +               /* Read the register 0 value */
5800 +               spin_lock_irqsave(&ioapic_lock, flags);
5801 +               reg_00.raw = io_apic_read(apic, 0);
5802 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5803 +               
5804 +               old_id = mp_ioapics[apic].mpc_apicid;
5805 +
5806 +               if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
5807 +                       printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
5808 +                               apic, mp_ioapics[apic].mpc_apicid);
5809 +                       printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
5810 +                               reg_00.bits.ID);
5811 +                       mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
5812 +               }
5813 +
5814 +               /*
5815 +                * Sanity check, is the ID really free? Every APIC in a
5816 +                * system must have a unique ID or we get lots of nice
5817 +                * 'stuck on smp_invalidate_needed IPI wait' messages.
5818 +                */
5819 +               if (check_apicid_used(phys_id_present_map,
5820 +                                       mp_ioapics[apic].mpc_apicid)) {
5821 +                       printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
5822 +                               apic, mp_ioapics[apic].mpc_apicid);
5823 +                       for (i = 0; i < get_physical_broadcast(); i++)
5824 +                               if (!physid_isset(i, phys_id_present_map))
5825 +                                       break;
5826 +                       if (i >= get_physical_broadcast())
5827 +                               panic("Max APIC ID exceeded!\n");
5828 +                       printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
5829 +                               i);
5830 +                       physid_set(i, phys_id_present_map);
5831 +                       mp_ioapics[apic].mpc_apicid = i;
5832 +               } else {
5833 +                       physid_mask_t tmp;
5834 +                       tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
5835 +                       apic_printk(APIC_VERBOSE, "Setting %d in the "
5836 +                                       "phys_id_present_map\n",
5837 +                                       mp_ioapics[apic].mpc_apicid);
5838 +                       physids_or(phys_id_present_map, phys_id_present_map, tmp);
5839 +               }
5840 +
5841 +
5842 +               /*
5843 +                * We need to adjust the IRQ routing table
5844 +                * if the ID changed.
5845 +                */
5846 +               if (old_id != mp_ioapics[apic].mpc_apicid)
5847 +                       for (i = 0; i < mp_irq_entries; i++)
5848 +                               if (mp_irqs[i].mpc_dstapic == old_id)
5849 +                                       mp_irqs[i].mpc_dstapic
5850 +                                               = mp_ioapics[apic].mpc_apicid;
5851 +
5852 +               /*
5853 +                * Read the right value from the MPC table and
5854 +                * write it into the ID register.
5855 +                */
5856 +               apic_printk(APIC_VERBOSE, KERN_INFO
5857 +                       "...changing IO-APIC physical APIC ID to %d ...",
5858 +                       mp_ioapics[apic].mpc_apicid);
5859 +
5860 +               reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
5861 +               spin_lock_irqsave(&ioapic_lock, flags);
5862 +               io_apic_write(apic, 0, reg_00.raw);
5863 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5864 +
5865 +               /*
5866 +                * Sanity check
5867 +                */
5868 +               spin_lock_irqsave(&ioapic_lock, flags);
5869 +               reg_00.raw = io_apic_read(apic, 0);
5870 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5871 +               if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
5872 +                       printk("could not set ID!\n");
5873 +               else
5874 +                       apic_printk(APIC_VERBOSE, " ok.\n");
5875 +       }
5876 +}
5877 +#else
5878 +static void __init setup_ioapic_ids_from_mpc(void) { }
5879 +#endif
5880 +
5881 +#ifndef CONFIG_XEN
5882 +/*
5883 + * There is a nasty bug in some older SMP boards, their mptable lies
5884 + * about the timer IRQ. We do the following to work around the situation:
5885 + *
5886 + *     - timer IRQ defaults to IO-APIC IRQ
5887 + *     - if this function detects that timer IRQs are defunct, then we fall
5888 + *       back to ISA timer IRQs
5889 + */
5890 +static int __init timer_irq_works(void)
5891 +{
5892 +       unsigned long t1 = jiffies;
5893 +
5894 +       local_irq_enable();
5895 +       /* Let ten ticks pass... */
5896 +       mdelay((10 * 1000) / HZ);
5897 +
5898 +       /*
5899 +        * Expect a few ticks at least, to be sure some possible
5900 +        * glue logic does not lock up after one or two first
5901 +        * ticks in a non-ExtINT mode.  Also the local APIC
5902 +        * might have cached one ExtINT interrupt.  Finally, at
5903 +        * least one tick may be lost due to delays.
5904 +        */
5905 +       if (jiffies - t1 > 4)
5906 +               return 1;
5907 +
5908 +       return 0;
5909 +}
5910 +
5911 +/*
5912 + * In the SMP+IOAPIC case it might happen that there are an unspecified
5913 + * number of pending IRQ events unhandled. These cases are very rare,
5914 + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
5915 + * better to do it this way as thus we do not have to be aware of
5916 + * 'pending' interrupts in the IRQ path, except at this point.
5917 + */
5918 +/*
5919 + * Edge triggered needs to resend any interrupt
5920 + * that was delayed but this is now handled in the device
5921 + * independent code.
5922 + */
5923 +
5924 +/*
5925 + * Starting up a edge-triggered IO-APIC interrupt is
5926 + * nasty - we need to make sure that we get the edge.
5927 + * If it is already asserted for some reason, we need
5928 + * return 1 to indicate that is was pending.
5929 + *
5930 + * This is not complete - we should be able to fake
5931 + * an edge even if it isn't on the 8259A...
5932 + */
5933 +static unsigned int startup_edge_ioapic_irq(unsigned int irq)
5934 +{
5935 +       int was_pending = 0;
5936 +       unsigned long flags;
5937 +
5938 +       spin_lock_irqsave(&ioapic_lock, flags);
5939 +       if (irq < 16) {
5940 +               disable_8259A_irq(irq);
5941 +               if (i8259A_irq_pending(irq))
5942 +                       was_pending = 1;
5943 +       }
5944 +       __unmask_IO_APIC_irq(irq);
5945 +       spin_unlock_irqrestore(&ioapic_lock, flags);
5946 +
5947 +       return was_pending;
5948 +}
5949 +
5950 +/*
5951 + * Once we have recorded IRQ_PENDING already, we can mask the
5952 + * interrupt for real. This prevents IRQ storms from unhandled
5953 + * devices.
5954 + */
5955 +static void ack_edge_ioapic_irq(unsigned int irq)
5956 +{
5957 +       move_irq(irq);
5958 +       if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
5959 +                                       == (IRQ_PENDING | IRQ_DISABLED))
5960 +               mask_IO_APIC_irq(irq);
5961 +       ack_APIC_irq();
5962 +}
5963 +
5964 +/*
5965 + * Level triggered interrupts can just be masked,
5966 + * and shutting down and starting up the interrupt
5967 + * is the same as enabling and disabling them -- except
5968 + * with a startup need to return a "was pending" value.
5969 + *
5970 + * Level triggered interrupts are special because we
5971 + * do not touch any IO-APIC register while handling
5972 + * them. We ack the APIC in the end-IRQ handler, not
5973 + * in the start-IRQ-handler. Protection against reentrance
5974 + * from the same interrupt is still provided, both by the
5975 + * generic IRQ layer and by the fact that an unacked local
5976 + * APIC does not accept IRQs.
5977 + */
5978 +static unsigned int startup_level_ioapic_irq (unsigned int irq)
5979 +{
5980 +       unmask_IO_APIC_irq(irq);
5981 +
5982 +       return 0; /* don't check for pending */
5983 +}
5984 +
5985 +static void end_level_ioapic_irq (unsigned int irq)
5986 +{
5987 +       unsigned long v;
5988 +       int i;
5989 +
5990 +       move_irq(irq);
5991 +/*
5992 + * It appears there is an erratum which affects at least version 0x11
5993 + * of I/O APIC (that's the 82093AA and cores integrated into various
5994 + * chipsets).  Under certain conditions a level-triggered interrupt is
5995 + * erroneously delivered as edge-triggered one but the respective IRR
5996 + * bit gets set nevertheless.  As a result the I/O unit expects an EOI
5997 + * message but it will never arrive and further interrupts are blocked
5998 + * from the source.  The exact reason is so far unknown, but the
5999 + * phenomenon was observed when two consecutive interrupt requests
6000 + * from a given source get delivered to the same CPU and the source is
6001 + * temporarily disabled in between.
6002 + *
6003 + * A workaround is to simulate an EOI message manually.  We achieve it
6004 + * by setting the trigger mode to edge and then to level when the edge
6005 + * trigger mode gets detected in the TMR of a local APIC for a
6006 + * level-triggered interrupt.  We mask the source for the time of the
6007 + * operation to prevent an edge-triggered interrupt escaping meanwhile.
6008 + * The idea is from Manfred Spraul.  --macro
6009 + */
6010 +       i = IO_APIC_VECTOR(irq);
6011 +
6012 +       v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
6013 +
6014 +       ack_APIC_irq();
6015 +
6016 +       if (!(v & (1 << (i & 0x1f)))) {
6017 +               atomic_inc(&irq_mis_count);
6018 +               spin_lock(&ioapic_lock);
6019 +               __mask_and_edge_IO_APIC_irq(irq);
6020 +               __unmask_and_level_IO_APIC_irq(irq);
6021 +               spin_unlock(&ioapic_lock);
6022 +       }
6023 +}
6024 +
6025 +#ifdef CONFIG_PCI_MSI
6026 +static unsigned int startup_edge_ioapic_vector(unsigned int vector)
6027 +{
6028 +       int irq = vector_to_irq(vector);
6029 +
6030 +       return startup_edge_ioapic_irq(irq);
6031 +}
6032 +
6033 +static void ack_edge_ioapic_vector(unsigned int vector)
6034 +{
6035 +       int irq = vector_to_irq(vector);
6036 +
6037 +       move_native_irq(vector);
6038 +       ack_edge_ioapic_irq(irq);
6039 +}
6040 +
6041 +static unsigned int startup_level_ioapic_vector (unsigned int vector)
6042 +{
6043 +       int irq = vector_to_irq(vector);
6044 +
6045 +       return startup_level_ioapic_irq (irq);
6046 +}
6047 +
6048 +static void end_level_ioapic_vector (unsigned int vector)
6049 +{
6050 +       int irq = vector_to_irq(vector);
6051 +
6052 +       move_native_irq(vector);
6053 +       end_level_ioapic_irq(irq);
6054 +}
6055 +
6056 +static void mask_IO_APIC_vector (unsigned int vector)
6057 +{
6058 +       int irq = vector_to_irq(vector);
6059 +
6060 +       mask_IO_APIC_irq(irq);
6061 +}
6062 +
6063 +static void unmask_IO_APIC_vector (unsigned int vector)
6064 +{
6065 +       int irq = vector_to_irq(vector);
6066 +
6067 +       unmask_IO_APIC_irq(irq);
6068 +}
6069 +
6070 +#ifdef CONFIG_SMP
6071 +static void set_ioapic_affinity_vector (unsigned int vector,
6072 +                                       cpumask_t cpu_mask)
6073 +{
6074 +       int irq = vector_to_irq(vector);
6075 +
6076 +       set_native_irq_info(vector, cpu_mask);
6077 +       set_ioapic_affinity_irq(irq, cpu_mask);
6078 +}
6079 +#endif
6080 +#endif
6081 +
6082 +/*
6083 + * Level and edge triggered IO-APIC interrupts need different handling,
6084 + * so we use two separate IRQ descriptors. Edge triggered IRQs can be
6085 + * handled with the level-triggered descriptor, but that one has slightly
6086 + * more overhead. Level-triggered interrupts cannot be handled with the
6087 + * edge-triggered handler, without risking IRQ storms and other ugly
6088 + * races.
6089 + */
6090 +static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
6091 +       .typename       = "IO-APIC-edge",
6092 +       .startup        = startup_edge_ioapic,
6093 +       .shutdown       = shutdown_edge_ioapic,
6094 +       .enable         = enable_edge_ioapic,
6095 +       .disable        = disable_edge_ioapic,
6096 +       .ack            = ack_edge_ioapic,
6097 +       .end            = end_edge_ioapic,
6098 +#ifdef CONFIG_SMP
6099 +       .set_affinity   = set_ioapic_affinity,
6100 +#endif
6101 +};
6102 +
6103 +static struct hw_interrupt_type ioapic_level_type __read_mostly = {
6104 +       .typename       = "IO-APIC-level",
6105 +       .startup        = startup_level_ioapic,
6106 +       .shutdown       = shutdown_level_ioapic,
6107 +       .enable         = enable_level_ioapic,
6108 +       .disable        = disable_level_ioapic,
6109 +       .ack            = mask_and_ack_level_ioapic,
6110 +       .end            = end_level_ioapic,
6111 +#ifdef CONFIG_SMP
6112 +       .set_affinity   = set_ioapic_affinity,
6113 +#endif
6114 +};
6115 +#endif /* !CONFIG_XEN */
6116 +
6117 +static inline void init_IO_APIC_traps(void)
6118 +{
6119 +       int irq;
6120 +
6121 +       /*
6122 +        * NOTE! The local APIC isn't very good at handling
6123 +        * multiple interrupts at the same interrupt level.
6124 +        * As the interrupt level is determined by taking the
6125 +        * vector number and shifting that right by 4, we
6126 +        * want to spread these out a bit so that they don't
6127 +        * all fall in the same interrupt level.
6128 +        *
6129 +        * Also, we've got to be careful not to trash gate
6130 +        * 0x80, because int 0x80 is hm, kind of importantish. ;)
6131 +        */
6132 +       for (irq = 0; irq < NR_IRQS ; irq++) {
6133 +               int tmp = irq;
6134 +               if (use_pci_vector()) {
6135 +                       if (!platform_legacy_irq(tmp))
6136 +                               if ((tmp = vector_to_irq(tmp)) == -1)
6137 +                                       continue;
6138 +               }
6139 +               if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
6140 +                       /*
6141 +                        * Hmm.. We don't have an entry for this,
6142 +                        * so default to an old-fashioned 8259
6143 +                        * interrupt if we can..
6144 +                        */
6145 +                       if (irq < 16)
6146 +                               make_8259A_irq(irq);
6147 +#ifndef CONFIG_XEN
6148 +                       else
6149 +                               /* Strange. Oh, well.. */
6150 +                               irq_desc[irq].handler = &no_irq_type;
6151 +#endif
6152 +               }
6153 +       }
6154 +}
6155 +
6156 +#ifndef CONFIG_XEN
6157 +static void enable_lapic_irq (unsigned int irq)
6158 +{
6159 +       unsigned long v;
6160 +
6161 +       v = apic_read(APIC_LVT0);
6162 +       apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
6163 +}
6164 +
6165 +static void disable_lapic_irq (unsigned int irq)
6166 +{
6167 +       unsigned long v;
6168 +
6169 +       v = apic_read(APIC_LVT0);
6170 +       apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
6171 +}
6172 +
6173 +static void ack_lapic_irq (unsigned int irq)
6174 +{
6175 +       ack_APIC_irq();
6176 +}
6177 +
6178 +static void end_lapic_irq (unsigned int i) { /* nothing */ }
6179 +
6180 +static struct hw_interrupt_type lapic_irq_type __read_mostly = {
6181 +       .typename       = "local-APIC-edge",
6182 +       .startup        = NULL, /* startup_irq() not used for IRQ0 */
6183 +       .shutdown       = NULL, /* shutdown_irq() not used for IRQ0 */
6184 +       .enable         = enable_lapic_irq,
6185 +       .disable        = disable_lapic_irq,
6186 +       .ack            = ack_lapic_irq,
6187 +       .end            = end_lapic_irq
6188 +};
6189 +
6190 +static void setup_nmi (void)
6191 +{
6192 +       /*
6193 +        * Dirty trick to enable the NMI watchdog ...
6194 +        * We put the 8259A master into AEOI mode and
6195 +        * unmask on all local APICs LVT0 as NMI.
6196 +        *
6197 +        * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
6198 +        * is from Maciej W. Rozycki - so we do not have to EOI from
6199 +        * the NMI handler or the timer interrupt.
6200 +        */ 
6201 +       apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
6202 +
6203 +       on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
6204 +
6205 +       apic_printk(APIC_VERBOSE, " done.\n");
6206 +}
6207 +
6208 +/*
6209 + * This looks a bit hackish but it's about the only one way of sending
6210 + * a few INTA cycles to 8259As and any associated glue logic.  ICR does
6211 + * not support the ExtINT mode, unfortunately.  We need to send these
6212 + * cycles as some i82489DX-based boards have glue logic that keeps the
6213 + * 8259A interrupt line asserted until INTA.  --macro
6214 + */
6215 +static inline void unlock_ExtINT_logic(void)
6216 +{
6217 +       int apic, pin, i;
6218 +       struct IO_APIC_route_entry entry0, entry1;
6219 +       unsigned char save_control, save_freq_select;
6220 +       unsigned long flags;
6221 +
6222 +       pin  = find_isa_irq_pin(8, mp_INT);
6223 +       apic = find_isa_irq_apic(8, mp_INT);
6224 +       if (pin == -1)
6225 +               return;
6226 +
6227 +       spin_lock_irqsave(&ioapic_lock, flags);
6228 +       *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
6229 +       *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
6230 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6231 +       clear_IO_APIC_pin(apic, pin);
6232 +
6233 +       memset(&entry1, 0, sizeof(entry1));
6234 +
6235 +       entry1.dest_mode = 0;                   /* physical delivery */
6236 +       entry1.mask = 0;                        /* unmask IRQ now */
6237 +       entry1.dest.physical.physical_dest = hard_smp_processor_id();
6238 +       entry1.delivery_mode = dest_ExtINT;
6239 +       entry1.polarity = entry0.polarity;
6240 +       entry1.trigger = 0;
6241 +       entry1.vector = 0;
6242 +
6243 +       spin_lock_irqsave(&ioapic_lock, flags);
6244 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
6245 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
6246 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6247 +
6248 +       save_control = CMOS_READ(RTC_CONTROL);
6249 +       save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
6250 +       CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
6251 +                  RTC_FREQ_SELECT);
6252 +       CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
6253 +
6254 +       i = 100;
6255 +       while (i-- > 0) {
6256 +               mdelay(10);
6257 +               if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
6258 +                       i -= 10;
6259 +       }
6260 +
6261 +       CMOS_WRITE(save_control, RTC_CONTROL);
6262 +       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
6263 +       clear_IO_APIC_pin(apic, pin);
6264 +
6265 +       spin_lock_irqsave(&ioapic_lock, flags);
6266 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
6267 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
6268 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6269 +}
6270 +
6271 +int timer_uses_ioapic_pin_0;
6272 +
6273 +/*
6274 + * This code may look a bit paranoid, but it's supposed to cooperate with
6275 + * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
6276 + * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
6277 + * fanatically on his truly buggy board.
6278 + */
6279 +static inline void check_timer(void)
6280 +{
6281 +       int apic1, pin1, apic2, pin2;
6282 +       int vector;
6283 +
6284 +       /*
6285 +        * get/set the timer IRQ vector:
6286 +        */
6287 +       disable_8259A_irq(0);
6288 +       vector = assign_irq_vector(0);
6289 +       set_intr_gate(vector, interrupt[0]);
6290 +
6291 +       /*
6292 +        * Subtle, code in do_timer_interrupt() expects an AEOI
6293 +        * mode for the 8259A whenever interrupts are routed
6294 +        * through I/O APICs.  Also IRQ0 has to be enabled in
6295 +        * the 8259A which implies the virtual wire has to be
6296 +        * disabled in the local APIC.
6297 +        */
6298 +       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
6299 +       init_8259A(1);
6300 +       timer_ack = 1;
6301 +       if (timer_over_8254 > 0)
6302 +               enable_8259A_irq(0);
6303 +
6304 +       pin1  = find_isa_irq_pin(0, mp_INT);
6305 +       apic1 = find_isa_irq_apic(0, mp_INT);
6306 +       pin2  = ioapic_i8259.pin;
6307 +       apic2 = ioapic_i8259.apic;
6308 +
6309 +       if (pin1 == 0)
6310 +               timer_uses_ioapic_pin_0 = 1;
6311 +
6312 +       printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
6313 +               vector, apic1, pin1, apic2, pin2);
6314 +
6315 +       if (pin1 != -1) {
6316 +               /*
6317 +                * Ok, does IRQ0 through the IOAPIC work?
6318 +                */
6319 +               unmask_IO_APIC_irq(0);
6320 +               if (timer_irq_works()) {
6321 +                       if (nmi_watchdog == NMI_IO_APIC) {
6322 +                               disable_8259A_irq(0);
6323 +                               setup_nmi();
6324 +                               enable_8259A_irq(0);
6325 +                       }
6326 +                       if (disable_timer_pin_1 > 0)
6327 +                               clear_IO_APIC_pin(0, pin1);
6328 +                       return;
6329 +               }
6330 +               clear_IO_APIC_pin(apic1, pin1);
6331 +               printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
6332 +                               "IO-APIC\n");
6333 +       }
6334 +
6335 +       printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
6336 +       if (pin2 != -1) {
6337 +               printk("\n..... (found pin %d) ...", pin2);
6338 +               /*
6339 +                * legacy devices should be connected to IO APIC #0
6340 +                */
6341 +               setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
6342 +               if (timer_irq_works()) {
6343 +                       printk("works.\n");
6344 +                       if (pin1 != -1)
6345 +                               replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
6346 +                       else
6347 +                               add_pin_to_irq(0, apic2, pin2);
6348 +                       if (nmi_watchdog == NMI_IO_APIC) {
6349 +                               setup_nmi();
6350 +                       }
6351 +                       return;
6352 +               }
6353 +               /*
6354 +                * Cleanup, just in case ...
6355 +                */
6356 +               clear_IO_APIC_pin(apic2, pin2);
6357 +       }
6358 +       printk(" failed.\n");
6359 +
6360 +       if (nmi_watchdog == NMI_IO_APIC) {
6361 +               printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
6362 +               nmi_watchdog = 0;
6363 +       }
6364 +
6365 +       printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
6366 +
6367 +       disable_8259A_irq(0);
6368 +       irq_desc[0].handler = &lapic_irq_type;
6369 +       apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);   /* Fixed mode */
6370 +       enable_8259A_irq(0);
6371 +
6372 +       if (timer_irq_works()) {
6373 +               printk(" works.\n");
6374 +               return;
6375 +       }
6376 +       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
6377 +       printk(" failed.\n");
6378 +
6379 +       printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
6380 +
6381 +       timer_ack = 0;
6382 +       init_8259A(0);
6383 +       make_8259A_irq(0);
6384 +       apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
6385 +
6386 +       unlock_ExtINT_logic();
6387 +
6388 +       if (timer_irq_works()) {
6389 +               printk(" works.\n");
6390 +               return;
6391 +       }
6392 +       printk(" failed :(.\n");
6393 +       panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
6394 +               "report.  Then try booting with the 'noapic' option");
6395 +}
6396 +#else
6397 +int timer_uses_ioapic_pin_0;
6398 +#define check_timer() ((void)0)
6399 +#endif
6400 +
6401 +/*
6402 + *
6403 + * IRQ's that are handled by the PIC in the MPS IOAPIC case.
6404 + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
6405 + *   Linux doesn't really care, as it's not actually used
6406 + *   for any interrupt handling anyway.
6407 + */
6408 +#define PIC_IRQS       (1 << PIC_CASCADE_IR)
6409 +
6410 +void __init setup_IO_APIC(void)
6411 +{
6412 +       enable_IO_APIC();
6413 +
6414 +       if (acpi_ioapic)
6415 +               io_apic_irqs = ~0;      /* all IRQs go through IOAPIC */
6416 +       else
6417 +               io_apic_irqs = ~PIC_IRQS;
6418 +
6419 +       printk("ENABLING IO-APIC IRQs\n");
6420 +
6421 +       /*
6422 +        * Set up IO-APIC IRQ routing.
6423 +        */
6424 +       if (!acpi_ioapic)
6425 +               setup_ioapic_ids_from_mpc();
6426 +#ifndef CONFIG_XEN
6427 +       sync_Arb_IDs();
6428 +#endif
6429 +       setup_IO_APIC_irqs();
6430 +       init_IO_APIC_traps();
6431 +       check_timer();
6432 +       if (!acpi_ioapic)
6433 +               print_IO_APIC();
6434 +}
6435 +
6436 +static int __init setup_disable_8254_timer(char *s)
6437 +{
6438 +       timer_over_8254 = -1;
6439 +       return 1;
6440 +}
6441 +static int __init setup_enable_8254_timer(char *s)
6442 +{
6443 +       timer_over_8254 = 2;
6444 +       return 1;
6445 +}
6446 +
6447 +__setup("disable_8254_timer", setup_disable_8254_timer);
6448 +__setup("enable_8254_timer", setup_enable_8254_timer);
6449 +
6450 +/*
6451 + *     Called after all the initialization is done. If we didnt find any
6452 + *     APIC bugs then we can allow the modify fast path
6453 + */
6454
6455 +static int __init io_apic_bug_finalize(void)
6456 +{
6457 +       if(sis_apic_bug == -1)
6458 +               sis_apic_bug = 0;
6459 +       if (xen_start_info->flags & SIF_INITDOMAIN) {
6460 +               dom0_op_t op = { .cmd = DOM0_PLATFORM_QUIRK };
6461 +               op.u.platform_quirk.quirk_id = sis_apic_bug ?
6462 +                       QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL;
6463 +               HYPERVISOR_dom0_op(&op);
6464 +       }
6465 +       return 0;
6466 +}
6467 +
6468 +late_initcall(io_apic_bug_finalize);
6469 +
6470 +struct sysfs_ioapic_data {
6471 +       struct sys_device dev;
6472 +       struct IO_APIC_route_entry entry[0];
6473 +};
6474 +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
6475 +
6476 +static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
6477 +{
6478 +       struct IO_APIC_route_entry *entry;
6479 +       struct sysfs_ioapic_data *data;
6480 +       unsigned long flags;
6481 +       int i;
6482 +       
6483 +       data = container_of(dev, struct sysfs_ioapic_data, dev);
6484 +       entry = data->entry;
6485 +       spin_lock_irqsave(&ioapic_lock, flags);
6486 +       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
6487 +               *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
6488 +               *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
6489 +       }
6490 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6491 +
6492 +       return 0;
6493 +}
6494 +
6495 +static int ioapic_resume(struct sys_device *dev)
6496 +{
6497 +       struct IO_APIC_route_entry *entry;
6498 +       struct sysfs_ioapic_data *data;
6499 +       unsigned long flags;
6500 +       union IO_APIC_reg_00 reg_00;
6501 +       int i;
6502 +       
6503 +       data = container_of(dev, struct sysfs_ioapic_data, dev);
6504 +       entry = data->entry;
6505 +
6506 +       spin_lock_irqsave(&ioapic_lock, flags);
6507 +       reg_00.raw = io_apic_read(dev->id, 0);
6508 +       if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
6509 +               reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
6510 +               io_apic_write(dev->id, 0, reg_00.raw);
6511 +       }
6512 +       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
6513 +               io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
6514 +               io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
6515 +       }
6516 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6517 +
6518 +       return 0;
6519 +}
6520 +
6521 +static struct sysdev_class ioapic_sysdev_class = {
6522 +       set_kset_name("ioapic"),
6523 +       .suspend = ioapic_suspend,
6524 +       .resume = ioapic_resume,
6525 +};
6526 +
6527 +static int __init ioapic_init_sysfs(void)
6528 +{
6529 +       struct sys_device * dev;
6530 +       int i, size, error = 0;
6531 +
6532 +       error = sysdev_class_register(&ioapic_sysdev_class);
6533 +       if (error)
6534 +               return error;
6535 +
6536 +       for (i = 0; i < nr_ioapics; i++ ) {
6537 +               size = sizeof(struct sys_device) + nr_ioapic_registers[i] 
6538 +                       * sizeof(struct IO_APIC_route_entry);
6539 +               mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
6540 +               if (!mp_ioapic_data[i]) {
6541 +                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
6542 +                       continue;
6543 +               }
6544 +               memset(mp_ioapic_data[i], 0, size);
6545 +               dev = &mp_ioapic_data[i]->dev;
6546 +               dev->id = i; 
6547 +               dev->cls = &ioapic_sysdev_class;
6548 +               error = sysdev_register(dev);
6549 +               if (error) {
6550 +                       kfree(mp_ioapic_data[i]);
6551 +                       mp_ioapic_data[i] = NULL;
6552 +                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
6553 +                       continue;
6554 +               }
6555 +       }
6556 +
6557 +       return 0;
6558 +}
6559 +
6560 +device_initcall(ioapic_init_sysfs);
6561 +
6562 +/* --------------------------------------------------------------------------
6563 +                          ACPI-based IOAPIC Configuration
6564 +   -------------------------------------------------------------------------- */
6565 +
6566 +#ifdef CONFIG_ACPI
6567 +
6568 +int __init io_apic_get_unique_id (int ioapic, int apic_id)
6569 +{
6570 +#ifndef CONFIG_XEN
6571 +       union IO_APIC_reg_00 reg_00;
6572 +       static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
6573 +       physid_mask_t tmp;
6574 +       unsigned long flags;
6575 +       int i = 0;
6576 +
6577 +       /*
6578 +        * The P4 platform supports up to 256 APIC IDs on two separate APIC 
6579 +        * buses (one for LAPICs, one for IOAPICs), where predecessors only 
6580 +        * supports up to 16 on one shared APIC bus.
6581 +        * 
6582 +        * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
6583 +        *      advantage of new APIC bus architecture.
6584 +        */
6585 +
6586 +       if (physids_empty(apic_id_map))
6587 +               apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
6588 +
6589 +       spin_lock_irqsave(&ioapic_lock, flags);
6590 +       reg_00.raw = io_apic_read(ioapic, 0);
6591 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6592 +
6593 +       if (apic_id >= get_physical_broadcast()) {
6594 +               printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
6595 +                       "%d\n", ioapic, apic_id, reg_00.bits.ID);
6596 +               apic_id = reg_00.bits.ID;
6597 +       }
6598 +
6599 +       /*
6600 +        * Every APIC in a system must have a unique ID or we get lots of nice 
6601 +        * 'stuck on smp_invalidate_needed IPI wait' messages.
6602 +        */
6603 +       if (check_apicid_used(apic_id_map, apic_id)) {
6604 +
6605 +               for (i = 0; i < get_physical_broadcast(); i++) {
6606 +                       if (!check_apicid_used(apic_id_map, i))
6607 +                               break;
6608 +               }
6609 +
6610 +               if (i == get_physical_broadcast())
6611 +                       panic("Max apic_id exceeded!\n");
6612 +
6613 +               printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
6614 +                       "trying %d\n", ioapic, apic_id, i);
6615 +
6616 +               apic_id = i;
6617 +       } 
6618 +
6619 +       tmp = apicid_to_cpu_present(apic_id);
6620 +       physids_or(apic_id_map, apic_id_map, tmp);
6621 +
6622 +       if (reg_00.bits.ID != apic_id) {
6623 +               reg_00.bits.ID = apic_id;
6624 +
6625 +               spin_lock_irqsave(&ioapic_lock, flags);
6626 +               io_apic_write(ioapic, 0, reg_00.raw);
6627 +               reg_00.raw = io_apic_read(ioapic, 0);
6628 +               spin_unlock_irqrestore(&ioapic_lock, flags);
6629 +
6630 +               /* Sanity check */
6631 +               if (reg_00.bits.ID != apic_id) {
6632 +                       printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
6633 +                       return -1;
6634 +               }
6635 +       }
6636 +
6637 +       apic_printk(APIC_VERBOSE, KERN_INFO
6638 +                       "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
6639 +#endif /* !CONFIG_XEN */
6640 +
6641 +       return apic_id;
6642 +}
6643 +
6644 +
6645 +int __init io_apic_get_version (int ioapic)
6646 +{
6647 +       union IO_APIC_reg_01    reg_01;
6648 +       unsigned long flags;
6649 +
6650 +       spin_lock_irqsave(&ioapic_lock, flags);
6651 +       reg_01.raw = io_apic_read(ioapic, 1);
6652 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6653 +
6654 +       return reg_01.bits.version;
6655 +}
6656 +
6657 +
6658 +int __init io_apic_get_redir_entries (int ioapic)
6659 +{
6660 +       union IO_APIC_reg_01    reg_01;
6661 +       unsigned long flags;
6662 +
6663 +       spin_lock_irqsave(&ioapic_lock, flags);
6664 +       reg_01.raw = io_apic_read(ioapic, 1);
6665 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6666 +
6667 +       return reg_01.bits.entries;
6668 +}
6669 +
6670 +
6671 +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
6672 +{
6673 +       struct IO_APIC_route_entry entry;
6674 +       unsigned long flags;
6675 +
6676 +       if (!IO_APIC_IRQ(irq)) {
6677 +               printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
6678 +                       ioapic);
6679 +               return -EINVAL;
6680 +       }
6681 +
6682 +       /*
6683 +        * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
6684 +        * Note that we mask (disable) IRQs now -- these get enabled when the
6685 +        * corresponding device driver registers for this IRQ.
6686 +        */
6687 +
6688 +       memset(&entry,0,sizeof(entry));
6689 +
6690 +       entry.delivery_mode = INT_DELIVERY_MODE;
6691 +       entry.dest_mode = INT_DEST_MODE;
6692 +       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
6693 +       entry.trigger = edge_level;
6694 +       entry.polarity = active_high_low;
6695 +       entry.mask  = 1;
6696 +
6697 +       /*
6698 +        * IRQs < 16 are already in the irq_2_pin[] map
6699 +        */
6700 +       if (irq >= 16)
6701 +               add_pin_to_irq(irq, ioapic, pin);
6702 +
6703 +       entry.vector = assign_irq_vector(irq);
6704 +
6705 +       apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
6706 +               "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
6707 +               mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
6708 +               edge_level, active_high_low);
6709 +
6710 +       ioapic_register_intr(irq, entry.vector, edge_level);
6711 +
6712 +       if (!ioapic && (irq < 16))
6713 +               disable_8259A_irq(irq);
6714 +
6715 +       spin_lock_irqsave(&ioapic_lock, flags);
6716 +       io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
6717 +       io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
6718 +       set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
6719 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6720 +
6721 +       return 0;
6722 +}
6723 +
6724 +#endif /* CONFIG_ACPI */
6725 diff -urNp linux-2.6/arch/i386/kernel/ioport-xen.c new/arch/i386/kernel/ioport-xen.c
6726 --- linux-2.6/arch/i386/kernel/ioport-xen.c     1970-01-01 01:00:00.000000000 +0100
6727 +++ new/arch/i386/kernel/ioport-xen.c   2006-05-09 12:32:35.000000000 +0200
6728 @@ -0,0 +1,121 @@
6729 +/*
6730 + *     linux/arch/i386/kernel/ioport.c
6731 + *
6732 + * This contains the io-permission bitmap code - written by obz, with changes
6733 + * by Linus.
6734 + */
6735 +
6736 +#include <linux/sched.h>
6737 +#include <linux/kernel.h>
6738 +#include <linux/capability.h>
6739 +#include <linux/errno.h>
6740 +#include <linux/types.h>
6741 +#include <linux/ioport.h>
6742 +#include <linux/smp.h>
6743 +#include <linux/smp_lock.h>
6744 +#include <linux/stddef.h>
6745 +#include <linux/slab.h>
6746 +#include <linux/thread_info.h>
6747 +#include <xen/interface/physdev.h>
6748 +
6749 +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
6750 +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
6751 +{
6752 +       unsigned long mask;
6753 +       unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
6754 +       unsigned int low_index = base & (BITS_PER_LONG-1);
6755 +       int length = low_index + extent;
6756 +
6757 +       if (low_index != 0) {
6758 +               mask = (~0UL << low_index);
6759 +               if (length < BITS_PER_LONG)
6760 +                       mask &= ~(~0UL << length);
6761 +               if (new_value)
6762 +                       *bitmap_base++ |= mask;
6763 +               else
6764 +                       *bitmap_base++ &= ~mask;
6765 +               length -= BITS_PER_LONG;
6766 +       }
6767 +
6768 +       mask = (new_value ? ~0UL : 0UL);
6769 +       while (length >= BITS_PER_LONG) {
6770 +               *bitmap_base++ = mask;
6771 +               length -= BITS_PER_LONG;
6772 +       }
6773 +
6774 +       if (length > 0) {
6775 +               mask = ~(~0UL << length);
6776 +               if (new_value)
6777 +                       *bitmap_base++ |= mask;
6778 +               else
6779 +                       *bitmap_base++ &= ~mask;
6780 +       }
6781 +}
6782 +
6783 +
6784 +/*
6785 + * this changes the io permissions bitmap in the current task.
6786 + */
6787 +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
6788 +{
6789 +       struct thread_struct * t = &current->thread;
6790 +       unsigned long *bitmap;
6791 +       struct physdev_set_iobitmap set_iobitmap;
6792 +
6793 +       if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
6794 +               return -EINVAL;
6795 +       if (turn_on && !capable(CAP_SYS_RAWIO))
6796 +               return -EPERM;
6797 +
6798 +       /*
6799 +        * If it's the first ioperm() call in this thread's lifetime, set the
6800 +        * IO bitmap up. ioperm() is much less timing critical than clone(),
6801 +        * this is why we delay this operation until now:
6802 +        */
6803 +       if (!t->io_bitmap_ptr) {
6804 +               bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
6805 +               if (!bitmap)
6806 +                       return -ENOMEM;
6807 +
6808 +               memset(bitmap, 0xff, IO_BITMAP_BYTES);
6809 +               t->io_bitmap_ptr = bitmap;
6810 +
6811 +               set_iobitmap.bitmap   = (char *)bitmap;
6812 +               set_iobitmap.nr_ports = IO_BITMAP_BITS;
6813 +               HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &set_iobitmap);
6814 +       }
6815 +
6816 +       set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
6817 +
6818 +       return 0;
6819 +}
6820 +
6821 +/*
6822 + * sys_iopl has to be used when you want to access the IO ports
6823 + * beyond the 0x3ff range: to get the full 65536 ports bitmapped
6824 + * you'd need 8kB of bitmaps/process, which is a bit excessive.
6825 + *
6826 + * Here we just change the eflags value on the stack: we allow
6827 + * only the super-user to do it. This depends on the stack-layout
6828 + * on system-call entry - see also fork() and the signal handling
6829 + * code.
6830 + */
6831 +
6832 +asmlinkage long sys_iopl(unsigned long unused)
6833 +{
6834 +       volatile struct pt_regs * regs = (struct pt_regs *) &unused;
6835 +       unsigned int level = regs->ebx;
6836 +       struct thread_struct *t = &current->thread;
6837 +       unsigned int old = (t->iopl >> 12) & 3;
6838 +
6839 +       if (level > 3)
6840 +               return -EINVAL;
6841 +       /* Trying to gain more privileges? */
6842 +       if (level > old) {
6843 +               if (!capable(CAP_SYS_RAWIO))
6844 +                       return -EPERM;
6845 +       }
6846 +       t->iopl = level << 12;
6847 +       set_iopl_mask(t->iopl);
6848 +       return 0;
6849 +}
6850 diff -urNp linux-2.6/arch/i386/kernel/irq.c new/arch/i386/kernel/irq.c
6851 --- linux-2.6/arch/i386/kernel/irq.c    2006-07-03 14:14:14.000000000 +0200
6852 +++ new/arch/i386/kernel/irq.c  2006-05-09 12:32:35.000000000 +0200
6853 @@ -53,8 +53,8 @@ static union irq_ctx *softirq_ctx[NR_CPU
6854   */
6855  fastcall unsigned int do_IRQ(struct pt_regs *regs)
6856  {      
6857 -       /* high bits used in ret_from_ code */
6858 -       int irq = regs->orig_eax & 0xff;
6859 +       /* high bit used in ret_from_ code */
6860 +       int irq = ~regs->orig_eax;
6861  #ifdef CONFIG_4KSTACKS
6862         union irq_ctx *curctx, *irqctx;
6863         u32 *isp;
6864 diff -urNp linux-2.6/arch/i386/kernel/irq-xen.c new/arch/i386/kernel/irq-xen.c
6865 --- linux-2.6/arch/i386/kernel/irq-xen.c        1970-01-01 01:00:00.000000000 +0100
6866 +++ new/arch/i386/kernel/irq-xen.c      2006-05-09 12:32:35.000000000 +0200
6867 @@ -0,0 +1,306 @@
6868 +/*
6869 + *     linux/arch/i386/kernel/irq.c
6870 + *
6871 + *     Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
6872 + *
6873 + * This file contains the lowest level x86-specific interrupt
6874 + * entry, irq-stacks and irq statistics code. All the remaining
6875 + * irq logic is done by the generic kernel/irq/ code and
6876 + * by the x86-specific irq controller code. (e.g. i8259.c and
6877 + * io_apic.c.)
6878 + */
6879 +
6880 +#include <asm/uaccess.h>
6881 +#include <linux/module.h>
6882 +#include <linux/seq_file.h>
6883 +#include <linux/interrupt.h>
6884 +#include <linux/kernel_stat.h>
6885 +#include <linux/notifier.h>
6886 +#include <linux/cpu.h>
6887 +#include <linux/delay.h>
6888 +
6889 +DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
6890 +EXPORT_PER_CPU_SYMBOL(irq_stat);
6891 +
6892 +#ifndef CONFIG_X86_LOCAL_APIC
6893 +/*
6894 + * 'what should we do if we get a hw irq event on an illegal vector'.
6895 + * each architecture has to answer this themselves.
6896 + */
6897 +void ack_bad_irq(unsigned int irq)
6898 +{
6899 +       printk("unexpected IRQ trap at vector %02x\n", irq);
6900 +}
6901 +#endif
6902 +
6903 +#ifdef CONFIG_4KSTACKS
6904 +/*
6905 + * per-CPU IRQ handling contexts (thread information and stack)
6906 + */
6907 +union irq_ctx {
6908 +       struct thread_info      tinfo;
6909 +       u32                     stack[THREAD_SIZE/sizeof(u32)];
6910 +};
6911 +
6912 +static union irq_ctx *hardirq_ctx[NR_CPUS];
6913 +static union irq_ctx *softirq_ctx[NR_CPUS];
6914 +#endif
6915 +
6916 +/*
6917 + * do_IRQ handles all normal device IRQ's (the special
6918 + * SMP cross-CPU interrupts have their own specific
6919 + * handlers).
6920 + */
6921 +fastcall unsigned int do_IRQ(struct pt_regs *regs)
6922 +{      
6923 +       /* high bit used in ret_from_ code */
6924 +       int irq = ~regs->orig_eax;
6925 +#ifdef CONFIG_4KSTACKS
6926 +       union irq_ctx *curctx, *irqctx;
6927 +       u32 *isp;
6928 +#endif
6929 +
6930 +       irq_enter();
6931 +#ifdef CONFIG_DEBUG_STACKOVERFLOW
6932 +       /* Debugging check for stack overflow: is there less than 1KB free? */
6933 +       {
6934 +               long esp;
6935 +
6936 +               __asm__ __volatile__("andl %%esp,%0" :
6937 +                                       "=r" (esp) : "0" (THREAD_SIZE - 1));
6938 +               if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
6939 +                       printk("do_IRQ: stack overflow: %ld\n",
6940 +                               esp - sizeof(struct thread_info));
6941 +                       dump_stack();
6942 +               }
6943 +       }
6944 +#endif
6945 +
6946 +#ifdef CONFIG_4KSTACKS
6947 +
6948 +       curctx = (union irq_ctx *) current_thread_info();
6949 +       irqctx = hardirq_ctx[smp_processor_id()];
6950 +
6951 +       /*
6952 +        * this is where we switch to the IRQ stack. However, if we are
6953 +        * already using the IRQ stack (because we interrupted a hardirq
6954 +        * handler) we can't do that and just have to keep using the
6955 +        * current stack (which is the irq stack already after all)
6956 +        */
6957 +       if (curctx != irqctx) {
6958 +               int arg1, arg2, ebx;
6959 +
6960 +               /* build the stack frame on the IRQ stack */
6961 +               isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
6962 +               irqctx->tinfo.task = curctx->tinfo.task;
6963 +               irqctx->tinfo.previous_esp = current_stack_pointer;
6964 +
6965 +               asm volatile(
6966 +                       "       xchgl   %%ebx,%%esp      \n"
6967 +                       "       call    __do_IRQ         \n"
6968 +                       "       movl   %%ebx,%%esp      \n"
6969 +                       : "=a" (arg1), "=d" (arg2), "=b" (ebx)
6970 +                       :  "0" (irq),   "1" (regs),  "2" (isp)
6971 +                       : "memory", "cc", "ecx"
6972 +               );
6973 +       } else
6974 +#endif
6975 +               __do_IRQ(irq, regs);
6976 +
6977 +       irq_exit();
6978 +
6979 +       return 1;
6980 +}
6981 +
6982 +#ifdef CONFIG_4KSTACKS
6983 +
6984 +/*
6985 + * These should really be __section__(".bss.page_aligned") as well, but
6986 + * gcc's 3.0 and earlier don't handle that correctly.
6987 + */
6988 +static char softirq_stack[NR_CPUS * THREAD_SIZE]
6989 +               __attribute__((__aligned__(THREAD_SIZE)));
6990 +
6991 +static char hardirq_stack[NR_CPUS * THREAD_SIZE]
6992 +               __attribute__((__aligned__(THREAD_SIZE)));
6993 +
6994 +/*
6995 + * allocate per-cpu stacks for hardirq and for softirq processing
6996 + */
6997 +void irq_ctx_init(int cpu)
6998 +{
6999 +       union irq_ctx *irqctx;
7000 +
7001 +       if (hardirq_ctx[cpu])
7002 +               return;
7003 +
7004 +       irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
7005 +       irqctx->tinfo.task              = NULL;
7006 +       irqctx->tinfo.exec_domain       = NULL;
7007 +       irqctx->tinfo.cpu               = cpu;
7008 +       irqctx->tinfo.preempt_count     = HARDIRQ_OFFSET;
7009 +       irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
7010 +
7011 +       hardirq_ctx[cpu] = irqctx;
7012 +
7013 +       irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
7014 +       irqctx->tinfo.task              = NULL;
7015 +       irqctx->tinfo.exec_domain       = NULL;
7016 +       irqctx->tinfo.cpu               = cpu;
7017 +       irqctx->tinfo.preempt_count     = SOFTIRQ_OFFSET;
7018 +       irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
7019 +
7020 +       softirq_ctx[cpu] = irqctx;
7021 +
7022 +       printk("CPU %u irqstacks, hard=%p soft=%p\n",
7023 +               cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
7024 +}
7025 +
7026 +void irq_ctx_exit(int cpu)
7027 +{
7028 +       hardirq_ctx[cpu] = NULL;
7029 +}
7030 +
7031 +extern asmlinkage void __do_softirq(void);
7032 +
7033 +asmlinkage void do_softirq(void)
7034 +{
7035 +       unsigned long flags;
7036 +       struct thread_info *curctx;
7037 +       union irq_ctx *irqctx;
7038 +       u32 *isp;
7039 +
7040 +       if (in_interrupt())
7041 +               return;
7042 +
7043 +       local_irq_save(flags);
7044 +
7045 +       if (local_softirq_pending()) {
7046 +               curctx = current_thread_info();
7047 +               irqctx = softirq_ctx[smp_processor_id()];
7048 +               irqctx->tinfo.task = curctx->task;
7049 +               irqctx->tinfo.previous_esp = current_stack_pointer;
7050 +
7051 +               /* build the stack frame on the softirq stack */
7052 +               isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
7053 +
7054 +               asm volatile(
7055 +                       "       xchgl   %%ebx,%%esp     \n"
7056 +                       "       call    __do_softirq    \n"
7057 +                       "       movl    %%ebx,%%esp     \n"
7058 +                       : "=b"(isp)
7059 +                       : "0"(isp)
7060 +                       : "memory", "cc", "edx", "ecx", "eax"
7061 +               );
7062 +       }
7063 +
7064 +       local_irq_restore(flags);
7065 +}
7066 +
7067 +EXPORT_SYMBOL(do_softirq);
7068 +#endif
7069 +
7070 +/*
7071 + * Interrupt statistics:
7072 + */
7073 +
7074 +atomic_t irq_err_count;
7075 +
7076 +/*
7077 + * /proc/interrupts printing:
7078 + */
7079 +
7080 +int show_interrupts(struct seq_file *p, void *v)
7081 +{
7082 +       int i = *(loff_t *) v, j;
7083 +       struct irqaction * action;
7084 +       unsigned long flags;
7085 +
7086 +       if (i == 0) {
7087 +               seq_printf(p, "           ");
7088 +               for_each_online_cpu(j)
7089 +                       seq_printf(p, "CPU%d       ",j);
7090 +               seq_putc(p, '\n');
7091 +       }
7092 +
7093 +       if (i < NR_IRQS) {
7094 +               spin_lock_irqsave(&irq_desc[i].lock, flags);
7095 +               action = irq_desc[i].action;
7096 +               if (!action)
7097 +                       goto skip;
7098 +               seq_printf(p, "%3d: ",i);
7099 +#ifndef CONFIG_SMP
7100 +               seq_printf(p, "%10u ", kstat_irqs(i));
7101 +#else
7102 +               for_each_online_cpu(j)
7103 +                       seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
7104 +#endif
7105 +               seq_printf(p, " %14s", irq_desc[i].handler->typename);
7106 +               seq_printf(p, "  %s", action->name);
7107 +
7108 +               for (action=action->next; action; action = action->next)
7109 +                       seq_printf(p, ", %s", action->name);
7110 +
7111 +               seq_putc(p, '\n');
7112 +skip:
7113 +               spin_unlock_irqrestore(&irq_desc[i].lock, flags);
7114 +       } else if (i == NR_IRQS) {
7115 +               seq_printf(p, "NMI: ");
7116 +               for_each_online_cpu(j)
7117 +                       seq_printf(p, "%10u ", nmi_count(j));
7118 +               seq_putc(p, '\n');
7119 +#ifdef CONFIG_X86_LOCAL_APIC
7120 +               seq_printf(p, "LOC: ");
7121 +               for_each_online_cpu(j)
7122 +                       seq_printf(p, "%10u ",
7123 +                               per_cpu(irq_stat,j).apic_timer_irqs);
7124 +               seq_putc(p, '\n');
7125 +#endif
7126 +               seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
7127 +#if defined(CONFIG_X86_IO_APIC)
7128 +               seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
7129 +#endif
7130 +       }
7131 +       return 0;
7132 +}
7133 +
7134 +#ifdef CONFIG_HOTPLUG_CPU
7135 +
7136 +void fixup_irqs(cpumask_t map)
7137 +{
7138 +       unsigned int irq;
7139 +       static int warned;
7140 +
7141 +       for (irq = 0; irq < NR_IRQS; irq++) {
7142 +               cpumask_t mask;
7143 +               if (irq == 2)
7144 +                       continue;
7145 +
7146 +               cpus_and(mask, irq_affinity[irq], map);
7147 +               if (any_online_cpu(mask) == NR_CPUS) {
7148 +                       /*printk("Breaking affinity for irq %i\n", irq);*/
7149 +                       mask = map;
7150 +               }
7151 +               if (irq_desc[irq].handler->set_affinity)
7152 +                       irq_desc[irq].handler->set_affinity(irq, mask);
7153 +               else if (irq_desc[irq].action && !(warned++))
7154 +                       printk("Cannot set affinity for irq %i\n", irq);
7155 +       }
7156 +
7157 +#if 0
7158 +       barrier();
7159 +       /* Ingo Molnar says: "after the IO-APIC masks have been redirected
7160 +          [note the nop - the interrupt-enable boundary on x86 is two
7161 +          instructions from sti] - to flush out pending hardirqs and
7162 +          IPIs. After this point nothing is supposed to reach this CPU." */
7163 +       __asm__ __volatile__("sti; nop; cli");
7164 +       barrier();
7165 +#else
7166 +       /* That doesn't seem sufficient.  Give it 1ms. */
7167 +       local_irq_enable();
7168 +       mdelay(1);
7169 +       local_irq_disable();
7170 +#endif
7171 +}
7172 +#endif
7173 +
7174 diff -urNp linux-2.6/arch/i386/kernel/ldt-xen.c new/arch/i386/kernel/ldt-xen.c
7175 --- linux-2.6/arch/i386/kernel/ldt-xen.c        1970-01-01 01:00:00.000000000 +0100
7176 +++ new/arch/i386/kernel/ldt-xen.c      2006-05-09 12:32:35.000000000 +0200
7177 @@ -0,0 +1,269 @@
7178 +/*
7179 + * linux/kernel/ldt.c
7180 + *
7181 + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
7182 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
7183 + */
7184 +
7185 +#include <linux/errno.h>
7186 +#include <linux/sched.h>
7187 +#include <linux/string.h>
7188 +#include <linux/mm.h>
7189 +#include <linux/smp.h>
7190 +#include <linux/smp_lock.h>
7191 +#include <linux/vmalloc.h>
7192 +#include <linux/slab.h>
7193 +
7194 +#include <asm/uaccess.h>
7195 +#include <asm/system.h>
7196 +#include <asm/ldt.h>
7197 +#include <asm/desc.h>
7198 +#include <asm/mmu_context.h>
7199 +
7200 +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
7201 +static void flush_ldt(void *null)
7202 +{
7203 +       if (current->active_mm)
7204 +               load_LDT(&current->active_mm->context);
7205 +}
7206 +#endif
7207 +
7208 +static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
7209 +{
7210 +       void *oldldt;
7211 +       void *newldt;
7212 +       int oldsize;
7213 +
7214 +       if (mincount <= pc->size)
7215 +               return 0;
7216 +       oldsize = pc->size;
7217 +       mincount = (mincount+511)&(~511);
7218 +       if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
7219 +               newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
7220 +       else
7221 +               newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
7222 +
7223 +       if (!newldt)
7224 +               return -ENOMEM;
7225 +
7226 +       if (oldsize)
7227 +               memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
7228 +       oldldt = pc->ldt;
7229 +       memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
7230 +       pc->ldt = newldt;
7231 +       wmb();
7232 +       pc->size = mincount;
7233 +       wmb();
7234 +
7235 +       if (reload) {
7236 +#ifdef CONFIG_SMP
7237 +               cpumask_t mask;
7238 +               preempt_disable();
7239 +#endif
7240 +               make_pages_readonly(
7241 +                       pc->ldt,
7242 +                       (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
7243 +                       XENFEAT_writable_descriptor_tables);
7244 +               load_LDT(pc);
7245 +#ifdef CONFIG_SMP
7246 +               mask = cpumask_of_cpu(smp_processor_id());
7247 +               if (!cpus_equal(current->mm->cpu_vm_mask, mask))
7248 +                       smp_call_function(flush_ldt, NULL, 1, 1);
7249 +               preempt_enable();
7250 +#endif
7251 +       }
7252 +       if (oldsize) {
7253 +               make_pages_writable(
7254 +                       oldldt,
7255 +                       (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
7256 +                       XENFEAT_writable_descriptor_tables);
7257 +               if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
7258 +                       vfree(oldldt);
7259 +               else
7260 +                       kfree(oldldt);
7261 +       }
7262 +       return 0;
7263 +}
7264 +
7265 +static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
7266 +{
7267 +       int err = alloc_ldt(new, old->size, 0);
7268 +       if (err < 0)
7269 +               return err;
7270 +       memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
7271 +       make_pages_readonly(
7272 +               new->ldt,
7273 +               (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
7274 +               XENFEAT_writable_descriptor_tables);
7275 +       return 0;
7276 +}
7277 +
7278 +/*
7279 + * we do not have to muck with descriptors here, that is
7280 + * done in switch_mm() as needed.
7281 + */
7282 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
7283 +{
7284 +       struct mm_struct * old_mm;
7285 +       int retval = 0;
7286 +
7287 +       init_MUTEX(&mm->context.sem);
7288 +       mm->context.size = 0;
7289 +       old_mm = current->mm;
7290 +       if (old_mm && old_mm->context.size > 0) {
7291 +               down(&old_mm->context.sem);
7292 +               retval = copy_ldt(&mm->context, &old_mm->context);
7293 +               up(&old_mm->context.sem);
7294 +       }
7295 +       return retval;
7296 +}
7297 +
7298 +/*
7299 + * No need to lock the MM as we are the last user
7300 + */
7301 +void destroy_context(struct mm_struct *mm)
7302 +{
7303 +       if (mm->context.size) {
7304 +               if (mm == current->active_mm)
7305 +                       clear_LDT();
7306 +               make_pages_writable(
7307 +                       mm->context.ldt,
7308 +                       (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
7309 +                       XENFEAT_writable_descriptor_tables);
7310 +               if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
7311 +                       vfree(mm->context.ldt);
7312 +               else
7313 +                       kfree(mm->context.ldt);
7314 +               mm->context.size = 0;
7315 +       }
7316 +}
7317 +
7318 +static int read_ldt(void __user * ptr, unsigned long bytecount)
7319 +{
7320 +       int err;
7321 +       unsigned long size;
7322 +       struct mm_struct * mm = current->mm;
7323 +
7324 +       if (!mm->context.size)
7325 +               return 0;
7326 +       if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
7327 +               bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
7328 +
7329 +       down(&mm->context.sem);
7330 +       size = mm->context.size*LDT_ENTRY_SIZE;
7331 +       if (size > bytecount)
7332 +               size = bytecount;
7333 +
7334 +       err = 0;
7335 +       if (copy_to_user(ptr, mm->context.ldt, size))
7336 +               err = -EFAULT;
7337 +       up(&mm->context.sem);
7338 +       if (err < 0)
7339 +               goto error_return;
7340 +       if (size != bytecount) {
7341 +               /* zero-fill the rest */
7342 +               if (clear_user(ptr+size, bytecount-size) != 0) {
7343 +                       err = -EFAULT;
7344 +                       goto error_return;
7345 +               }
7346 +       }
7347 +       return bytecount;
7348 +error_return:
7349 +       return err;
7350 +}
7351 +
7352 +static int read_default_ldt(void __user * ptr, unsigned long bytecount)
7353 +{
7354 +       int err;
7355 +       unsigned long size;
7356 +       void *address;
7357 +
7358 +       err = 0;
7359 +       address = &default_ldt[0];
7360 +       size = 5*sizeof(struct desc_struct);
7361 +       if (size > bytecount)
7362 +               size = bytecount;
7363 +
7364 +       err = size;
7365 +       if (copy_to_user(ptr, address, size))
7366 +               err = -EFAULT;
7367 +
7368 +       return err;
7369 +}
7370 +
7371 +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
7372 +{
7373 +       struct mm_struct * mm = current->mm;
7374 +       __u32 entry_1, entry_2;
7375 +       int error;
7376 +       struct user_desc ldt_info;
7377 +
7378 +       error = -EINVAL;
7379 +       if (bytecount != sizeof(ldt_info))
7380 +               goto out;
7381 +       error = -EFAULT;        
7382 +       if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
7383 +               goto out;
7384 +
7385 +       error = -EINVAL;
7386 +       if (ldt_info.entry_number >= LDT_ENTRIES)
7387 +               goto out;
7388 +       if (ldt_info.contents == 3) {
7389 +               if (oldmode)
7390 +                       goto out;
7391 +               if (ldt_info.seg_not_present == 0)
7392 +                       goto out;
7393 +       }
7394 +
7395 +       down(&mm->context.sem);
7396 +       if (ldt_info.entry_number >= mm->context.size) {
7397 +               error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
7398 +               if (error < 0)
7399 +                       goto out_unlock;
7400 +       }
7401 +
7402 +       /* Allow LDTs to be cleared by the user. */
7403 +       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
7404 +               if (oldmode || LDT_empty(&ldt_info)) {
7405 +                       entry_1 = 0;
7406 +                       entry_2 = 0;
7407 +                       goto install;
7408 +               }
7409 +       }
7410 +
7411 +       entry_1 = LDT_entry_a(&ldt_info);
7412 +       entry_2 = LDT_entry_b(&ldt_info);
7413 +       if (oldmode)
7414 +               entry_2 &= ~(1 << 20);
7415 +
7416 +       /* Install the new entry ...  */
7417 +install:
7418 +       error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number,
7419 +                               entry_1, entry_2);
7420 +
7421 +out_unlock:
7422 +       up(&mm->context.sem);
7423 +out:
7424 +       return error;
7425 +}
7426 +
7427 +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
7428 +{
7429 +       int ret = -ENOSYS;
7430 +
7431 +       switch (func) {
7432 +       case 0:
7433 +               ret = read_ldt(ptr, bytecount);
7434 +               break;
7435 +       case 1:
7436 +               ret = write_ldt(ptr, bytecount, 1);
7437 +               break;
7438 +       case 2:
7439 +               ret = read_default_ldt(ptr, bytecount);
7440 +               break;
7441 +       case 0x11:
7442 +               ret = write_ldt(ptr, bytecount, 0);
7443 +               break;
7444 +       }
7445 +       return ret;
7446 +}
7447 diff -urNp linux-2.6/arch/i386/kernel/Makefile new/arch/i386/kernel/Makefile
7448 --- linux-2.6/arch/i386/kernel/Makefile 2006-07-03 14:14:14.000000000 +0200
7449 +++ new/arch/i386/kernel/Makefile       2006-05-09 12:32:33.000000000 +0200
7450 @@ -42,6 +42,12 @@ EXTRA_AFLAGS   := -traditional
7451  
7452  obj-$(CONFIG_SCx200)           += scx200.o
7453  
7454 +ifdef CONFIG_XEN
7455 +vsyscall_note := vsyscall-note-xen.o
7456 +else
7457 +vsyscall_note := vsyscall-note.o
7458 +endif
7459 +
7460  # vsyscall.o contains the vsyscall DSO images as __initdata.
7461  # We must build both images before we can assemble it.
7462  # Note: kbuild does not track this dependency due to usage of .incbin
7463 @@ -62,7 +68,7 @@ SYSCFLAGS_vsyscall-int80.so   = $(vsyscall
7464  
7465  $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so: \
7466  $(obj)/vsyscall-%.so: $(src)/vsyscall.lds \
7467 -                     $(obj)/vsyscall-%.o $(obj)/vsyscall-note.o FORCE
7468 +                     $(obj)/vsyscall-%.o $(obj)/$(vsyscall_note) FORCE
7469         $(call if_changed,syscall)
7470  
7471  # We also create a special relocatable object that should mirror the symbol
7472 @@ -74,5 +80,17 @@ $(obj)/built-in.o: ld_flags += -R $(obj)
7473  
7474  SYSCFLAGS_vsyscall-syms.o = -r
7475  $(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \
7476 -                       $(obj)/vsyscall-sysenter.o $(obj)/vsyscall-note.o FORCE
7477 +                       $(obj)/vsyscall-sysenter.o $(obj)/$(vsyscall_note) FORCE
7478         $(call if_changed,syscall)
7479 +
7480 +ifdef CONFIG_XEN
7481 +include $(srctree)/scripts/Makefile.xen
7482 +
7483 +obj-y += fixup.o
7484 +microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
7485 +n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
7486 +
7487 +obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
7488 +obj-y := $(call cherrypickxen, $(obj-y))
7489 +extra-y := $(call cherrypickxen, $(extra-y))
7490 +endif
7491 diff -urNp linux-2.6/arch/i386/kernel/microcode-xen.c new/arch/i386/kernel/microcode-xen.c
7492 --- linux-2.6/arch/i386/kernel/microcode-xen.c  1970-01-01 01:00:00.000000000 +0100
7493 +++ new/arch/i386/kernel/microcode-xen.c        2006-05-23 18:37:09.000000000 +0200
7494 @@ -0,0 +1,148 @@
7495 +/*
7496 + *     Intel CPU Microcode Update Driver for Linux
7497 + *
7498 + *     Copyright (C) 2000-2004 Tigran Aivazian
7499 + *
7500 + *     This driver allows to upgrade microcode on Intel processors
7501 + *     belonging to IA-32 family - PentiumPro, Pentium II, 
7502 + *     Pentium III, Xeon, Pentium 4, etc.
7503 + *
7504 + *     Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, 
7505 + *     Order Number 245472 or free download from:
7506 + *             
7507 + *     http://developer.intel.com/design/pentium4/manuals/245472.htm
7508 + *
7509 + *     For more information, go to http://www.urbanmyth.org/microcode
7510 + *
7511 + *     This program is free software; you can redistribute it and/or
7512 + *     modify it under the terms of the GNU General Public License
7513 + *     as published by the Free Software Foundation; either version
7514 + *     2 of the License, or (at your option) any later version.
7515 + */
7516 +
7517 +//#define DEBUG /* pr_debug */
7518 +#include <linux/capability.h>
7519 +#include <linux/kernel.h>
7520 +#include <linux/init.h>
7521 +#include <linux/sched.h>
7522 +#include <linux/cpumask.h>
7523 +#include <linux/module.h>
7524 +#include <linux/slab.h>
7525 +#include <linux/vmalloc.h>
7526 +#include <linux/miscdevice.h>
7527 +#include <linux/spinlock.h>
7528 +#include <linux/mm.h>
7529 +#include <linux/mutex.h>
7530 +#include <linux/syscalls.h>
7531 +
7532 +#include <asm/msr.h>
7533 +#include <asm/uaccess.h>
7534 +#include <asm/processor.h>
7535 +
7536 +MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
7537 +MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>");
7538 +MODULE_LICENSE("GPL");
7539 +
7540 +#define MICROCODE_VERSION      "1.14-xen"
7541 +
7542 +#define DEFAULT_UCODE_DATASIZE         (2000)    /* 2000 bytes */
7543 +#define MC_HEADER_SIZE         (sizeof (microcode_header_t))     /* 48 bytes */
7544 +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
7545 +
7546 +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
7547 +static DEFINE_MUTEX(microcode_mutex);
7548 +
7549 +static void __user *user_buffer;       /* user area microcode data buffer */
7550 +static unsigned int user_buffer_size;  /* it's size */
7551 +                               
7552 +static int microcode_open (struct inode *unused1, struct file *unused2)
7553 +{
7554 +       return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
7555 +}
7556 +
7557 +
7558 +static int do_microcode_update (void)
7559 +{
7560 +       int err;
7561 +       dom0_op_t op;
7562 +
7563 +       err = sys_mlock((unsigned long)user_buffer, user_buffer_size);
7564 +       if (err != 0)
7565 +               return err;
7566 +
7567 +       op.cmd = DOM0_MICROCODE;
7568 +       set_xen_guest_handle(op.u.microcode.data, user_buffer);
7569 +       op.u.microcode.length = user_buffer_size;
7570 +       err = HYPERVISOR_dom0_op(&op);
7571 +
7572 +       (void)sys_munlock((unsigned long)user_buffer, user_buffer_size);
7573 +
7574 +       return err;
7575 +}
7576 +
7577 +static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
7578 +{
7579 +       ssize_t ret;
7580 +
7581 +       if (len < DEFAULT_UCODE_TOTALSIZE) {
7582 +               printk(KERN_ERR "microcode: not enough data\n"); 
7583 +               return -EINVAL;
7584 +       }
7585 +
7586 +       if ((len >> PAGE_SHIFT) > num_physpages) {
7587 +               printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
7588 +               return -EINVAL;
7589 +       }
7590 +
7591 +       mutex_lock(&microcode_mutex);
7592 +
7593 +       user_buffer = (void __user *) buf;
7594 +       user_buffer_size = (int) len;
7595 +
7596 +       ret = do_microcode_update();
7597 +       if (!ret)
7598 +               ret = (ssize_t)len;
7599 +
7600 +       mutex_unlock(&microcode_mutex);
7601 +
7602 +       return ret;
7603 +}
7604 +
7605 +static struct file_operations microcode_fops = {
7606 +       .owner          = THIS_MODULE,
7607 +       .write          = microcode_write,
7608 +       .open           = microcode_open,
7609 +};
7610 +
7611 +static struct miscdevice microcode_dev = {
7612 +       .minor          = MICROCODE_MINOR,
7613 +       .name           = "microcode",
7614 +       .devfs_name     = "cpu/microcode",
7615 +       .fops           = &microcode_fops,
7616 +};
7617 +
7618 +static int __init microcode_init (void)
7619 +{
7620 +       int error;
7621 +
7622 +       error = misc_register(&microcode_dev);
7623 +       if (error) {
7624 +               printk(KERN_ERR
7625 +                       "microcode: can't misc_register on minor=%d\n",
7626 +                       MICROCODE_MINOR);
7627 +               return error;
7628 +       }
7629 +
7630 +       printk(KERN_INFO 
7631 +               "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n");
7632 +       return 0;
7633 +}
7634 +
7635 +static void __exit microcode_exit (void)
7636 +{
7637 +       misc_deregister(&microcode_dev);
7638 +}
7639 +
7640 +module_init(microcode_init)
7641 +module_exit(microcode_exit)
7642 +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
7643 diff -urNp linux-2.6/arch/i386/kernel/mpparse-xen.c new/arch/i386/kernel/mpparse-xen.c
7644 --- linux-2.6/arch/i386/kernel/mpparse-xen.c    1970-01-01 01:00:00.000000000 +0100
7645 +++ new/arch/i386/kernel/mpparse-xen.c  2006-05-23 18:37:09.000000000 +0200
7646 @@ -0,0 +1,1186 @@
7647 +/*
7648 + *     Intel Multiprocessor Specification 1.1 and 1.4
7649 + *     compliant MP-table parsing routines.
7650 + *
7651 + *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
7652 + *     (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
7653 + *
7654 + *     Fixes
7655 + *             Erich Boleyn    :       MP v1.4 and additional changes.
7656 + *             Alan Cox        :       Added EBDA scanning
7657 + *             Ingo Molnar     :       various cleanups and rewrites
7658 + *             Maciej W. Rozycki:      Bits for default MP configurations
7659 + *             Paul Diefenbaugh:       Added full ACPI support
7660 + */
7661 +
7662 +#include <linux/mm.h>
7663 +#include <linux/init.h>
7664 +#include <linux/acpi.h>
7665 +#include <linux/delay.h>
7666 +#include <linux/config.h>
7667 +#include <linux/bootmem.h>
7668 +#include <linux/smp_lock.h>
7669 +#include <linux/kernel_stat.h>
7670 +#include <linux/mc146818rtc.h>
7671 +#include <linux/bitops.h>
7672 +
7673 +#include <asm/smp.h>
7674 +#include <asm/acpi.h>
7675 +#include <asm/mtrr.h>
7676 +#include <asm/mpspec.h>
7677 +#include <asm/io_apic.h>
7678 +
7679 +#include <mach_apic.h>
7680 +#include <mach_mpparse.h>
7681 +#include <bios_ebda.h>
7682 +
7683 +/* Have we found an MP table */
7684 +int smp_found_config;
7685 +unsigned int __initdata maxcpus = NR_CPUS;
7686 +
7687 +/*
7688 + * Various Linux-internal data structures created from the
7689 + * MP-table.
7690 + */
7691 +int apic_version [MAX_APICS];
7692 +int mp_bus_id_to_type [MAX_MP_BUSSES];
7693 +int mp_bus_id_to_node [MAX_MP_BUSSES];
7694 +int mp_bus_id_to_local [MAX_MP_BUSSES];
7695 +int quad_local_to_mp_bus_id [NR_CPUS/4][4];
7696 +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
7697 +static int mp_current_pci_id;
7698 +
7699 +/* I/O APIC entries */
7700 +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
7701 +
7702 +/* # of MP IRQ source entries */
7703 +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
7704 +
7705 +/* MP IRQ source entries */
7706 +int mp_irq_entries;
7707 +
7708 +int nr_ioapics;
7709 +
7710 +int pic_mode;
7711 +unsigned long mp_lapic_addr;
7712 +
7713 +unsigned int def_to_bigsmp = 0;
7714 +
7715 +/* Processor that is doing the boot up */
7716 +unsigned int boot_cpu_physical_apicid = -1U;
7717 +/* Internal processor count */
7718 +static unsigned int __devinitdata num_processors;
7719 +
7720 +/* Bitmask of physically existing CPUs */
7721 +physid_mask_t phys_cpu_present_map;
7722 +
7723 +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
7724 +
7725 +/*
7726 + * Intel MP BIOS table parsing routines:
7727 + */
7728 +
7729 +
7730 +/*
7731 + * Checksum an MP configuration block.
7732 + */
7733 +
7734 +static int __init mpf_checksum(unsigned char *mp, int len)
7735 +{
7736 +       int sum = 0;
7737 +
7738 +       while (len--)
7739 +               sum += *mp++;
7740 +
7741 +       return sum & 0xFF;
7742 +}
7743 +
7744 +/*
7745 + * Have to match translation table entries to main table entries by counter
7746 + * hence the mpc_record variable .... can't see a less disgusting way of
7747 + * doing this ....
7748 + */
7749 +
7750 +static int mpc_record; 
7751 +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata;
7752 +
7753 +#ifndef CONFIG_XEN
7754 +static void __devinit MP_processor_info (struct mpc_config_processor *m)
7755 +{
7756 +       int ver, apicid;
7757 +       physid_mask_t phys_cpu;
7758 +       
7759 +       if (!(m->mpc_cpuflag & CPU_ENABLED))
7760 +               return;
7761 +
7762 +       apicid = mpc_apic_id(m, translation_table[mpc_record]);
7763 +
7764 +       if (m->mpc_featureflag&(1<<0))
7765 +               Dprintk("    Floating point unit present.\n");
7766 +       if (m->mpc_featureflag&(1<<7))
7767 +               Dprintk("    Machine Exception supported.\n");
7768 +       if (m->mpc_featureflag&(1<<8))
7769 +               Dprintk("    64 bit compare & exchange supported.\n");
7770 +       if (m->mpc_featureflag&(1<<9))
7771 +               Dprintk("    Internal APIC present.\n");
7772 +       if (m->mpc_featureflag&(1<<11))
7773 +               Dprintk("    SEP present.\n");
7774 +       if (m->mpc_featureflag&(1<<12))
7775 +               Dprintk("    MTRR  present.\n");
7776 +       if (m->mpc_featureflag&(1<<13))
7777 +               Dprintk("    PGE  present.\n");
7778 +       if (m->mpc_featureflag&(1<<14))
7779 +               Dprintk("    MCA  present.\n");
7780 +       if (m->mpc_featureflag&(1<<15))
7781 +               Dprintk("    CMOV  present.\n");
7782 +       if (m->mpc_featureflag&(1<<16))
7783 +               Dprintk("    PAT  present.\n");
7784 +       if (m->mpc_featureflag&(1<<17))
7785 +               Dprintk("    PSE  present.\n");
7786 +       if (m->mpc_featureflag&(1<<18))
7787 +               Dprintk("    PSN  present.\n");
7788 +       if (m->mpc_featureflag&(1<<19))
7789 +               Dprintk("    Cache Line Flush Instruction present.\n");
7790 +       /* 20 Reserved */
7791 +       if (m->mpc_featureflag&(1<<21))
7792 +               Dprintk("    Debug Trace and EMON Store present.\n");
7793 +       if (m->mpc_featureflag&(1<<22))
7794 +               Dprintk("    ACPI Thermal Throttle Registers  present.\n");
7795 +       if (m->mpc_featureflag&(1<<23))
7796 +               Dprintk("    MMX  present.\n");
7797 +       if (m->mpc_featureflag&(1<<24))
7798 +               Dprintk("    FXSR  present.\n");
7799 +       if (m->mpc_featureflag&(1<<25))
7800 +               Dprintk("    XMM  present.\n");
7801 +       if (m->mpc_featureflag&(1<<26))
7802 +               Dprintk("    Willamette New Instructions  present.\n");
7803 +       if (m->mpc_featureflag&(1<<27))
7804 +               Dprintk("    Self Snoop  present.\n");
7805 +       if (m->mpc_featureflag&(1<<28))
7806 +               Dprintk("    HT  present.\n");
7807 +       if (m->mpc_featureflag&(1<<29))
7808 +               Dprintk("    Thermal Monitor present.\n");
7809 +       /* 30, 31 Reserved */
7810 +
7811 +
7812 +       if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
7813 +               Dprintk("    Bootup CPU\n");
7814 +               boot_cpu_physical_apicid = m->mpc_apicid;
7815 +       }
7816 +
7817 +       ver = m->mpc_apicver;
7818 +
7819 +       /*
7820 +        * Validate version
7821 +        */
7822 +       if (ver == 0x0) {
7823 +               printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
7824 +                               "fixing up to 0x10. (tell your hw vendor)\n",
7825 +                               m->mpc_apicid);
7826 +               ver = 0x10;
7827 +       }
7828 +       apic_version[m->mpc_apicid] = ver;
7829 +
7830 +       phys_cpu = apicid_to_cpu_present(apicid);
7831 +       physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
7832 +
7833 +       if (num_processors >= NR_CPUS) {
7834 +               printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
7835 +                       "  Processor ignored.\n", NR_CPUS);
7836 +               return;
7837 +       }
7838 +
7839 +       if (num_processors >= maxcpus) {
7840 +               printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
7841 +                       " Processor ignored.\n", maxcpus);
7842 +               return;
7843 +       }
7844 +
7845 +       cpu_set(num_processors, cpu_possible_map);
7846 +       num_processors++;
7847 +
7848 +       /*
7849 +        * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
7850 +        * but we need to work other dependencies like SMP_SUSPEND etc
7851 +        * before this can be done without some confusion.
7852 +        * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
7853 +        *       - Ashok Raj <ashok.raj@intel.com>
7854 +        */
7855 +       if (num_processors > 8) {
7856 +               switch (boot_cpu_data.x86_vendor) {
7857 +               case X86_VENDOR_INTEL:
7858 +                       if (!APIC_XAPIC(ver)) {
7859 +                               def_to_bigsmp = 0;
7860 +                               break;
7861 +                       }
7862 +                       /* If P4 and above fall through */
7863 +               case X86_VENDOR_AMD:
7864 +                       def_to_bigsmp = 1;
7865 +               }
7866 +       }
7867 +       bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
7868 +}
7869 +#else
7870 +void __init MP_processor_info (struct mpc_config_processor *m)
7871 +{
7872 +       num_processors++;
7873 +}
7874 +#endif /* CONFIG_XEN */
7875 +
7876 +static void __init MP_bus_info (struct mpc_config_bus *m)
7877 +{
7878 +       char str[7];
7879 +
7880 +       memcpy(str, m->mpc_bustype, 6);
7881 +       str[6] = 0;
7882 +
7883 +       mpc_oem_bus_info(m, str, translation_table[mpc_record]);
7884 +
7885 +       if (m->mpc_busid >= MAX_MP_BUSSES) {
7886 +               printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
7887 +                       " is too large, max. supported is %d\n",
7888 +                       m->mpc_busid, str, MAX_MP_BUSSES - 1);
7889 +               return;
7890 +       }
7891 +
7892 +       if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
7893 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
7894 +       } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
7895 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
7896 +       } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
7897 +               mpc_oem_pci_bus(m, translation_table[mpc_record]);
7898 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
7899 +               mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
7900 +               mp_current_pci_id++;
7901 +       } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
7902 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
7903 +       } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
7904 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
7905 +       } else {
7906 +               printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
7907 +       }
7908 +}
7909 +
7910 +static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
7911 +{
7912 +       if (!(m->mpc_flags & MPC_APIC_USABLE))
7913 +               return;
7914 +
7915 +       printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
7916 +               m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
7917 +       if (nr_ioapics >= MAX_IO_APICS) {
7918 +               printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
7919 +                       MAX_IO_APICS, nr_ioapics);
7920 +               panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
7921 +       }
7922 +       if (!m->mpc_apicaddr) {
7923 +               printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
7924 +                       " found in MP table, skipping!\n");
7925 +               return;
7926 +       }
7927 +       mp_ioapics[nr_ioapics] = *m;
7928 +       nr_ioapics++;
7929 +}
7930 +
7931 +static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
7932 +{
7933 +       mp_irqs [mp_irq_entries] = *m;
7934 +       Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
7935 +               " IRQ %02x, APIC ID %x, APIC INT %02x\n",
7936 +                       m->mpc_irqtype, m->mpc_irqflag & 3,
7937 +                       (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
7938 +                       m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
7939 +       if (++mp_irq_entries == MAX_IRQ_SOURCES)
7940 +               panic("Max # of irq sources exceeded!!\n");
7941 +}
7942 +
7943 +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
7944 +{
7945 +       Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
7946 +               " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
7947 +                       m->mpc_irqtype, m->mpc_irqflag & 3,
7948 +                       (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
7949 +                       m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
7950 +       /*
7951 +        * Well it seems all SMP boards in existence
7952 +        * use ExtINT/LVT1 == LINT0 and
7953 +        * NMI/LVT2 == LINT1 - the following check
7954 +        * will show us if this assumptions is false.
7955 +        * Until then we do not have to add baggage.
7956 +        */
7957 +       if ((m->mpc_irqtype == mp_ExtINT) &&
7958 +               (m->mpc_destapiclint != 0))
7959 +                       BUG();
7960 +       if ((m->mpc_irqtype == mp_NMI) &&
7961 +               (m->mpc_destapiclint != 1))
7962 +                       BUG();
7963 +}
7964 +
7965 +#ifdef CONFIG_X86_NUMAQ
7966 +static void __init MP_translation_info (struct mpc_config_translation *m)
7967 +{
7968 +       printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
7969 +
7970 +       if (mpc_record >= MAX_MPC_ENTRY) 
7971 +               printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
7972 +       else
7973 +               translation_table[mpc_record] = m; /* stash this for later */
7974 +       if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
7975 +               node_set_online(m->trans_quad);
7976 +}
7977 +
7978 +/*
7979 + * Read/parse the MPC oem tables
7980 + */
7981 +
7982 +static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
7983 +       unsigned short oemsize)
7984 +{
7985 +       int count = sizeof (*oemtable); /* the header size */
7986 +       unsigned char *oemptr = ((unsigned char *)oemtable)+count;
7987 +       
7988 +       mpc_record = 0;
7989 +       printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
7990 +       if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
7991 +       {
7992 +               printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
7993 +                       oemtable->oem_signature[0],
7994 +                       oemtable->oem_signature[1],
7995 +                       oemtable->oem_signature[2],
7996 +                       oemtable->oem_signature[3]);
7997 +               return;
7998 +       }
7999 +       if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
8000 +       {
8001 +               printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
8002 +               return;
8003 +       }
8004 +       while (count < oemtable->oem_length) {
8005 +               switch (*oemptr) {
8006 +                       case MP_TRANSLATION:
8007 +                       {
8008 +                               struct mpc_config_translation *m=
8009 +                                       (struct mpc_config_translation *)oemptr;
8010 +                               MP_translation_info(m);
8011 +                               oemptr += sizeof(*m);
8012 +                               count += sizeof(*m);
8013 +                               ++mpc_record;
8014 +                               break;
8015 +                       }
8016 +                       default:
8017 +                       {
8018 +                               printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
8019 +                               return;
8020 +                       }
8021 +               }
8022 +       }
8023 +}
8024 +
8025 +static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
8026 +               char *productid)
8027 +{
8028 +       if (strncmp(oem, "IBM NUMA", 8))
8029 +               printk("Warning!  May not be a NUMA-Q system!\n");
8030 +       if (mpc->mpc_oemptr)
8031 +               smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
8032 +                               mpc->mpc_oemsize);
8033 +}
8034 +#endif /* CONFIG_X86_NUMAQ */
8035 +
8036 +/*
8037 + * Read/parse the MPC
8038 + */
8039 +
8040 +static int __init smp_read_mpc(struct mp_config_table *mpc)
8041 +{
8042 +       char str[16];
8043 +       char oem[10];
8044 +       int count=sizeof(*mpc);
8045 +       unsigned char *mpt=((unsigned char *)mpc)+count;
8046 +
8047 +       if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
8048 +               printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
8049 +                       *(u32 *)mpc->mpc_signature);
8050 +               return 0;
8051 +       }
8052 +       if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
8053 +               printk(KERN_ERR "SMP mptable: checksum error!\n");
8054 +               return 0;
8055 +       }
8056 +       if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
8057 +               printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
8058 +                       mpc->mpc_spec);
8059 +               return 0;
8060 +       }
8061 +       if (!mpc->mpc_lapic) {
8062 +               printk(KERN_ERR "SMP mptable: null local APIC address!\n");
8063 +               return 0;
8064 +       }
8065 +       memcpy(oem,mpc->mpc_oem,8);
8066 +       oem[8]=0;
8067 +       printk(KERN_INFO "OEM ID: %s ",oem);
8068 +
8069 +       memcpy(str,mpc->mpc_productid,12);
8070 +       str[12]=0;
8071 +       printk("Product ID: %s ",str);
8072 +
8073 +       mps_oem_check(mpc, oem, str);
8074 +
8075 +       printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
8076 +
8077 +       /* 
8078 +        * Save the local APIC address (it might be non-default) -- but only
8079 +        * if we're not using ACPI.
8080 +        */
8081 +       if (!acpi_lapic)
8082 +               mp_lapic_addr = mpc->mpc_lapic;
8083 +
8084 +       /*
8085 +        *      Now process the configuration blocks.
8086 +        */
8087 +       mpc_record = 0;
8088 +       while (count < mpc->mpc_length) {
8089 +               switch(*mpt) {
8090 +                       case MP_PROCESSOR:
8091 +                       {
8092 +                               struct mpc_config_processor *m=
8093 +                                       (struct mpc_config_processor *)mpt;
8094 +                               /* ACPI may have already provided this data */
8095 +                               if (!acpi_lapic)
8096 +                                       MP_processor_info(m);
8097 +                               mpt += sizeof(*m);
8098 +                               count += sizeof(*m);
8099 +                               break;
8100 +                       }
8101 +                       case MP_BUS:
8102 +                       {
8103 +                               struct mpc_config_bus *m=
8104 +                                       (struct mpc_config_bus *)mpt;
8105 +                               MP_bus_info(m);
8106 +                               mpt += sizeof(*m);
8107 +                               count += sizeof(*m);
8108 +                               break;
8109 +                       }
8110 +                       case MP_IOAPIC:
8111 +                       {
8112 +                               struct mpc_config_ioapic *m=
8113 +                                       (struct mpc_config_ioapic *)mpt;
8114 +                               MP_ioapic_info(m);
8115 +                               mpt+=sizeof(*m);
8116 +                               count+=sizeof(*m);
8117 +                               break;
8118 +                       }
8119 +                       case MP_INTSRC:
8120 +                       {
8121 +                               struct mpc_config_intsrc *m=
8122 +                                       (struct mpc_config_intsrc *)mpt;
8123 +
8124 +                               MP_intsrc_info(m);
8125 +                               mpt+=sizeof(*m);
8126 +                               count+=sizeof(*m);
8127 +                               break;
8128 +                       }
8129 +                       case MP_LINTSRC:
8130 +                       {
8131 +                               struct mpc_config_lintsrc *m=
8132 +                                       (struct mpc_config_lintsrc *)mpt;
8133 +                               MP_lintsrc_info(m);
8134 +                               mpt+=sizeof(*m);
8135 +                               count+=sizeof(*m);
8136 +                               break;
8137 +                       }
8138 +                       default:
8139 +                       {
8140 +                               count = mpc->mpc_length;
8141 +                               break;
8142 +                       }
8143 +               }
8144 +               ++mpc_record;
8145 +       }
8146 +       clustered_apic_check();
8147 +       if (!num_processors)
8148 +               printk(KERN_ERR "SMP mptable: no processors registered!\n");
8149 +       return num_processors;
8150 +}
8151 +
8152 +static int __init ELCR_trigger(unsigned int irq)
8153 +{
8154 +       unsigned int port;
8155 +
8156 +       port = 0x4d0 + (irq >> 3);
8157 +       return (inb(port) >> (irq & 7)) & 1;
8158 +}
8159 +
8160 +static void __init construct_default_ioirq_mptable(int mpc_default_type)
8161 +{
8162 +       struct mpc_config_intsrc intsrc;
8163 +       int i;
8164 +       int ELCR_fallback = 0;
8165 +
8166 +       intsrc.mpc_type = MP_INTSRC;
8167 +       intsrc.mpc_irqflag = 0;                 /* conforming */
8168 +       intsrc.mpc_srcbus = 0;
8169 +       intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
8170 +
8171 +       intsrc.mpc_irqtype = mp_INT;
8172 +
8173 +       /*
8174 +        *  If true, we have an ISA/PCI system with no IRQ entries
8175 +        *  in the MP table. To prevent the PCI interrupts from being set up
8176 +        *  incorrectly, we try to use the ELCR. The sanity check to see if
8177 +        *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
8178 +        *  never be level sensitive, so we simply see if the ELCR agrees.
8179 +        *  If it does, we assume it's valid.
8180 +        */
8181 +       if (mpc_default_type == 5) {
8182 +               printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
8183 +
8184 +               if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
8185 +                       printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
8186 +               else {
8187 +                       printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
8188 +                       ELCR_fallback = 1;
8189 +               }
8190 +       }
8191 +
8192 +       for (i = 0; i < 16; i++) {
8193 +               switch (mpc_default_type) {
8194 +               case 2:
8195 +                       if (i == 0 || i == 13)
8196 +                               continue;       /* IRQ0 & IRQ13 not connected */
8197 +                       /* fall through */
8198 +               default:
8199 +                       if (i == 2)
8200 +                               continue;       /* IRQ2 is never connected */
8201 +               }
8202 +
8203 +               if (ELCR_fallback) {
8204 +                       /*
8205 +                        *  If the ELCR indicates a level-sensitive interrupt, we
8206 +                        *  copy that information over to the MP table in the
8207 +                        *  irqflag field (level sensitive, active high polarity).
8208 +                        */
8209 +                       if (ELCR_trigger(i))
8210 +                               intsrc.mpc_irqflag = 13;
8211 +                       else
8212 +                               intsrc.mpc_irqflag = 0;
8213 +               }
8214 +
8215 +               intsrc.mpc_srcbusirq = i;
8216 +               intsrc.mpc_dstirq = i ? i : 2;          /* IRQ0 to INTIN2 */
8217 +               MP_intsrc_info(&intsrc);
8218 +       }
8219 +
8220 +       intsrc.mpc_irqtype = mp_ExtINT;
8221 +       intsrc.mpc_srcbusirq = 0;
8222 +       intsrc.mpc_dstirq = 0;                          /* 8259A to INTIN0 */
8223 +       MP_intsrc_info(&intsrc);
8224 +}
8225 +
8226 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
8227 +{
8228 +       struct mpc_config_processor processor;
8229 +       struct mpc_config_bus bus;
8230 +       struct mpc_config_ioapic ioapic;
8231 +       struct mpc_config_lintsrc lintsrc;
8232 +       int linttypes[2] = { mp_ExtINT, mp_NMI };
8233 +       int i;
8234 +
8235 +       /*
8236 +        * local APIC has default address
8237 +        */
8238 +       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
8239 +
8240 +       /*
8241 +        * 2 CPUs, numbered 0 & 1.
8242 +        */
8243 +       processor.mpc_type = MP_PROCESSOR;
8244 +       /* Either an integrated APIC or a discrete 82489DX. */
8245 +       processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
8246 +       processor.mpc_cpuflag = CPU_ENABLED;
8247 +       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
8248 +                                  (boot_cpu_data.x86_model << 4) |
8249 +                                  boot_cpu_data.x86_mask;
8250 +       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
8251 +       processor.mpc_reserved[0] = 0;
8252 +       processor.mpc_reserved[1] = 0;
8253 +       for (i = 0; i < 2; i++) {
8254 +               processor.mpc_apicid = i;
8255 +               MP_processor_info(&processor);
8256 +       }
8257 +
8258 +       bus.mpc_type = MP_BUS;
8259 +       bus.mpc_busid = 0;
8260 +       switch (mpc_default_type) {
8261 +               default:
8262 +                       printk("???\n");
8263 +                       printk(KERN_ERR "Unknown standard configuration %d\n",
8264 +                               mpc_default_type);
8265 +                       /* fall through */
8266 +               case 1:
8267 +               case 5:
8268 +                       memcpy(bus.mpc_bustype, "ISA   ", 6);
8269 +                       break;
8270 +               case 2:
8271 +               case 6:
8272 +               case 3:
8273 +                       memcpy(bus.mpc_bustype, "EISA  ", 6);
8274 +                       break;
8275 +               case 4:
8276 +               case 7:
8277 +                       memcpy(bus.mpc_bustype, "MCA   ", 6);
8278 +       }
8279 +       MP_bus_info(&bus);
8280 +       if (mpc_default_type > 4) {
8281 +               bus.mpc_busid = 1;
8282 +               memcpy(bus.mpc_bustype, "PCI   ", 6);
8283 +               MP_bus_info(&bus);
8284 +       }
8285 +
8286 +       ioapic.mpc_type = MP_IOAPIC;
8287 +       ioapic.mpc_apicid = 2;
8288 +       ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
8289 +       ioapic.mpc_flags = MPC_APIC_USABLE;
8290 +       ioapic.mpc_apicaddr = 0xFEC00000;
8291 +       MP_ioapic_info(&ioapic);
8292 +
8293 +       /*
8294 +        * We set up most of the low 16 IO-APIC pins according to MPS rules.
8295 +        */
8296 +       construct_default_ioirq_mptable(mpc_default_type);
8297 +
8298 +       lintsrc.mpc_type = MP_LINTSRC;
8299 +       lintsrc.mpc_irqflag = 0;                /* conforming */
8300 +       lintsrc.mpc_srcbusid = 0;
8301 +       lintsrc.mpc_srcbusirq = 0;
8302 +       lintsrc.mpc_destapic = MP_APIC_ALL;
8303 +       for (i = 0; i < 2; i++) {
8304 +               lintsrc.mpc_irqtype = linttypes[i];
8305 +               lintsrc.mpc_destapiclint = i;
8306 +               MP_lintsrc_info(&lintsrc);
8307 +       }
8308 +}
8309 +
8310 +static struct intel_mp_floating *mpf_found;
8311 +
8312 +/*
8313 + * Scan the memory blocks for an SMP configuration block.
8314 + */
8315 +void __init get_smp_config (void)
8316 +{
8317 +       struct intel_mp_floating *mpf = mpf_found;
8318 +
8319 +       /*
8320 +        * ACPI supports both logical (e.g. Hyper-Threading) and physical 
8321 +        * processors, where MPS only supports physical.
8322 +        */
8323 +       if (acpi_lapic && acpi_ioapic) {
8324 +               printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
8325 +               return;
8326 +       }
8327 +       else if (acpi_lapic)
8328 +               printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
8329 +
8330 +       printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
8331 +       if (mpf->mpf_feature2 & (1<<7)) {
8332 +               printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
8333 +               pic_mode = 1;
8334 +       } else {
8335 +               printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
8336 +               pic_mode = 0;
8337 +       }
8338 +
8339 +       /*
8340 +        * Now see if we need to read further.
8341 +        */
8342 +       if (mpf->mpf_feature1 != 0) {
8343 +
8344 +               printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
8345 +               construct_default_ISA_mptable(mpf->mpf_feature1);
8346 +
8347 +       } else if (mpf->mpf_physptr) {
8348 +
8349 +               /*
8350 +                * Read the physical hardware table.  Anything here will
8351 +                * override the defaults.
8352 +                */
8353 +               if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
8354 +                       smp_found_config = 0;
8355 +                       printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
8356 +                       printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
8357 +                       return;
8358 +               }
8359 +               /*
8360 +                * If there are no explicit MP IRQ entries, then we are
8361 +                * broken.  We set up most of the low 16 IO-APIC pins to
8362 +                * ISA defaults and hope it will work.
8363 +                */
8364 +               if (!mp_irq_entries) {
8365 +                       struct mpc_config_bus bus;
8366 +
8367 +                       printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
8368 +
8369 +                       bus.mpc_type = MP_BUS;
8370 +                       bus.mpc_busid = 0;
8371 +                       memcpy(bus.mpc_bustype, "ISA   ", 6);
8372 +                       MP_bus_info(&bus);
8373 +
8374 +                       construct_default_ioirq_mptable(0);
8375 +               }
8376 +
8377 +       } else
8378 +               BUG();
8379 +
8380 +       printk(KERN_INFO "Processors: %d\n", num_processors);
8381 +       /*
8382 +        * Only use the first configuration found.
8383 +        */
8384 +}
8385 +
8386 +static int __init smp_scan_config (unsigned long base, unsigned long length)
8387 +{
8388 +       unsigned long *bp = isa_bus_to_virt(base);
8389 +       struct intel_mp_floating *mpf;
8390 +
8391 +       Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
8392 +       if (sizeof(*mpf) != 16)
8393 +               printk("Error: MPF size\n");
8394 +
8395 +       while (length > 0) {
8396 +               mpf = (struct intel_mp_floating *)bp;
8397 +               if ((*bp == SMP_MAGIC_IDENT) &&
8398 +                       (mpf->mpf_length == 1) &&
8399 +                       !mpf_checksum((unsigned char *)bp, 16) &&
8400 +                       ((mpf->mpf_specification == 1)
8401 +                               || (mpf->mpf_specification == 4)) ) {
8402 +
8403 +                       smp_found_config = 1;
8404 +#ifndef CONFIG_XEN
8405 +                       printk(KERN_INFO "found SMP MP-table at %08lx\n",
8406 +                                               virt_to_phys(mpf));
8407 +                       reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
8408 +                       if (mpf->mpf_physptr) {
8409 +                               /*
8410 +                                * We cannot access to MPC table to compute
8411 +                                * table size yet, as only few megabytes from
8412 +                                * the bottom is mapped now.
8413 +                                * PC-9800's MPC table places on the very last
8414 +                                * of physical memory; so that simply reserving
8415 +                                * PAGE_SIZE from mpg->mpf_physptr yields BUG()
8416 +                                * in reserve_bootmem.
8417 +                                */
8418 +                               unsigned long size = PAGE_SIZE;
8419 +                               unsigned long end = max_low_pfn * PAGE_SIZE;
8420 +                               if (mpf->mpf_physptr + size > end)
8421 +                                       size = end - mpf->mpf_physptr;
8422 +                               reserve_bootmem(mpf->mpf_physptr, size);
8423 +                       }
8424 +#else
8425 +                       printk(KERN_INFO "found SMP MP-table at %08lx\n",
8426 +                               ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base);
8427 +#endif
8428 +
8429 +                       mpf_found = mpf;
8430 +                       return 1;
8431 +               }
8432 +               bp += 4;
8433 +               length -= 16;
8434 +       }
8435 +       return 0;
8436 +}
8437 +
8438 +void __init find_smp_config (void)
8439 +{
8440 +#ifndef CONFIG_XEN
8441 +       unsigned int address;
8442 +#endif
8443 +
8444 +       /*
8445 +        * FIXME: Linux assumes you have 640K of base ram..
8446 +        * this continues the error...
8447 +        *
8448 +        * 1) Scan the bottom 1K for a signature
8449 +        * 2) Scan the top 1K of base RAM
8450 +        * 3) Scan the 64K of bios
8451 +        */
8452 +       if (smp_scan_config(0x0,0x400) ||
8453 +               smp_scan_config(639*0x400,0x400) ||
8454 +                       smp_scan_config(0xF0000,0x10000))
8455 +               return;
8456 +       /*
8457 +        * If it is an SMP machine we should know now, unless the
8458 +        * configuration is in an EISA/MCA bus machine with an
8459 +        * extended bios data area.
8460 +        *
8461 +        * there is a real-mode segmented pointer pointing to the
8462 +        * 4K EBDA area at 0x40E, calculate and scan it here.
8463 +        *
8464 +        * NOTE! There are Linux loaders that will corrupt the EBDA
8465 +        * area, and as such this kind of SMP config may be less
8466 +        * trustworthy, simply because the SMP table may have been
8467 +        * stomped on during early boot. These loaders are buggy and
8468 +        * should be fixed.
8469 +        *
8470 +        * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
8471 +        */
8472 +
8473 +#ifndef CONFIG_XEN
8474 +       address = get_bios_ebda();
8475 +       if (address)
8476 +               smp_scan_config(address, 0x400);
8477 +#endif
8478 +}
8479 +
8480 +int es7000_plat;
8481 +
8482 +/* --------------------------------------------------------------------------
8483 +                            ACPI-based MP Configuration
8484 +   -------------------------------------------------------------------------- */
8485 +
8486 +#ifdef CONFIG_ACPI
8487 +
8488 +void __init mp_register_lapic_address (
8489 +       u64                     address)
8490 +{
8491 +#ifndef CONFIG_XEN
8492 +       mp_lapic_addr = (unsigned long) address;
8493 +
8494 +       set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
8495 +
8496 +       if (boot_cpu_physical_apicid == -1U)
8497 +               boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
8498 +
8499 +       Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
8500 +#endif
8501 +}
8502 +
8503 +
8504 +void __devinit mp_register_lapic (
8505 +       u8                      id, 
8506 +       u8                      enabled)
8507 +{
8508 +       struct mpc_config_processor processor;
8509 +       int                     boot_cpu = 0;
8510 +       
8511 +       if (MAX_APICS - id <= 0) {
8512 +               printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
8513 +                       id, MAX_APICS);
8514 +               return;
8515 +       }
8516 +
8517 +       if (id == boot_cpu_physical_apicid)
8518 +               boot_cpu = 1;
8519 +
8520 +#ifndef CONFIG_XEN
8521 +       processor.mpc_type = MP_PROCESSOR;
8522 +       processor.mpc_apicid = id;
8523 +       processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
8524 +       processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
8525 +       processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
8526 +       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | 
8527 +               (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
8528 +       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
8529 +       processor.mpc_reserved[0] = 0;
8530 +       processor.mpc_reserved[1] = 0;
8531 +#endif
8532 +
8533 +       MP_processor_info(&processor);
8534 +}
8535 +
8536 +#ifdef CONFIG_X86_IO_APIC
8537 +
8538 +#define MP_ISA_BUS             0
8539 +#define MP_MAX_IOAPIC_PIN      127
8540 +
8541 +static struct mp_ioapic_routing {
8542 +       int                     apic_id;
8543 +       int                     gsi_base;
8544 +       int                     gsi_end;
8545 +       u32                     pin_programmed[4];
8546 +} mp_ioapic_routing[MAX_IO_APICS];
8547 +
8548 +
8549 +static int mp_find_ioapic (
8550 +       int                     gsi)
8551 +{
8552 +       int                     i = 0;
8553 +
8554 +       /* Find the IOAPIC that manages this GSI. */
8555 +       for (i = 0; i < nr_ioapics; i++) {
8556 +               if ((gsi >= mp_ioapic_routing[i].gsi_base)
8557 +                       && (gsi <= mp_ioapic_routing[i].gsi_end))
8558 +                       return i;
8559 +       }
8560 +
8561 +       printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
8562 +
8563 +       return -1;
8564 +}
8565 +       
8566 +
8567 +void __init mp_register_ioapic (
8568 +       u8                      id, 
8569 +       u32                     address,
8570 +       u32                     gsi_base)
8571 +{
8572 +       int                     idx = 0;
8573 +       int                     tmpid;
8574 +
8575 +       if (nr_ioapics >= MAX_IO_APICS) {
8576 +               printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
8577 +                       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
8578 +               panic("Recompile kernel with bigger MAX_IO_APICS!\n");
8579 +       }
8580 +       if (!address) {
8581 +               printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
8582 +                       " found in MADT table, skipping!\n");
8583 +               return;
8584 +       }
8585 +
8586 +       idx = nr_ioapics++;
8587 +
8588 +       mp_ioapics[idx].mpc_type = MP_IOAPIC;
8589 +       mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
8590 +       mp_ioapics[idx].mpc_apicaddr = address;
8591 +
8592 +#ifndef CONFIG_XEN
8593 +       set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
8594 +#endif
8595 +       if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
8596 +               && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
8597 +               tmpid = io_apic_get_unique_id(idx, id);
8598 +       else
8599 +               tmpid = id;
8600 +       if (tmpid == -1) {
8601 +               nr_ioapics--;
8602 +               return;
8603 +       }
8604 +       mp_ioapics[idx].mpc_apicid = tmpid;
8605 +       mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
8606 +       
8607 +       /* 
8608 +        * Build basic GSI lookup table to facilitate gsi->io_apic lookups
8609 +        * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
8610 +        */
8611 +       mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
8612 +       mp_ioapic_routing[idx].gsi_base = gsi_base;
8613 +       mp_ioapic_routing[idx].gsi_end = gsi_base + 
8614 +               io_apic_get_redir_entries(idx);
8615 +
8616 +       printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
8617 +               "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 
8618 +               mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
8619 +               mp_ioapic_routing[idx].gsi_base,
8620 +               mp_ioapic_routing[idx].gsi_end);
8621 +
8622 +       return;
8623 +}
8624 +
8625 +
8626 +void __init mp_override_legacy_irq (
8627 +       u8                      bus_irq,
8628 +       u8                      polarity, 
8629 +       u8                      trigger, 
8630 +       u32                     gsi)
8631 +{
8632 +       struct mpc_config_intsrc intsrc;
8633 +       int                     ioapic = -1;
8634 +       int                     pin = -1;
8635 +
8636 +       /* 
8637 +        * Convert 'gsi' to 'ioapic.pin'.
8638 +        */
8639 +       ioapic = mp_find_ioapic(gsi);
8640 +       if (ioapic < 0)
8641 +               return;
8642 +       pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
8643 +
8644 +       /*
8645 +        * TBD: This check is for faulty timer entries, where the override
8646 +        *      erroneously sets the trigger to level, resulting in a HUGE 
8647 +        *      increase of timer interrupts!
8648 +        */
8649 +       if ((bus_irq == 0) && (trigger == 3))
8650 +               trigger = 1;
8651 +
8652 +       intsrc.mpc_type = MP_INTSRC;
8653 +       intsrc.mpc_irqtype = mp_INT;
8654 +       intsrc.mpc_irqflag = (trigger << 2) | polarity;
8655 +       intsrc.mpc_srcbus = MP_ISA_BUS;
8656 +       intsrc.mpc_srcbusirq = bus_irq;                                /* IRQ */
8657 +       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;        /* APIC ID */
8658 +       intsrc.mpc_dstirq = pin;                                    /* INTIN# */
8659 +
8660 +       Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
8661 +               intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
8662 +               (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
8663 +               intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
8664 +
8665 +       mp_irqs[mp_irq_entries] = intsrc;
8666 +       if (++mp_irq_entries == MAX_IRQ_SOURCES)
8667 +               panic("Max # of irq sources exceeded!\n");
8668 +
8669 +       return;
8670 +}
8671 +
8672 +void __init mp_config_acpi_legacy_irqs (void)
8673 +{
8674 +       struct mpc_config_intsrc intsrc;
8675 +       int                     i = 0;
8676 +       int                     ioapic = -1;
8677 +
8678 +       /* 
8679 +        * Fabricate the legacy ISA bus (bus #31).
8680 +        */
8681 +       mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
8682 +       Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
8683 +
8684 +       /*
8685 +        * Older generations of ES7000 have no legacy identity mappings
8686 +        */
8687 +       if (es7000_plat == 1)
8688 +               return;
8689 +
8690 +       /* 
8691 +        * Locate the IOAPIC that manages the ISA IRQs (0-15). 
8692 +        */
8693 +       ioapic = mp_find_ioapic(0);
8694 +       if (ioapic < 0)
8695 +               return;
8696 +
8697 +       intsrc.mpc_type = MP_INTSRC;
8698 +       intsrc.mpc_irqflag = 0;                                 /* Conforming */
8699 +       intsrc.mpc_srcbus = MP_ISA_BUS;
8700 +       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
8701 +
8702 +       /* 
8703 +        * Use the default configuration for the IRQs 0-15.  Unless
8704 +        * overriden by (MADT) interrupt source override entries.
8705 +        */
8706 +       for (i = 0; i < 16; i++) {
8707 +               int idx;
8708 +
8709 +               for (idx = 0; idx < mp_irq_entries; idx++) {
8710 +                       struct mpc_config_intsrc *irq = mp_irqs + idx;
8711 +
8712 +                       /* Do we already have a mapping for this ISA IRQ? */
8713 +                       if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
8714 +                               break;
8715 +
8716 +                       /* Do we already have a mapping for this IOAPIC pin */
8717 +                       if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
8718 +                               (irq->mpc_dstirq == i))
8719 +                               break;
8720 +               }
8721 +
8722 +               if (idx != mp_irq_entries) {
8723 +                       printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
8724 +                       continue;                       /* IRQ already used */
8725 +               }
8726 +
8727 +               intsrc.mpc_irqtype = mp_INT;
8728 +               intsrc.mpc_srcbusirq = i;                  /* Identity mapped */
8729 +               intsrc.mpc_dstirq = i;
8730 +
8731 +               Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
8732 +                       "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
8733 +                       (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
8734 +                       intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, 
8735 +                       intsrc.mpc_dstirq);
8736 +
8737 +               mp_irqs[mp_irq_entries] = intsrc;
8738 +               if (++mp_irq_entries == MAX_IRQ_SOURCES)
8739 +                       panic("Max # of irq sources exceeded!\n");
8740 +       }
8741 +}
8742 +
8743 +#define MAX_GSI_NUM    4096
8744 +
8745 +int mp_register_gsi (u32 gsi, int triggering, int polarity)
8746 +{
8747 +       int                     ioapic = -1;
8748 +       int                     ioapic_pin = 0;
8749 +       int                     idx, bit = 0;
8750 +       static int              pci_irq = 16;
8751 +       /*
8752 +        * Mapping between Global System Interrups, which
8753 +        * represent all possible interrupts, and IRQs
8754 +        * assigned to actual devices.
8755 +        */
8756 +       static int              gsi_to_irq[MAX_GSI_NUM];
8757 +
8758 +       /* Don't set up the ACPI SCI because it's already set up */
8759 +       if (acpi_fadt.sci_int == gsi)
8760 +               return gsi;
8761 +
8762 +       ioapic = mp_find_ioapic(gsi);
8763 +       if (ioapic < 0) {
8764 +               printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
8765 +               return gsi;
8766 +       }
8767 +
8768 +       ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
8769 +
8770 +       if (ioapic_renumber_irq)
8771 +               gsi = ioapic_renumber_irq(ioapic, gsi);
8772 +
8773 +       /* 
8774 +        * Avoid pin reprogramming.  PRTs typically include entries  
8775 +        * with redundant pin->gsi mappings (but unique PCI devices);
8776 +        * we only program the IOAPIC on the first.
8777 +        */
8778 +       bit = ioapic_pin % 32;
8779 +       idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
8780 +       if (idx > 3) {
8781 +               printk(KERN_ERR "Invalid reference to IOAPIC pin "
8782 +                       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, 
8783 +                       ioapic_pin);
8784 +               return gsi;
8785 +       }
8786 +       if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
8787 +               Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
8788 +                       mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
8789 +               return gsi_to_irq[gsi];
8790 +       }
8791 +
8792 +       mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
8793 +
8794 +       if (triggering == ACPI_LEVEL_SENSITIVE) {
8795 +               /*
8796 +                * For PCI devices assign IRQs in order, avoiding gaps
8797 +                * due to unused I/O APIC pins.
8798 +                */
8799 +               int irq = gsi;
8800 +               if (gsi < MAX_GSI_NUM) {
8801 +                       /*
8802 +                        * Retain the VIA chipset work-around (gsi > 15), but
8803 +                        * avoid a problem where the 8254 timer (IRQ0) is setup
8804 +                        * via an override (so it's not on pin 0 of the ioapic),
8805 +                        * and at the same time, the pin 0 interrupt is a PCI
8806 +                        * type.  The gsi > 15 test could cause these two pins
8807 +                        * to be shared as IRQ0, and they are not shareable.
8808 +                        * So test for this condition, and if necessary, avoid
8809 +                        * the pin collision.
8810 +                        */
8811 +                       if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
8812 +                               gsi = pci_irq++;
8813 +                       /*
8814 +                        * Don't assign IRQ used by ACPI SCI
8815 +                        */
8816 +                       if (gsi == acpi_fadt.sci_int)
8817 +                               gsi = pci_irq++;
8818 +                       gsi_to_irq[irq] = gsi;
8819 +               } else {
8820 +                       printk(KERN_ERR "GSI %u is too high\n", gsi);
8821 +                       return gsi;
8822 +               }
8823 +       }
8824 +
8825 +       io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
8826 +                   triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
8827 +                   polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
8828 +       return gsi;
8829 +}
8830 +
8831 +#endif /* CONFIG_X86_IO_APIC */
8832 +#endif /* CONFIG_ACPI */
8833 diff -urNp linux-2.6/arch/i386/kernel/pci-dma-xen.c new/arch/i386/kernel/pci-dma-xen.c
8834 --- linux-2.6/arch/i386/kernel/pci-dma-xen.c    1970-01-01 01:00:00.000000000 +0100
8835 +++ new/arch/i386/kernel/pci-dma-xen.c  2006-05-23 18:42:17.000000000 +0200
8836 @@ -0,0 +1,344 @@
8837 +/*
8838 + * Dynamic DMA mapping support.
8839 + *
8840 + * On i386 there is no hardware dynamic DMA address translation,
8841 + * so consistent alloc/free are merely page allocation/freeing.
8842 + * The rest of the dynamic DMA mapping interface is implemented
8843 + * in asm/pci.h.
8844 + */
8845 +
8846 +#include <linux/types.h>
8847 +#include <linux/mm.h>
8848 +#include <linux/string.h>
8849 +#include <linux/pci.h>
8850 +#include <linux/module.h>
8851 +#include <linux/version.h>
8852 +#include <asm/io.h>
8853 +#include <xen/balloon.h>
8854 +#include <asm/tlbflush.h>
8855 +#include <asm-i386/mach-xen/asm/swiotlb.h>
8856 +#include <asm/bug.h>
8857 +
8858 +#ifdef __x86_64__
8859 +int iommu_merge __read_mostly = 0;
8860 +EXPORT_SYMBOL(iommu_merge);
8861 +
8862 +dma_addr_t bad_dma_address __read_mostly;
8863 +EXPORT_SYMBOL(bad_dma_address);
8864 +
8865 +/* This tells the BIO block layer to assume merging. Default to off
8866 +   because we cannot guarantee merging later. */
8867 +int iommu_bio_merge __read_mostly = 0;
8868 +EXPORT_SYMBOL(iommu_bio_merge);
8869 +
8870 +__init int iommu_setup(char *p)
8871 +{
8872 +    return 1;
8873 +}
8874 +#endif
8875 +
8876 +struct dma_coherent_mem {
8877 +       void            *virt_base;
8878 +       u32             device_base;
8879 +       int             size;
8880 +       int             flags;
8881 +       unsigned long   *bitmap;
8882 +};
8883 +
8884 +#define IOMMU_BUG_ON(test)                             \
8885 +do {                                                   \
8886 +       if (unlikely(test)) {                           \
8887 +               printk(KERN_ALERT "Fatal DMA error! "   \
8888 +                      "Please use 'swiotlb=force'\n"); \
8889 +               BUG();                                  \
8890 +       }                                               \
8891 +} while (0)
8892 +
8893 +int
8894 +dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents,
8895 +          enum dma_data_direction direction)
8896 +{
8897 +       int i, rc;
8898 +
8899 +       if (direction == DMA_NONE)
8900 +               BUG();
8901 +       WARN_ON(nents == 0 || sg[0].length == 0);
8902 +
8903 +       if (swiotlb) {
8904 +               rc = swiotlb_map_sg(hwdev, sg, nents, direction);
8905 +       } else {
8906 +               for (i = 0; i < nents; i++ ) {
8907 +                       sg[i].dma_address =
8908 +                               page_to_bus(sg[i].page) + sg[i].offset;
8909 +                       sg[i].dma_length  = sg[i].length;
8910 +                       BUG_ON(!sg[i].page);
8911 +                       IOMMU_BUG_ON(address_needs_mapping(
8912 +                               hwdev, sg[i].dma_address));
8913 +               }
8914 +               rc = nents;
8915 +       }
8916 +
8917 +       flush_write_buffers();
8918 +       return rc;
8919 +}
8920 +EXPORT_SYMBOL(dma_map_sg);
8921 +
8922 +void
8923 +dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
8924 +            enum dma_data_direction direction)
8925 +{
8926 +       BUG_ON(direction == DMA_NONE);
8927 +       if (swiotlb)
8928 +               swiotlb_unmap_sg(hwdev, sg, nents, direction);
8929 +}
8930 +EXPORT_SYMBOL(dma_unmap_sg);
8931 +
8932 +/*
8933 + * XXX This file is also used by xenLinux/ia64. 
8934 + * "defined(__i386__) || defined (__x86_64__)" means "!defined(__ia64__)".
8935 + * This #if work around should be removed once this file is merbed back into
8936 + * i386' pci-dma or is moved to drivers/xen/core.
8937 + */
8938 +#if defined(__i386__) || defined(__x86_64__)
8939 +dma_addr_t
8940 +dma_map_page(struct device *dev, struct page *page, unsigned long offset,
8941 +            size_t size, enum dma_data_direction direction)
8942 +{
8943 +       dma_addr_t dma_addr;
8944 +
8945 +       BUG_ON(direction == DMA_NONE);
8946 +
8947 +       if (swiotlb) {
8948 +               dma_addr = swiotlb_map_page(
8949 +                       dev, page, offset, size, direction);
8950 +       } else {
8951 +               dma_addr = page_to_bus(page) + offset;
8952 +               IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr));
8953 +       }
8954 +
8955 +       return dma_addr;
8956 +}
8957 +EXPORT_SYMBOL(dma_map_page);
8958 +
8959 +void
8960 +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
8961 +              enum dma_data_direction direction)
8962 +{
8963 +       BUG_ON(direction == DMA_NONE);
8964 +       if (swiotlb)
8965 +               swiotlb_unmap_page(dev, dma_address, size, direction);
8966 +}
8967 +EXPORT_SYMBOL(dma_unmap_page);
8968 +#endif /* defined(__i386__) || defined(__x86_64__) */
8969 +
8970 +int
8971 +dma_mapping_error(dma_addr_t dma_addr)
8972 +{
8973 +       if (swiotlb)
8974 +               return swiotlb_dma_mapping_error(dma_addr);
8975 +       return 0;
8976 +}
8977 +EXPORT_SYMBOL(dma_mapping_error);
8978 +
8979 +int
8980 +dma_supported(struct device *dev, u64 mask)
8981 +{
8982 +       if (swiotlb)
8983 +               return swiotlb_dma_supported(dev, mask);
8984 +       /*
8985 +        * By default we'll BUG when an infeasible DMA is requested, and
8986 +        * request swiotlb=force (see IOMMU_BUG_ON).
8987 +        */
8988 +       return 1;
8989 +}
8990 +EXPORT_SYMBOL(dma_supported);
8991 +
8992 +void *dma_alloc_coherent(struct device *dev, size_t size,
8993 +                          dma_addr_t *dma_handle, gfp_t gfp)
8994 +{
8995 +       void *ret;
8996 +       struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
8997 +       unsigned int order = get_order(size);
8998 +       unsigned long vstart;
8999 +       /* ignore region specifiers */
9000 +       gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
9001 +
9002 +       if (mem) {
9003 +               int page = bitmap_find_free_region(mem->bitmap, mem->size,
9004 +                                                    order);
9005 +               if (page >= 0) {
9006 +                       *dma_handle = mem->device_base + (page << PAGE_SHIFT);
9007 +                       ret = mem->virt_base + (page << PAGE_SHIFT);
9008 +                       memset(ret, 0, size);
9009 +                       return ret;
9010 +               }
9011 +               if (mem->flags & DMA_MEMORY_EXCLUSIVE)
9012 +                       return NULL;
9013 +       }
9014 +
9015 +       if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
9016 +               gfp |= GFP_DMA;
9017 +
9018 +       vstart = __get_free_pages(gfp, order);
9019 +       ret = (void *)vstart;
9020 +
9021 +       if (ret != NULL) {
9022 +               /* NB. Hardcode 31 address bits for now: aacraid limitation. */
9023 +               if (xen_create_contiguous_region(vstart, order, 31) != 0) {
9024 +                       free_pages(vstart, order);
9025 +                       return NULL;
9026 +               }
9027 +               memset(ret, 0, size);
9028 +               *dma_handle = virt_to_bus(ret);
9029 +       }
9030 +       return ret;
9031 +}
9032 +EXPORT_SYMBOL(dma_alloc_coherent);
9033 +
9034 +void dma_free_coherent(struct device *dev, size_t size,
9035 +                        void *vaddr, dma_addr_t dma_handle)
9036 +{
9037 +       struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
9038 +       int order = get_order(size);
9039 +       
9040 +       if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
9041 +               int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
9042 +
9043 +               bitmap_release_region(mem->bitmap, page, order);
9044 +       } else {
9045 +               xen_destroy_contiguous_region((unsigned long)vaddr, order);
9046 +               free_pages((unsigned long)vaddr, order);
9047 +       }
9048 +}
9049 +EXPORT_SYMBOL(dma_free_coherent);
9050 +
9051 +#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
9052 +int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
9053 +                               dma_addr_t device_addr, size_t size, int flags)
9054 +{
9055 +       void __iomem *mem_base;
9056 +       int pages = size >> PAGE_SHIFT;
9057 +       int bitmap_size = (pages + 31)/32;
9058 +
9059 +       if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
9060 +               goto out;
9061 +       if (!size)
9062 +               goto out;
9063 +       if (dev->dma_mem)
9064 +               goto out;
9065 +
9066 +       /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
9067 +
9068 +       mem_base = ioremap(bus_addr, size);
9069 +       if (!mem_base)
9070 +               goto out;
9071 +
9072 +       dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
9073 +       if (!dev->dma_mem)
9074 +               goto out;
9075 +       memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem));
9076 +       dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
9077 +       if (!dev->dma_mem->bitmap)
9078 +               goto free1_out;
9079 +       memset(dev->dma_mem->bitmap, 0, bitmap_size);
9080 +
9081 +       dev->dma_mem->virt_base = mem_base;
9082 +       dev->dma_mem->device_base = device_addr;
9083 +       dev->dma_mem->size = pages;
9084 +       dev->dma_mem->flags = flags;
9085 +
9086 +       if (flags & DMA_MEMORY_MAP)
9087 +               return DMA_MEMORY_MAP;
9088 +
9089 +       return DMA_MEMORY_IO;
9090 +
9091 + free1_out:
9092 +       kfree(dev->dma_mem->bitmap);
9093 + out:
9094 +       return 0;
9095 +}
9096 +EXPORT_SYMBOL(dma_declare_coherent_memory);
9097 +
9098 +void dma_release_declared_memory(struct device *dev)
9099 +{
9100 +       struct dma_coherent_mem *mem = dev->dma_mem;
9101 +       
9102 +       if(!mem)
9103 +               return;
9104 +       dev->dma_mem = NULL;
9105 +       iounmap(mem->virt_base);
9106 +       kfree(mem->bitmap);
9107 +       kfree(mem);
9108 +}
9109 +EXPORT_SYMBOL(dma_release_declared_memory);
9110 +
9111 +void *dma_mark_declared_memory_occupied(struct device *dev,
9112 +                                       dma_addr_t device_addr, size_t size)
9113 +{
9114 +       struct dma_coherent_mem *mem = dev->dma_mem;
9115 +       int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
9116 +       int pos, err;
9117 +
9118 +       if (!mem)
9119 +               return ERR_PTR(-EINVAL);
9120 +
9121 +       pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
9122 +       err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
9123 +       if (err != 0)
9124 +               return ERR_PTR(err);
9125 +       return mem->virt_base + (pos << PAGE_SHIFT);
9126 +}
9127 +EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
9128 +#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
9129 +
9130 +dma_addr_t
9131 +dma_map_single(struct device *dev, void *ptr, size_t size,
9132 +              enum dma_data_direction direction)
9133 +{
9134 +       dma_addr_t dma;
9135 +
9136 +       if (direction == DMA_NONE)
9137 +               BUG();
9138 +       WARN_ON(size == 0);
9139 +
9140 +       if (swiotlb) {
9141 +               dma = swiotlb_map_single(dev, ptr, size, direction);
9142 +       } else {
9143 +               dma = virt_to_bus(ptr);
9144 +               IOMMU_BUG_ON(range_straddles_page_boundary(ptr, size));
9145 +               IOMMU_BUG_ON(address_needs_mapping(dev, dma));
9146 +       }
9147 +
9148 +       flush_write_buffers();
9149 +       return dma;
9150 +}
9151 +EXPORT_SYMBOL(dma_map_single);
9152 +
9153 +void
9154 +dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
9155 +                enum dma_data_direction direction)
9156 +{
9157 +       if (direction == DMA_NONE)
9158 +               BUG();
9159 +       if (swiotlb)
9160 +               swiotlb_unmap_single(dev, dma_addr, size, direction);
9161 +}
9162 +EXPORT_SYMBOL(dma_unmap_single);
9163 +
9164 +void
9165 +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
9166 +                       enum dma_data_direction direction)
9167 +{
9168 +       if (swiotlb)
9169 +               swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction);
9170 +}
9171 +EXPORT_SYMBOL(dma_sync_single_for_cpu);
9172 +
9173 +void
9174 +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
9175 +                           enum dma_data_direction direction)
9176 +{
9177 +       if (swiotlb)
9178 +               swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
9179 +}
9180 +EXPORT_SYMBOL(dma_sync_single_for_device);
9181 diff -urNp linux-2.6/arch/i386/kernel/process-xen.c new/arch/i386/kernel/process-xen.c
9182 --- linux-2.6/arch/i386/kernel/process-xen.c    1970-01-01 01:00:00.000000000 +0100
9183 +++ new/arch/i386/kernel/process-xen.c  2006-06-07 13:15:16.000000000 +0200
9184 @@ -0,0 +1,812 @@
9185 +/*
9186 + *  linux/arch/i386/kernel/process.c
9187 + *
9188 + *  Copyright (C) 1995  Linus Torvalds
9189 + *
9190 + *  Pentium III FXSR, SSE support
9191 + *     Gareth Hughes <gareth@valinux.com>, May 2000
9192 + */
9193 +
9194 +/*
9195 + * This file handles the architecture-dependent parts of process handling..
9196 + */
9197 +
9198 +#include <stdarg.h>
9199 +
9200 +#include <linux/cpu.h>
9201 +#include <linux/errno.h>
9202 +#include <linux/sched.h>
9203 +#include <linux/fs.h>
9204 +#include <linux/kernel.h>
9205 +#include <linux/mm.h>
9206 +#include <linux/elfcore.h>
9207 +#include <linux/smp.h>
9208 +#include <linux/smp_lock.h>
9209 +#include <linux/stddef.h>
9210 +#include <linux/slab.h>
9211 +#include <linux/vmalloc.h>
9212 +#include <linux/user.h>
9213 +#include <linux/a.out.h>
9214 +#include <linux/interrupt.h>
9215 +#include <linux/config.h>
9216 +#include <linux/utsname.h>
9217 +#include <linux/delay.h>
9218 +#include <linux/reboot.h>
9219 +#include <linux/init.h>
9220 +#include <linux/mc146818rtc.h>
9221 +#include <linux/module.h>
9222 +#include <linux/kallsyms.h>
9223 +#include <linux/ptrace.h>
9224 +#include <linux/random.h>
9225 +
9226 +#include <asm/uaccess.h>
9227 +#include <asm/pgtable.h>
9228 +#include <asm/system.h>
9229 +#include <asm/io.h>
9230 +#include <asm/ldt.h>
9231 +#include <asm/processor.h>
9232 +#include <asm/i387.h>
9233 +#include <asm/desc.h>
9234 +#include <asm/vm86.h>
9235 +#ifdef CONFIG_MATH_EMULATION
9236 +#include <asm/math_emu.h>
9237 +#endif
9238 +
9239 +#include <xen/interface/physdev.h>
9240 +#include <xen/interface/vcpu.h>
9241 +#include <xen/cpu_hotplug.h>
9242 +
9243 +#include <linux/err.h>
9244 +
9245 +#include <asm/tlbflush.h>
9246 +#include <asm/cpu.h>
9247 +
9248 +#include <asm/tlbflush.h>
9249 +#include <asm/cpu.h>
9250 +
9251 +asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
9252 +
9253 +static int hlt_counter;
9254 +
9255 +unsigned long boot_option_idle_override = 0;
9256 +EXPORT_SYMBOL(boot_option_idle_override);
9257 +
9258 +/*
9259 + * Return saved PC of a blocked thread.
9260 + */
9261 +unsigned long thread_saved_pc(struct task_struct *tsk)
9262 +{
9263 +       return ((unsigned long *)tsk->thread.esp)[3];
9264 +}
9265 +
9266 +/*
9267 + * Powermanagement idle function, if any..
9268 + */
9269 +void (*pm_idle)(void);
9270 +EXPORT_SYMBOL(pm_idle);
9271 +static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
9272 +
9273 +void disable_hlt(void)
9274 +{
9275 +       hlt_counter++;
9276 +}
9277 +
9278 +EXPORT_SYMBOL(disable_hlt);
9279 +
9280 +void enable_hlt(void)
9281 +{
9282 +       hlt_counter--;
9283 +}
9284 +
9285 +EXPORT_SYMBOL(enable_hlt);
9286 +
9287 +/* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */
9288 +void xen_idle(void)
9289 +{
9290 +       local_irq_disable();
9291 +
9292 +       if (need_resched())
9293 +               local_irq_enable();
9294 +       else {
9295 +               clear_thread_flag(TIF_POLLING_NRFLAG);
9296 +               smp_mb__after_clear_bit();
9297 +               safe_halt();
9298 +               set_thread_flag(TIF_POLLING_NRFLAG);
9299 +       }
9300 +}
9301 +#ifdef CONFIG_APM_MODULE
9302 +EXPORT_SYMBOL(default_idle);
9303 +#endif
9304 +
9305 +#ifdef CONFIG_HOTPLUG_CPU
9306 +extern cpumask_t cpu_initialized;
9307 +static inline void play_dead(void)
9308 +{
9309 +       idle_task_exit();
9310 +       local_irq_disable();
9311 +       cpu_clear(smp_processor_id(), cpu_initialized);
9312 +       preempt_enable_no_resched();
9313 +       HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
9314 +       cpu_bringup();
9315 +}
9316 +#else
9317 +static inline void play_dead(void)
9318 +{
9319 +       BUG();
9320 +}
9321 +#endif /* CONFIG_HOTPLUG_CPU */
9322 +
9323 +/*
9324 + * The idle thread. There's no useful work to be
9325 + * done, so just try to conserve power and have a
9326 + * low exit latency (ie sit in a loop waiting for
9327 + * somebody to say that they'd like to reschedule)
9328 + */
9329 +void cpu_idle(void)
9330 +{
9331 +       int cpu = smp_processor_id();
9332 +
9333 +       set_thread_flag(TIF_POLLING_NRFLAG);
9334 +
9335 +       /* endless idle loop with no priority at all */
9336 +       while (1) {
9337 +               while (!need_resched()) {
9338 +
9339 +                       if (__get_cpu_var(cpu_idle_state))
9340 +                               __get_cpu_var(cpu_idle_state) = 0;
9341 +
9342 +                       rmb();
9343 +
9344 +                       if (cpu_is_offline(cpu))
9345 +                               play_dead();
9346 +
9347 +                       __get_cpu_var(irq_stat).idle_timestamp = jiffies;
9348 +                       xen_idle();
9349 +               }
9350 +               preempt_enable_no_resched();
9351 +               schedule();
9352 +               preempt_disable();
9353 +       }
9354 +}
9355 +
9356 +void cpu_idle_wait(void)
9357 +{
9358 +       unsigned int cpu, this_cpu = get_cpu();
9359 +       cpumask_t map;
9360 +
9361 +       set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
9362 +       put_cpu();
9363 +
9364 +       cpus_clear(map);
9365 +       for_each_online_cpu(cpu) {
9366 +               per_cpu(cpu_idle_state, cpu) = 1;
9367 +               cpu_set(cpu, map);
9368 +       }
9369 +
9370 +       __get_cpu_var(cpu_idle_state) = 0;
9371 +
9372 +       wmb();
9373 +       do {
9374 +               ssleep(1);
9375 +               for_each_online_cpu(cpu) {
9376 +                       if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
9377 +                               cpu_clear(cpu, map);
9378 +               }
9379 +               cpus_and(map, map, cpu_online_map);
9380 +       } while (!cpus_empty(map));
9381 +}
9382 +EXPORT_SYMBOL_GPL(cpu_idle_wait);
9383 +
9384 +/* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */
9385 +/* Always use xen_idle() instead. */
9386 +void __devinit select_idle_routine(const struct cpuinfo_x86 *c) {}
9387 +
9388 +void show_regs(struct pt_regs * regs)
9389 +{
9390 +       unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
9391 +
9392 +       printk("\n");
9393 +       printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
9394 +       printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
9395 +       print_symbol("EIP is at %s\n", regs->eip);
9396 +
9397 +       if (user_mode_vm(regs))
9398 +               printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
9399 +       printk(" EFLAGS: %08lx    %s  (%s %.*s)\n",
9400 +              regs->eflags, print_tainted(), system_utsname.release,
9401 +              (int)strcspn(system_utsname.version, " "),
9402 +              system_utsname.version);
9403 +       printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
9404 +               regs->eax,regs->ebx,regs->ecx,regs->edx);
9405 +       printk("ESI: %08lx EDI: %08lx EBP: %08lx",
9406 +               regs->esi, regs->edi, regs->ebp);
9407 +       printk(" DS: %04x ES: %04x\n",
9408 +               0xffff & regs->xds,0xffff & regs->xes);
9409 +
9410 +       cr0 = read_cr0();
9411 +       cr2 = read_cr2();
9412 +       cr3 = read_cr3();
9413 +       cr4 = read_cr4_safe();
9414 +       printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
9415 +       show_trace(NULL, &regs->esp);
9416 +}
9417 +
9418 +/*
9419 + * This gets run with %ebx containing the
9420 + * function to call, and %edx containing
9421 + * the "args".
9422 + */
9423 +extern void kernel_thread_helper(void);
9424 +__asm__(".section .text\n"
9425 +       ".align 4\n"
9426 +       "kernel_thread_helper:\n\t"
9427 +       "movl %edx,%eax\n\t"
9428 +       "pushl %edx\n\t"
9429 +       "call *%ebx\n\t"
9430 +       "pushl %eax\n\t"
9431 +       "call do_exit\n"
9432 +       ".previous");
9433 +
9434 +/*
9435 + * Create a kernel thread
9436 + */
9437 +int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
9438 +{
9439 +       struct pt_regs regs;
9440 +
9441 +       memset(&regs, 0, sizeof(regs));
9442 +
9443 +       regs.ebx = (unsigned long) fn;
9444 +       regs.edx = (unsigned long) arg;
9445 +
9446 +       regs.xds = __USER_DS;
9447 +       regs.xes = __USER_DS;
9448 +       regs.orig_eax = -1;
9449 +       regs.eip = (unsigned long) kernel_thread_helper;
9450 +       regs.xcs = GET_KERNEL_CS();
9451 +       regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
9452 +
9453 +       /* Ok, create the new process.. */
9454 +       return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
9455 +}
9456 +EXPORT_SYMBOL(kernel_thread);
9457 +
9458 +/*
9459 + * Free current thread data structures etc..
9460 + */
9461 +void exit_thread(void)
9462 +{
9463 +       struct task_struct *tsk = current;
9464 +       struct thread_struct *t = &tsk->thread;
9465 +
9466 +       /* The process may have allocated an io port bitmap... nuke it. */
9467 +       if (unlikely(NULL != t->io_bitmap_ptr)) {
9468 +               struct physdev_set_iobitmap set_iobitmap = { 0 };
9469 +               HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &set_iobitmap);
9470 +               kfree(t->io_bitmap_ptr);
9471 +               t->io_bitmap_ptr = NULL;
9472 +       }
9473 +}
9474 +
9475 +void flush_thread(void)
9476 +{
9477 +       struct task_struct *tsk = current;
9478 +
9479 +       memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
9480 +       memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));        
9481 +       /*
9482 +        * Forget coprocessor state..
9483 +        */
9484 +       clear_fpu(tsk);
9485 +       clear_used_math();
9486 +}
9487 +
9488 +void release_thread(struct task_struct *dead_task)
9489 +{
9490 +       BUG_ON(dead_task->mm);
9491 +       release_vm86_irqs(dead_task);
9492 +}
9493 +
9494 +/*
9495 + * This gets called before we allocate a new thread and copy
9496 + * the current task into it.
9497 + */
9498 +void prepare_to_copy(struct task_struct *tsk)
9499 +{
9500 +       unlazy_fpu(tsk);
9501 +}
9502 +
9503 +int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
9504 +       unsigned long unused,
9505 +       struct task_struct * p, struct pt_regs * regs)
9506 +{
9507 +       struct pt_regs * childregs;
9508 +       struct task_struct *tsk;
9509 +       int err;
9510 +
9511 +       childregs = task_pt_regs(p);
9512 +       *childregs = *regs;
9513 +       childregs->eax = 0;
9514 +       childregs->esp = esp;
9515 +
9516 +       p->thread.esp = (unsigned long) childregs;
9517 +       p->thread.esp0 = (unsigned long) (childregs+1);
9518 +
9519 +       p->thread.eip = (unsigned long) ret_from_fork;
9520 +
9521 +       savesegment(fs,p->thread.fs);
9522 +       savesegment(gs,p->thread.gs);
9523 +
9524 +       tsk = current;
9525 +       if (unlikely(NULL != tsk->thread.io_bitmap_ptr)) {
9526 +               p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
9527 +               if (!p->thread.io_bitmap_ptr) {
9528 +                       p->thread.io_bitmap_max = 0;
9529 +                       return -ENOMEM;
9530 +               }
9531 +               memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
9532 +                       IO_BITMAP_BYTES);
9533 +       }
9534 +
9535 +       /*
9536 +        * Set a new TLS for the child thread?
9537 +        */
9538 +       if (clone_flags & CLONE_SETTLS) {
9539 +               struct desc_struct *desc;
9540 +               struct user_desc info;
9541 +               int idx;
9542 +
9543 +               err = -EFAULT;
9544 +               if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
9545 +                       goto out;
9546 +               err = -EINVAL;
9547 +               if (LDT_empty(&info))
9548 +                       goto out;
9549 +
9550 +               idx = info.entry_number;
9551 +               if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
9552 +                       goto out;
9553 +
9554 +               desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
9555 +               desc->a = LDT_entry_a(&info);
9556 +               desc->b = LDT_entry_b(&info);
9557 +       }
9558 +
9559 +       p->thread.iopl = current->thread.iopl;
9560 +
9561 +       err = 0;
9562 + out:
9563 +       if (err && p->thread.io_bitmap_ptr) {
9564 +               kfree(p->thread.io_bitmap_ptr);
9565 +               p->thread.io_bitmap_max = 0;
9566 +       }
9567 +       return err;
9568 +}
9569 +
9570 +/*
9571 + * fill in the user structure for a core dump..
9572 + */
9573 +void dump_thread(struct pt_regs * regs, struct user * dump)
9574 +{
9575 +       int i;
9576 +
9577 +/* changed the size calculations - should hopefully work better. lbt */
9578 +       dump->magic = CMAGIC;
9579 +       dump->start_code = 0;
9580 +       dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
9581 +       dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
9582 +       dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
9583 +       dump->u_dsize -= dump->u_tsize;
9584 +       dump->u_ssize = 0;
9585 +       for (i = 0; i < 8; i++)
9586 +               dump->u_debugreg[i] = current->thread.debugreg[i];  
9587 +
9588 +       if (dump->start_stack < TASK_SIZE)
9589 +               dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
9590 +
9591 +       dump->regs.ebx = regs->ebx;
9592 +       dump->regs.ecx = regs->ecx;
9593 +       dump->regs.edx = regs->edx;
9594 +       dump->regs.esi = regs->esi;
9595 +       dump->regs.edi = regs->edi;
9596 +       dump->regs.ebp = regs->ebp;
9597 +       dump->regs.eax = regs->eax;
9598 +       dump->regs.ds = regs->xds;
9599 +       dump->regs.es = regs->xes;
9600 +       savesegment(fs,dump->regs.fs);
9601 +       savesegment(gs,dump->regs.gs);
9602 +       dump->regs.orig_eax = regs->orig_eax;
9603 +       dump->regs.eip = regs->eip;
9604 +       dump->regs.cs = regs->xcs;
9605 +       dump->regs.eflags = regs->eflags;
9606 +       dump->regs.esp = regs->esp;
9607 +       dump->regs.ss = regs->xss;
9608 +
9609 +       dump->u_fpvalid = dump_fpu (regs, &dump->i387);
9610 +}
9611 +EXPORT_SYMBOL(dump_thread);
9612 +
9613 +/* 
9614 + * Capture the user space registers if the task is not running (in user space)
9615 + */
9616 +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
9617 +{
9618 +       struct pt_regs ptregs = *task_pt_regs(tsk);
9619 +       ptregs.xcs &= 0xffff;
9620 +       ptregs.xds &= 0xffff;
9621 +       ptregs.xes &= 0xffff;
9622 +       ptregs.xss &= 0xffff;
9623 +
9624 +       elf_core_copy_regs(regs, &ptregs);
9625 +
9626 +       return 1;
9627 +}
9628 +
9629 +/*
9630 + * This function selects if the context switch from prev to next
9631 + * has to tweak the TSC disable bit in the cr4.
9632 + */
9633 +static inline void disable_tsc(struct task_struct *prev_p,
9634 +                              struct task_struct *next_p)
9635 +{
9636 +       struct thread_info *prev, *next;
9637 +
9638 +       /*
9639 +        * gcc should eliminate the ->thread_info dereference if
9640 +        * has_secure_computing returns 0 at compile time (SECCOMP=n).
9641 +        */
9642 +       prev = task_thread_info(prev_p);
9643 +       next = task_thread_info(next_p);
9644 +
9645 +       if (has_secure_computing(prev) || has_secure_computing(next)) {
9646 +               /* slow path here */
9647 +               if (has_secure_computing(prev) &&
9648 +                   !has_secure_computing(next)) {
9649 +                       write_cr4(read_cr4() & ~X86_CR4_TSD);
9650 +               } else if (!has_secure_computing(prev) &&
9651 +                          has_secure_computing(next))
9652 +                       write_cr4(read_cr4() | X86_CR4_TSD);
9653 +       }
9654 +}
9655 +
9656 +/*
9657 + *     switch_to(x,yn) should switch tasks from x to y.
9658 + *
9659 + * We fsave/fwait so that an exception goes off at the right time
9660 + * (as a call from the fsave or fwait in effect) rather than to
9661 + * the wrong process. Lazy FP saving no longer makes any sense
9662 + * with modern CPU's, and this simplifies a lot of things (SMP
9663 + * and UP become the same).
9664 + *
9665 + * NOTE! We used to use the x86 hardware context switching. The
9666 + * reason for not using it any more becomes apparent when you
9667 + * try to recover gracefully from saved state that is no longer
9668 + * valid (stale segment register values in particular). With the
9669 + * hardware task-switch, there is no way to fix up bad state in
9670 + * a reasonable manner.
9671 + *
9672 + * The fact that Intel documents the hardware task-switching to
9673 + * be slow is a fairly red herring - this code is not noticeably
9674 + * faster. However, there _is_ some room for improvement here,
9675 + * so the performance issues may eventually be a valid point.
9676 + * More important, however, is the fact that this allows us much
9677 + * more flexibility.
9678 + *
9679 + * The return value (in %eax) will be the "prev" task after
9680 + * the task-switch, and shows up in ret_from_fork in entry.S,
9681 + * for example.
9682 + */
9683 +struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
9684 +{
9685 +       struct thread_struct *prev = &prev_p->thread,
9686 +                                *next = &next_p->thread;
9687 +       int cpu = smp_processor_id();
9688 +#ifndef CONFIG_X86_NO_TSS
9689 +       struct tss_struct *tss = &per_cpu(init_tss, cpu);
9690 +#endif
9691 +       struct physdev_set_iopl iopl_op;
9692 +       struct physdev_set_iobitmap iobmp_op;
9693 +       multicall_entry_t _mcl[8], *mcl = _mcl;
9694 +
9695 +       /* XEN NOTE: FS/GS saved in switch_mm(), not here. */
9696 +
9697 +       /*
9698 +        * This is basically '__unlazy_fpu', except that we queue a
9699 +        * multicall to indicate FPU task switch, rather than
9700 +        * synchronously trapping to Xen.
9701 +        */
9702 +       if (prev_p->thread_info->status & TS_USEDFPU) {
9703 +               __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
9704 +               mcl->op      = __HYPERVISOR_fpu_taskswitch;
9705 +               mcl->args[0] = 1;
9706 +               mcl++;
9707 +       }
9708 +#if 0 /* lazy fpu sanity check */
9709 +       else BUG_ON(!(read_cr0() & 8));
9710 +#endif
9711 +
9712 +       /*
9713 +        * Reload esp0.
9714 +        * This is load_esp0(tss, next) with a multicall.
9715 +        */
9716 +       mcl->op      = __HYPERVISOR_stack_switch;
9717 +       mcl->args[0] = __KERNEL_DS;
9718 +       mcl->args[1] = next->esp0;
9719 +       mcl++;
9720 +
9721 +       /*
9722 +        * Load the per-thread Thread-Local Storage descriptor.
9723 +        * This is load_TLS(next, cpu) with multicalls.
9724 +        */
9725 +#define C(i) do {                                                      \
9726 +       if (unlikely(next->tls_array[i].a != prev->tls_array[i].a ||    \
9727 +                    next->tls_array[i].b != prev->tls_array[i].b)) {   \
9728 +               mcl->op = __HYPERVISOR_update_descriptor;               \
9729 +               *(u64 *)&mcl->args[0] = virt_to_machine(                \
9730 +                       &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
9731 +               *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i];    \
9732 +               mcl++;                                                  \
9733 +       }                                                               \
9734 +} while (0)
9735 +       C(0); C(1); C(2);
9736 +#undef C
9737 +
9738 +       if (unlikely(prev->iopl != next->iopl)) {
9739 +               iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
9740 +               mcl->op      = __HYPERVISOR_physdev_op;
9741 +               mcl->args[0] = PHYSDEVOP_set_iopl;
9742 +               mcl->args[1] = (unsigned long)&iopl_op;
9743 +               mcl++;
9744 +       }
9745 +
9746 +       if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
9747 +               iobmp_op.bitmap   = (char *)next->io_bitmap_ptr;
9748 +               iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
9749 +               mcl->op      = __HYPERVISOR_physdev_op;
9750 +               mcl->args[0] = PHYSDEVOP_set_iobitmap;
9751 +               mcl->args[1] = (unsigned long)&iobmp_op;
9752 +               mcl++;
9753 +       }
9754 +
9755 +       (void)HYPERVISOR_multicall(_mcl, mcl - _mcl);
9756 +
9757 +       /*
9758 +        * Restore %fs and %gs if needed.
9759 +        *
9760 +        * Glibc normally makes %fs be zero, and %gs is one of
9761 +        * the TLS segments.
9762 +        */
9763 +       if (unlikely(next->fs))
9764 +               loadsegment(fs, next->fs);
9765 +
9766 +       if (next->gs)
9767 +               loadsegment(gs, next->gs);
9768 +
9769 +       /*
9770 +        * Now maybe reload the debug registers
9771 +        */
9772 +       if (unlikely(next->debugreg[7])) {
9773 +               set_debugreg(next->debugreg[0], 0);
9774 +               set_debugreg(next->debugreg[1], 1);
9775 +               set_debugreg(next->debugreg[2], 2);
9776 +               set_debugreg(next->debugreg[3], 3);
9777 +               /* no 4 and 5 */
9778 +               set_debugreg(next->debugreg[6], 6);
9779 +               set_debugreg(next->debugreg[7], 7);
9780 +       }
9781 +
9782 +       disable_tsc(prev_p, next_p);
9783 +
9784 +       return prev_p;
9785 +}
9786 +
9787 +asmlinkage int sys_fork(struct pt_regs regs)
9788 +{
9789 +       return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
9790 +}
9791 +
9792 +asmlinkage int sys_clone(struct pt_regs regs)
9793 +{
9794 +       unsigned long clone_flags;
9795 +       unsigned long newsp;
9796 +       int __user *parent_tidptr, *child_tidptr;
9797 +
9798 +       clone_flags = regs.ebx;
9799 +       newsp = regs.ecx;
9800 +       parent_tidptr = (int __user *)regs.edx;
9801 +       child_tidptr = (int __user *)regs.edi;
9802 +       if (!newsp)
9803 +               newsp = regs.esp;
9804 +       return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
9805 +}
9806 +
9807 +/*
9808 + * This is trivial, and on the face of it looks like it
9809 + * could equally well be done in user mode.
9810 + *
9811 + * Not so, for quite unobvious reasons - register pressure.
9812 + * In user mode vfork() cannot have a stack frame, and if
9813 + * done by calling the "clone()" system call directly, you
9814 + * do not have enough call-clobbered registers to hold all
9815 + * the information you need.
9816 + */
9817 +asmlinkage int sys_vfork(struct pt_regs regs)
9818 +{
9819 +       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
9820 +}
9821 +
9822 +/*
9823 + * sys_execve() executes a new program.
9824 + */
9825 +asmlinkage int sys_execve(struct pt_regs regs)
9826 +{
9827 +       int error;
9828 +       char * filename;
9829 +
9830 +       filename = getname((char __user *) regs.ebx);
9831 +       error = PTR_ERR(filename);
9832 +       if (IS_ERR(filename))
9833 +               goto out;
9834 +       error = do_execve(filename,
9835 +                       (char __user * __user *) regs.ecx,
9836 +                       (char __user * __user *) regs.edx,
9837 +                       &regs);
9838 +       if (error == 0) {
9839 +               task_lock(current);
9840 +               current->ptrace &= ~PT_DTRACE;
9841 +               task_unlock(current);
9842 +               /* Make sure we don't return using sysenter.. */
9843 +               set_thread_flag(TIF_IRET);
9844 +       }
9845 +       putname(filename);
9846 +out:
9847 +       return error;
9848 +}
9849 +
9850 +#define top_esp                (THREAD_SIZE - sizeof(unsigned long))
9851 +#define top_ebp                (THREAD_SIZE - 2*sizeof(unsigned long))
9852 +
9853 +unsigned long get_wchan(struct task_struct *p)
9854 +{
9855 +       unsigned long ebp, esp, eip;
9856 +       unsigned long stack_page;
9857 +       int count = 0;
9858 +       if (!p || p == current || p->state == TASK_RUNNING)
9859 +               return 0;
9860 +       stack_page = (unsigned long)task_stack_page(p);
9861 +       esp = p->thread.esp;
9862 +       if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
9863 +               return 0;
9864 +       /* include/asm-i386/system.h:switch_to() pushes ebp last. */
9865 +       ebp = *(unsigned long *) esp;
9866 +       do {
9867 +               if (ebp < stack_page || ebp > top_ebp+stack_page)
9868 +                       return 0;
9869 +               eip = *(unsigned long *) (ebp+4);
9870 +               if (!in_sched_functions(eip))
9871 +                       return eip;
9872 +               ebp = *(unsigned long *) ebp;
9873 +       } while (count++ < 16);
9874 +       return 0;
9875 +}
9876 +
9877 +/*
9878 + * sys_alloc_thread_area: get a yet unused TLS descriptor index.
9879 + */
9880 +static int get_free_idx(void)
9881 +{
9882 +       struct thread_struct *t = &current->thread;
9883 +       int idx;
9884 +
9885 +       for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
9886 +               if (desc_empty(t->tls_array + idx))
9887 +                       return idx + GDT_ENTRY_TLS_MIN;
9888 +       return -ESRCH;
9889 +}
9890 +
9891 +/*
9892 + * Set a given TLS descriptor:
9893 + */
9894 +asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
9895 +{
9896 +       struct thread_struct *t = &current->thread;
9897 +       struct user_desc info;
9898 +       struct desc_struct *desc;
9899 +       int cpu, idx;
9900 +
9901 +       if (copy_from_user(&info, u_info, sizeof(info)))
9902 +               return -EFAULT;
9903 +       idx = info.entry_number;
9904 +
9905 +       /*
9906 +        * index -1 means the kernel should try to find and
9907 +        * allocate an empty descriptor:
9908 +        */
9909 +       if (idx == -1) {
9910 +               idx = get_free_idx();
9911 +               if (idx < 0)
9912 +                       return idx;
9913 +               if (put_user(idx, &u_info->entry_number))
9914 +                       return -EFAULT;
9915 +       }
9916 +
9917 +       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
9918 +               return -EINVAL;
9919 +
9920 +       desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
9921 +
9922 +       /*
9923 +        * We must not get preempted while modifying the TLS.
9924 +        */
9925 +       cpu = get_cpu();
9926 +
9927 +       if (LDT_empty(&info)) {
9928 +               desc->a = 0;
9929 +               desc->b = 0;
9930 +       } else {
9931 +               desc->a = LDT_entry_a(&info);
9932 +               desc->b = LDT_entry_b(&info);
9933 +       }
9934 +       load_TLS(t, cpu);
9935 +
9936 +       put_cpu();
9937 +
9938 +       return 0;
9939 +}
9940 +
9941 +/*
9942 + * Get the current Thread-Local Storage area:
9943 + */
9944 +
9945 +#define GET_BASE(desc) ( \
9946 +       (((desc)->a >> 16) & 0x0000ffff) | \
9947 +       (((desc)->b << 16) & 0x00ff0000) | \
9948 +       ( (desc)->b        & 0xff000000)   )
9949 +
9950 +#define GET_LIMIT(desc) ( \
9951 +       ((desc)->a & 0x0ffff) | \
9952 +        ((desc)->b & 0xf0000) )
9953 +       
9954 +#define GET_32BIT(desc)                (((desc)->b >> 22) & 1)
9955 +#define GET_CONTENTS(desc)     (((desc)->b >> 10) & 3)
9956 +#define GET_WRITABLE(desc)     (((desc)->b >>  9) & 1)
9957 +#define GET_LIMIT_PAGES(desc)  (((desc)->b >> 23) & 1)
9958 +#define GET_PRESENT(desc)      (((desc)->b >> 15) & 1)
9959 +#define GET_USEABLE(desc)      (((desc)->b >> 20) & 1)
9960 +
9961 +asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
9962 +{
9963 +       struct user_desc info;
9964 +       struct desc_struct *desc;
9965 +       int idx;
9966 +
9967 +       if (get_user(idx, &u_info->entry_number))
9968 +               return -EFAULT;
9969 +       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
9970 +               return -EINVAL;
9971 +
9972 +       memset(&info, 0, sizeof(info));
9973 +
9974 +       desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
9975 +
9976 +       info.entry_number = idx;
9977 +       info.base_addr = GET_BASE(desc);
9978 +       info.limit = GET_LIMIT(desc);
9979 +       info.seg_32bit = GET_32BIT(desc);
9980 +       info.contents = GET_CONTENTS(desc);
9981 +       info.read_exec_only = !GET_WRITABLE(desc);
9982 +       info.limit_in_pages = GET_LIMIT_PAGES(desc);
9983 +       info.seg_not_present = !GET_PRESENT(desc);
9984 +       info.useable = GET_USEABLE(desc);
9985 +
9986 +       if (copy_to_user(u_info, &info, sizeof(info)))
9987 +               return -EFAULT;
9988 +       return 0;
9989 +}
9990 +
9991 +unsigned long arch_align_stack(unsigned long sp)
9992 +{
9993 +       if (randomize_va_space)
9994 +               sp -= get_random_int() % 8192;
9995 +       return sp & ~0xf;
9996 +}
9997 diff -urNp linux-2.6/arch/i386/kernel/quirks-xen.c new/arch/i386/kernel/quirks-xen.c
9998 --- linux-2.6/arch/i386/kernel/quirks-xen.c     1970-01-01 01:00:00.000000000 +0100
9999 +++ new/arch/i386/kernel/quirks-xen.c   2006-05-09 12:32:35.000000000 +0200
10000 @@ -0,0 +1,48 @@
10001 +/*
10002 + * This file contains work-arounds for x86 and x86_64 platform bugs.
10003 + */
10004 +#include <linux/config.h>
10005 +#include <linux/pci.h>
10006 +#include <linux/irq.h>
10007 +
10008 +#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
10009 +
10010 +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
10011 +{
10012 +       u8 config, rev;
10013 +       u32 word;
10014 +
10015 +       /* BIOS may enable hardware IRQ balancing for
10016 +        * E7520/E7320/E7525(revision ID 0x9 and below)
10017 +        * based platforms.
10018 +        * Disable SW irqbalance/affinity on those platforms.
10019 +        */
10020 +       pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
10021 +       if (rev > 0x9)
10022 +               return;
10023 +
10024 +       printk(KERN_INFO "Intel E7520/7320/7525 detected.");
10025 +
10026 +       /* enable access to config space*/
10027 +       pci_read_config_byte(dev, 0xf4, &config);
10028 +       pci_write_config_byte(dev, 0xf4, config|0x2);
10029 +
10030 +       /* read xTPR register */
10031 +       raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
10032 +
10033 +       if (!(word & (1 << 13))) {
10034 +               dom0_op_t op;
10035 +               printk(KERN_INFO "Disabling irq balancing and affinity\n");
10036 +               op.cmd = DOM0_PLATFORM_QUIRK;
10037 +               op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
10038 +               (void)HYPERVISOR_dom0_op(&op);
10039 +       }
10040 +
10041 +       /* put back the original value for config space*/
10042 +       if (!(config & 0x2))
10043 +               pci_write_config_byte(dev, 0xf4, config);
10044 +}
10045 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7320_MCH,  quirk_intel_irqbalance);
10046 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7525_MCH,  quirk_intel_irqbalance);
10047 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7520_MCH,  quirk_intel_irqbalance);
10048 +#endif
10049 diff -urNp linux-2.6/arch/i386/kernel/setup-xen.c new/arch/i386/kernel/setup-xen.c
10050 --- linux-2.6/arch/i386/kernel/setup-xen.c      1970-01-01 01:00:00.000000000 +0100
10051 +++ new/arch/i386/kernel/setup-xen.c    2006-06-28 14:32:13.000000000 +0200
10052 @@ -0,0 +1,1827 @@
10053 +/*
10054 + *  linux/arch/i386/kernel/setup.c
10055 + *
10056 + *  Copyright (C) 1995  Linus Torvalds
10057 + *
10058 + *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10059 + *
10060 + *  Memory region support
10061 + *     David Parsons <orc@pell.chi.il.us>, July-August 1999
10062 + *
10063 + *  Added E820 sanitization routine (removes overlapping memory regions);
10064 + *  Brian Moyle <bmoyle@mvista.com>, February 2001
10065 + *
10066 + * Moved CPU detection code to cpu/${cpu}.c
10067 + *    Patrick Mochel <mochel@osdl.org>, March 2002
10068 + *
10069 + *  Provisions for empty E820 memory regions (reported by certain BIOSes).
10070 + *  Alex Achenbach <xela@slit.de>, December 2002.
10071 + *
10072 + */
10073 +
10074 +/*
10075 + * This file handles the architecture-dependent parts of initialization
10076 + */
10077 +
10078 +#include <linux/config.h>
10079 +#include <linux/sched.h>
10080 +#include <linux/mm.h>
10081 +#include <linux/mmzone.h>
10082 +#include <linux/tty.h>
10083 +#include <linux/ioport.h>
10084 +#include <linux/acpi.h>
10085 +#include <linux/apm_bios.h>
10086 +#include <linux/initrd.h>
10087 +#include <linux/bootmem.h>
10088 +#include <linux/seq_file.h>
10089 +#include <linux/platform_device.h>
10090 +#include <linux/console.h>
10091 +#include <linux/mca.h>
10092 +#include <linux/root_dev.h>
10093 +#include <linux/highmem.h>
10094 +#include <linux/module.h>
10095 +#include <linux/efi.h>
10096 +#include <linux/init.h>
10097 +#include <linux/edd.h>
10098 +#include <linux/nodemask.h>
10099 +#include <linux/kernel.h>
10100 +#include <linux/percpu.h>
10101 +#include <linux/notifier.h>
10102 +#include <linux/kexec.h>
10103 +#include <linux/crash_dump.h>
10104 +#include <linux/dmi.h>
10105 +#include <linux/pfn.h>
10106 +
10107 +#include <video/edid.h>
10108 +
10109 +#include <asm/apic.h>
10110 +#include <asm/e820.h>
10111 +#include <asm/mpspec.h>
10112 +#include <asm/setup.h>
10113 +#include <asm/arch_hooks.h>
10114 +#include <asm/sections.h>
10115 +#include <asm/io_apic.h>
10116 +#include <asm/ist.h>
10117 +#include <asm/io.h>
10118 +#include <asm/hypervisor.h>
10119 +#include <xen/interface/physdev.h>
10120 +#include <xen/interface/memory.h>
10121 +#include <xen/features.h>
10122 +#include "setup_arch_pre.h"
10123 +#include <bios_ebda.h>
10124 +
10125 +/* Forward Declaration. */
10126 +void __init find_max_pfn(void);
10127 +
10128 +static int xen_panic_event(struct notifier_block *, unsigned long, void *);
10129 +static struct notifier_block xen_panic_block = {
10130 +       xen_panic_event, NULL, 0 /* try to go last */
10131 +};
10132 +
10133 +extern char hypercall_page[PAGE_SIZE];
10134 +EXPORT_SYMBOL(hypercall_page);
10135 +
10136 +int disable_pse __devinitdata = 0;
10137 +
10138 +/*
10139 + * Machine setup..
10140 + */
10141 +
10142 +#ifdef CONFIG_EFI
10143 +int efi_enabled = 0;
10144 +EXPORT_SYMBOL(efi_enabled);
10145 +#endif
10146 +
10147 +/* cpu data as detected by the assembly code in head.S */
10148 +struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
10149 +/* common cpu data for all cpus */
10150 +struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
10151 +EXPORT_SYMBOL(boot_cpu_data);
10152 +
10153 +unsigned long mmu_cr4_features;
10154 +
10155 +#ifdef CONFIG_ACPI
10156 +       int acpi_disabled = 0;
10157 +#else
10158 +       int acpi_disabled = 1;
10159 +#endif
10160 +EXPORT_SYMBOL(acpi_disabled);
10161 +
10162 +#ifdef CONFIG_ACPI
10163 +int __initdata acpi_force = 0;
10164 +extern acpi_interrupt_flags    acpi_sci_flags;
10165 +#endif
10166 +
10167 +/* for MCA, but anyone else can use it if they want */
10168 +unsigned int machine_id;
10169 +#ifdef CONFIG_MCA
10170 +EXPORT_SYMBOL(machine_id);
10171 +#endif
10172 +unsigned int machine_submodel_id;
10173 +unsigned int BIOS_revision;
10174 +unsigned int mca_pentium_flag;
10175 +
10176 +/* For PCI or other memory-mapped resources */
10177 +unsigned long pci_mem_start = 0x10000000;
10178 +#ifdef CONFIG_PCI
10179 +EXPORT_SYMBOL(pci_mem_start);
10180 +#endif
10181 +
10182 +/* Boot loader ID as an integer, for the benefit of proc_dointvec */
10183 +int bootloader_type;
10184 +
10185 +/* user-defined highmem size */
10186 +static unsigned int highmem_pages = -1;
10187 +
10188 +/*
10189 + * Setup options
10190 + */
10191 +struct drive_info_struct { char dummy[32]; } drive_info;
10192 +#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || \
10193 +    defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
10194 +EXPORT_SYMBOL(drive_info);
10195 +#endif
10196 +struct screen_info screen_info;
10197 +EXPORT_SYMBOL(screen_info);
10198 +struct apm_info apm_info;
10199 +EXPORT_SYMBOL(apm_info);
10200 +struct sys_desc_table_struct {
10201 +       unsigned short length;
10202 +       unsigned char table[0];
10203 +};
10204 +struct edid_info edid_info;
10205 +EXPORT_SYMBOL_GPL(edid_info);
10206 +struct ist_info ist_info;
10207 +#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
10208 +       defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
10209 +EXPORT_SYMBOL(ist_info);
10210 +#endif
10211 +struct e820map e820;
10212 +
10213 +extern void early_cpu_init(void);
10214 +extern void generic_apic_probe(char *);
10215 +extern int root_mountflags;
10216 +
10217 +unsigned long saved_videomode;
10218 +
10219 +#define RAMDISK_IMAGE_START_MASK       0x07FF
10220 +#define RAMDISK_PROMPT_FLAG            0x8000
10221 +#define RAMDISK_LOAD_FLAG              0x4000  
10222 +
10223 +static char command_line[COMMAND_LINE_SIZE];
10224 +
10225 +unsigned char __initdata boot_params[PARAM_SIZE];
10226 +
10227 +static struct resource data_resource = {
10228 +       .name   = "Kernel data",
10229 +       .start  = 0,
10230 +       .end    = 0,
10231 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
10232 +};
10233 +
10234 +static struct resource code_resource = {
10235 +       .name   = "Kernel code",
10236 +       .start  = 0,
10237 +       .end    = 0,
10238 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
10239 +};
10240 +
10241 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
10242 +static struct resource system_rom_resource = {
10243 +       .name   = "System ROM",
10244 +       .start  = 0xf0000,
10245 +       .end    = 0xfffff,
10246 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10247 +};
10248 +
10249 +static struct resource extension_rom_resource = {
10250 +       .name   = "Extension ROM",
10251 +       .start  = 0xe0000,
10252 +       .end    = 0xeffff,
10253 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10254 +};
10255 +
10256 +static struct resource adapter_rom_resources[] = { {
10257 +       .name   = "Adapter ROM",
10258 +       .start  = 0xc8000,
10259 +       .end    = 0,
10260 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10261 +}, {
10262 +       .name   = "Adapter ROM",
10263 +       .start  = 0,
10264 +       .end    = 0,
10265 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10266 +}, {
10267 +       .name   = "Adapter ROM",
10268 +       .start  = 0,
10269 +       .end    = 0,
10270 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10271 +}, {
10272 +       .name   = "Adapter ROM",
10273 +       .start  = 0,
10274 +       .end    = 0,
10275 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10276 +}, {
10277 +       .name   = "Adapter ROM",
10278 +       .start  = 0,
10279 +       .end    = 0,
10280 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10281 +}, {
10282 +       .name   = "Adapter ROM",
10283 +       .start  = 0,
10284 +       .end    = 0,
10285 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10286 +} };
10287 +
10288 +#define ADAPTER_ROM_RESOURCES \
10289 +       (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
10290 +
10291 +static struct resource video_rom_resource = {
10292 +       .name   = "Video ROM",
10293 +       .start  = 0xc0000,
10294 +       .end    = 0xc7fff,
10295 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10296 +};
10297 +#endif
10298 +
10299 +static struct resource video_ram_resource = {
10300 +       .name   = "Video RAM area",
10301 +       .start  = 0xa0000,
10302 +       .end    = 0xbffff,
10303 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
10304 +};
10305 +
10306 +static struct resource standard_io_resources[] = { {
10307 +       .name   = "dma1",
10308 +       .start  = 0x0000,
10309 +       .end    = 0x001f,
10310 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10311 +}, {
10312 +       .name   = "pic1",
10313 +       .start  = 0x0020,
10314 +       .end    = 0x0021,
10315 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10316 +}, {
10317 +       .name   = "timer0",
10318 +       .start  = 0x0040,
10319 +       .end    = 0x0043,
10320 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10321 +}, {
10322 +       .name   = "timer1",
10323 +       .start  = 0x0050,
10324 +       .end    = 0x0053,
10325 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10326 +}, {
10327 +       .name   = "keyboard",
10328 +       .start  = 0x0060,
10329 +       .end    = 0x006f,
10330 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10331 +}, {
10332 +       .name   = "dma page reg",
10333 +       .start  = 0x0080,
10334 +       .end    = 0x008f,
10335 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10336 +}, {
10337 +       .name   = "pic2",
10338 +       .start  = 0x00a0,
10339 +       .end    = 0x00a1,
10340 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10341 +}, {
10342 +       .name   = "dma2",
10343 +       .start  = 0x00c0,
10344 +       .end    = 0x00df,
10345 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10346 +}, {
10347 +       .name   = "fpu",
10348 +       .start  = 0x00f0,
10349 +       .end    = 0x00ff,
10350 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10351 +} };
10352 +
10353 +#define STANDARD_IO_RESOURCES \
10354 +       (sizeof standard_io_resources / sizeof standard_io_resources[0])
10355 +
10356 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
10357 +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
10358 +
10359 +static int __init romchecksum(unsigned char *rom, unsigned long length)
10360 +{
10361 +       unsigned char *p, sum = 0;
10362 +
10363 +       for (p = rom; p < rom + length; p++)
10364 +               sum += *p;
10365 +       return sum == 0;
10366 +}
10367 +
10368 +static void __init probe_roms(void)
10369 +{
10370 +       unsigned long start, length, upper;
10371 +       unsigned char *rom;
10372 +       int           i;
10373 +
10374 +       /* Nothing to do if not running in dom0. */
10375 +       if (!(xen_start_info->flags & SIF_INITDOMAIN))
10376 +               return;
10377 +
10378 +       /* video rom */
10379 +       upper = adapter_rom_resources[0].start;
10380 +       for (start = video_rom_resource.start; start < upper; start += 2048) {
10381 +               rom = isa_bus_to_virt(start);
10382 +               if (!romsignature(rom))
10383 +                       continue;
10384 +
10385 +               video_rom_resource.start = start;
10386 +
10387 +               /* 0 < length <= 0x7f * 512, historically */
10388 +               length = rom[2] * 512;
10389 +
10390 +               /* if checksum okay, trust length byte */
10391 +               if (length && romchecksum(rom, length))
10392 +                       video_rom_resource.end = start + length - 1;
10393 +
10394 +               request_resource(&iomem_resource, &video_rom_resource);
10395 +               break;
10396 +       }
10397 +
10398 +       start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
10399 +       if (start < upper)
10400 +               start = upper;
10401 +
10402 +       /* system rom */
10403 +       request_resource(&iomem_resource, &system_rom_resource);
10404 +       upper = system_rom_resource.start;
10405 +
10406 +       /* check for extension rom (ignore length byte!) */
10407 +       rom = isa_bus_to_virt(extension_rom_resource.start);
10408 +       if (romsignature(rom)) {
10409 +               length = extension_rom_resource.end - extension_rom_resource.start + 1;
10410 +               if (romchecksum(rom, length)) {
10411 +                       request_resource(&iomem_resource, &extension_rom_resource);
10412 +                       upper = extension_rom_resource.start;
10413 +               }
10414 +       }
10415 +
10416 +       /* check for adapter roms on 2k boundaries */
10417 +       for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
10418 +               rom = isa_bus_to_virt(start);
10419 +               if (!romsignature(rom))
10420 +                       continue;
10421 +
10422 +               /* 0 < length <= 0x7f * 512, historically */
10423 +               length = rom[2] * 512;
10424 +
10425 +               /* but accept any length that fits if checksum okay */
10426 +               if (!length || start + length > upper || !romchecksum(rom, length))
10427 +                       continue;
10428 +
10429 +               adapter_rom_resources[i].start = start;
10430 +               adapter_rom_resources[i].end = start + length - 1;
10431 +               request_resource(&iomem_resource, &adapter_rom_resources[i]);
10432 +
10433 +               start = adapter_rom_resources[i++].end & ~2047UL;
10434 +       }
10435 +}
10436 +#endif
10437 +
10438 +/*
10439 + * Point at the empty zero page to start with. We map the real shared_info
10440 + * page as soon as fixmap is up and running.
10441 + */
10442 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
10443 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
10444 +
10445 +unsigned long *phys_to_machine_mapping;
10446 +unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
10447 +EXPORT_SYMBOL(phys_to_machine_mapping);
10448 +
10449 +/* Raw start-of-day parameters from the hypervisor. */
10450 +start_info_t *xen_start_info;
10451 +EXPORT_SYMBOL(xen_start_info);
10452 +
10453 +static void __init add_memory_region(unsigned long long start,
10454 +                                  unsigned long long size, int type)
10455 +{
10456 +       int x;
10457 +
10458 +       if (!efi_enabled) {
10459 +                       x = e820.nr_map;
10460 +
10461 +               if (x == E820MAX) {
10462 +                   printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
10463 +                   return;
10464 +               }
10465 +
10466 +               e820.map[x].addr = start;
10467 +               e820.map[x].size = size;
10468 +               e820.map[x].type = type;
10469 +               e820.nr_map++;
10470 +       }
10471 +} /* add_memory_region */
10472 +
10473 +static void __init limit_regions(unsigned long long size)
10474 +{
10475 +       unsigned long long current_addr = 0;
10476 +       int i;
10477 +
10478 +       if (efi_enabled) {
10479 +               efi_memory_desc_t *md;
10480 +               void *p;
10481 +
10482 +               for (p = memmap.map, i = 0; p < memmap.map_end;
10483 +                       p += memmap.desc_size, i++) {
10484 +                       md = p;
10485 +                       current_addr = md->phys_addr + (md->num_pages << 12);
10486 +                       if (md->type == EFI_CONVENTIONAL_MEMORY) {
10487 +                               if (current_addr >= size) {
10488 +                                       md->num_pages -=
10489 +                                               (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
10490 +                                       memmap.nr_map = i + 1;
10491 +                                       return;
10492 +                               }
10493 +                       }
10494 +               }
10495 +       }
10496 +       for (i = 0; i < e820.nr_map; i++) {
10497 +               current_addr = e820.map[i].addr + e820.map[i].size;
10498 +               if (current_addr < size)
10499 +                       continue;
10500 +
10501 +               if (e820.map[i].type != E820_RAM)
10502 +                       continue;
10503 +
10504 +               if (e820.map[i].addr >= size) {
10505 +                       /*
10506 +                        * This region starts past the end of the
10507 +                        * requested size, skip it completely.
10508 +                        */
10509 +                       e820.nr_map = i;
10510 +               } else {
10511 +                       e820.nr_map = i + 1;
10512 +                       e820.map[i].size -= current_addr - size;
10513 +               }
10514 +               return;
10515 +       }
10516 +#ifdef CONFIG_XEN
10517 +       if (i==e820.nr_map && current_addr < size) {
10518 +               /*
10519 +                 * The e820 map finished before our requested size so
10520 +                 * extend the final entry to the requested address.
10521 +                 */
10522 +               --i;
10523 +               if (e820.map[i].type == E820_RAM)
10524 +                       e820.map[i].size -= current_addr - size;
10525 +               else
10526 +                       add_memory_region(current_addr, size - current_addr, E820_RAM);
10527 +       }
10528 +#endif
10529 +}
10530 +
10531 +#define E820_DEBUG     1
10532 +
10533 +static void __init print_memory_map(char *who)
10534 +{
10535 +       int i;
10536 +
10537 +       for (i = 0; i < e820.nr_map; i++) {
10538 +               printk(" %s: %016Lx - %016Lx ", who,
10539 +                       e820.map[i].addr,
10540 +                       e820.map[i].addr + e820.map[i].size);
10541 +               switch (e820.map[i].type) {
10542 +               case E820_RAM:  printk("(usable)\n");
10543 +                               break;
10544 +               case E820_RESERVED:
10545 +                               printk("(reserved)\n");
10546 +                               break;
10547 +               case E820_ACPI:
10548 +                               printk("(ACPI data)\n");
10549 +                               break;
10550 +               case E820_NVS:
10551 +                               printk("(ACPI NVS)\n");
10552 +                               break;
10553 +               default:        printk("type %lu\n", e820.map[i].type);
10554 +                               break;
10555 +               }
10556 +       }
10557 +}
10558 +
10559 +/*
10560 + * Sanitize the BIOS e820 map.
10561 + *
10562 + * Some e820 responses include overlapping entries.  The following 
10563 + * replaces the original e820 map with a new one, removing overlaps.
10564 + *
10565 + */
10566 +struct change_member {
10567 +       struct e820entry *pbios; /* pointer to original bios entry */
10568 +       unsigned long long addr; /* address for this change point */
10569 +};
10570 +static struct change_member change_point_list[2*E820MAX] __initdata;
10571 +static struct change_member *change_point[2*E820MAX] __initdata;
10572 +static struct e820entry *overlap_list[E820MAX] __initdata;
10573 +static struct e820entry new_bios[E820MAX] __initdata;
10574 +
10575 +static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
10576 +{
10577 +       struct change_member *change_tmp;
10578 +       unsigned long current_type, last_type;
10579 +       unsigned long long last_addr;
10580 +       int chgidx, still_changing;
10581 +       int overlap_entries;
10582 +       int new_bios_entry;
10583 +       int old_nr, new_nr, chg_nr;
10584 +       int i;
10585 +
10586 +       /*
10587 +               Visually we're performing the following (1,2,3,4 = memory types)...
10588 +
10589 +               Sample memory map (w/overlaps):
10590 +                  ____22__________________
10591 +                  ______________________4_
10592 +                  ____1111________________
10593 +                  _44_____________________
10594 +                  11111111________________
10595 +                  ____________________33__
10596 +                  ___________44___________
10597 +                  __________33333_________
10598 +                  ______________22________
10599 +                  ___________________2222_
10600 +                  _________111111111______
10601 +                  _____________________11_
10602 +                  _________________4______
10603 +
10604 +               Sanitized equivalent (no overlap):
10605 +                  1_______________________
10606 +                  _44_____________________
10607 +                  ___1____________________
10608 +                  ____22__________________
10609 +                  ______11________________
10610 +                  _________1______________
10611 +                  __________3_____________
10612 +                  ___________44___________
10613 +                  _____________33_________
10614 +                  _______________2________
10615 +                  ________________1_______
10616 +                  _________________4______
10617 +                  ___________________2____
10618 +                  ____________________33__
10619 +                  ______________________4_
10620 +       */
10621 +
10622 +       /* if there's only one memory region, don't bother */
10623 +       if (*pnr_map < 2)
10624 +               return -1;
10625 +
10626 +       old_nr = *pnr_map;
10627 +
10628 +       /* bail out if we find any unreasonable addresses in bios map */
10629 +       for (i=0; i<old_nr; i++)
10630 +               if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
10631 +                       return -1;
10632 +
10633 +       /* create pointers for initial change-point information (for sorting) */
10634 +       for (i=0; i < 2*old_nr; i++)
10635 +               change_point[i] = &change_point_list[i];
10636 +
10637 +       /* record all known change-points (starting and ending addresses),
10638 +          omitting those that are for empty memory regions */
10639 +       chgidx = 0;
10640 +       for (i=0; i < old_nr; i++)      {
10641 +               if (biosmap[i].size != 0) {
10642 +                       change_point[chgidx]->addr = biosmap[i].addr;
10643 +                       change_point[chgidx++]->pbios = &biosmap[i];
10644 +                       change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
10645 +                       change_point[chgidx++]->pbios = &biosmap[i];
10646 +               }
10647 +       }
10648 +       chg_nr = chgidx;        /* true number of change-points */
10649 +
10650 +       /* sort change-point list by memory addresses (low -> high) */
10651 +       still_changing = 1;
10652 +       while (still_changing)  {
10653 +               still_changing = 0;
10654 +               for (i=1; i < chg_nr; i++)  {
10655 +                       /* if <current_addr> > <last_addr>, swap */
10656 +                       /* or, if current=<start_addr> & last=<end_addr>, swap */
10657 +                       if ((change_point[i]->addr < change_point[i-1]->addr) ||
10658 +                               ((change_point[i]->addr == change_point[i-1]->addr) &&
10659 +                                (change_point[i]->addr == change_point[i]->pbios->addr) &&
10660 +                                (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
10661 +                          )
10662 +                       {
10663 +                               change_tmp = change_point[i];
10664 +                               change_point[i] = change_point[i-1];
10665 +                               change_point[i-1] = change_tmp;
10666 +                               still_changing=1;
10667 +                       }
10668 +               }
10669 +       }
10670 +
10671 +       /* create a new bios memory map, removing overlaps */
10672 +       overlap_entries=0;       /* number of entries in the overlap table */
10673 +       new_bios_entry=0;        /* index for creating new bios map entries */
10674 +       last_type = 0;           /* start with undefined memory type */
10675 +       last_addr = 0;           /* start with 0 as last starting address */
10676 +       /* loop through change-points, determining affect on the new bios map */
10677 +       for (chgidx=0; chgidx < chg_nr; chgidx++)
10678 +       {
10679 +               /* keep track of all overlapping bios entries */
10680 +               if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
10681 +               {
10682 +                       /* add map entry to overlap list (> 1 entry implies an overlap) */
10683 +                       overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
10684 +               }
10685 +               else
10686 +               {
10687 +                       /* remove entry from list (order independent, so swap with last) */
10688 +                       for (i=0; i<overlap_entries; i++)
10689 +                       {
10690 +                               if (overlap_list[i] == change_point[chgidx]->pbios)
10691 +                                       overlap_list[i] = overlap_list[overlap_entries-1];
10692 +                       }
10693 +                       overlap_entries--;
10694 +               }
10695 +               /* if there are overlapping entries, decide which "type" to use */
10696 +               /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
10697 +               current_type = 0;
10698 +               for (i=0; i<overlap_entries; i++)
10699 +                       if (overlap_list[i]->type > current_type)
10700 +                               current_type = overlap_list[i]->type;
10701 +               /* continue building up new bios map based on this information */
10702 +               if (current_type != last_type)  {
10703 +                       if (last_type != 0)      {
10704 +                               new_bios[new_bios_entry].size =
10705 +                                       change_point[chgidx]->addr - last_addr;
10706 +                               /* move forward only if the new size was non-zero */
10707 +                               if (new_bios[new_bios_entry].size != 0)
10708 +                                       if (++new_bios_entry >= E820MAX)
10709 +                                               break;  /* no more space left for new bios entries */
10710 +                       }
10711 +                       if (current_type != 0)  {
10712 +                               new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
10713 +                               new_bios[new_bios_entry].type = current_type;
10714 +                               last_addr=change_point[chgidx]->addr;
10715 +                       }
10716 +                       last_type = current_type;
10717 +               }
10718 +       }
10719 +       new_nr = new_bios_entry;   /* retain count for new bios entries */
10720 +
10721 +       /* copy new bios mapping into original location */
10722 +       memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
10723 +       *pnr_map = new_nr;
10724 +
10725 +       return 0;
10726 +}
10727 +
10728 +/*
10729 + * Copy the BIOS e820 map into a safe place.
10730 + *
10731 + * Sanity-check it while we're at it..
10732 + *
10733 + * If we're lucky and live on a modern system, the setup code
10734 + * will have given us a memory map that we can use to properly
10735 + * set up memory.  If we aren't, we'll fake a memory map.
10736 + *
10737 + * We check to see that the memory map contains at least 2 elements
10738 + * before we'll use it, because the detection code in setup.S may
10739 + * not be perfect and most every PC known to man has two memory
10740 + * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
10741 + * thinkpad 560x, for example, does not cooperate with the memory
10742 + * detection code.)
10743 + */
10744 +static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
10745 +{
10746 +#ifndef CONFIG_XEN
10747 +       /* Only one memory region (or negative)? Ignore it */
10748 +       if (nr_map < 2)
10749 +               return -1;
10750 +#else
10751 +       BUG_ON(nr_map < 1);
10752 +#endif
10753 +
10754 +       do {
10755 +               unsigned long long start = biosmap->addr;
10756 +               unsigned long long size = biosmap->size;
10757 +               unsigned long long end = start + size;
10758 +               unsigned long type = biosmap->type;
10759 +
10760 +               /* Overflow in 64 bits? Ignore the memory map. */
10761 +               if (start > end)
10762 +                       return -1;
10763 +
10764 +#ifndef CONFIG_XEN
10765 +               /*
10766 +                * Some BIOSes claim RAM in the 640k - 1M region.
10767 +                * Not right. Fix it up.
10768 +                */
10769 +               if (type == E820_RAM) {
10770 +                       if (start < 0x100000ULL && end > 0xA0000ULL) {
10771 +                               if (start < 0xA0000ULL)
10772 +                                       add_memory_region(start, 0xA0000ULL-start, type);
10773 +                               if (end <= 0x100000ULL)
10774 +                                       continue;
10775 +                               start = 0x100000ULL;
10776 +                               size = end - start;
10777 +                       }
10778 +               }
10779 +#endif
10780 +               add_memory_region(start, size, type);
10781 +       } while (biosmap++,--nr_map);
10782 +       return 0;
10783 +}
10784 +
10785 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
10786 +struct edd edd;
10787 +#ifdef CONFIG_EDD_MODULE
10788 +EXPORT_SYMBOL(edd);
10789 +#endif
10790 +/**
10791 + * copy_edd() - Copy the BIOS EDD information
10792 + *              from boot_params into a safe place.
10793 + *
10794 + */
10795 +static inline void copy_edd(void)
10796 +{
10797 +     memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
10798 +     memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
10799 +     edd.mbr_signature_nr = EDD_MBR_SIG_NR;
10800 +     edd.edd_info_nr = EDD_NR;
10801 +}
10802 +#else
10803 +static inline void copy_edd(void)
10804 +{
10805 +}
10806 +#endif
10807 +
10808 +/*
10809 + * Do NOT EVER look at the BIOS memory size location.
10810 + * It does not work on many machines.
10811 + */
10812 +#define LOWMEMSIZE()   (0x9f000)
10813 +
10814 +static void __init parse_cmdline_early (char ** cmdline_p)
10815 +{
10816 +       char c = ' ', *to = command_line, *from = saved_command_line;
10817 +       int len = 0, max_cmdline;
10818 +       int userdef = 0;
10819 +
10820 +       if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
10821 +               max_cmdline = COMMAND_LINE_SIZE;
10822 +       memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
10823 +       /* Save unparsed command line copy for /proc/cmdline */
10824 +       saved_command_line[max_cmdline-1] = '\0';
10825 +
10826 +       for (;;) {
10827 +               if (c != ' ')
10828 +                       goto next_char;
10829 +               /*
10830 +                * "mem=nopentium" disables the 4MB page tables.
10831 +                * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
10832 +                * to <mem>, overriding the bios size.
10833 +                * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
10834 +                * <start> to <start>+<mem>, overriding the bios size.
10835 +                *
10836 +                * HPA tells me bootloaders need to parse mem=, so no new
10837 +                * option should be mem=  [also see Documentation/i386/boot.txt]
10838 +                */
10839 +               if (!memcmp(from, "mem=", 4)) {
10840 +                       if (to != command_line)
10841 +                               to--;
10842 +                       if (!memcmp(from+4, "nopentium", 9)) {
10843 +                               from += 9+4;
10844 +                               clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
10845 +                               disable_pse = 1;
10846 +                       } else {
10847 +                               /* If the user specifies memory size, we
10848 +                                * limit the BIOS-provided memory map to
10849 +                                * that size. exactmap can be used to specify
10850 +                                * the exact map. mem=number can be used to
10851 +                                * trim the existing memory map.
10852 +                                */
10853 +                               unsigned long long mem_size;
10854
10855 +                               mem_size = memparse(from+4, &from);
10856 +                               limit_regions(mem_size);
10857 +                               userdef=1;
10858 +                       }
10859 +               }
10860 +
10861 +               else if (!memcmp(from, "memmap=", 7)) {
10862 +                       if (to != command_line)
10863 +                               to--;
10864 +                       if (!memcmp(from+7, "exactmap", 8)) {
10865 +#ifdef CONFIG_CRASH_DUMP
10866 +                               /* If we are doing a crash dump, we
10867 +                                * still need to know the real mem
10868 +                                * size before original memory map is
10869 +                                * reset.
10870 +                                */
10871 +                               find_max_pfn();
10872 +                               saved_max_pfn = max_pfn;
10873 +#endif
10874 +                               from += 8+7;
10875 +                               e820.nr_map = 0;
10876 +                               userdef = 1;
10877 +                       } else {
10878 +                               /* If the user specifies memory size, we
10879 +                                * limit the BIOS-provided memory map to
10880 +                                * that size. exactmap can be used to specify
10881 +                                * the exact map. mem=number can be used to
10882 +                                * trim the existing memory map.
10883 +                                */
10884 +                               unsigned long long start_at, mem_size;
10885
10886 +                               mem_size = memparse(from+7, &from);
10887 +                               if (*from == '@') {
10888 +                                       start_at = memparse(from+1, &from);
10889 +                                       add_memory_region(start_at, mem_size, E820_RAM);
10890 +                               } else if (*from == '#') {
10891 +                                       start_at = memparse(from+1, &from);
10892 +                                       add_memory_region(start_at, mem_size, E820_ACPI);
10893 +                               } else if (*from == '$') {
10894 +                                       start_at = memparse(from+1, &from);
10895 +                                       add_memory_region(start_at, mem_size, E820_RESERVED);
10896 +                               } else {
10897 +                                       limit_regions(mem_size);
10898 +                                       userdef=1;
10899 +                               }
10900 +                       }
10901 +               }
10902 +
10903 +               else if (!memcmp(from, "noexec=", 7))
10904 +                       noexec_setup(from + 7);
10905 +
10906 +
10907 +#ifdef  CONFIG_X86_MPPARSE
10908 +               /*
10909 +                * If the BIOS enumerates physical processors before logical,
10910 +                * maxcpus=N at enumeration-time can be used to disable HT.
10911 +                */
10912 +               else if (!memcmp(from, "maxcpus=", 8)) {
10913 +                       extern unsigned int maxcpus;
10914 +
10915 +                       maxcpus = simple_strtoul(from + 8, NULL, 0);
10916 +               }
10917 +#endif
10918 +
10919 +#ifdef CONFIG_ACPI
10920 +               /* "acpi=off" disables both ACPI table parsing and interpreter */
10921 +               else if (!memcmp(from, "acpi=off", 8)) {
10922 +                       disable_acpi();
10923 +               }
10924 +
10925 +               /* acpi=force to over-ride black-list */
10926 +               else if (!memcmp(from, "acpi=force", 10)) {
10927 +                       acpi_force = 1;
10928 +                       acpi_ht = 1;
10929 +                       acpi_disabled = 0;
10930 +               }
10931 +
10932 +               /* acpi=strict disables out-of-spec workarounds */
10933 +               else if (!memcmp(from, "acpi=strict", 11)) {
10934 +                       acpi_strict = 1;
10935 +               }
10936 +
10937 +               /* Limit ACPI just to boot-time to enable HT */
10938 +               else if (!memcmp(from, "acpi=ht", 7)) {
10939 +                       if (!acpi_force)
10940 +                               disable_acpi();
10941 +                       acpi_ht = 1;
10942 +               }
10943 +               
10944 +               /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
10945 +               else if (!memcmp(from, "pci=noacpi", 10)) {
10946 +                       acpi_disable_pci();
10947 +               }
10948 +               /* "acpi=noirq" disables ACPI interrupt routing */
10949 +               else if (!memcmp(from, "acpi=noirq", 10)) {
10950 +                       acpi_noirq_set();
10951 +               }
10952 +
10953 +               else if (!memcmp(from, "acpi_sci=edge", 13))
10954 +                       acpi_sci_flags.trigger =  1;
10955 +
10956 +               else if (!memcmp(from, "acpi_sci=level", 14))
10957 +                       acpi_sci_flags.trigger = 3;
10958 +
10959 +               else if (!memcmp(from, "acpi_sci=high", 13))
10960 +                       acpi_sci_flags.polarity = 1;
10961 +
10962 +               else if (!memcmp(from, "acpi_sci=low", 12))
10963 +                       acpi_sci_flags.polarity = 3;
10964 +
10965 +#ifdef CONFIG_X86_IO_APIC
10966 +               else if (!memcmp(from, "acpi_skip_timer_override", 24))
10967 +                       acpi_skip_timer_override = 1;
10968 +
10969 +               if (!memcmp(from, "disable_timer_pin_1", 19))
10970 +                       disable_timer_pin_1 = 1;
10971 +               if (!memcmp(from, "enable_timer_pin_1", 18))
10972 +                       disable_timer_pin_1 = -1;
10973 +
10974 +               /* disable IO-APIC */
10975 +               else if (!memcmp(from, "noapic", 6))
10976 +                       disable_ioapic_setup();
10977 +#endif /* CONFIG_X86_IO_APIC */
10978 +#endif /* CONFIG_ACPI */
10979 +
10980 +#ifdef CONFIG_X86_LOCAL_APIC
10981 +               /* enable local APIC */
10982 +               else if (!memcmp(from, "lapic", 5))
10983 +                       lapic_enable();
10984 +
10985 +               /* disable local APIC */
10986 +               else if (!memcmp(from, "nolapic", 6))
10987 +                       lapic_disable();
10988 +#endif /* CONFIG_X86_LOCAL_APIC */
10989 +
10990 +#ifdef CONFIG_KEXEC
10991 +               /* crashkernel=size@addr specifies the location to reserve for
10992 +                * a crash kernel.  By reserving this memory we guarantee
10993 +                * that linux never set's it up as a DMA target.
10994 +                * Useful for holding code to do something appropriate
10995 +                * after a kernel panic.
10996 +                */
10997 +               else if (!memcmp(from, "crashkernel=", 12)) {
10998 +                       unsigned long size, base;
10999 +                       size = memparse(from+12, &from);
11000 +                       if (*from == '@') {
11001 +                               base = memparse(from+1, &from);
11002 +                               /* FIXME: Do I want a sanity check
11003 +                                * to validate the memory range?
11004 +                                */
11005 +                               crashk_res.start = base;
11006 +                               crashk_res.end   = base + size - 1;
11007 +                       }
11008 +               }
11009 +#endif
11010 +#ifdef CONFIG_PROC_VMCORE
11011 +               /* elfcorehdr= specifies the location of elf core header
11012 +                * stored by the crashed kernel.
11013 +                */
11014 +               else if (!memcmp(from, "elfcorehdr=", 11))
11015 +                       elfcorehdr_addr = memparse(from+11, &from);
11016 +#endif
11017 +
11018 +               /*
11019 +                * highmem=size forces highmem to be exactly 'size' bytes.
11020 +                * This works even on boxes that have no highmem otherwise.
11021 +                * This also works to reduce highmem size on bigger boxes.
11022 +                */
11023 +               else if (!memcmp(from, "highmem=", 8))
11024 +                       highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
11025 +       
11026 +               /*
11027 +                * vmalloc=size forces the vmalloc area to be exactly 'size'
11028 +                * bytes. This can be used to increase (or decrease) the
11029 +                * vmalloc area - the default is 128m.
11030 +                */
11031 +               else if (!memcmp(from, "vmalloc=", 8))
11032 +                       __VMALLOC_RESERVE = memparse(from+8, &from);
11033 +
11034 +       next_char:
11035 +               c = *(from++);
11036 +               if (!c)
11037 +                       break;
11038 +               if (COMMAND_LINE_SIZE <= ++len)
11039 +                       break;
11040 +               *(to++) = c;
11041 +       }
11042 +       *to = '\0';
11043 +       *cmdline_p = command_line;
11044 +       if (userdef) {
11045 +               printk(KERN_INFO "user-defined physical RAM map:\n");
11046 +               print_memory_map("user");
11047 +       }
11048 +}
11049 +
11050 +/*
11051 + * Callback for efi_memory_walk.
11052 + */
11053 +static int __init
11054 +efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
11055 +{
11056 +       unsigned long *max_pfn = arg, pfn;
11057 +
11058 +       if (start < end) {
11059 +               pfn = PFN_UP(end -1);
11060 +               if (pfn > *max_pfn)
11061 +                       *max_pfn = pfn;
11062 +       }
11063 +       return 0;
11064 +}
11065 +
11066 +static int __init
11067 +efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
11068 +{
11069 +       memory_present(0, start, end);
11070 +       return 0;
11071 +}
11072 +
11073 + /*
11074 +  * This function checks if the entire range <start,end> is mapped with type.
11075 +  *
11076 +  * Note: this function only works correct if the e820 table is sorted and
11077 +  * not-overlapping, which is the case
11078 +  */
11079 +int __init
11080 +e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
11081 +{
11082 +       u64 start = s;
11083 +       u64 end = e;
11084 +       int i;
11085 +       for (i = 0; i < e820.nr_map; i++) {
11086 +               struct e820entry *ei = &e820.map[i];
11087 +               if (type && ei->type != type)
11088 +                       continue;
11089 +               /* is the region (part) in overlap with the current region ?*/
11090 +               if (ei->addr >= end || ei->addr + ei->size <= start)
11091 +                       continue;
11092 +               /* if the region is at the beginning of <start,end> we move
11093 +                * start to the end of the region since it's ok until there
11094 +                */
11095 +               if (ei->addr <= start)
11096 +                       start = ei->addr + ei->size;
11097 +               /* if start is now at or beyond end, we're done, full
11098 +                * coverage */
11099 +               if (start >= end)
11100 +                       return 1; /* we're done */
11101 +       }
11102 +       return 0;
11103 +}
11104 +
11105 +/*
11106 + * Find the highest page frame number we have available
11107 + */
11108 +void __init find_max_pfn(void)
11109 +{
11110 +       int i;
11111 +
11112 +       max_pfn = 0;
11113 +       if (efi_enabled) {
11114 +               efi_memmap_walk(efi_find_max_pfn, &max_pfn);
11115 +               efi_memmap_walk(efi_memory_present_wrapper, NULL);
11116 +               return;
11117 +       }
11118 +
11119 +       for (i = 0; i < e820.nr_map; i++) {
11120 +               unsigned long start, end;
11121 +               /* RAM? */
11122 +               if (e820.map[i].type != E820_RAM)
11123 +                       continue;
11124 +               start = PFN_UP(e820.map[i].addr);
11125 +               end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
11126 +               if (start >= end)
11127 +                       continue;
11128 +               if (end > max_pfn)
11129 +                       max_pfn = end;
11130 +               memory_present(0, start, end);
11131 +       }
11132 +}
11133 +
11134 +/*
11135 + * Determine low and high memory ranges:
11136 + */
11137 +unsigned long __init find_max_low_pfn(void)
11138 +{
11139 +       unsigned long max_low_pfn;
11140 +
11141 +       max_low_pfn = max_pfn;
11142 +       if (max_low_pfn > MAXMEM_PFN) {
11143 +               if (highmem_pages == -1)
11144 +                       highmem_pages = max_pfn - MAXMEM_PFN;
11145 +               if (highmem_pages + MAXMEM_PFN < max_pfn)
11146 +                       max_pfn = MAXMEM_PFN + highmem_pages;
11147 +               if (highmem_pages + MAXMEM_PFN > max_pfn) {
11148 +                       printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
11149 +                       highmem_pages = 0;
11150 +               }
11151 +               max_low_pfn = MAXMEM_PFN;
11152 +#ifndef CONFIG_HIGHMEM
11153 +               /* Maximum memory usable is what is directly addressable */
11154 +               printk(KERN_WARNING "Warning only %ldMB will be used.\n",
11155 +                                       MAXMEM>>20);
11156 +               if (max_pfn > MAX_NONPAE_PFN)
11157 +                       printk(KERN_WARNING "Use a PAE enabled kernel.\n");
11158 +               else
11159 +                       printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
11160 +               max_pfn = MAXMEM_PFN;
11161 +#else /* !CONFIG_HIGHMEM */
11162 +#ifndef CONFIG_X86_PAE
11163 +               if (max_pfn > MAX_NONPAE_PFN) {
11164 +                       max_pfn = MAX_NONPAE_PFN;
11165 +                       printk(KERN_WARNING "Warning only 4GB will be used.\n");
11166 +                       printk(KERN_WARNING "Use a PAE enabled kernel.\n");
11167 +               }
11168 +#endif /* !CONFIG_X86_PAE */
11169 +#endif /* !CONFIG_HIGHMEM */
11170 +       } else {
11171 +               if (highmem_pages == -1)
11172 +                       highmem_pages = 0;
11173 +#ifdef CONFIG_HIGHMEM
11174 +               if (highmem_pages >= max_pfn) {
11175 +                       printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
11176 +                       highmem_pages = 0;
11177 +               }
11178 +               if (highmem_pages) {
11179 +                       if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
11180 +                               printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
11181 +                               highmem_pages = 0;
11182 +                       }
11183 +                       max_low_pfn -= highmem_pages;
11184 +               }
11185 +#else
11186 +               if (highmem_pages)
11187 +                       printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
11188 +#endif
11189 +       }
11190 +       return max_low_pfn;
11191 +}
11192 +
11193 +/*
11194 + * Free all available memory for boot time allocation.  Used
11195 + * as a callback function by efi_memory_walk()
11196 + */
11197 +
11198 +static int __init
11199 +free_available_memory(unsigned long start, unsigned long end, void *arg)
11200 +{
11201 +       /* check max_low_pfn */
11202 +       if (start >= ((max_low_pfn + 1) << PAGE_SHIFT))
11203 +               return 0;
11204 +       if (end >= ((max_low_pfn + 1) << PAGE_SHIFT))
11205 +               end = (max_low_pfn + 1) << PAGE_SHIFT;
11206 +       if (start < end)
11207 +               free_bootmem(start, end - start);
11208 +
11209 +       return 0;
11210 +}
11211 +/*
11212 + * Register fully available low RAM pages with the bootmem allocator.
11213 + */
11214 +static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
11215 +{
11216 +       int i;
11217 +
11218 +       if (efi_enabled) {
11219 +               efi_memmap_walk(free_available_memory, NULL);
11220 +               return;
11221 +       }
11222 +       for (i = 0; i < e820.nr_map; i++) {
11223 +               unsigned long curr_pfn, last_pfn, size;
11224 +               /*
11225 +                * Reserve usable low memory
11226 +                */
11227 +               if (e820.map[i].type != E820_RAM)
11228 +                       continue;
11229 +               /*
11230 +                * We are rounding up the start address of usable memory:
11231 +                */
11232 +               curr_pfn = PFN_UP(e820.map[i].addr);
11233 +               if (curr_pfn >= max_low_pfn)
11234 +                       continue;
11235 +               /*
11236 +                * ... and at the end of the usable range downwards:
11237 +                */
11238 +               last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
11239 +
11240 +#ifdef CONFIG_XEN
11241 +               /*
11242 +                 * Truncate to the number of actual pages currently
11243 +                 * present.
11244 +                 */
11245 +               if (last_pfn > xen_start_info->nr_pages)
11246 +                       last_pfn = xen_start_info->nr_pages;
11247 +#endif
11248 +
11249 +               if (last_pfn > max_low_pfn)
11250 +                       last_pfn = max_low_pfn;
11251 +
11252 +               /*
11253 +                * .. finally, did all the rounding and playing
11254 +                * around just make the area go away?
11255 +                */
11256 +               if (last_pfn <= curr_pfn)
11257 +                       continue;
11258 +
11259 +               size = last_pfn - curr_pfn;
11260 +               free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
11261 +       }
11262 +}
11263 +
11264 +#ifndef CONFIG_XEN
11265 +/*
11266 + * workaround for Dell systems that neglect to reserve EBDA
11267 + */
11268 +static void __init reserve_ebda_region(void)
11269 +{
11270 +       unsigned int addr;
11271 +       addr = get_bios_ebda();
11272 +       if (addr)
11273 +               reserve_bootmem(addr, PAGE_SIZE);       
11274 +}
11275 +#endif
11276 +
11277 +#ifndef CONFIG_NEED_MULTIPLE_NODES
11278 +void __init setup_bootmem_allocator(void);
11279 +static unsigned long __init setup_memory(void)
11280 +{
11281 +       /*
11282 +        * partially used pages are not usable - thus
11283 +        * we are rounding upwards:
11284 +        */
11285 +       min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
11286 +               xen_start_info->nr_pt_frames;
11287 +
11288 +       find_max_pfn();
11289 +
11290 +       max_low_pfn = find_max_low_pfn();
11291 +
11292 +#ifdef CONFIG_HIGHMEM
11293 +       highstart_pfn = highend_pfn = max_pfn;
11294 +       if (max_pfn > max_low_pfn) {
11295 +               highstart_pfn = max_low_pfn;
11296 +       }
11297 +       printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
11298 +               pages_to_mb(highend_pfn - highstart_pfn));
11299 +#endif
11300 +       printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
11301 +                       pages_to_mb(max_low_pfn));
11302 +
11303 +       setup_bootmem_allocator();
11304 +
11305 +       return max_low_pfn;
11306 +}
11307 +
11308 +void __init zone_sizes_init(void)
11309 +{
11310 +       unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
11311 +       unsigned int max_dma, low;
11312 +
11313 +       /*
11314 +        * XEN: Our notion of "DMA memory" is fake when running over Xen.
11315 +        * We simply put all RAM in the DMA zone so that those drivers which
11316 +        * needlessly specify GFP_DMA do not get starved of RAM unnecessarily.
11317 +        * Those drivers that *do* require lowmem are screwed anyway when
11318 +        * running over Xen!
11319 +        */
11320 +       max_dma = max_low_pfn;
11321 +       low = max_low_pfn;
11322 +
11323 +       if (low < max_dma)
11324 +               zones_size[ZONE_DMA] = low;
11325 +       else {
11326 +               zones_size[ZONE_DMA] = max_dma;
11327 +               zones_size[ZONE_NORMAL] = low - max_dma;
11328 +#ifdef CONFIG_HIGHMEM
11329 +               zones_size[ZONE_HIGHMEM] = highend_pfn - low;
11330 +#endif
11331 +       }
11332 +       free_area_init(zones_size);
11333 +}
11334 +#else
11335 +extern unsigned long __init setup_memory(void);
11336 +extern void zone_sizes_init(void);
11337 +#endif /* !CONFIG_NEED_MULTIPLE_NODES */
11338 +
11339 +void __init setup_bootmem_allocator(void)
11340 +{
11341 +       unsigned long bootmap_size;
11342 +       /*
11343 +        * Initialize the boot-time allocator (with low memory only):
11344 +        */
11345 +       bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
11346 +
11347 +       register_bootmem_low_pages(max_low_pfn);
11348 +
11349 +       /*
11350 +        * Reserve the bootmem bitmap itself as well. We do this in two
11351 +        * steps (first step was init_bootmem()) because this catches
11352 +        * the (very unlikely) case of us accidentally initializing the
11353 +        * bootmem allocator with an invalid RAM area.
11354 +        */
11355 +       reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) +
11356 +                        bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START));
11357 +
11358 +#ifndef CONFIG_XEN
11359 +       /*
11360 +        * reserve physical page 0 - it's a special BIOS page on many boxes,
11361 +        * enabling clean reboots, SMP operation, laptop functions.
11362 +        */
11363 +       reserve_bootmem(0, PAGE_SIZE);
11364 +
11365 +       /* reserve EBDA region, it's a 4K region */
11366 +       reserve_ebda_region();
11367 +
11368 +    /* could be an AMD 768MPX chipset. Reserve a page  before VGA to prevent
11369 +       PCI prefetch into it (errata #56). Usually the page is reserved anyways,
11370 +       unless you have no PS/2 mouse plugged in. */
11371 +       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
11372 +           boot_cpu_data.x86 == 6)
11373 +            reserve_bootmem(0xa0000 - 4096, 4096);
11374 +
11375 +#ifdef CONFIG_SMP
11376 +       /*
11377 +        * But first pinch a few for the stack/trampoline stuff
11378 +        * FIXME: Don't need the extra page at 4K, but need to fix
11379 +        * trampoline before removing it. (see the GDT stuff)
11380 +        */
11381 +       reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
11382 +#endif
11383 +#ifdef CONFIG_ACPI_SLEEP
11384 +       /*
11385 +        * Reserve low memory region for sleep support.
11386 +        */
11387 +       acpi_reserve_bootmem();
11388 +#endif
11389 +#endif /* !CONFIG_XEN */
11390 +
11391 +#ifdef CONFIG_BLK_DEV_INITRD
11392 +       if (xen_start_info->mod_start) {
11393 +               if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
11394 +                       /*reserve_bootmem(INITRD_START, INITRD_SIZE);*/
11395 +                       initrd_start = INITRD_START + PAGE_OFFSET;
11396 +                       initrd_end = initrd_start+INITRD_SIZE;
11397 +                       initrd_below_start_ok = 1;
11398 +               }
11399 +               else {
11400 +                       printk(KERN_ERR "initrd extends beyond end of memory "
11401 +                           "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
11402 +                           INITRD_START + INITRD_SIZE,
11403 +                           max_low_pfn << PAGE_SHIFT);
11404 +                       initrd_start = 0;
11405 +               }
11406 +       }
11407 +#endif
11408 +#ifdef CONFIG_KEXEC
11409 +       if (crashk_res.start != crashk_res.end)
11410 +               reserve_bootmem(crashk_res.start,
11411 +                       crashk_res.end - crashk_res.start + 1);
11412 +#endif
11413 +
11414 +       if (!xen_feature(XENFEAT_auto_translated_physmap))
11415 +               phys_to_machine_mapping =
11416 +                       (unsigned long *)xen_start_info->mfn_list;
11417 +}
11418 +
11419 +/*
11420 + * The node 0 pgdat is initialized before all of these because
11421 + * it's needed for bootmem.  node>0 pgdats have their virtual
11422 + * space allocated before the pagetables are in place to access
11423 + * them, so they can't be cleared then.
11424 + *
11425 + * This should all compile down to nothing when NUMA is off.
11426 + */
11427 +void __init remapped_pgdat_init(void)
11428 +{
11429 +       int nid;
11430 +
11431 +       for_each_online_node(nid) {
11432 +               if (nid != 0)
11433 +                       memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
11434 +       }
11435 +}
11436 +
11437 +/*
11438 + * Request address space for all standard RAM and ROM resources
11439 + * and also for regions reported as reserved by the e820.
11440 + */
11441 +static void __init
11442 +legacy_init_iomem_resources(struct e820entry *e820, int nr_map,
11443 +                           struct resource *code_resource,
11444 +                           struct resource *data_resource)
11445 +{
11446 +       int i;
11447 +
11448 +#if defined(CONFIG_XEN_PRIVILEGED_GUEST) || !defined(CONFIG_XEN)
11449 +       probe_roms();
11450 +#endif
11451 +
11452 +       for (i = 0; i < nr_map; i++) {
11453 +               struct resource *res;
11454 +               if (e820[i].addr + e820[i].size > 0x100000000ULL)
11455 +                       continue;
11456 +               res = alloc_bootmem_low(sizeof(struct resource));
11457 +               switch (e820[i].type) {
11458 +               case E820_RAM:  res->name = "System RAM"; break;
11459 +               case E820_ACPI: res->name = "ACPI Tables"; break;
11460 +               case E820_NVS:  res->name = "ACPI Non-volatile Storage"; break;
11461 +               default:        res->name = "reserved";
11462 +               }
11463 +               res->start = e820[i].addr;
11464 +               res->end = res->start + e820[i].size - 1;
11465 +               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
11466 +               request_resource(&iomem_resource, res);
11467 +               if (e820[i].type == E820_RAM) {
11468 +                       /*
11469 +                        *  We don't know which RAM region contains kernel data,
11470 +                        *  so we try it repeatedly and let the resource manager
11471 +                        *  test it.
11472 +                        */
11473 +                       request_resource(res, code_resource);
11474 +                       request_resource(res, data_resource);
11475 +#ifdef CONFIG_KEXEC
11476 +                       request_resource(res, &crashk_res);
11477 +#endif
11478 +               }
11479 +       }
11480 +}
11481 +
11482 +/*
11483 + * Locate a unused range of the physical address space below 4G which
11484 + * can be used for PCI mappings.
11485 + */
11486 +static void __init
11487 +e820_setup_gap(struct e820entry *e820, int nr_map)
11488 +{
11489 +       unsigned long gapstart, gapsize, round;
11490 +       unsigned long long last;
11491 +       int i;
11492 +
11493 +       /*
11494 +        * Search for the bigest gap in the low 32 bits of the e820
11495 +        * memory space.
11496 +        */
11497 +       last = 0x100000000ull;
11498 +       gapstart = 0x10000000;
11499 +       gapsize = 0x400000;
11500 +       i = nr_map;
11501 +       while (--i >= 0) {
11502 +               unsigned long long start = e820[i].addr;
11503 +               unsigned long long end = start + e820[i].size;
11504 +
11505 +               /*
11506 +                * Since "last" is at most 4GB, we know we'll
11507 +                * fit in 32 bits if this condition is true
11508 +                */
11509 +               if (last > end) {
11510 +                       unsigned long gap = last - end;
11511 +
11512 +                       if (gap > gapsize) {
11513 +                               gapsize = gap;
11514 +                               gapstart = end;
11515 +                       }
11516 +               }
11517 +               if (start < last)
11518 +                       last = start;
11519 +       }
11520 +
11521 +       /*
11522 +        * See how much we want to round up: start off with
11523 +        * rounding to the next 1MB area.
11524 +        */
11525 +       round = 0x100000;
11526 +       while ((gapsize >> 4) > round)
11527 +               round += round;
11528 +       /* Fun with two's complement */
11529 +       pci_mem_start = (gapstart + round) & -round;
11530 +
11531 +       printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
11532 +               pci_mem_start, gapstart, gapsize);
11533 +}
11534 +
11535 +/*
11536 + * Request address space for all standard resources
11537 + */
11538 +static void __init register_memory(void)
11539 +{
11540 +#ifdef CONFIG_XEN
11541 +       struct e820entry *machine_e820;
11542 +       struct xen_memory_map memmap;
11543 +#endif
11544 +       int           i;
11545 +
11546 +       /* Nothing to do if not running in dom0. */
11547 +       if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
11548 +               legacy_init_iomem_resources(e820.map, e820.nr_map,
11549 +                                           &code_resource, &data_resource);
11550 +               return;
11551 +       }
11552 +
11553 +#ifdef CONFIG_XEN
11554 +       machine_e820 = alloc_bootmem_low_pages(PAGE_SIZE);
11555 +
11556 +       memmap.nr_entries = E820MAX;
11557 +       set_xen_guest_handle(memmap.buffer, machine_e820);
11558 +
11559 +       BUG_ON(HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap));
11560 +
11561 +       legacy_init_iomem_resources(machine_e820, memmap.nr_entries,
11562 +                                   &code_resource, &data_resource);
11563 +#else
11564 +       if (efi_enabled)
11565 +               efi_initialize_iomem_resources(&code_resource, &data_resource);
11566 +       else
11567 +               legacy_init_iomem_resources(e820.map, e820.nr_map,
11568 +                                           &code_resource, &data_resource);
11569 +#endif
11570 +
11571 +       /* EFI systems may still have VGA */
11572 +       request_resource(&iomem_resource, &video_ram_resource);
11573 +
11574 +       /* request I/O space for devices used on all i[345]86 PCs */
11575 +       for (i = 0; i < STANDARD_IO_RESOURCES; i++)
11576 +               request_resource(&ioport_resource, &standard_io_resources[i]);
11577 +
11578 +#ifdef CONFIG_XEN
11579 +       e820_setup_gap(machine_e820, memmap.nr_entries);
11580 +       free_bootmem(__pa(machine_e820), PAGE_SIZE);
11581 +#else
11582 +       e820_setup_gap(e820.map, e820.nr_map);
11583 +#endif
11584 +}
11585 +
11586 +static char * __init machine_specific_memory_setup(void);
11587 +
11588 +#ifdef CONFIG_MCA
11589 +static void set_mca_bus(int x)
11590 +{
11591 +       MCA_bus = x;
11592 +}
11593 +#else
11594 +static void set_mca_bus(int x) { }
11595 +#endif
11596 +
11597 +/*
11598 + * Determine if we were loaded by an EFI loader.  If so, then we have also been
11599 + * passed the efi memmap, systab, etc., so we should use these data structures
11600 + * for initialization.  Note, the efi init code path is determined by the
11601 + * global efi_enabled. This allows the same kernel image to be used on existing
11602 + * systems (with a traditional BIOS) as well as on EFI systems.
11603 + */
11604 +void __init setup_arch(char **cmdline_p)
11605 +{
11606 +       int i, j, k, fpp;
11607 +       struct physdev_set_iopl set_iopl;
11608 +       unsigned long max_low_pfn;
11609 +
11610 +       /* Force a quick death if the kernel panics (not domain 0). */
11611 +       extern int panic_timeout;
11612 +       if (!panic_timeout && !(xen_start_info->flags & SIF_INITDOMAIN))
11613 +               panic_timeout = 1;
11614 +
11615 +       /* Register a call for panic conditions. */
11616 +       atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
11617 +
11618 +       HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
11619 +       HYPERVISOR_vm_assist(VMASST_CMD_enable,
11620 +                            VMASST_TYPE_writable_pagetables);
11621 +
11622 +       memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
11623 +       pre_setup_arch_hook();
11624 +       early_cpu_init();
11625 +
11626 +       /*
11627 +        * FIXME: This isn't an official loader_type right
11628 +        * now but does currently work with elilo.
11629 +        * If we were configured as an EFI kernel, check to make
11630 +        * sure that we were loaded correctly from elilo and that
11631 +        * the system table is valid.  If not, then initialize normally.
11632 +        */
11633 +#ifdef CONFIG_EFI
11634 +       if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
11635 +               efi_enabled = 1;
11636 +#endif
11637 +
11638 +       /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
11639 +          properly.  Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
11640 +       */
11641 +       ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
11642 +       drive_info = DRIVE_INFO;
11643 +       screen_info = SCREEN_INFO;
11644 +       edid_info = EDID_INFO;
11645 +       apm_info.bios = APM_BIOS_INFO;
11646 +       ist_info = IST_INFO;
11647 +       saved_videomode = VIDEO_MODE;
11648 +       if( SYS_DESC_TABLE.length != 0 ) {
11649 +               set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2);
11650 +               machine_id = SYS_DESC_TABLE.table[0];
11651 +               machine_submodel_id = SYS_DESC_TABLE.table[1];
11652 +               BIOS_revision = SYS_DESC_TABLE.table[2];
11653 +       }
11654 +       bootloader_type = LOADER_TYPE;
11655 +
11656 +       if (xen_start_info->flags & SIF_INITDOMAIN) {
11657 +               /* This is drawn from a dump from vgacon:startup in
11658 +                * standard Linux. */
11659 +               screen_info.orig_video_mode = 3; 
11660 +               screen_info.orig_video_isVGA = 1;
11661 +               screen_info.orig_video_lines = 25;
11662 +               screen_info.orig_video_cols = 80;
11663 +               screen_info.orig_video_ega_bx = 3;
11664 +               screen_info.orig_video_points = 16;
11665 +       } else
11666 +               screen_info.orig_video_isVGA = 0;
11667 +
11668 +#ifdef CONFIG_BLK_DEV_RAM
11669 +       rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
11670 +       rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
11671 +       rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
11672 +#endif
11673 +
11674 +       setup_xen_features();
11675 +
11676 +       ARCH_SETUP
11677 +       if (efi_enabled)
11678 +               efi_init();
11679 +       else {
11680 +               printk(KERN_INFO "BIOS-provided physical RAM map:\n");
11681 +               print_memory_map(machine_specific_memory_setup());
11682 +       }
11683 +
11684 +       copy_edd();
11685 +
11686 +       if (!MOUNT_ROOT_RDONLY)
11687 +               root_mountflags &= ~MS_RDONLY;
11688 +       init_mm.start_code = (unsigned long) _text;
11689 +       init_mm.end_code = (unsigned long) _etext;
11690 +       init_mm.end_data = (unsigned long) _edata;
11691 +       init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
11692 +                      xen_start_info->nr_pt_frames) << PAGE_SHIFT;
11693 +
11694 +       code_resource.start = virt_to_phys(_text);
11695 +       code_resource.end = virt_to_phys(_etext)-1;
11696 +       data_resource.start = virt_to_phys(_etext);
11697 +       data_resource.end = virt_to_phys(_edata)-1;
11698 +
11699 +       parse_cmdline_early(cmdline_p);
11700 +
11701 +#ifdef CONFIG_EARLY_PRINTK
11702 +       {
11703 +               char *s = strstr(*cmdline_p, "earlyprintk=");
11704 +               if (s) {
11705 +                       setup_early_printk(strchr(s, '=') + 1);
11706 +                       printk("early console enabled\n");
11707 +               }
11708 +       }
11709 +#endif
11710 +
11711 +       max_low_pfn = setup_memory();
11712 +
11713 +       /*
11714 +        * NOTE: before this point _nobody_ is allowed to allocate
11715 +        * any memory using the bootmem allocator.  Although the
11716 +        * alloctor is now initialised only the first 8Mb of the kernel
11717 +        * virtual address space has been mapped.  All allocations before
11718 +        * paging_init() has completed must use the alloc_bootmem_low_pages()
11719 +        * variant (which allocates DMA'able memory) and care must be taken
11720 +        * not to exceed the 8Mb limit.
11721 +        */
11722 +
11723 +#ifdef CONFIG_SMP
11724 +       smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
11725 +#endif
11726 +       paging_init();
11727 +       remapped_pgdat_init();
11728 +       sparse_init();
11729 +       zone_sizes_init();
11730 +
11731 +#ifdef CONFIG_X86_FIND_SMP_CONFIG
11732 +       /*
11733 +        * Find and reserve possible boot-time SMP configuration:
11734 +        */
11735 +       find_smp_config();
11736 +#endif
11737 +
11738 +       /* Make sure we have a correctly sized P->M table. */
11739 +       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
11740 +               phys_to_machine_mapping = alloc_bootmem_low_pages(
11741 +                    max_pfn * sizeof(unsigned long));
11742 +               memset(phys_to_machine_mapping, ~0,
11743 +                      max_pfn * sizeof(unsigned long));
11744 +               memcpy(phys_to_machine_mapping,
11745 +                      (unsigned long *)xen_start_info->mfn_list,
11746 +                      xen_start_info->nr_pages * sizeof(unsigned long));
11747 +               free_bootmem(
11748 +                    __pa(xen_start_info->mfn_list),
11749 +                    PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
11750 +                                    sizeof(unsigned long))));
11751 +
11752 +               /*
11753 +                * Initialise the list of the frames that specify the list of
11754 +                * frames that make up the p2m table. Used by save/restore
11755 +                */
11756 +               pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
11757 +               HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
11758 +                    virt_to_mfn(pfn_to_mfn_frame_list_list);
11759 +
11760 +               fpp = PAGE_SIZE/sizeof(unsigned long);
11761 +               for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
11762 +                       if ((j % fpp) == 0) {
11763 +                               k++;
11764 +                               BUG_ON(k>=16);
11765 +                               pfn_to_mfn_frame_list[k] =
11766 +                                       alloc_bootmem_low_pages(PAGE_SIZE);
11767 +                               pfn_to_mfn_frame_list_list[k] =
11768 +                                       virt_to_mfn(pfn_to_mfn_frame_list[k]);
11769 +                               j=0;
11770 +                       }
11771 +                       pfn_to_mfn_frame_list[k][j] =
11772 +                               virt_to_mfn(&phys_to_machine_mapping[i]);
11773 +               }
11774 +               HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
11775 +       }
11776 +
11777 +       /*
11778 +        * NOTE: at this point the bootmem allocator is fully available.
11779 +        */
11780 +
11781 +       if (xen_start_info->flags & SIF_INITDOMAIN)
11782 +               dmi_scan_machine();
11783 +
11784 +#ifdef CONFIG_X86_GENERICARCH
11785 +       generic_apic_probe(*cmdline_p);
11786 +#endif 
11787 +       if (efi_enabled)
11788 +               efi_map_memmap();
11789 +
11790 +       set_iopl.iopl = 1;
11791 +       HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
11792 +
11793 +#ifdef CONFIG_X86_IO_APIC
11794 +       check_acpi_pci();       /* Checks more than just ACPI actually */
11795 +#endif
11796 +
11797 +#ifdef CONFIG_ACPI
11798 +       if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
11799 +               printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
11800 +               acpi_disabled = 1;
11801 +               acpi_ht = 0;
11802 +       }
11803 +
11804 +       /*
11805 +        * Parse the ACPI tables for possible boot-time SMP configuration.
11806 +        */
11807 +       acpi_boot_table_init();
11808 +       acpi_boot_init();
11809 +
11810 +#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
11811 +       if (def_to_bigsmp)
11812 +               printk(KERN_WARNING "More than 8 CPUs detected and "
11813 +                       "CONFIG_X86_PC cannot handle it.\nUse "
11814 +                       "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
11815 +#endif
11816 +#endif
11817 +#ifdef CONFIG_X86_LOCAL_APIC
11818 +       if (smp_found_config)
11819 +               get_smp_config();
11820 +#endif
11821 +#if defined(CONFIG_XEN) && defined(CONFIG_SMP)
11822 +       prefill_possible_map();
11823 +#endif
11824 +
11825 +       register_memory();
11826 +
11827 +       if (xen_start_info->flags & SIF_INITDOMAIN) {
11828 +               if (!(xen_start_info->flags & SIF_PRIVILEGED))
11829 +                       panic("Xen granted us console access "
11830 +                             "but not privileged status");
11831 +
11832 +#ifdef CONFIG_VT
11833 +#if defined(CONFIG_VGA_CONSOLE)
11834 +               if (!efi_enabled ||
11835 +                   (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
11836 +                       conswitchp = &vga_con;
11837 +#elif defined(CONFIG_DUMMY_CONSOLE)
11838 +               conswitchp = &dummy_con;
11839 +#endif
11840 +#endif
11841 +       } else {
11842 +               extern int console_use_vt;
11843 +               console_use_vt = 0;
11844 +       }
11845 +}
11846 +
11847 +static int
11848 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
11849 +{
11850 +       HYPERVISOR_shutdown(SHUTDOWN_crash);
11851 +       /* we're never actually going to get here... */
11852 +       return NOTIFY_DONE;
11853 +}
11854 +
11855 +static __init int add_pcspkr(void)
11856 +{
11857 +       struct platform_device *pd;
11858 +       int ret;
11859 +
11860 +       pd = platform_device_alloc("pcspkr", -1);
11861 +       if (!pd)
11862 +               return -ENOMEM;
11863 +
11864 +       ret = platform_device_add(pd);
11865 +       if (ret)
11866 +               platform_device_put(pd);
11867 +
11868 +       return ret;
11869 +}
11870 +device_initcall(add_pcspkr);
11871 +
11872 +#include "setup_arch_post.h"
11873 +/*
11874 + * Local Variables:
11875 + * mode:c
11876 + * c-file-style:"k&r"
11877 + * c-basic-offset:8
11878 + * End:
11879 + */
11880 diff -urNp linux-2.6/arch/i386/kernel/smp-xen.c new/arch/i386/kernel/smp-xen.c
11881 --- linux-2.6/arch/i386/kernel/smp-xen.c        1970-01-01 01:00:00.000000000 +0100
11882 +++ new/arch/i386/kernel/smp-xen.c      2006-07-07 15:10:04.000000000 +0200
11883 @@ -0,0 +1,614 @@
11884 +/*
11885 + *     Intel SMP support routines.
11886 + *
11887 + *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
11888 + *     (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
11889 + *
11890 + *     This code is released under the GNU General Public License version 2 or
11891 + *     later.
11892 + */
11893 +
11894 +#include <linux/init.h>
11895 +
11896 +#include <linux/mm.h>
11897 +#include <linux/delay.h>
11898 +#include <linux/spinlock.h>
11899 +#include <linux/smp_lock.h>
11900 +#include <linux/kernel_stat.h>
11901 +#include <linux/mc146818rtc.h>
11902 +#include <linux/cache.h>
11903 +#include <linux/interrupt.h>
11904 +#include <linux/cpu.h>
11905 +#include <linux/module.h>
11906 +
11907 +#include <asm/mtrr.h>
11908 +#include <asm/tlbflush.h>
11909 +#if 0
11910 +#include <mach_apic.h>
11911 +#endif
11912 +#include <xen/evtchn.h>
11913 +
11914 +/*
11915 + *     Some notes on x86 processor bugs affecting SMP operation:
11916 + *
11917 + *     Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
11918 + *     The Linux implications for SMP are handled as follows:
11919 + *
11920 + *     Pentium III / [Xeon]
11921 + *             None of the E1AP-E3AP errata are visible to the user.
11922 + *
11923 + *     E1AP.   see PII A1AP
11924 + *     E2AP.   see PII A2AP
11925 + *     E3AP.   see PII A3AP
11926 + *
11927 + *     Pentium II / [Xeon]
11928 + *             None of the A1AP-A3AP errata are visible to the user.
11929 + *
11930 + *     A1AP.   see PPro 1AP
11931 + *     A2AP.   see PPro 2AP
11932 + *     A3AP.   see PPro 7AP
11933 + *
11934 + *     Pentium Pro
11935 + *             None of 1AP-9AP errata are visible to the normal user,
11936 + *     except occasional delivery of 'spurious interrupt' as trap #15.
11937 + *     This is very rare and a non-problem.
11938 + *
11939 + *     1AP.    Linux maps APIC as non-cacheable
11940 + *     2AP.    worked around in hardware
11941 + *     3AP.    fixed in C0 and above steppings microcode update.
11942 + *             Linux does not use excessive STARTUP_IPIs.
11943 + *     4AP.    worked around in hardware
11944 + *     5AP.    symmetric IO mode (normal Linux operation) not affected.
11945 + *             'noapic' mode has vector 0xf filled out properly.
11946 + *     6AP.    'noapic' mode might be affected - fixed in later steppings
11947 + *     7AP.    We do not assume writes to the LVT deassering IRQs
11948 + *     8AP.    We do not enable low power mode (deep sleep) during MP bootup
11949 + *     9AP.    We do not use mixed mode
11950 + *
11951 + *     Pentium
11952 + *             There is a marginal case where REP MOVS on 100MHz SMP
11953 + *     machines with B stepping processors can fail. XXX should provide
11954 + *     an L1cache=Writethrough or L1cache=off option.
11955 + *
11956 + *             B stepping CPUs may hang. There are hardware work arounds
11957 + *     for this. We warn about it in case your board doesn't have the work
11958 + *     arounds. Basically thats so I can tell anyone with a B stepping
11959 + *     CPU and SMP problems "tough".
11960 + *
11961 + *     Specific items [From Pentium Processor Specification Update]
11962 + *
11963 + *     1AP.    Linux doesn't use remote read
11964 + *     2AP.    Linux doesn't trust APIC errors
11965 + *     3AP.    We work around this
11966 + *     4AP.    Linux never generated 3 interrupts of the same priority
11967 + *             to cause a lost local interrupt.
11968 + *     5AP.    Remote read is never used
11969 + *     6AP.    not affected - worked around in hardware
11970 + *     7AP.    not affected - worked around in hardware
11971 + *     8AP.    worked around in hardware - we get explicit CS errors if not
11972 + *     9AP.    only 'noapic' mode affected. Might generate spurious
11973 + *             interrupts, we log only the first one and count the
11974 + *             rest silently.
11975 + *     10AP.   not affected - worked around in hardware
11976 + *     11AP.   Linux reads the APIC between writes to avoid this, as per
11977 + *             the documentation. Make sure you preserve this as it affects
11978 + *             the C stepping chips too.
11979 + *     12AP.   not affected - worked around in hardware
11980 + *     13AP.   not affected - worked around in hardware
11981 + *     14AP.   we always deassert INIT during bootup
11982 + *     15AP.   not affected - worked around in hardware
11983 + *     16AP.   not affected - worked around in hardware
11984 + *     17AP.   not affected - worked around in hardware
11985 + *     18AP.   not affected - worked around in hardware
11986 + *     19AP.   not affected - worked around in BIOS
11987 + *
11988 + *     If this sounds worrying believe me these bugs are either ___RARE___,
11989 + *     or are signal timing bugs worked around in hardware and there's
11990 + *     about nothing of note with C stepping upwards.
11991 + */
11992 +
11993 +DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
11994 +
11995 +/*
11996 + * the following functions deal with sending IPIs between CPUs.
11997 + *
11998 + * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
11999 + */
12000 +
12001 +static inline int __prepare_ICR (unsigned int shortcut, int vector)
12002 +{
12003 +       return APIC_DM_FIXED | shortcut | vector | APIC_DEST_LOGICAL;
12004 +}
12005 +
12006 +static inline int __prepare_ICR2 (unsigned int mask)
12007 +{
12008 +       return SET_APIC_DEST_FIELD(mask);
12009 +}
12010 +
12011 +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
12012 +
12013 +static inline void __send_IPI_one(unsigned int cpu, int vector)
12014 +{
12015 +       int irq = per_cpu(ipi_to_irq, cpu)[vector];
12016 +       BUG_ON(irq < 0);
12017 +       notify_remote_via_irq(irq);
12018 +}
12019 +
12020 +void __send_IPI_shortcut(unsigned int shortcut, int vector)
12021 +{
12022 +       int cpu;
12023 +
12024 +       switch (shortcut) {
12025 +       case APIC_DEST_SELF:
12026 +               __send_IPI_one(smp_processor_id(), vector);
12027 +               break;
12028 +       case APIC_DEST_ALLBUT:
12029 +               for (cpu = 0; cpu < NR_CPUS; ++cpu) {
12030 +                       if (cpu == smp_processor_id())
12031 +                               continue;
12032 +                       if (cpu_isset(cpu, cpu_online_map)) {
12033 +                               __send_IPI_one(cpu, vector);
12034 +                       }
12035 +               }
12036 +               break;
12037 +       default:
12038 +               printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
12039 +                      vector);
12040 +               break;
12041 +       }
12042 +}
12043 +
12044 +void fastcall send_IPI_self(int vector)
12045 +{
12046 +       __send_IPI_shortcut(APIC_DEST_SELF, vector);
12047 +}
12048 +
12049 +/*
12050 + * This is only used on smaller machines.
12051 + */
12052 +void send_IPI_mask_bitmask(cpumask_t mask, int vector)
12053 +{
12054 +       unsigned long flags;
12055 +       unsigned int cpu;
12056 +
12057 +       local_irq_save(flags);
12058 +       WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]);
12059 +
12060 +       for (cpu = 0; cpu < NR_CPUS; ++cpu) {
12061 +               if (cpu_isset(cpu, mask)) {
12062 +                       __send_IPI_one(cpu, vector);
12063 +               }
12064 +       }
12065 +
12066 +       local_irq_restore(flags);
12067 +}
12068 +
12069 +void send_IPI_mask_sequence(cpumask_t mask, int vector)
12070 +{
12071 +
12072 +       send_IPI_mask_bitmask(mask, vector);
12073 +}
12074 +
12075 +#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
12076 +
12077 +#if 0 /* XEN */
12078 +/*
12079 + *     Smarter SMP flushing macros. 
12080 + *             c/o Linus Torvalds.
12081 + *
12082 + *     These mean you can really definitely utterly forget about
12083 + *     writing to user space from interrupts. (Its not allowed anyway).
12084 + *
12085 + *     Optimizations Manfred Spraul <manfred@colorfullife.com>
12086 + */
12087 +
12088 +static cpumask_t flush_cpumask;
12089 +static struct mm_struct * flush_mm;
12090 +static unsigned long flush_va;
12091 +static DEFINE_SPINLOCK(tlbstate_lock);
12092 +#define FLUSH_ALL      0xffffffff
12093 +
12094 +/*
12095 + * We cannot call mmdrop() because we are in interrupt context, 
12096 + * instead update mm->cpu_vm_mask.
12097 + *
12098 + * We need to reload %cr3 since the page tables may be going
12099 + * away from under us..
12100 + */
12101 +static inline void leave_mm (unsigned long cpu)
12102 +{
12103 +       if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
12104 +               BUG();
12105 +       cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
12106 +       load_cr3(swapper_pg_dir);
12107 +}
12108 +
12109 +/*
12110 + *
12111 + * The flush IPI assumes that a thread switch happens in this order:
12112 + * [cpu0: the cpu that switches]
12113 + * 1) switch_mm() either 1a) or 1b)
12114 + * 1a) thread switch to a different mm
12115 + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
12116 + *     Stop ipi delivery for the old mm. This is not synchronized with
12117 + *     the other cpus, but smp_invalidate_interrupt ignore flush ipis
12118 + *     for the wrong mm, and in the worst case we perform a superflous
12119 + *     tlb flush.
12120 + * 1a2) set cpu_tlbstate to TLBSTATE_OK
12121 + *     Now the smp_invalidate_interrupt won't call leave_mm if cpu0
12122 + *     was in lazy tlb mode.
12123 + * 1a3) update cpu_tlbstate[].active_mm
12124 + *     Now cpu0 accepts tlb flushes for the new mm.
12125 + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
12126 + *     Now the other cpus will send tlb flush ipis.
12127 + * 1a4) change cr3.
12128 + * 1b) thread switch without mm change
12129 + *     cpu_tlbstate[].active_mm is correct, cpu0 already handles
12130 + *     flush ipis.
12131 + * 1b1) set cpu_tlbstate to TLBSTATE_OK
12132 + * 1b2) test_and_set the cpu bit in cpu_vm_mask.
12133 + *     Atomically set the bit [other cpus will start sending flush ipis],
12134 + *     and test the bit.
12135 + * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
12136 + * 2) switch %%esp, ie current
12137 + *
12138 + * The interrupt must handle 2 special cases:
12139 + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
12140 + * - the cpu performs speculative tlb reads, i.e. even if the cpu only
12141 + *   runs in kernel space, the cpu could load tlb entries for user space
12142 + *   pages.
12143 + *
12144 + * The good news is that cpu_tlbstate is local to each cpu, no
12145 + * write/read ordering problems.
12146 + */
12147 +
12148 +/*
12149 + * TLB flush IPI:
12150 + *
12151 + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
12152 + * 2) Leave the mm if we are in the lazy tlb mode.
12153 + */
12154 +
12155 +irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
12156 +                                    struct pt_regs *regs)
12157 +{
12158 +       unsigned long cpu;
12159 +
12160 +       cpu = get_cpu();
12161 +
12162 +       if (!cpu_isset(cpu, flush_cpumask))
12163 +               goto out;
12164 +               /* 
12165 +                * This was a BUG() but until someone can quote me the
12166 +                * line from the intel manual that guarantees an IPI to
12167 +                * multiple CPUs is retried _only_ on the erroring CPUs
12168 +                * its staying as a return
12169 +                *
12170 +                * BUG();
12171 +                */
12172 +                
12173 +       if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
12174 +               if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
12175 +                       if (flush_va == FLUSH_ALL)
12176 +                               local_flush_tlb();
12177 +                       else
12178 +                               __flush_tlb_one(flush_va);
12179 +               } else
12180 +                       leave_mm(cpu);
12181 +       }
12182 +       smp_mb__before_clear_bit();
12183 +       cpu_clear(cpu, flush_cpumask);
12184 +       smp_mb__after_clear_bit();
12185 +out:
12186 +       put_cpu_no_resched();
12187 +
12188 +       return IRQ_HANDLED;
12189 +}
12190 +
12191 +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
12192 +                                               unsigned long va)
12193 +{
12194 +       /*
12195 +        * A couple of (to be removed) sanity checks:
12196 +        *
12197 +        * - current CPU must not be in mask
12198 +        * - mask must exist :)
12199 +        */
12200 +       BUG_ON(cpus_empty(cpumask));
12201 +       BUG_ON(cpu_isset(smp_processor_id(), cpumask));
12202 +       BUG_ON(!mm);
12203 +
12204 +       /* If a CPU which we ran on has gone down, OK. */
12205 +       cpus_and(cpumask, cpumask, cpu_online_map);
12206 +       if (cpus_empty(cpumask))
12207 +               return;
12208 +
12209 +       /*
12210 +        * i'm not happy about this global shared spinlock in the
12211 +        * MM hot path, but we'll see how contended it is.
12212 +        * Temporarily this turns IRQs off, so that lockups are
12213 +        * detected by the NMI watchdog.
12214 +        */
12215 +       spin_lock(&tlbstate_lock);
12216 +       
12217 +       flush_mm = mm;
12218 +       flush_va = va;
12219 +#if NR_CPUS <= BITS_PER_LONG
12220 +       atomic_set_mask(cpumask, &flush_cpumask);
12221 +#else
12222 +       {
12223 +               int k;
12224 +               unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
12225 +               unsigned long *cpu_mask = (unsigned long *)&cpumask;
12226 +               for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
12227 +                       atomic_set_mask(cpu_mask[k], &flush_mask[k]);
12228 +       }
12229 +#endif
12230 +       /*
12231 +        * We have to send the IPI only to
12232 +        * CPUs affected.
12233 +        */
12234 +       send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
12235 +
12236 +       while (!cpus_empty(flush_cpumask))
12237 +               /* nothing. lockup detection does not belong here */
12238 +               mb();
12239 +
12240 +       flush_mm = NULL;
12241 +       flush_va = 0;
12242 +       spin_unlock(&tlbstate_lock);
12243 +}
12244 +       
12245 +void flush_tlb_current_task(void)
12246 +{
12247 +       struct mm_struct *mm = current->mm;
12248 +       cpumask_t cpu_mask;
12249 +
12250 +       preempt_disable();
12251 +       cpu_mask = mm->cpu_vm_mask;
12252 +       cpu_clear(smp_processor_id(), cpu_mask);
12253 +
12254 +       local_flush_tlb();
12255 +       if (!cpus_empty(cpu_mask))
12256 +               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
12257 +       preempt_enable();
12258 +}
12259 +
12260 +void flush_tlb_mm (struct mm_struct * mm)
12261 +{
12262 +       cpumask_t cpu_mask;
12263 +
12264 +       preempt_disable();
12265 +       cpu_mask = mm->cpu_vm_mask;
12266 +       cpu_clear(smp_processor_id(), cpu_mask);
12267 +
12268 +       if (current->active_mm == mm) {
12269 +               if (current->mm)
12270 +                       local_flush_tlb();
12271 +               else
12272 +                       leave_mm(smp_processor_id());
12273 +       }
12274 +       if (!cpus_empty(cpu_mask))
12275 +               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
12276 +
12277 +       preempt_enable();
12278 +}
12279 +
12280 +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
12281 +{
12282 +       struct mm_struct *mm = vma->vm_mm;
12283 +       cpumask_t cpu_mask;
12284 +
12285 +       preempt_disable();
12286 +       cpu_mask = mm->cpu_vm_mask;
12287 +       cpu_clear(smp_processor_id(), cpu_mask);
12288 +
12289 +       if (current->active_mm == mm) {
12290 +               if(current->mm)
12291 +                       __flush_tlb_one(va);
12292 +               else
12293 +                       leave_mm(smp_processor_id());
12294 +       }
12295 +
12296 +       if (!cpus_empty(cpu_mask))
12297 +               flush_tlb_others(cpu_mask, mm, va);
12298 +
12299 +       preempt_enable();
12300 +}
12301 +EXPORT_SYMBOL(flush_tlb_page);
12302 +
12303 +static void do_flush_tlb_all(void* info)
12304 +{
12305 +       unsigned long cpu = smp_processor_id();
12306 +
12307 +       __flush_tlb_all();
12308 +       if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
12309 +               leave_mm(cpu);
12310 +}
12311 +
12312 +void flush_tlb_all(void)
12313 +{
12314 +       on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
12315 +}
12316 +
12317 +#else
12318 +
12319 +irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
12320 +                                    struct pt_regs *regs)
12321 +{ return 0; }
12322 +void flush_tlb_current_task(void)
12323 +{ xen_tlb_flush_mask(&current->mm->cpu_vm_mask); }
12324 +void flush_tlb_mm(struct mm_struct * mm)
12325 +{ xen_tlb_flush_mask(&mm->cpu_vm_mask); }
12326 +void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
12327 +{ xen_invlpg_mask(&vma->vm_mm->cpu_vm_mask, va); }
12328 +EXPORT_SYMBOL(flush_tlb_page);
12329 +void flush_tlb_all(void)
12330 +{ xen_tlb_flush_all(); }
12331 +
12332 +#endif /* XEN */
12333 +
12334 +/*
12335 + * this function sends a 'reschedule' IPI to another CPU.
12336 + * it goes straight through and wastes no time serializing
12337 + * anything. Worst case is that we lose a reschedule ...
12338 + */
12339 +void smp_send_reschedule(int cpu)
12340 +{
12341 +       WARN_ON(cpu_is_offline(cpu));
12342 +       send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
12343 +}
12344 +
12345 +/*
12346 + * Structure and data for smp_call_function(). This is designed to minimise
12347 + * static memory requirements. It also looks cleaner.
12348 + */
12349 +static DEFINE_SPINLOCK(call_lock);
12350 +
12351 +struct call_data_struct {
12352 +       void (*func) (void *info);
12353 +       void *info;
12354 +       atomic_t started;
12355 +       atomic_t finished;
12356 +       int wait;
12357 +};
12358 +
12359 +void lock_ipi_call_lock(void)
12360 +{
12361 +       spin_lock_irq(&call_lock);
12362 +}
12363 +
12364 +void unlock_ipi_call_lock(void)
12365 +{
12366 +       spin_unlock_irq(&call_lock);
12367 +}
12368 +
12369 +static struct call_data_struct *call_data;
12370 +
12371 +/**
12372 + * smp_call_function(): Run a function on all other CPUs.
12373 + * @func: The function to run. This must be fast and non-blocking.
12374 + * @info: An arbitrary pointer to pass to the function.
12375 + * @nonatomic: currently unused.
12376 + * @wait: If true, wait (atomically) until function has completed on other CPUs.
12377 + *
12378 + * Returns 0 on success, else a negative status code. Does not return until
12379 + * remote CPUs are nearly ready to execute <<func>> or are or have executed.
12380 + *
12381 + * You must not call this function with disabled interrupts or from a
12382 + * hardware interrupt handler or from a bottom half handler.
12383 + */
12384 +int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
12385 +                       int wait)
12386 +{
12387 +       struct call_data_struct data;
12388 +       int cpus;
12389 +
12390 +       /* Holding any lock stops cpus from going down. */
12391 +       spin_lock(&call_lock);
12392 +       cpus = num_online_cpus() - 1;
12393 +       if (!cpus) {
12394 +               spin_unlock(&call_lock);
12395 +               return 0;
12396 +       }
12397 +
12398 +       /* Can deadlock when called with interrupts disabled */
12399 +       WARN_ON(irqs_disabled());
12400 +
12401 +       data.func = func;
12402 +       data.info = info;
12403 +       atomic_set(&data.started, 0);
12404 +       data.wait = wait;
12405 +       if (wait)
12406 +               atomic_set(&data.finished, 0);
12407 +
12408 +       call_data = &data;
12409 +       mb();
12410 +       
12411 +       /* Send a message to all other CPUs and wait for them to respond */
12412 +       send_IPI_allbutself(CALL_FUNCTION_VECTOR);
12413 +
12414 +       /* Wait for response */
12415 +       while (atomic_read(&data.started) != cpus)
12416 +               barrier();
12417 +
12418 +       if (wait)
12419 +               while (atomic_read(&data.finished) != cpus)
12420 +                       barrier();
12421 +       spin_unlock(&call_lock);
12422 +
12423 +       return 0;
12424 +}
12425 +EXPORT_SYMBOL(smp_call_function);
12426 +
12427 +static void stop_this_cpu (void * dummy)
12428 +{
12429 +       /*
12430 +        * Remove this CPU:
12431 +        */
12432 +       cpu_clear(smp_processor_id(), cpu_online_map);
12433 +       local_irq_disable();
12434 +#if 0
12435 +       disable_local_APIC();
12436 +#endif
12437 +       if (cpu_data[smp_processor_id()].hlt_works_ok)
12438 +               for(;;) halt();
12439 +       for (;;);
12440 +}
12441 +
12442 +/*
12443 + * this function calls the 'stop' function on all other CPUs in the system.
12444 + */
12445 +
12446 +void smp_send_stop(void)
12447 +{
12448 +       smp_call_function(stop_this_cpu, NULL, 1, 0);
12449 +
12450 +       local_irq_disable();
12451 +#if 0
12452 +       disable_local_APIC();
12453 +#endif
12454 +       local_irq_enable();
12455 +}
12456 +
12457 +/*
12458 + * Reschedule call back. Nothing to do,
12459 + * all the work is done automatically when
12460 + * we return from the interrupt.
12461 + */
12462 +irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id,
12463 +                                    struct pt_regs *regs)
12464 +{
12465 +
12466 +       return IRQ_HANDLED;
12467 +}
12468 +
12469 +#include <linux/kallsyms.h>
12470 +irqreturn_t smp_call_function_interrupt(int irq, void *dev_id,
12471 +                                       struct pt_regs *regs)
12472 +{
12473 +       void (*func) (void *info) = call_data->func;
12474 +       void *info = call_data->info;
12475 +       int wait = call_data->wait;
12476 +
12477 +       /*
12478 +        * Notify initiating CPU that I've grabbed the data and am
12479 +        * about to execute the function
12480 +        */
12481 +       mb();
12482 +       atomic_inc(&call_data->started);
12483 +       /*
12484 +        * At this point the info structure may be out of scope unless wait==1
12485 +        */
12486 +       irq_enter();
12487 +       (*func)(info);
12488 +       irq_exit();
12489 +
12490 +       if (wait) {
12491 +               mb();
12492 +               atomic_inc(&call_data->finished);
12493 +       }
12494 +
12495 +       return IRQ_HANDLED;
12496 +}
12497 +
12498 diff -urNp linux-2.6/arch/i386/kernel/swiotlb.c new/arch/i386/kernel/swiotlb.c
12499 --- linux-2.6/arch/i386/kernel/swiotlb.c        1970-01-01 01:00:00.000000000 +0100
12500 +++ new/arch/i386/kernel/swiotlb.c      2006-06-28 14:32:13.000000000 +0200
12501 @@ -0,0 +1,672 @@
12502 +/*
12503 + * Dynamic DMA mapping support.
12504 + *
12505 + * This implementation is a fallback for platforms that do not support
12506 + * I/O TLBs (aka DMA address translation hardware).
12507 + * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
12508 + * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
12509 + * Copyright (C) 2000, 2003 Hewlett-Packard Co
12510 + *     David Mosberger-Tang <davidm@hpl.hp.com>
12511 + * Copyright (C) 2005 Keir Fraser <keir@xensource.com>
12512 + */
12513 +
12514 +#include <linux/cache.h>
12515 +#include <linux/mm.h>
12516 +#include <linux/module.h>
12517 +#include <linux/pci.h>
12518 +#include <linux/spinlock.h>
12519 +#include <linux/string.h>
12520 +#include <linux/types.h>
12521 +#include <linux/ctype.h>
12522 +#include <linux/init.h>
12523 +#include <linux/bootmem.h>
12524 +#include <linux/highmem.h>
12525 +#include <asm/io.h>
12526 +#include <asm/pci.h>
12527 +#include <asm/dma.h>
12528 +#include <asm/uaccess.h>
12529 +#include <xen/interface/memory.h>
12530 +
12531 +int swiotlb;
12532 +EXPORT_SYMBOL(swiotlb);
12533 +
12534 +#define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
12535 +
12536 +#define SG_ENT_PHYS_ADDRESS(sg)        (page_to_bus((sg)->page) + (sg)->offset)
12537 +
12538 +/*
12539 + * Maximum allowable number of contiguous slabs to map,
12540 + * must be a power of 2.  What is the appropriate value ?
12541 + * The complexity of {map,unmap}_single is linearly dependent on this value.
12542 + */
12543 +#define IO_TLB_SEGSIZE 128
12544 +
12545 +/*
12546 + * log of the size of each IO TLB slab.  The number of slabs is command line
12547 + * controllable.
12548 + */
12549 +#define IO_TLB_SHIFT 11
12550 +
12551 +/* Width of DMA addresses in the IO TLB. 31 bits is an aacraid limitation. */
12552 +#define IO_TLB_DMA_BITS 31
12553 +
12554 +static int swiotlb_force;
12555 +static char *iotlb_virt_start;
12556 +static unsigned long iotlb_nslabs;
12557 +
12558 +/*
12559 + * Used to do a quick range check in swiotlb_unmap_single and
12560 + * swiotlb_sync_single_*, to see if the memory was in fact allocated by this
12561 + * API.
12562 + */
12563 +static unsigned long iotlb_pfn_start, iotlb_pfn_end;
12564 +
12565 +/* Does the given dma address reside within the swiotlb aperture? */
12566 +static inline int in_swiotlb_aperture(dma_addr_t dev_addr)
12567 +{
12568 +       unsigned long pfn = mfn_to_local_pfn(dev_addr >> PAGE_SHIFT);
12569 +       return (pfn_valid(pfn)
12570 +               && (pfn >= iotlb_pfn_start)
12571 +               && (pfn < iotlb_pfn_end));
12572 +}
12573 +
12574 +/*
12575 + * When the IOMMU overflows we return a fallback buffer. This sets the size.
12576 + */
12577 +static unsigned long io_tlb_overflow = 32*1024;
12578 +
12579 +void *io_tlb_overflow_buffer;
12580 +
12581 +/*
12582 + * This is a free list describing the number of free entries available from
12583 + * each index
12584 + */
12585 +static unsigned int *io_tlb_list;
12586 +static unsigned int io_tlb_index;
12587 +
12588 +/*
12589 + * We need to save away the original address corresponding to a mapped entry
12590 + * for the sync operations.
12591 + */
12592 +static struct phys_addr {
12593 +       struct page *page;
12594 +       unsigned int offset;
12595 +} *io_tlb_orig_addr;
12596 +
12597 +/*
12598 + * Protect the above data structures in the map and unmap calls
12599 + */
12600 +static DEFINE_SPINLOCK(io_tlb_lock);
12601 +
12602 +static int __init
12603 +setup_io_tlb_npages(char *str)
12604 +{
12605 +       /* Unlike ia64, the size is aperture in megabytes, not 'slabs'! */
12606 +       if (isdigit(*str)) {
12607 +               iotlb_nslabs = simple_strtoul(str, &str, 0) <<
12608 +                       (20 - IO_TLB_SHIFT);
12609 +               iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
12610 +               /* Round up to power of two (xen_create_contiguous_region). */
12611 +               while (iotlb_nslabs & (iotlb_nslabs-1))
12612 +                       iotlb_nslabs += iotlb_nslabs & ~(iotlb_nslabs-1);
12613 +       }
12614 +       if (*str == ',')
12615 +               ++str;
12616 +       /*
12617 +         * NB. 'force' enables the swiotlb, but doesn't force its use for
12618 +         * every DMA like it does on native Linux. 'off' forcibly disables
12619 +         * use of the swiotlb.
12620 +         */
12621 +       if (!strcmp(str, "force"))
12622 +               swiotlb_force = 1;
12623 +       else if (!strcmp(str, "off"))
12624 +               swiotlb_force = -1;
12625 +       return 1;
12626 +}
12627 +__setup("swiotlb=", setup_io_tlb_npages);
12628 +/* make io_tlb_overflow tunable too? */
12629 +
12630 +/*
12631 + * Statically reserve bounce buffer space and initialize bounce buffer data
12632 + * structures for the software IO TLB used to implement the PCI DMA API.
12633 + */
12634 +void
12635 +swiotlb_init_with_default_size (size_t default_size)
12636 +{
12637 +       unsigned long i, bytes;
12638 +
12639 +       if (!iotlb_nslabs) {
12640 +               iotlb_nslabs = (default_size >> IO_TLB_SHIFT);
12641 +               iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
12642 +               /* Round up to power of two (xen_create_contiguous_region). */
12643 +               while (iotlb_nslabs & (iotlb_nslabs-1))
12644 +                       iotlb_nslabs += iotlb_nslabs & ~(iotlb_nslabs-1);
12645 +       }
12646 +
12647 +       bytes = iotlb_nslabs * (1UL << IO_TLB_SHIFT);
12648 +
12649 +       /*
12650 +        * Get IO TLB memory from the low pages
12651 +        */
12652 +       iotlb_virt_start = alloc_bootmem_low_pages(bytes);
12653 +       if (!iotlb_virt_start)
12654 +               panic("Cannot allocate SWIOTLB buffer!\n"
12655 +                     "Use dom0_mem Xen boot parameter to reserve\n"
12656 +                     "some DMA memory (e.g., dom0_mem=-128M).\n");
12657 +
12658 +       for (i = 0; i < iotlb_nslabs; i += IO_TLB_SEGSIZE) {
12659 +               int rc = xen_create_contiguous_region(
12660 +                       (unsigned long)iotlb_virt_start + (i << IO_TLB_SHIFT),
12661 +                       get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT),
12662 +                       IO_TLB_DMA_BITS);
12663 +               BUG_ON(rc);
12664 +       }
12665 +
12666 +       /*
12667 +        * Allocate and initialize the free list array.  This array is used
12668 +        * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE.
12669 +        */
12670 +       io_tlb_list = alloc_bootmem(iotlb_nslabs * sizeof(int));
12671 +       for (i = 0; i < iotlb_nslabs; i++)
12672 +               io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
12673 +       io_tlb_index = 0;
12674 +       io_tlb_orig_addr = alloc_bootmem(
12675 +               iotlb_nslabs * sizeof(*io_tlb_orig_addr));
12676 +
12677 +       /*
12678 +        * Get the overflow emergency buffer
12679 +        */
12680 +       io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
12681 +
12682 +       iotlb_pfn_start = __pa(iotlb_virt_start) >> PAGE_SHIFT;
12683 +       iotlb_pfn_end   = iotlb_pfn_start + (bytes >> PAGE_SHIFT);
12684 +
12685 +       printk(KERN_INFO "Software IO TLB enabled: \n"
12686 +              " Aperture:     %lu megabytes\n"
12687 +              " Kernel range: 0x%016lx - 0x%016lx\n",
12688 +              bytes >> 20,
12689 +              (unsigned long)iotlb_virt_start,
12690 +              (unsigned long)iotlb_virt_start + bytes);
12691 +}
12692 +
12693 +void
12694 +swiotlb_init(void)
12695 +{
12696 +       long ram_end;
12697 +       size_t defsz = 64 * (1 << 20); /* 64MB default size */
12698 +
12699 +       if (swiotlb_force == 1) {
12700 +               swiotlb = 1;
12701 +       } else if ((swiotlb_force != -1) &&
12702 +                  is_running_on_xen() &&
12703 +                  (xen_start_info->flags & SIF_INITDOMAIN)) {
12704 +               /* Domain 0 always has a swiotlb. */
12705 +               ram_end = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
12706 +               if (ram_end <= 0x7ffff)
12707 +                       defsz = 2 * (1 << 20); /* 2MB on <2GB on systems. */
12708 +               swiotlb = 1;
12709 +       }
12710 +
12711 +       if (swiotlb)
12712 +               swiotlb_init_with_default_size(defsz);
12713 +       else
12714 +               printk(KERN_INFO "Software IO TLB disabled\n");
12715 +}
12716 +
12717 +/*
12718 + * We use __copy_to_user_inatomic to transfer to the host buffer because the
12719 + * buffer may be mapped read-only (e.g, in blkback driver) but lower-level
12720 + * drivers map the buffer for DMA_BIDIRECTIONAL access. This causes an
12721 + * unnecessary copy from the aperture to the host buffer, and a page fault.
12722 + */
12723 +static void
12724 +__sync_single(struct phys_addr buffer, char *dma_addr, size_t size, int dir)
12725 +{
12726 +       if (PageHighMem(buffer.page)) {
12727 +               size_t len, bytes;
12728 +               char *dev, *host, *kmp;
12729 +               len = size;
12730 +               while (len != 0) {
12731 +                       if (((bytes = len) + buffer.offset) > PAGE_SIZE)
12732 +                               bytes = PAGE_SIZE - buffer.offset;
12733 +                       kmp  = kmap_atomic(buffer.page, KM_SWIOTLB);
12734 +                       dev  = dma_addr + size - len;
12735 +                       host = kmp + buffer.offset;
12736 +                       if (dir == DMA_FROM_DEVICE) {
12737 +                               if (__copy_to_user_inatomic(host, dev, bytes))
12738 +                                       /* inaccessible */;
12739 +                       } else
12740 +                               memcpy(dev, host, bytes);
12741 +                       kunmap_atomic(kmp, KM_SWIOTLB);
12742 +                       len -= bytes;
12743 +                       buffer.page++;
12744 +                       buffer.offset = 0;
12745 +               }
12746 +       } else {
12747 +               char *host = (char *)phys_to_virt(
12748 +                       page_to_pseudophys(buffer.page)) + buffer.offset;
12749 +               if (dir == DMA_FROM_DEVICE) {
12750 +                       if (__copy_to_user_inatomic(host, dma_addr, size))
12751 +                               /* inaccessible */;
12752 +               } else if (dir == DMA_TO_DEVICE)
12753 +                       memcpy(dma_addr, host, size);
12754 +       }
12755 +}
12756 +
12757 +/*
12758 + * Allocates bounce buffer and returns its kernel virtual address.
12759 + */
12760 +static void *
12761 +map_single(struct device *hwdev, struct phys_addr buffer, size_t size, int dir)
12762 +{
12763 +       unsigned long flags;
12764 +       char *dma_addr;
12765 +       unsigned int nslots, stride, index, wrap;
12766 +       int i;
12767 +
12768 +       /*
12769 +        * For mappings greater than a page, we limit the stride (and
12770 +        * hence alignment) to a page size.
12771 +        */
12772 +       nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
12773 +       if (size > PAGE_SIZE)
12774 +               stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
12775 +       else
12776 +               stride = 1;
12777 +
12778 +       BUG_ON(!nslots);
12779 +
12780 +       /*
12781 +        * Find suitable number of IO TLB entries size that will fit this
12782 +        * request and allocate a buffer from that IO TLB pool.
12783 +        */
12784 +       spin_lock_irqsave(&io_tlb_lock, flags);
12785 +       {
12786 +               wrap = index = ALIGN(io_tlb_index, stride);
12787 +
12788 +               if (index >= iotlb_nslabs)
12789 +                       wrap = index = 0;
12790 +
12791 +               do {
12792 +                       /*
12793 +                        * If we find a slot that indicates we have 'nslots'
12794 +                        * number of contiguous buffers, we allocate the
12795 +                        * buffers from that slot and mark the entries as '0'
12796 +                        * indicating unavailable.
12797 +                        */
12798 +                       if (io_tlb_list[index] >= nslots) {
12799 +                               int count = 0;
12800 +
12801 +                               for (i = index; i < (int)(index + nslots); i++)
12802 +                                       io_tlb_list[i] = 0;
12803 +                               for (i = index - 1;
12804 +                                    (OFFSET(i, IO_TLB_SEGSIZE) !=
12805 +                                     IO_TLB_SEGSIZE -1) && io_tlb_list[i];
12806 +                                    i--)
12807 +                                       io_tlb_list[i] = ++count;
12808 +                               dma_addr = iotlb_virt_start +
12809 +                                       (index << IO_TLB_SHIFT);
12810 +
12811 +                               /*
12812 +                                * Update the indices to avoid searching in
12813 +                                * the next round.
12814 +                                */
12815 +                               io_tlb_index = 
12816 +                                       ((index + nslots) < iotlb_nslabs
12817 +                                        ? (index + nslots) : 0);
12818 +
12819 +                               goto found;
12820 +                       }
12821 +                       index += stride;
12822 +                       if (index >= iotlb_nslabs)
12823 +                               index = 0;
12824 +               } while (index != wrap);
12825 +
12826 +               spin_unlock_irqrestore(&io_tlb_lock, flags);
12827 +               return NULL;
12828 +       }
12829 +  found:
12830 +       spin_unlock_irqrestore(&io_tlb_lock, flags);
12831 +
12832 +       /*
12833 +        * Save away the mapping from the original address to the DMA address.
12834 +        * This is needed when we sync the memory.  Then we sync the buffer if
12835 +        * needed.
12836 +        */
12837 +       io_tlb_orig_addr[index] = buffer;
12838 +       if ((dir == DMA_TO_DEVICE) || (dir == DMA_BIDIRECTIONAL))
12839 +               __sync_single(buffer, dma_addr, size, DMA_TO_DEVICE);
12840 +
12841 +       return dma_addr;
12842 +}
12843 +
12844 +/*
12845 + * dma_addr is the kernel virtual address of the bounce buffer to unmap.
12846 + */
12847 +static void
12848 +unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
12849 +{
12850 +       unsigned long flags;
12851 +       int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
12852 +       int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
12853 +       struct phys_addr buffer = io_tlb_orig_addr[index];
12854 +
12855 +       /*
12856 +        * First, sync the memory before unmapping the entry
12857 +        */
12858 +       if ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))
12859 +               __sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE);
12860 +
12861 +       /*
12862 +        * Return the buffer to the free list by setting the corresponding
12863 +        * entries to indicate the number of contigous entries available.
12864 +        * While returning the entries to the free list, we merge the entries
12865 +        * with slots below and above the pool being returned.
12866 +        */
12867 +       spin_lock_irqsave(&io_tlb_lock, flags);
12868 +       {
12869 +               count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
12870 +                        io_tlb_list[index + nslots] : 0);
12871 +               /*
12872 +                * Step 1: return the slots to the free list, merging the
12873 +                * slots with superceeding slots
12874 +                */
12875 +               for (i = index + nslots - 1; i >= index; i--)
12876 +                       io_tlb_list[i] = ++count;
12877 +               /*
12878 +                * Step 2: merge the returned slots with the preceding slots,
12879 +                * if available (non zero)
12880 +                */
12881 +               for (i = index - 1;
12882 +                    (OFFSET(i, IO_TLB_SEGSIZE) !=
12883 +                     IO_TLB_SEGSIZE -1) && io_tlb_list[i];
12884 +                    i--)
12885 +                       io_tlb_list[i] = ++count;
12886 +       }
12887 +       spin_unlock_irqrestore(&io_tlb_lock, flags);
12888 +}
12889 +
12890 +static void
12891 +sync_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
12892 +{
12893 +       int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
12894 +       struct phys_addr buffer = io_tlb_orig_addr[index];
12895 +       BUG_ON((dir != DMA_FROM_DEVICE) && (dir != DMA_TO_DEVICE));
12896 +       __sync_single(buffer, dma_addr, size, dir);
12897 +}
12898 +
12899 +static void
12900 +swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
12901 +{
12902 +       /*
12903 +        * Ran out of IOMMU space for this operation. This is very bad.
12904 +        * Unfortunately the drivers cannot handle this operation properly.
12905 +        * unless they check for pci_dma_mapping_error (most don't)
12906 +        * When the mapping is small enough return a static buffer to limit
12907 +        * the damage, or panic when the transfer is too big.
12908 +        */
12909 +       printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %lu bytes at "
12910 +              "device %s\n", (unsigned long)size, dev ? dev->bus_id : "?");
12911 +
12912 +       if (size > io_tlb_overflow && do_panic) {
12913 +               if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
12914 +                       panic("PCI-DMA: Memory would be corrupted\n");
12915 +               if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
12916 +                       panic("PCI-DMA: Random memory would be DMAed\n");
12917 +       }
12918 +}
12919 +
12920 +/*
12921 + * Map a single buffer of the indicated size for DMA in streaming mode.  The
12922 + * PCI address to use is returned.
12923 + *
12924 + * Once the device is given the dma address, the device owns this memory until
12925 + * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
12926 + */
12927 +dma_addr_t
12928 +swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
12929 +{
12930 +       dma_addr_t dev_addr = virt_to_bus(ptr);
12931 +       void *map;
12932 +       struct phys_addr buffer;
12933 +
12934 +       BUG_ON(dir == DMA_NONE);
12935 +
12936 +       /*
12937 +        * If the pointer passed in happens to be in the device's DMA window,
12938 +        * we can safely return the device addr and not worry about bounce
12939 +        * buffering it.
12940 +        */
12941 +       if (!range_straddles_page_boundary(ptr, size) &&
12942 +           !address_needs_mapping(hwdev, dev_addr))
12943 +               return dev_addr;
12944 +
12945 +       /*
12946 +        * Oh well, have to allocate and map a bounce buffer.
12947 +        */
12948 +       buffer.page   = virt_to_page(ptr);
12949 +       buffer.offset = (unsigned long)ptr & ~PAGE_MASK;
12950 +       map = map_single(hwdev, buffer, size, dir);
12951 +       if (!map) {
12952 +               swiotlb_full(hwdev, size, dir, 1);
12953 +               map = io_tlb_overflow_buffer;
12954 +       }
12955 +
12956 +       dev_addr = virt_to_bus(map);
12957 +       return dev_addr;
12958 +}
12959 +
12960 +/*
12961 + * Unmap a single streaming mode DMA translation.  The dma_addr and size must
12962 + * match what was provided for in a previous swiotlb_map_single call.  All
12963 + * other usages are undefined.
12964 + *
12965 + * After this call, reads by the cpu to the buffer are guaranteed to see
12966 + * whatever the device wrote there.
12967 + */
12968 +void
12969 +swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
12970 +                    int dir)
12971 +{
12972 +       BUG_ON(dir == DMA_NONE);
12973 +       if (in_swiotlb_aperture(dev_addr))
12974 +               unmap_single(hwdev, bus_to_virt(dev_addr), size, dir);
12975 +}
12976 +
12977 +/*
12978 + * Make physical memory consistent for a single streaming mode DMA translation
12979 + * after a transfer.
12980 + *
12981 + * If you perform a swiotlb_map_single() but wish to interrogate the buffer
12982 + * using the cpu, yet do not wish to teardown the PCI dma mapping, you must
12983 + * call this function before doing so.  At the next point you give the PCI dma
12984 + * address back to the card, you must first perform a
12985 + * swiotlb_dma_sync_for_device, and then the device again owns the buffer
12986 + */
12987 +void
12988 +swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
12989 +                           size_t size, int dir)
12990 +{
12991 +       BUG_ON(dir == DMA_NONE);
12992 +       if (in_swiotlb_aperture(dev_addr))
12993 +               sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
12994 +}
12995 +
12996 +void
12997 +swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
12998 +                              size_t size, int dir)
12999 +{
13000 +       BUG_ON(dir == DMA_NONE);
13001 +       if (in_swiotlb_aperture(dev_addr))
13002 +               sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
13003 +}
13004 +
13005 +/*
13006 + * Map a set of buffers described by scatterlist in streaming mode for DMA.
13007 + * This is the scatter-gather version of the above swiotlb_map_single
13008 + * interface.  Here the scatter gather list elements are each tagged with the
13009 + * appropriate dma address and length.  They are obtained via
13010 + * sg_dma_{address,length}(SG).
13011 + *
13012 + * NOTE: An implementation may be able to use a smaller number of
13013 + *       DMA address/length pairs than there are SG table elements.
13014 + *       (for example via virtual mapping capabilities)
13015 + *       The routine returns the number of addr/length pairs actually
13016 + *       used, at most nents.
13017 + *
13018 + * Device ownership issues as mentioned above for swiotlb_map_single are the
13019 + * same here.
13020 + */
13021 +int
13022 +swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
13023 +              int dir)
13024 +{
13025 +       struct phys_addr buffer;
13026 +       dma_addr_t dev_addr;
13027 +       char *map;
13028 +       int i;
13029 +
13030 +       BUG_ON(dir == DMA_NONE);
13031 +
13032 +       for (i = 0; i < nelems; i++, sg++) {
13033 +               dev_addr = SG_ENT_PHYS_ADDRESS(sg);
13034 +               if (address_needs_mapping(hwdev, dev_addr)) {
13035 +                       buffer.page   = sg->page;
13036 +                       buffer.offset = sg->offset;
13037 +                       map = map_single(hwdev, buffer, sg->length, dir);
13038 +                       if (!map) {
13039 +                               /* Don't panic here, we expect map_sg users
13040 +                                  to do proper error handling. */
13041 +                               swiotlb_full(hwdev, sg->length, dir, 0);
13042 +                               swiotlb_unmap_sg(hwdev, sg - i, i, dir);
13043 +                               sg[0].dma_length = 0;
13044 +                               return 0;
13045 +                       }
13046 +                       sg->dma_address = (dma_addr_t)virt_to_bus(map);
13047 +               } else
13048 +                       sg->dma_address = dev_addr;
13049 +               sg->dma_length = sg->length;
13050 +       }
13051 +       return nelems;
13052 +}
13053 +
13054 +/*
13055 + * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
13056 + * concerning calls here are the same as for swiotlb_unmap_single() above.
13057 + */
13058 +void
13059 +swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
13060 +                int dir)
13061 +{
13062 +       int i;
13063 +
13064 +       BUG_ON(dir == DMA_NONE);
13065 +
13066 +       for (i = 0; i < nelems; i++, sg++)
13067 +               if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
13068 +                       unmap_single(hwdev, 
13069 +                                    (void *)bus_to_virt(sg->dma_address),
13070 +                                    sg->dma_length, dir);
13071 +}
13072 +
13073 +/*
13074 + * Make physical memory consistent for a set of streaming mode DMA translations
13075 + * after a transfer.
13076 + *
13077 + * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
13078 + * and usage.
13079 + */
13080 +void
13081 +swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
13082 +                       int nelems, int dir)
13083 +{
13084 +       int i;
13085 +
13086 +       BUG_ON(dir == DMA_NONE);
13087 +
13088 +       for (i = 0; i < nelems; i++, sg++)
13089 +               if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
13090 +                       sync_single(hwdev,
13091 +                                   (void *)bus_to_virt(sg->dma_address),
13092 +                                   sg->dma_length, dir);
13093 +}
13094 +
13095 +void
13096 +swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
13097 +                          int nelems, int dir)
13098 +{
13099 +       int i;
13100 +
13101 +       BUG_ON(dir == DMA_NONE);
13102 +
13103 +       for (i = 0; i < nelems; i++, sg++)
13104 +               if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
13105 +                       sync_single(hwdev,
13106 +                                   (void *)bus_to_virt(sg->dma_address),
13107 +                                   sg->dma_length, dir);
13108 +}
13109 +
13110 +dma_addr_t
13111 +swiotlb_map_page(struct device *hwdev, struct page *page,
13112 +                unsigned long offset, size_t size,
13113 +                enum dma_data_direction direction)
13114 +{
13115 +       struct phys_addr buffer;
13116 +       dma_addr_t dev_addr;
13117 +       char *map;
13118 +
13119 +       dev_addr = page_to_bus(page) + offset;
13120 +       if (address_needs_mapping(hwdev, dev_addr)) {
13121 +               buffer.page   = page;
13122 +               buffer.offset = offset;
13123 +               map = map_single(hwdev, buffer, size, direction);
13124 +               if (!map) {
13125 +                       swiotlb_full(hwdev, size, direction, 1);
13126 +                       map = io_tlb_overflow_buffer;
13127 +               }
13128 +               dev_addr = (dma_addr_t)virt_to_bus(map);
13129 +       }
13130 +
13131 +       return dev_addr;
13132 +}
13133 +
13134 +void
13135 +swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
13136 +                  size_t size, enum dma_data_direction direction)
13137 +{
13138 +       BUG_ON(direction == DMA_NONE);
13139 +       if (in_swiotlb_aperture(dma_address))
13140 +               unmap_single(hwdev, bus_to_virt(dma_address), size, direction);
13141 +}
13142 +
13143 +int
13144 +swiotlb_dma_mapping_error(dma_addr_t dma_addr)
13145 +{
13146 +       return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
13147 +}
13148 +
13149 +/*
13150 + * Return whether the given PCI device DMA address mask can be supported
13151 + * properly.  For example, if your device can only drive the low 24-bits
13152 + * during PCI bus mastering, then you would pass 0x00ffffff as the mask to
13153 + * this function.
13154 + */
13155 +int
13156 +swiotlb_dma_supported (struct device *hwdev, u64 mask)
13157 +{
13158 +       return (mask >= ((1UL << IO_TLB_DMA_BITS) - 1));
13159 +}
13160 +
13161 +EXPORT_SYMBOL(swiotlb_init);
13162 +EXPORT_SYMBOL(swiotlb_map_single);
13163 +EXPORT_SYMBOL(swiotlb_unmap_single);
13164 +EXPORT_SYMBOL(swiotlb_map_sg);
13165 +EXPORT_SYMBOL(swiotlb_unmap_sg);
13166 +EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
13167 +EXPORT_SYMBOL(swiotlb_sync_single_for_device);
13168 +EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
13169 +EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
13170 +EXPORT_SYMBOL(swiotlb_map_page);
13171 +EXPORT_SYMBOL(swiotlb_unmap_page);
13172 +EXPORT_SYMBOL(swiotlb_dma_mapping_error);
13173 +EXPORT_SYMBOL(swiotlb_dma_supported);
13174 diff -urNp linux-2.6/arch/i386/kernel/sysenter.c new/arch/i386/kernel/sysenter.c
13175 --- linux-2.6/arch/i386/kernel/sysenter.c       2006-07-03 14:14:14.000000000 +0200
13176 +++ new/arch/i386/kernel/sysenter.c     2006-05-09 12:32:35.000000000 +0200
13177 @@ -13,16 +13,22 @@
13178  #include <linux/gfp.h>
13179  #include <linux/string.h>
13180  #include <linux/elf.h>
13181 +#include <linux/mm.h>
13182  
13183  #include <asm/cpufeature.h>
13184  #include <asm/msr.h>
13185  #include <asm/pgtable.h>
13186  #include <asm/unistd.h>
13187  
13188 +#ifdef CONFIG_XEN
13189 +#include <xen/interface/callback.h>
13190 +#endif
13191 +
13192  extern asmlinkage void sysenter_entry(void);
13193  
13194  void enable_sep_cpu(void)
13195  {
13196 +#ifndef CONFIG_X86_NO_TSS
13197         int cpu = get_cpu();
13198         struct tss_struct *tss = &per_cpu(init_tss, cpu);
13199  
13200 @@ -37,6 +43,7 @@ void enable_sep_cpu(void)
13201         wrmsr(MSR_IA32_SYSENTER_ESP, tss->esp1, 0);
13202         wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
13203         put_cpu();      
13204 +#endif
13205  }
13206  
13207  /*
13208 @@ -45,23 +52,100 @@ void enable_sep_cpu(void)
13209   */
13210  extern const char vsyscall_int80_start, vsyscall_int80_end;
13211  extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
13212 +static void *syscall_page;
13213  
13214  int __init sysenter_setup(void)
13215  {
13216 -       void *page = (void *)get_zeroed_page(GFP_ATOMIC);
13217 +       syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
13218  
13219 -       __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC);
13220 +#ifdef CONFIG_XEN
13221 +       if (boot_cpu_has(X86_FEATURE_SEP)) {
13222 +               struct callback_register sysenter = {
13223 +                       .type = CALLBACKTYPE_sysenter,
13224 +                       .address = { __KERNEL_CS, (unsigned long)sysenter_entry },
13225 +               };
13226  
13227 -       if (!boot_cpu_has(X86_FEATURE_SEP)) {
13228 -               memcpy(page,
13229 -                      &vsyscall_int80_start,
13230 -                      &vsyscall_int80_end - &vsyscall_int80_start);
13231 +               if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0)
13232 +                       clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
13233 +       }
13234 +#endif
13235 +
13236 +       if (boot_cpu_has(X86_FEATURE_SEP)) {
13237 +               memcpy(syscall_page,
13238 +                      &vsyscall_sysenter_start,
13239 +                      &vsyscall_sysenter_end - &vsyscall_sysenter_start);
13240                 return 0;
13241         }
13242  
13243 -       memcpy(page,
13244 -              &vsyscall_sysenter_start,
13245 -              &vsyscall_sysenter_end - &vsyscall_sysenter_start);
13246 +       memcpy(syscall_page,
13247 +              &vsyscall_int80_start,
13248 +              &vsyscall_int80_end - &vsyscall_int80_start);
13249 +
13250 +       return 0;
13251 +}
13252 +
13253 +static struct page*
13254 +syscall_nopage(struct vm_area_struct *vma, unsigned long adr, int *type)
13255 +{
13256 +       struct page *p = virt_to_page(adr - vma->vm_start + syscall_page);
13257 +       get_page(p);
13258 +       return p;
13259 +}
13260 +
13261 +/* Prevent VMA merging */
13262 +static void syscall_vma_close(struct vm_area_struct *vma)
13263 +{
13264 +}
13265 +
13266 +static struct vm_operations_struct syscall_vm_ops = {
13267 +       .close = syscall_vma_close,
13268 +       .nopage = syscall_nopage,
13269 +};
13270  
13271 +/* Setup a VMA at program startup for the vsyscall page */
13272 +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
13273 +{
13274 +       struct vm_area_struct *vma;
13275 +       struct mm_struct *mm = current->mm;
13276 +       int ret;
13277 +
13278 +       vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
13279 +       if (!vma)
13280 +               return -ENOMEM;
13281 +
13282 +       memset(vma, 0, sizeof(struct vm_area_struct));
13283 +       /* Could randomize here */
13284 +       vma->vm_start = VSYSCALL_BASE;
13285 +       vma->vm_end = VSYSCALL_BASE + PAGE_SIZE;
13286 +       /* MAYWRITE to allow gdb to COW and set breakpoints */
13287 +       vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE;
13288 +       vma->vm_flags |= mm->def_flags;
13289 +       vma->vm_page_prot = protection_map[vma->vm_flags & 7];
13290 +       vma->vm_ops = &syscall_vm_ops;
13291 +       vma->vm_mm = mm;
13292 +
13293 +       down_write(&mm->mmap_sem);
13294 +       if ((ret = insert_vm_struct(mm, vma))) {
13295 +               up_write(&mm->mmap_sem);
13296 +               kmem_cache_free(vm_area_cachep, vma);
13297 +               return ret;
13298 +       }
13299 +       mm->total_vm++;
13300 +       up_write(&mm->mmap_sem);
13301 +       return 0;
13302 +}
13303 +
13304 +struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
13305 +{
13306 +       return NULL;
13307 +}
13308 +
13309 +int in_gate_area(struct task_struct *task, unsigned long addr)
13310 +{
13311 +       return 0;
13312 +}
13313 +
13314 +int in_gate_area_no_task(unsigned long addr)
13315 +{
13316         return 0;
13317  }
13318 diff -urNp linux-2.6/arch/i386/kernel/time-xen.c new/arch/i386/kernel/time-xen.c
13319 --- linux-2.6/arch/i386/kernel/time-xen.c       1970-01-01 01:00:00.000000000 +0100
13320 +++ new/arch/i386/kernel/time-xen.c     2006-07-07 15:10:03.000000000 +0200
13321 @@ -0,0 +1,1109 @@
13322 +/*
13323 + *  linux/arch/i386/kernel/time.c
13324 + *
13325 + *  Copyright (C) 1991, 1992, 1995  Linus Torvalds
13326 + *
13327 + * This file contains the PC-specific time handling details:
13328 + * reading the RTC at bootup, etc..
13329 + * 1994-07-02    Alan Modra
13330 + *     fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
13331 + * 1995-03-26    Markus Kuhn
13332 + *      fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
13333 + *      precision CMOS clock update
13334 + * 1996-05-03    Ingo Molnar
13335 + *      fixed time warps in do_[slow|fast]_gettimeoffset()
13336 + * 1997-09-10  Updated NTP code according to technical memorandum Jan '96
13337 + *             "A Kernel Model for Precision Timekeeping" by Dave Mills
13338 + * 1998-09-05    (Various)
13339 + *     More robust do_fast_gettimeoffset() algorithm implemented
13340 + *     (works with APM, Cyrix 6x86MX and Centaur C6),
13341 + *     monotonic gettimeofday() with fast_get_timeoffset(),
13342 + *     drift-proof precision TSC calibration on boot
13343 + *     (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
13344 + *     Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
13345 + *     ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
13346 + * 1998-12-16    Andrea Arcangeli
13347 + *     Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
13348 + *     because was not accounting lost_ticks.
13349 + * 1998-12-24 Copyright (C) 1998  Andrea Arcangeli
13350 + *     Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
13351 + *     serialize accesses to xtime/lost_ticks).
13352 + */
13353 +
13354 +#include <linux/errno.h>
13355 +#include <linux/sched.h>
13356 +#include <linux/kernel.h>
13357 +#include <linux/param.h>
13358 +#include <linux/string.h>
13359 +#include <linux/mm.h>
13360 +#include <linux/interrupt.h>
13361 +#include <linux/time.h>
13362 +#include <linux/delay.h>
13363 +#include <linux/init.h>
13364 +#include <linux/smp.h>
13365 +#include <linux/module.h>
13366 +#include <linux/sysdev.h>
13367 +#include <linux/bcd.h>
13368 +#include <linux/efi.h>
13369 +#include <linux/mca.h>
13370 +#include <linux/sysctl.h>
13371 +#include <linux/percpu.h>
13372 +#include <linux/kernel_stat.h>
13373 +#include <linux/posix-timers.h>
13374 +
13375 +#include <asm/io.h>
13376 +#include <asm/smp.h>
13377 +#include <asm/irq.h>
13378 +#include <asm/msr.h>
13379 +#include <asm/delay.h>
13380 +#include <asm/mpspec.h>
13381 +#include <asm/uaccess.h>
13382 +#include <asm/processor.h>
13383 +#include <asm/timer.h>
13384 +#include <asm/sections.h>
13385 +
13386 +#include "mach_time.h"
13387 +
13388 +#include <linux/timex.h>
13389 +#include <linux/config.h>
13390 +
13391 +#include <asm/hpet.h>
13392 +
13393 +#include <asm/arch_hooks.h>
13394 +
13395 +#include <xen/evtchn.h>
13396 +#include <xen/interface/vcpu.h>
13397 +
13398 +#if defined (__i386__)
13399 +#include <asm/i8259.h>
13400 +#endif
13401 +
13402 +int pit_latch_buggy;              /* extern */
13403 +
13404 +#if defined(__x86_64__)
13405 +unsigned long vxtime_hz = PIT_TICK_RATE;
13406 +struct vxtime_data __vxtime __section_vxtime;   /* for vsyscalls */
13407 +volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
13408 +unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
13409 +struct timespec __xtime __section_xtime;
13410 +struct timezone __sys_tz __section_sys_tz;
13411 +#endif
13412 +
13413 +unsigned int cpu_khz;  /* Detected as we calibrate the TSC */
13414 +EXPORT_SYMBOL(cpu_khz);
13415 +
13416 +extern unsigned long wall_jiffies;
13417 +
13418 +DEFINE_SPINLOCK(rtc_lock);
13419 +EXPORT_SYMBOL(rtc_lock);
13420 +
13421 +#if defined (__i386__)
13422 +#include <asm/i8253.h>
13423 +#endif
13424 +
13425 +DEFINE_SPINLOCK(i8253_lock);
13426 +EXPORT_SYMBOL(i8253_lock);
13427 +
13428 +extern struct init_timer_opts timer_tsc_init;
13429 +extern struct timer_opts timer_tsc;
13430 +#define timer_none timer_tsc
13431 +struct timer_opts *cur_timer __read_mostly = &timer_tsc;
13432 +
13433 +/* These are peridically updated in shared_info, and then copied here. */
13434 +struct shadow_time_info {
13435 +       u64 tsc_timestamp;     /* TSC at last update of time vals.  */
13436 +       u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
13437 +       u32 tsc_to_nsec_mul;
13438 +       u32 tsc_to_usec_mul;
13439 +       int tsc_shift;
13440 +       u32 version;
13441 +};
13442 +static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
13443 +static struct timespec shadow_tv;
13444 +static u32 shadow_tv_version;
13445 +
13446 +/* Keep track of last time we did processing/updating of jiffies and xtime. */
13447 +static u64 processed_system_time;   /* System time (ns) at last processing. */
13448 +static DEFINE_PER_CPU(u64, processed_system_time);
13449 +
13450 +/* How much CPU time was spent blocked and how much was 'stolen'? */
13451 +static DEFINE_PER_CPU(u64, processed_stolen_time);
13452 +static DEFINE_PER_CPU(u64, processed_blocked_time);
13453 +
13454 +/* Current runstate of each CPU (updated automatically by the hypervisor). */
13455 +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
13456 +
13457 +/* Must be signed, as it's compared with s64 quantities which can be -ve. */
13458 +#define NS_PER_TICK (1000000000LL/HZ)
13459 +
13460 +static inline void __normalize_time(time_t *sec, s64 *nsec)
13461 +{
13462 +       while (*nsec >= NSEC_PER_SEC) {
13463 +               (*nsec) -= NSEC_PER_SEC;
13464 +               (*sec)++;
13465 +       }
13466 +       while (*nsec < 0) {
13467 +               (*nsec) += NSEC_PER_SEC;
13468 +               (*sec)--;
13469 +       }
13470 +}
13471 +
13472 +/* Does this guest OS track Xen time, or set its wall clock independently? */
13473 +static int independent_wallclock = 0;
13474 +static int __init __independent_wallclock(char *str)
13475 +{
13476 +       independent_wallclock = 1;
13477 +       return 1;
13478 +}
13479 +__setup("independent_wallclock", __independent_wallclock);
13480 +
13481 +/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
13482 +static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
13483 +static int __init __permitted_clock_jitter(char *str)
13484 +{
13485 +       permitted_clock_jitter = simple_strtoul(str, NULL, 0);
13486 +       return 1;
13487 +}
13488 +__setup("permitted_clock_jitter=", __permitted_clock_jitter);
13489 +
13490 +int tsc_disable __devinitdata = 0;
13491 +
13492 +static void delay_tsc(unsigned long loops)
13493 +{
13494 +       unsigned long bclock, now;
13495 +
13496 +       rdtscl(bclock);
13497 +       do {
13498 +               rep_nop();
13499 +               rdtscl(now);
13500 +       } while ((now - bclock) < loops);
13501 +}
13502 +
13503 +struct timer_opts timer_tsc = {
13504 +       .name = "tsc",
13505 +       .delay = delay_tsc,
13506 +};
13507 +
13508 +/*
13509 + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
13510 + * yielding a 64-bit result.
13511 + */
13512 +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
13513 +{
13514 +       u64 product;
13515 +#ifdef __i386__
13516 +       u32 tmp1, tmp2;
13517 +#endif
13518 +
13519 +       if (shift < 0)
13520 +               delta >>= -shift;
13521 +       else
13522 +               delta <<= shift;
13523 +
13524 +#ifdef __i386__
13525 +       __asm__ (
13526 +               "mul  %5       ; "
13527 +               "mov  %4,%%eax ; "
13528 +               "mov  %%edx,%4 ; "
13529 +               "mul  %5       ; "
13530 +               "xor  %5,%5    ; "
13531 +               "add  %4,%%eax ; "
13532 +               "adc  %5,%%edx ; "
13533 +               : "=A" (product), "=r" (tmp1), "=r" (tmp2)
13534 +               : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
13535 +#else
13536 +       __asm__ (
13537 +               "mul %%rdx ; shrd $32,%%rdx,%%rax"
13538 +               : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
13539 +#endif
13540 +
13541 +       return product;
13542 +}
13543 +
13544 +#if defined (__i386__)
13545 +int read_current_timer(unsigned long *timer_val)
13546 +{
13547 +       rdtscl(*timer_val);
13548 +       return 0;
13549 +}
13550 +#endif
13551 +
13552 +void init_cpu_khz(void)
13553 +{
13554 +       u64 __cpu_khz = 1000000ULL << 32;
13555 +       struct vcpu_time_info *info;
13556 +       info = &HYPERVISOR_shared_info->vcpu_info[0].time;
13557 +       do_div(__cpu_khz, info->tsc_to_system_mul);
13558 +       if (info->tsc_shift < 0)
13559 +               cpu_khz = __cpu_khz << -info->tsc_shift;
13560 +       else
13561 +               cpu_khz = __cpu_khz >> info->tsc_shift;
13562 +}
13563 +
13564 +static u64 get_nsec_offset(struct shadow_time_info *shadow)
13565 +{
13566 +       u64 now, delta;
13567 +       rdtscll(now);
13568 +       delta = now - shadow->tsc_timestamp;
13569 +       return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
13570 +}
13571 +
13572 +static unsigned long get_usec_offset(struct shadow_time_info *shadow)
13573 +{
13574 +       u64 now, delta;
13575 +       rdtscll(now);
13576 +       delta = now - shadow->tsc_timestamp;
13577 +       return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift);
13578 +}
13579 +
13580 +static void __update_wallclock(time_t sec, long nsec)
13581 +{
13582 +       long wtm_nsec, xtime_nsec;
13583 +       time_t wtm_sec, xtime_sec;
13584 +       u64 tmp, wc_nsec;
13585 +
13586 +       /* Adjust wall-clock time base based on wall_jiffies ticks. */
13587 +       wc_nsec = processed_system_time;
13588 +       wc_nsec += sec * (u64)NSEC_PER_SEC;
13589 +       wc_nsec += nsec;
13590 +       wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
13591 +
13592 +       /* Split wallclock base into seconds and nanoseconds. */
13593 +       tmp = wc_nsec;
13594 +       xtime_nsec = do_div(tmp, 1000000000);
13595 +       xtime_sec  = (time_t)tmp;
13596 +
13597 +       wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
13598 +       wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
13599 +
13600 +       set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
13601 +       set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
13602 +
13603 +       ntp_clear();
13604 +}
13605 +
13606 +static void update_wallclock(void)
13607 +{
13608 +       shared_info_t *s = HYPERVISOR_shared_info;
13609 +
13610 +       do {
13611 +               shadow_tv_version = s->wc_version;
13612 +               rmb();
13613 +               shadow_tv.tv_sec  = s->wc_sec;
13614 +               shadow_tv.tv_nsec = s->wc_nsec;
13615 +               rmb();
13616 +       } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
13617 +
13618 +       if (!independent_wallclock)
13619 +               __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
13620 +}
13621 +
13622 +/*
13623 + * Reads a consistent set of time-base values from Xen, into a shadow data
13624 + * area.
13625 + */
13626 +static void get_time_values_from_xen(void)
13627 +{
13628 +       shared_info_t           *s = HYPERVISOR_shared_info;
13629 +       struct vcpu_time_info   *src;
13630 +       struct shadow_time_info *dst;
13631 +
13632 +       src = &s->vcpu_info[smp_processor_id()].time;
13633 +       dst = &per_cpu(shadow_time, smp_processor_id());
13634 +
13635 +       do {
13636 +               dst->version = src->version;
13637 +               rmb();
13638 +               dst->tsc_timestamp     = src->tsc_timestamp;
13639 +               dst->system_timestamp  = src->system_time;
13640 +               dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
13641 +               dst->tsc_shift         = src->tsc_shift;
13642 +               rmb();
13643 +       } while ((src->version & 1) | (dst->version ^ src->version));
13644 +
13645 +       dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
13646 +}
13647 +
13648 +static inline int time_values_up_to_date(int cpu)
13649 +{
13650 +       struct vcpu_time_info   *src;
13651 +       struct shadow_time_info *dst;
13652 +
13653 +       src = &HYPERVISOR_shared_info->vcpu_info[cpu].time;
13654 +       dst = &per_cpu(shadow_time, cpu);
13655 +
13656 +       rmb();
13657 +       return (dst->version == src->version);
13658 +}
13659 +
13660 +/*
13661 + * This is a special lock that is owned by the CPU and holds the index
13662 + * register we are working with.  It is required for NMI access to the
13663 + * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
13664 + */
13665 +volatile unsigned long cmos_lock = 0;
13666 +EXPORT_SYMBOL(cmos_lock);
13667 +
13668 +/* Routines for accessing the CMOS RAM/RTC. */
13669 +unsigned char rtc_cmos_read(unsigned char addr)
13670 +{
13671 +       unsigned char val;
13672 +       lock_cmos_prefix(addr);
13673 +       outb_p(addr, RTC_PORT(0));
13674 +       val = inb_p(RTC_PORT(1));
13675 +       lock_cmos_suffix(addr);
13676 +       return val;
13677 +}
13678 +EXPORT_SYMBOL(rtc_cmos_read);
13679 +
13680 +void rtc_cmos_write(unsigned char val, unsigned char addr)
13681 +{
13682 +       lock_cmos_prefix(addr);
13683 +       outb_p(addr, RTC_PORT(0));
13684 +       outb_p(val, RTC_PORT(1));
13685 +       lock_cmos_suffix(addr);
13686 +}
13687 +EXPORT_SYMBOL(rtc_cmos_write);
13688 +
13689 +/*
13690 + * This version of gettimeofday has microsecond resolution
13691 + * and better than microsecond precision on fast x86 machines with TSC.
13692 + */
13693 +void do_gettimeofday(struct timeval *tv)
13694 +{
13695 +       unsigned long seq;
13696 +       unsigned long usec, sec;
13697 +       unsigned long max_ntp_tick;
13698 +       s64 nsec;
13699 +       unsigned int cpu;
13700 +       struct shadow_time_info *shadow;
13701 +       u32 local_time_version;
13702 +
13703 +       cpu = get_cpu();
13704 +       shadow = &per_cpu(shadow_time, cpu);
13705 +
13706 +       do {
13707 +               unsigned long lost;
13708 +
13709 +               local_time_version = shadow->version;
13710 +               seq = read_seqbegin(&xtime_lock);
13711 +
13712 +               usec = get_usec_offset(shadow);
13713 +               lost = jiffies - wall_jiffies;
13714 +
13715 +               /*
13716 +                * If time_adjust is negative then NTP is slowing the clock
13717 +                * so make sure not to go into next possible interval.
13718 +                * Better to lose some accuracy than have time go backwards..
13719 +                */
13720 +               if (unlikely(time_adjust < 0)) {
13721 +                       max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
13722 +                       usec = min(usec, max_ntp_tick);
13723 +
13724 +                       if (lost)
13725 +                               usec += lost * max_ntp_tick;
13726 +               }
13727 +               else if (unlikely(lost))
13728 +                       usec += lost * (USEC_PER_SEC / HZ);
13729 +
13730 +               sec = xtime.tv_sec;
13731 +               usec += (xtime.tv_nsec / NSEC_PER_USEC);
13732 +
13733 +               nsec = shadow->system_timestamp - processed_system_time;
13734 +               __normalize_time(&sec, &nsec);
13735 +               usec += (long)nsec / NSEC_PER_USEC;
13736 +
13737 +               if (unlikely(!time_values_up_to_date(cpu))) {
13738 +                       /*
13739 +                        * We may have blocked for a long time,
13740 +                        * rendering our calculations invalid
13741 +                        * (e.g. the time delta may have
13742 +                        * overflowed). Detect that and recalculate
13743 +                        * with fresh values.
13744 +                        */
13745 +                       get_time_values_from_xen();
13746 +                       continue;
13747 +               }
13748 +       } while (read_seqretry(&xtime_lock, seq) ||
13749 +                (local_time_version != shadow->version));
13750 +
13751 +       put_cpu();
13752 +
13753 +       while (usec >= USEC_PER_SEC) {
13754 +               usec -= USEC_PER_SEC;
13755 +               sec++;
13756 +       }
13757 +
13758 +       tv->tv_sec = sec;
13759 +       tv->tv_usec = usec;
13760 +}
13761 +
13762 +EXPORT_SYMBOL(do_gettimeofday);
13763 +
13764 +int do_settimeofday(struct timespec *tv)
13765 +{
13766 +       time_t sec;
13767 +       s64 nsec;
13768 +       unsigned int cpu;
13769 +       struct shadow_time_info *shadow;
13770 +       dom0_op_t op;
13771 +
13772 +       if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
13773 +               return -EINVAL;
13774 +
13775 +       cpu = get_cpu();
13776 +       shadow = &per_cpu(shadow_time, cpu);
13777 +
13778 +       write_seqlock_irq(&xtime_lock);
13779 +
13780 +       /*
13781 +        * Ensure we don't get blocked for a long time so that our time delta
13782 +        * overflows. If that were to happen then our shadow time values would
13783 +        * be stale, so we can retry with fresh ones.
13784 +        */
13785 +       for (;;) {
13786 +               nsec = tv->tv_nsec - get_nsec_offset(shadow);
13787 +               if (time_values_up_to_date(cpu))
13788 +                       break;
13789 +               get_time_values_from_xen();
13790 +       }
13791 +       sec = tv->tv_sec;
13792 +       __normalize_time(&sec, &nsec);
13793 +
13794 +       if ((xen_start_info->flags & SIF_INITDOMAIN) &&
13795 +           !independent_wallclock) {
13796 +               op.cmd = DOM0_SETTIME;
13797 +               op.u.settime.secs        = sec;
13798 +               op.u.settime.nsecs       = nsec;
13799 +               op.u.settime.system_time = shadow->system_timestamp;
13800 +               HYPERVISOR_dom0_op(&op);
13801 +               update_wallclock();
13802 +       } else if (independent_wallclock) {
13803 +               nsec -= shadow->system_timestamp;
13804 +               __normalize_time(&sec, &nsec);
13805 +               __update_wallclock(sec, nsec);
13806 +       }
13807 +
13808 +       write_sequnlock_irq(&xtime_lock);
13809 +
13810 +       put_cpu();
13811 +
13812 +       clock_was_set();
13813 +       return 0;
13814 +}
13815 +
13816 +EXPORT_SYMBOL(do_settimeofday);
13817 +
13818 +static void sync_xen_wallclock(unsigned long dummy);
13819 +static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
13820 +static void sync_xen_wallclock(unsigned long dummy)
13821 +{
13822 +       time_t sec;
13823 +       s64 nsec;
13824 +       dom0_op_t op;
13825 +
13826 +       if (!ntp_synced() || independent_wallclock ||
13827 +           !(xen_start_info->flags & SIF_INITDOMAIN))
13828 +               return;
13829 +
13830 +       write_seqlock_irq(&xtime_lock);
13831 +
13832 +       sec  = xtime.tv_sec;
13833 +       nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
13834 +       __normalize_time(&sec, &nsec);
13835 +
13836 +       op.cmd = DOM0_SETTIME;
13837 +       op.u.settime.secs        = sec;
13838 +       op.u.settime.nsecs       = nsec;
13839 +       op.u.settime.system_time = processed_system_time;
13840 +       HYPERVISOR_dom0_op(&op);
13841 +
13842 +       update_wallclock();
13843 +
13844 +       write_sequnlock_irq(&xtime_lock);
13845 +
13846 +       /* Once per minute. */
13847 +       mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
13848 +}
13849 +
13850 +static int set_rtc_mmss(unsigned long nowtime)
13851 +{
13852 +       int retval;
13853 +
13854 +       WARN_ON(irqs_disabled());
13855 +
13856 +       if (independent_wallclock || !(xen_start_info->flags & SIF_INITDOMAIN))
13857 +               return 0;
13858 +
13859 +       /* gets recalled with irq locally disabled */
13860 +       spin_lock_irq(&rtc_lock);
13861 +       if (efi_enabled)
13862 +               retval = efi_set_rtc_mmss(nowtime);
13863 +       else
13864 +               retval = mach_set_rtc_mmss(nowtime);
13865 +       spin_unlock_irq(&rtc_lock);
13866 +
13867 +       return retval;
13868 +}
13869 +
13870 +/* monotonic_clock(): returns # of nanoseconds passed since time_init()
13871 + *             Note: This function is required to return accurate
13872 + *             time even in the absence of multiple timer ticks.
13873 + */
13874 +unsigned long long monotonic_clock(void)
13875 +{
13876 +       int cpu = get_cpu();
13877 +       struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
13878 +       u64 time;
13879 +       u32 local_time_version;
13880 +
13881 +       do {
13882 +               local_time_version = shadow->version;
13883 +               barrier();
13884 +               time = shadow->system_timestamp + get_nsec_offset(shadow);
13885 +               if (!time_values_up_to_date(cpu))
13886 +                       get_time_values_from_xen();
13887 +               barrier();
13888 +       } while (local_time_version != shadow->version);
13889 +
13890 +       put_cpu();
13891 +
13892 +       return time;
13893 +}
13894 +EXPORT_SYMBOL(monotonic_clock);
13895 +
13896 +unsigned long long sched_clock(void)
13897 +{
13898 +       return monotonic_clock();
13899 +}
13900 +
13901 +#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
13902 +unsigned long profile_pc(struct pt_regs *regs)
13903 +{
13904 +       unsigned long pc = instruction_pointer(regs);
13905 +
13906 +#ifdef __x86_64__
13907 +       /* Assume the lock function has either no stack frame or only a single word.
13908 +          This checks if the address on the stack looks like a kernel text address.
13909 +          There is a small window for false hits, but in that case the tick
13910 +          is just accounted to the spinlock function.
13911 +          Better would be to write these functions in assembler again
13912 +          and check exactly. */
13913 +       if (in_lock_functions(pc)) {
13914 +               char *v = *(char **)regs->rsp;
13915 +               if ((v >= _stext && v <= _etext) ||
13916 +                       (v >= _sinittext && v <= _einittext) ||
13917 +                       (v >= (char *)MODULES_VADDR  && v <= (char *)MODULES_END))
13918 +                       return (unsigned long)v;
13919 +               return ((unsigned long *)regs->rsp)[1];
13920 +       }
13921 +#else
13922 +       if (in_lock_functions(pc))
13923 +               return *(unsigned long *)(regs->ebp + 4);
13924 +#endif
13925 +
13926 +       return pc;
13927 +}
13928 +EXPORT_SYMBOL(profile_pc);
13929 +#endif
13930 +
13931 +irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
13932 +{
13933 +       s64 delta, delta_cpu, stolen, blocked;
13934 +       u64 sched_time;
13935 +       int i, cpu = smp_processor_id();
13936 +       struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
13937 +       struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
13938 +
13939 +       write_seqlock(&xtime_lock);
13940 +
13941 +       do {
13942 +               get_time_values_from_xen();
13943 +
13944 +               /* Obtain a consistent snapshot of elapsed wallclock cycles. */
13945 +               delta = delta_cpu =
13946 +                       shadow->system_timestamp + get_nsec_offset(shadow);
13947 +               delta     -= processed_system_time;
13948 +               delta_cpu -= per_cpu(processed_system_time, cpu);
13949 +
13950 +               /*
13951 +                * Obtain a consistent snapshot of stolen/blocked cycles. We
13952 +                * can use state_entry_time to detect if we get preempted here.
13953 +                */
13954 +               do {
13955 +                       sched_time = runstate->state_entry_time;
13956 +                       barrier();
13957 +                       stolen = runstate->time[RUNSTATE_runnable] +
13958 +                               runstate->time[RUNSTATE_offline] -
13959 +                               per_cpu(processed_stolen_time, cpu);
13960 +                       blocked = runstate->time[RUNSTATE_blocked] -
13961 +                               per_cpu(processed_blocked_time, cpu);
13962 +                       barrier();
13963 +               } while (sched_time != runstate->state_entry_time);
13964 +       } while (!time_values_up_to_date(cpu));
13965 +
13966 +       if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
13967 +            unlikely(delta_cpu < -(s64)permitted_clock_jitter))
13968 +           && printk_ratelimit()) {
13969 +               printk("Timer ISR/%d: Time went backwards: "
13970 +                      "delta=%lld delta_cpu=%lld shadow=%lld "
13971 +                      "off=%lld processed=%lld cpu_processed=%lld\n",
13972 +                      cpu, delta, delta_cpu, shadow->system_timestamp,
13973 +                      (s64)get_nsec_offset(shadow),
13974 +                      processed_system_time,
13975 +                      per_cpu(processed_system_time, cpu));
13976 +               for (i = 0; i < num_online_cpus(); i++)
13977 +                       printk(" %d: %lld\n", i,
13978 +                              per_cpu(processed_system_time, i));
13979 +       }
13980 +
13981 +       /* System-wide jiffy work. */
13982 +       while (delta >= NS_PER_TICK) {
13983 +               delta -= NS_PER_TICK;
13984 +               processed_system_time += NS_PER_TICK;
13985 +               do_timer(regs);
13986 +       }
13987 +
13988 +       if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
13989 +               update_wallclock();
13990 +               clock_was_set();
13991 +       }
13992 +
13993 +       write_sequnlock(&xtime_lock);
13994 +
13995 +       /*
13996 +        * Account stolen ticks.
13997 +        * HACK: Passing NULL to account_steal_time()
13998 +        * ensures that the ticks are accounted as stolen.
13999 +        */
14000 +       if ((stolen > 0) && (delta_cpu > 0)) {
14001 +               delta_cpu -= stolen;
14002 +               if (unlikely(delta_cpu < 0))
14003 +                       stolen += delta_cpu; /* clamp local-time progress */
14004 +               do_div(stolen, NS_PER_TICK);
14005 +               per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
14006 +               per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
14007 +               account_steal_time(NULL, (cputime_t)stolen);
14008 +       }
14009 +
14010 +       /*
14011 +        * Account blocked ticks.
14012 +        * HACK: Passing idle_task to account_steal_time()
14013 +        * ensures that the ticks are accounted as idle/wait.
14014 +        */
14015 +       if ((blocked > 0) && (delta_cpu > 0)) {
14016 +               delta_cpu -= blocked;
14017 +               if (unlikely(delta_cpu < 0))
14018 +                       blocked += delta_cpu; /* clamp local-time progress */
14019 +               do_div(blocked, NS_PER_TICK);
14020 +               per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
14021 +               per_cpu(processed_system_time, cpu)  += blocked * NS_PER_TICK;
14022 +               account_steal_time(idle_task(cpu), (cputime_t)blocked);
14023 +       }
14024 +
14025 +       /* Account user/system ticks. */
14026 +       if (delta_cpu > 0) {
14027 +               do_div(delta_cpu, NS_PER_TICK);
14028 +               per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
14029 +               if (user_mode(regs))
14030 +                       account_user_time(current, (cputime_t)delta_cpu);
14031 +               else
14032 +                       account_system_time(current, HARDIRQ_OFFSET,
14033 +                                           (cputime_t)delta_cpu);
14034 +       }
14035 +
14036 +       /* Local timer processing (see update_process_times()). */
14037 +       run_local_timers();
14038 +       if (rcu_pending(cpu))
14039 +               rcu_check_callbacks(cpu, user_mode(regs));
14040 +       scheduler_tick();
14041 +       run_posix_cpu_timers(current);
14042 +
14043 +       return IRQ_HANDLED;
14044 +}
14045 +
14046 +static void init_missing_ticks_accounting(int cpu)
14047 +{
14048 +       struct vcpu_register_runstate_memory_area area;
14049 +       struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
14050 +
14051 +       memset(runstate, 0, sizeof(*runstate));
14052 +
14053 +       area.addr.v = runstate;
14054 +       HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
14055 +
14056 +       per_cpu(processed_blocked_time, cpu) =
14057 +               runstate->time[RUNSTATE_blocked];
14058 +       per_cpu(processed_stolen_time, cpu) =
14059 +               runstate->time[RUNSTATE_runnable] +
14060 +               runstate->time[RUNSTATE_offline];
14061 +}
14062 +
14063 +/* not static: needed by APM */
14064 +unsigned long get_cmos_time(void)
14065 +{
14066 +       unsigned long retval;
14067 +
14068 +       spin_lock(&rtc_lock);
14069 +
14070 +       if (efi_enabled)
14071 +               retval = efi_get_time();
14072 +       else
14073 +               retval = mach_get_cmos_time();
14074 +
14075 +       spin_unlock(&rtc_lock);
14076 +
14077 +       return retval;
14078 +}
14079 +EXPORT_SYMBOL(get_cmos_time);
14080 +
14081 +static void sync_cmos_clock(unsigned long dummy);
14082 +
14083 +static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
14084 +
14085 +static void sync_cmos_clock(unsigned long dummy)
14086 +{
14087 +       struct timeval now, next;
14088 +       int fail = 1;
14089 +
14090 +       /*
14091 +        * If we have an externally synchronized Linux clock, then update
14092 +        * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
14093 +        * called as close as possible to 500 ms before the new second starts.
14094 +        * This code is run on a timer.  If the clock is set, that timer
14095 +        * may not expire at the correct time.  Thus, we adjust...
14096 +        */
14097 +       if (!ntp_synced())
14098 +               /*
14099 +                * Not synced, exit, do not restart a timer (if one is
14100 +                * running, let it run out).
14101 +                */
14102 +               return;
14103 +
14104 +       do_gettimeofday(&now);
14105 +       if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
14106 +           now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
14107 +               fail = set_rtc_mmss(now.tv_sec);
14108 +
14109 +       next.tv_usec = USEC_AFTER - now.tv_usec;
14110 +       if (next.tv_usec <= 0)
14111 +               next.tv_usec += USEC_PER_SEC;
14112 +
14113 +       if (!fail)
14114 +               next.tv_sec = 659;
14115 +       else
14116 +               next.tv_sec = 0;
14117 +
14118 +       if (next.tv_usec >= USEC_PER_SEC) {
14119 +               next.tv_sec++;
14120 +               next.tv_usec -= USEC_PER_SEC;
14121 +       }
14122 +       mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
14123 +}
14124 +
14125 +void notify_arch_cmos_timer(void)
14126 +{
14127 +       mod_timer(&sync_cmos_timer, jiffies + 1);
14128 +       mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
14129 +}
14130 +
14131 +static long clock_cmos_diff, sleep_start;
14132 +
14133 +static struct timer_opts *last_timer;
14134 +static int timer_suspend(struct sys_device *dev, pm_message_t state)
14135 +{
14136 +       /*
14137 +        * Estimate time zone so that set_time can update the clock
14138 +        */
14139 +       clock_cmos_diff = -get_cmos_time();
14140 +       clock_cmos_diff += get_seconds();
14141 +       sleep_start = get_cmos_time();
14142 +       last_timer = cur_timer;
14143 +       cur_timer = &timer_none;
14144 +       if (last_timer->suspend)
14145 +               last_timer->suspend(state);
14146 +       return 0;
14147 +}
14148 +
14149 +static int timer_resume(struct sys_device *dev)
14150 +{
14151 +       unsigned long flags;
14152 +       unsigned long sec;
14153 +       unsigned long sleep_length;
14154 +
14155 +#ifdef CONFIG_HPET_TIMER
14156 +       if (is_hpet_enabled())
14157 +               hpet_reenable();
14158 +#endif
14159 +       sec = get_cmos_time() + clock_cmos_diff;
14160 +       sleep_length = (get_cmos_time() - sleep_start) * HZ;
14161 +       write_seqlock_irqsave(&xtime_lock, flags);
14162 +       xtime.tv_sec = sec;
14163 +       xtime.tv_nsec = 0;
14164 +       jiffies_64 += sleep_length;
14165 +       wall_jiffies += sleep_length;
14166 +       write_sequnlock_irqrestore(&xtime_lock, flags);
14167 +       if (last_timer->resume)
14168 +               last_timer->resume();
14169 +       cur_timer = last_timer;
14170 +       last_timer = NULL;
14171 +       touch_softlockup_watchdog();
14172 +       return 0;
14173 +}
14174 +
14175 +static struct sysdev_class timer_sysclass = {
14176 +       .resume = timer_resume,
14177 +       .suspend = timer_suspend,
14178 +       set_kset_name("timer"),
14179 +};
14180 +
14181 +
14182 +/* XXX this driverfs stuff should probably go elsewhere later -john */
14183 +static struct sys_device device_timer = {
14184 +       .id     = 0,
14185 +       .cls    = &timer_sysclass,
14186 +};
14187 +
14188 +static int time_init_device(void)
14189 +{
14190 +       int error = sysdev_class_register(&timer_sysclass);
14191 +       if (!error)
14192 +               error = sysdev_register(&device_timer);
14193 +       return error;
14194 +}
14195 +
14196 +device_initcall(time_init_device);
14197 +
14198 +#ifdef CONFIG_HPET_TIMER
14199 +extern void (*late_time_init)(void);
14200 +/* Duplicate of time_init() below, with hpet_enable part added */
14201 +static void __init hpet_time_init(void)
14202 +{
14203 +       xtime.tv_sec = get_cmos_time();
14204 +       xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
14205 +       set_normalized_timespec(&wall_to_monotonic,
14206 +               -xtime.tv_sec, -xtime.tv_nsec);
14207 +
14208 +       if ((hpet_enable() >= 0) && hpet_use_timer) {
14209 +               printk("Using HPET for base-timer\n");
14210 +       }
14211 +
14212 +       cur_timer = select_timer();
14213 +       printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
14214 +
14215 +       time_init_hook();
14216 +}
14217 +#endif
14218 +
14219 +/* Dynamically-mapped IRQ. */
14220 +DEFINE_PER_CPU(int, timer_irq);
14221 +
14222 +extern void (*late_time_init)(void);
14223 +static void setup_cpu0_timer_irq(void)
14224 +{
14225 +       per_cpu(timer_irq, 0) =
14226 +               bind_virq_to_irqhandler(
14227 +                       VIRQ_TIMER,
14228 +                       0,
14229 +                       timer_interrupt,
14230 +                       SA_INTERRUPT,
14231 +                       "timer0",
14232 +                       NULL);
14233 +       BUG_ON(per_cpu(timer_irq, 0) < 0);
14234 +}
14235 +
14236 +void __init time_init(void)
14237 +{
14238 +#ifdef CONFIG_HPET_TIMER
14239 +       if (is_hpet_capable()) {
14240 +               /*
14241 +                * HPET initialization needs to do memory-mapped io. So, let
14242 +                * us do a late initialization after mem_init().
14243 +                */
14244 +               late_time_init = hpet_time_init;
14245 +               return;
14246 +       }
14247 +#endif
14248 +       get_time_values_from_xen();
14249 +
14250 +       processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
14251 +       per_cpu(processed_system_time, 0) = processed_system_time;
14252 +       init_missing_ticks_accounting(0);
14253 +
14254 +       update_wallclock();
14255 +
14256 +       init_cpu_khz();
14257 +       printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
14258 +              cpu_khz / 1000, cpu_khz % 1000);
14259 +
14260 +#if defined(__x86_64__)
14261 +       vxtime.mode = VXTIME_TSC;
14262 +       vxtime.quot = (1000000L << 32) / vxtime_hz;
14263 +       vxtime.tsc_quot = (1000L << 32) / cpu_khz;
14264 +       sync_core();
14265 +       rdtscll(vxtime.last_tsc);
14266 +#endif
14267 +
14268 +       /* Cannot request_irq() until kmem is initialised. */
14269 +       late_time_init = setup_cpu0_timer_irq;
14270 +}
14271 +
14272 +/* Convert jiffies to system time. */
14273 +u64 jiffies_to_st(unsigned long j)
14274 +{
14275 +       unsigned long seq;
14276 +       long delta;
14277 +       u64 st;
14278 +
14279 +       do {
14280 +               seq = read_seqbegin(&xtime_lock);
14281 +               delta = j - jiffies;
14282 +               /* NB. The next check can trigger in some wrap-around cases,
14283 +                * but that's ok: we'll just end up with a shorter timeout. */
14284 +               if (delta < 1)
14285 +                       delta = 1;
14286 +               st = processed_system_time + (delta * (u64)NS_PER_TICK);
14287 +       } while (read_seqretry(&xtime_lock, seq));
14288 +
14289 +       return st;
14290 +}
14291 +EXPORT_SYMBOL(jiffies_to_st);
14292 +
14293 +/*
14294 + * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
14295 + * These functions are based on implementations from arch/s390/kernel/time.c
14296 + */
14297 +static void stop_hz_timer(void)
14298 +{
14299 +       unsigned int cpu = smp_processor_id();
14300 +       unsigned long j;
14301 +
14302 +       cpu_set(cpu, nohz_cpu_mask);
14303 +
14304 +       /* See matching smp_mb in rcu_start_batch in rcupdate.c.  These mbs  */
14305 +       /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a  */
14306 +       /* value of rcp->cur that matches rdp->quiescbatch and allows us to  */
14307 +       /* stop the hz timer then the cpumasks created for subsequent values */
14308 +       /* of cur in rcu_start_batch are guaranteed to pick up the updated   */
14309 +       /* nohz_cpu_mask and so will not depend on this cpu.                 */
14310 +
14311 +       smp_mb();
14312 +
14313 +       /* Leave ourselves in 'tick mode' if rcu or softirq or timer pending. */
14314 +       if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
14315 +           (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
14316 +               cpu_clear(cpu, nohz_cpu_mask);
14317 +               j = jiffies + 1;
14318 +       }
14319 +
14320 +       BUG_ON(HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0);
14321 +}
14322 +
14323 +static void start_hz_timer(void)
14324 +{
14325 +       cpu_clear(smp_processor_id(), nohz_cpu_mask);
14326 +}
14327 +
14328 +void safe_halt(void)
14329 +{
14330 +       stop_hz_timer();
14331 +       /* Blocking includes an implicit local_irq_enable(). */
14332 +       HYPERVISOR_block();
14333 +       start_hz_timer();
14334 +}
14335 +EXPORT_SYMBOL(safe_halt);
14336 +
14337 +void halt(void)
14338 +{
14339 +       if (irqs_disabled())
14340 +               HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
14341 +}
14342 +EXPORT_SYMBOL(halt);
14343 +
14344 +/* No locking required. We are only CPU running, and interrupts are off. */
14345 +void time_resume(void)
14346 +{
14347 +       init_cpu_khz();
14348 +
14349 +       get_time_values_from_xen();
14350 +
14351 +       processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
14352 +       per_cpu(processed_system_time, 0) = processed_system_time;
14353 +       init_missing_ticks_accounting(0);
14354 +
14355 +       update_wallclock();
14356 +}
14357 +
14358 +#ifdef CONFIG_SMP
14359 +static char timer_name[NR_CPUS][15];
14360 +
14361 +void local_setup_timer(unsigned int cpu)
14362 +{
14363 +       int seq;
14364 +
14365 +       BUG_ON(cpu == 0);
14366 +
14367 +       do {
14368 +               seq = read_seqbegin(&xtime_lock);
14369 +               /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
14370 +               per_cpu(processed_system_time, cpu) =
14371 +                       per_cpu(shadow_time, 0).system_timestamp;
14372 +               init_missing_ticks_accounting(cpu);
14373 +       } while (read_seqretry(&xtime_lock, seq));
14374 +
14375 +       sprintf(timer_name[cpu], "timer%d", cpu);
14376 +       per_cpu(timer_irq, cpu) =
14377 +               bind_virq_to_irqhandler(
14378 +                       VIRQ_TIMER,
14379 +                       cpu,
14380 +                       timer_interrupt,
14381 +                       SA_INTERRUPT,
14382 +                       timer_name[cpu],
14383 +                       NULL);
14384 +       BUG_ON(per_cpu(timer_irq, cpu) < 0);
14385 +}
14386 +
14387 +void local_teardown_timer(unsigned int cpu)
14388 +{
14389 +       BUG_ON(cpu == 0);
14390 +       unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
14391 +}
14392 +#endif
14393 +
14394 +/*
14395 + * /proc/sys/xen: This really belongs in another file. It can stay here for
14396 + * now however.
14397 + */
14398 +static ctl_table xen_subtable[] = {
14399 +       {
14400 +               .ctl_name       = 1,
14401 +               .procname       = "independent_wallclock",
14402 +               .data           = &independent_wallclock,
14403 +               .maxlen         = sizeof(independent_wallclock),
14404 +               .mode           = 0644,
14405 +               .proc_handler   = proc_dointvec
14406 +       },
14407 +       {
14408 +               .ctl_name       = 2,
14409 +               .procname       = "permitted_clock_jitter",
14410 +               .data           = &permitted_clock_jitter,
14411 +               .maxlen         = sizeof(permitted_clock_jitter),
14412 +               .mode           = 0644,
14413 +               .proc_handler   = proc_doulongvec_minmax
14414 +       },
14415 +       { 0 }
14416 +};
14417 +static ctl_table xen_table[] = {
14418 +       {
14419 +               .ctl_name       = 123,
14420 +               .procname       = "xen",
14421 +               .mode           = 0555,
14422 +               .child          = xen_subtable},
14423 +       { 0 }
14424 +};
14425 +static int __init xen_sysctl_init(void)
14426 +{
14427 +       (void)register_sysctl_table(xen_table, 0);
14428 +       return 0;
14429 +}
14430 +__initcall(xen_sysctl_init);
14431 diff -urNp linux-2.6/arch/i386/kernel/traps.c new/arch/i386/kernel/traps.c
14432 --- linux-2.6/arch/i386/kernel/traps.c  2006-07-03 14:14:14.000000000 +0200
14433 +++ new/arch/i386/kernel/traps.c        2006-05-23 18:37:09.000000000 +0200
14434 @@ -607,18 +607,11 @@ static void mem_parity_error(unsigned ch
14435  
14436  static void io_check_error(unsigned char reason, struct pt_regs * regs)
14437  {
14438 -       unsigned long i;
14439 -
14440         printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
14441         show_registers(regs);
14442  
14443         /* Re-enable the IOCK line, wait for a few seconds */
14444 -       reason = (reason & 0xf) | 8;
14445 -       outb(reason, 0x61);
14446 -       i = 2000;
14447 -       while (--i) udelay(1000);
14448 -       reason &= ~8;
14449 -       outb(reason, 0x61);
14450 +       clear_io_check_error(reason);
14451  }
14452  
14453  static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
14454 diff -urNp linux-2.6/arch/i386/kernel/traps-xen.c new/arch/i386/kernel/traps-xen.c
14455 --- linux-2.6/arch/i386/kernel/traps-xen.c      1970-01-01 01:00:00.000000000 +0100
14456 +++ new/arch/i386/kernel/traps-xen.c    2006-05-23 18:37:09.000000000 +0200
14457 @@ -0,0 +1,1136 @@
14458 +/*
14459 + *  linux/arch/i386/traps.c
14460 + *
14461 + *  Copyright (C) 1991, 1992  Linus Torvalds
14462 + *
14463 + *  Pentium III FXSR, SSE support
14464 + *     Gareth Hughes <gareth@valinux.com>, May 2000
14465 + */
14466 +
14467 +/*
14468 + * 'Traps.c' handles hardware traps and faults after we have saved some
14469 + * state in 'asm.s'.
14470 + */
14471 +#include <linux/config.h>
14472 +#include <linux/sched.h>
14473 +#include <linux/kernel.h>
14474 +#include <linux/string.h>
14475 +#include <linux/errno.h>
14476 +#include <linux/timer.h>
14477 +#include <linux/mm.h>
14478 +#include <linux/init.h>
14479 +#include <linux/delay.h>
14480 +#include <linux/spinlock.h>
14481 +#include <linux/interrupt.h>
14482 +#include <linux/highmem.h>
14483 +#include <linux/kallsyms.h>
14484 +#include <linux/ptrace.h>
14485 +#include <linux/utsname.h>
14486 +#include <linux/kprobes.h>
14487 +#include <linux/kexec.h>
14488 +
14489 +#ifdef CONFIG_EISA
14490 +#include <linux/ioport.h>
14491 +#include <linux/eisa.h>
14492 +#endif
14493 +
14494 +#ifdef CONFIG_MCA
14495 +#include <linux/mca.h>
14496 +#endif
14497 +
14498 +#include <asm/processor.h>
14499 +#include <asm/system.h>
14500 +#include <asm/uaccess.h>
14501 +#include <asm/io.h>
14502 +#include <asm/atomic.h>
14503 +#include <asm/debugreg.h>
14504 +#include <asm/desc.h>
14505 +#include <asm/i387.h>
14506 +#include <asm/nmi.h>
14507 +
14508 +#include <asm/smp.h>
14509 +#include <asm/arch_hooks.h>
14510 +#include <asm/kdebug.h>
14511 +
14512 +#include <linux/module.h>
14513 +
14514 +#include "mach_traps.h"
14515 +
14516 +asmlinkage int system_call(void);
14517 +
14518 +struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
14519 +               { 0, 0 }, { 0, 0 } };
14520 +
14521 +/* Do we ignore FPU interrupts ? */
14522 +char ignore_fpu_irq = 0;
14523 +
14524 +#ifndef CONFIG_X86_NO_IDT
14525 +/*
14526 + * The IDT has to be page-aligned to simplify the Pentium
14527 + * F0 0F bug workaround.. We have a special link segment
14528 + * for this.
14529 + */
14530 +struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
14531 +#endif
14532 +
14533 +asmlinkage void divide_error(void);
14534 +asmlinkage void debug(void);
14535 +asmlinkage void nmi(void);
14536 +asmlinkage void int3(void);
14537 +asmlinkage void overflow(void);
14538 +asmlinkage void bounds(void);
14539 +asmlinkage void invalid_op(void);
14540 +asmlinkage void device_not_available(void);
14541 +asmlinkage void coprocessor_segment_overrun(void);
14542 +asmlinkage void invalid_TSS(void);
14543 +asmlinkage void segment_not_present(void);
14544 +asmlinkage void stack_segment(void);
14545 +asmlinkage void general_protection(void);
14546 +asmlinkage void page_fault(void);
14547 +asmlinkage void coprocessor_error(void);
14548 +asmlinkage void simd_coprocessor_error(void);
14549 +asmlinkage void alignment_check(void);
14550 +#ifndef CONFIG_XEN
14551 +asmlinkage void spurious_interrupt_bug(void);
14552 +#else
14553 +asmlinkage void fixup_4gb_segment(void);
14554 +#endif
14555 +asmlinkage void machine_check(void);
14556 +
14557 +static int kstack_depth_to_print = 24;
14558 +ATOMIC_NOTIFIER_HEAD(i386die_chain);
14559 +
14560 +int register_die_notifier(struct notifier_block *nb)
14561 +{
14562 +       vmalloc_sync_all();
14563 +       return atomic_notifier_chain_register(&i386die_chain, nb);
14564 +}
14565 +EXPORT_SYMBOL(register_die_notifier);
14566 +
14567 +int unregister_die_notifier(struct notifier_block *nb)
14568 +{
14569 +       return atomic_notifier_chain_unregister(&i386die_chain, nb);
14570 +}
14571 +EXPORT_SYMBOL(unregister_die_notifier);
14572 +
14573 +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
14574 +{
14575 +       return  p > (void *)tinfo &&
14576 +               p < (void *)tinfo + THREAD_SIZE - 3;
14577 +}
14578 +
14579 +/*
14580 + * Print CONFIG_STACK_BACKTRACE_COLS address/symbol entries per line.
14581 + */
14582 +static inline int print_addr_and_symbol(unsigned long addr, char *log_lvl,
14583 +                                       int printed)
14584 +{
14585 +       if (!printed)
14586 +               printk(log_lvl);
14587 +
14588 +#if CONFIG_STACK_BACKTRACE_COLS == 1
14589 +       printk(" [<%08lx>] ", addr);
14590 +#else
14591 +       printk(" <%08lx> ", addr);
14592 +#endif
14593 +       print_symbol("%s", addr);
14594 +
14595 +       printed = (printed + 1) % CONFIG_STACK_BACKTRACE_COLS;
14596 +       if (printed)
14597 +               printk(" ");
14598 +       else
14599 +               printk("\n");
14600 +
14601 +       return printed;
14602 +}
14603 +
14604 +static inline unsigned long print_context_stack(struct thread_info *tinfo,
14605 +                               unsigned long *stack, unsigned long ebp,
14606 +                               char *log_lvl)
14607 +{
14608 +       unsigned long addr;
14609 +       int printed = 0; /* nr of entries already printed on current line */
14610 +
14611 +#ifdef CONFIG_FRAME_POINTER
14612 +       while (valid_stack_ptr(tinfo, (void *)ebp)) {
14613 +               addr = *(unsigned long *)(ebp + 4);
14614 +               printed = print_addr_and_symbol(addr, log_lvl, printed);
14615 +               ebp = *(unsigned long *)ebp;
14616 +       }
14617 +#else
14618 +       while (valid_stack_ptr(tinfo, stack)) {
14619 +               addr = *stack++;
14620 +               if (__kernel_text_address(addr))
14621 +                       printed = print_addr_and_symbol(addr, log_lvl, printed);
14622 +       }
14623 +#endif
14624 +       if (printed)
14625 +               printk("\n");
14626 +
14627 +       return ebp;
14628 +}
14629 +
14630 +static void show_trace_log_lvl(struct task_struct *task,
14631 +                              unsigned long *stack, char *log_lvl)
14632 +{
14633 +       unsigned long ebp;
14634 +
14635 +       if (!task)
14636 +               task = current;
14637 +
14638 +       if (task == current) {
14639 +               /* Grab ebp right from our regs */
14640 +               asm ("movl %%ebp, %0" : "=r" (ebp) : );
14641 +       } else {
14642 +               /* ebp is the last reg pushed by switch_to */
14643 +               ebp = *(unsigned long *) task->thread.esp;
14644 +       }
14645 +
14646 +       while (1) {
14647 +               struct thread_info *context;
14648 +               context = (struct thread_info *)
14649 +                       ((unsigned long)stack & (~(THREAD_SIZE - 1)));
14650 +               ebp = print_context_stack(context, stack, ebp, log_lvl);
14651 +               stack = (unsigned long*)context->previous_esp;
14652 +               if (!stack)
14653 +                       break;
14654 +               printk("%s =======================\n", log_lvl);
14655 +       }
14656 +}
14657 +
14658 +void show_trace(struct task_struct *task, unsigned long * stack)
14659 +{
14660 +       show_trace_log_lvl(task, stack, "");
14661 +}
14662 +
14663 +static void show_stack_log_lvl(struct task_struct *task, unsigned long *esp,
14664 +                              char *log_lvl)
14665 +{
14666 +       unsigned long *stack;
14667 +       int i;
14668 +
14669 +       if (esp == NULL) {
14670 +               if (task)
14671 +                       esp = (unsigned long*)task->thread.esp;
14672 +               else
14673 +                       esp = (unsigned long *)&esp;
14674 +       }
14675 +
14676 +       stack = esp;
14677 +       printk(log_lvl);
14678 +       for(i = 0; i < kstack_depth_to_print; i++) {
14679 +               if (kstack_end(stack))
14680 +                       break;
14681 +               if (i && ((i % 8) == 0))
14682 +                       printk("\n%s       ", log_lvl);
14683 +               printk("%08lx ", *stack++);
14684 +       }
14685 +       printk("\n%sCall Trace:\n", log_lvl);
14686 +       show_trace_log_lvl(task, esp, log_lvl);
14687 +}
14688 +
14689 +void show_stack(struct task_struct *task, unsigned long *esp)
14690 +{
14691 +       printk("       ");
14692 +       show_stack_log_lvl(task, esp, "");
14693 +}
14694 +
14695 +/*
14696 + * The architecture-independent dump_stack generator
14697 + */
14698 +void dump_stack(void)
14699 +{
14700 +       unsigned long stack;
14701 +
14702 +       show_trace(current, &stack);
14703 +}
14704 +
14705 +EXPORT_SYMBOL(dump_stack);
14706 +
14707 +void show_registers(struct pt_regs *regs)
14708 +{
14709 +       int i;
14710 +       int in_kernel = 1;
14711 +       unsigned long esp;
14712 +       unsigned short ss;
14713 +
14714 +       esp = (unsigned long) (&regs->esp);
14715 +       savesegment(ss, ss);
14716 +       if (user_mode_vm(regs)) {
14717 +               in_kernel = 0;
14718 +               esp = regs->esp;
14719 +               ss = regs->xss & 0xffff;
14720 +       }
14721 +       print_modules();
14722 +       printk(KERN_EMERG "CPU:    %d\nEIP:    %04x:[<%08lx>]    %s VLI\n"
14723 +                       "EFLAGS: %08lx   (%s %.*s) \n",
14724 +               smp_processor_id(), 0xffff & regs->xcs, regs->eip,
14725 +               print_tainted(), regs->eflags, system_utsname.release,
14726 +               (int)strcspn(system_utsname.version, " "),
14727 +               system_utsname.version);
14728 +       print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip);
14729 +       printk(KERN_EMERG "eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
14730 +               regs->eax, regs->ebx, regs->ecx, regs->edx);
14731 +       printk(KERN_EMERG "esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
14732 +               regs->esi, regs->edi, regs->ebp, esp);
14733 +       printk(KERN_EMERG "ds: %04x   es: %04x   ss: %04x\n",
14734 +               regs->xds & 0xffff, regs->xes & 0xffff, ss);
14735 +       printk(KERN_EMERG "Process %s (pid: %d, threadinfo=%p task=%p)",
14736 +               current->comm, current->pid, current_thread_info(), current);
14737 +       /*
14738 +        * When in-kernel, we also print out the stack and code at the
14739 +        * time of the fault..
14740 +        */
14741 +       if (in_kernel) {
14742 +               u8 __user *eip;
14743 +
14744 +               printk("\n" KERN_EMERG "Stack: ");
14745 +               show_stack_log_lvl(NULL, (unsigned long *)esp, KERN_EMERG);
14746 +
14747 +               printk(KERN_EMERG "Code: ");
14748 +
14749 +               eip = (u8 __user *)regs->eip - 43;
14750 +               for (i = 0; i < 64; i++, eip++) {
14751 +                       unsigned char c;
14752 +
14753 +                       if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
14754 +                               printk(" Bad EIP value.");
14755 +                               break;
14756 +                       }
14757 +                       if (eip == (u8 __user *)regs->eip)
14758 +                               printk("<%02x> ", c);
14759 +                       else
14760 +                               printk("%02x ", c);
14761 +               }
14762 +       }
14763 +       printk("\n");
14764 +}      
14765 +
14766 +static void handle_BUG(struct pt_regs *regs)
14767 +{
14768 +       unsigned short ud2;
14769 +       unsigned short line;
14770 +       char *file;
14771 +       char c;
14772 +       unsigned long eip;
14773 +
14774 +       eip = regs->eip;
14775 +
14776 +       if (eip < PAGE_OFFSET)
14777 +               goto no_bug;
14778 +       if (__get_user(ud2, (unsigned short __user *)eip))
14779 +               goto no_bug;
14780 +       if (ud2 != 0x0b0f)
14781 +               goto no_bug;
14782 +       if (__get_user(line, (unsigned short __user *)(eip + 2)))
14783 +               goto bug;
14784 +       if (__get_user(file, (char * __user *)(eip + 4)) ||
14785 +               (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
14786 +               file = "<bad filename>";
14787 +
14788 +       printk(KERN_EMERG "------------[ cut here ]------------\n");
14789 +       printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
14790 +
14791 +no_bug:
14792 +       return;
14793 +
14794 +       /* Here we know it was a BUG but file-n-line is unavailable */
14795 +bug:
14796 +       printk(KERN_EMERG "Kernel BUG\n");
14797 +}
14798 +
14799 +/* This is gone through when something in the kernel
14800 + * has done something bad and is about to be terminated.
14801 +*/
14802 +void die(const char * str, struct pt_regs * regs, long err)
14803 +{
14804 +       static struct {
14805 +               spinlock_t lock;
14806 +               u32 lock_owner;
14807 +               int lock_owner_depth;
14808 +       } die = {
14809 +               .lock =                 SPIN_LOCK_UNLOCKED,
14810 +               .lock_owner =           -1,
14811 +               .lock_owner_depth =     0
14812 +       };
14813 +       static int die_counter;
14814 +       unsigned long flags;
14815 +
14816 +       oops_enter();
14817 +
14818 +       if (die.lock_owner != raw_smp_processor_id()) {
14819 +               console_verbose();
14820 +               spin_lock_irqsave(&die.lock, flags);
14821 +               die.lock_owner = smp_processor_id();
14822 +               die.lock_owner_depth = 0;
14823 +               bust_spinlocks(1);
14824 +       }
14825 +       else
14826 +               local_save_flags(flags);
14827 +
14828 +       if (++die.lock_owner_depth < 3) {
14829 +               int nl = 0;
14830 +               unsigned long esp;
14831 +               unsigned short ss;
14832 +
14833 +               handle_BUG(regs);
14834 +               printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
14835 +#ifdef CONFIG_PREEMPT
14836 +               printk(KERN_EMERG "PREEMPT ");
14837 +               nl = 1;
14838 +#endif
14839 +#ifdef CONFIG_SMP
14840 +               if (!nl)
14841 +                       printk(KERN_EMERG);
14842 +               printk("SMP ");
14843 +               nl = 1;
14844 +#endif
14845 +#ifdef CONFIG_DEBUG_PAGEALLOC
14846 +               if (!nl)
14847 +                       printk(KERN_EMERG);
14848 +               printk("DEBUG_PAGEALLOC");
14849 +               nl = 1;
14850 +#endif
14851 +               if (nl)
14852 +                       printk("\n");
14853 +               if (notify_die(DIE_OOPS, str, regs, err,
14854 +                                       current->thread.trap_no, SIGSEGV) !=
14855 +                               NOTIFY_STOP) {
14856 +                       show_registers(regs);
14857 +                       /* Executive summary in case the oops scrolled away */
14858 +                       esp = (unsigned long) (&regs->esp);
14859 +                       savesegment(ss, ss);
14860 +                       if (user_mode(regs)) {
14861 +                               esp = regs->esp;
14862 +                               ss = regs->xss & 0xffff;
14863 +                       }
14864 +                       printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
14865 +                       print_symbol("%s", regs->eip);
14866 +                       printk(" SS:ESP %04x:%08lx\n", ss, esp);
14867 +               }
14868 +               else
14869 +                       regs = NULL;
14870 +       } else
14871 +               printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
14872 +
14873 +       bust_spinlocks(0);
14874 +       die.lock_owner = -1;
14875 +       spin_unlock_irqrestore(&die.lock, flags);
14876 +
14877 +       if (!regs)
14878 +               return;
14879 +
14880 +       if (kexec_should_crash(current))
14881 +               crash_kexec(regs);
14882 +
14883 +       if (in_interrupt())
14884 +               panic("Fatal exception in interrupt");
14885 +
14886 +       if (panic_on_oops) {
14887 +               printk(KERN_EMERG "Fatal exception: panic in 5 seconds\n");
14888 +               ssleep(5);
14889 +               panic("Fatal exception");
14890 +       }
14891 +       oops_exit();
14892 +       do_exit(SIGSEGV);
14893 +}
14894 +
14895 +static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
14896 +{
14897 +       if (!user_mode_vm(regs))
14898 +               die(str, regs, err);
14899 +}
14900 +
14901 +static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
14902 +                             struct pt_regs * regs, long error_code,
14903 +                             siginfo_t *info)
14904 +{
14905 +       struct task_struct *tsk = current;
14906 +       tsk->thread.error_code = error_code;
14907 +       tsk->thread.trap_no = trapnr;
14908 +
14909 +       if (regs->eflags & VM_MASK) {
14910 +               if (vm86)
14911 +                       goto vm86_trap;
14912 +               goto trap_signal;
14913 +       }
14914 +
14915 +       if (!user_mode(regs))
14916 +               goto kernel_trap;
14917 +
14918 +       trap_signal: {
14919 +               if (info)
14920 +                       force_sig_info(signr, info, tsk);
14921 +               else
14922 +                       force_sig(signr, tsk);
14923 +               return;
14924 +       }
14925 +
14926 +       kernel_trap: {
14927 +               if (!fixup_exception(regs))
14928 +                       die(str, regs, error_code);
14929 +               return;
14930 +       }
14931 +
14932 +       vm86_trap: {
14933 +               int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
14934 +               if (ret) goto trap_signal;
14935 +               return;
14936 +       }
14937 +}
14938 +
14939 +#define DO_ERROR(trapnr, signr, str, name) \
14940 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
14941 +{ \
14942 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
14943 +                                               == NOTIFY_STOP) \
14944 +               return; \
14945 +       do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
14946 +}
14947 +
14948 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
14949 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
14950 +{ \
14951 +       siginfo_t info; \
14952 +       info.si_signo = signr; \
14953 +       info.si_errno = 0; \
14954 +       info.si_code = sicode; \
14955 +       info.si_addr = (void __user *)siaddr; \
14956 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
14957 +                                               == NOTIFY_STOP) \
14958 +               return; \
14959 +       do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
14960 +}
14961 +
14962 +#define DO_VM86_ERROR(trapnr, signr, str, name) \
14963 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
14964 +{ \
14965 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
14966 +                                               == NOTIFY_STOP) \
14967 +               return; \
14968 +       do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
14969 +}
14970 +
14971 +#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
14972 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
14973 +{ \
14974 +       siginfo_t info; \
14975 +       info.si_signo = signr; \
14976 +       info.si_errno = 0; \
14977 +       info.si_code = sicode; \
14978 +       info.si_addr = (void __user *)siaddr; \
14979 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
14980 +                                               == NOTIFY_STOP) \
14981 +               return; \
14982 +       do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
14983 +}
14984 +
14985 +DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->eip)
14986 +#ifndef CONFIG_KPROBES
14987 +DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
14988 +#endif
14989 +DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
14990 +DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
14991 +DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip)
14992 +DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
14993 +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
14994 +DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
14995 +DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
14996 +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
14997 +DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
14998 +
14999 +fastcall void __kprobes do_general_protection(struct pt_regs * regs,
15000 +                                             long error_code)
15001 +{
15002 +       current->thread.error_code = error_code;
15003 +       current->thread.trap_no = 13;
15004 +
15005 +       if (regs->eflags & VM_MASK)
15006 +               goto gp_in_vm86;
15007 +
15008 +       if (!user_mode(regs))
15009 +               goto gp_in_kernel;
15010 +
15011 +       current->thread.error_code = error_code;
15012 +       current->thread.trap_no = 13;
15013 +       force_sig(SIGSEGV, current);
15014 +       return;
15015 +
15016 +gp_in_vm86:
15017 +       local_irq_enable();
15018 +       handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
15019 +       return;
15020 +
15021 +gp_in_kernel:
15022 +       if (!fixup_exception(regs)) {
15023 +               if (notify_die(DIE_GPF, "general protection fault", regs,
15024 +                               error_code, 13, SIGSEGV) == NOTIFY_STOP)
15025 +                       return;
15026 +               die("general protection fault", regs, error_code);
15027 +       }
15028 +}
15029 +
15030 +static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
15031 +{
15032 +       printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying "
15033 +                       "to continue\n");
15034 +       printk(KERN_EMERG "You probably have a hardware problem with your RAM "
15035 +                       "chips\n");
15036 +
15037 +       /* Clear and disable the memory parity error line. */
15038 +       clear_mem_error(reason);
15039 +}
15040 +
15041 +static void io_check_error(unsigned char reason, struct pt_regs * regs)
15042 +{
15043 +       printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
15044 +       show_registers(regs);
15045 +
15046 +       /* Re-enable the IOCK line, wait for a few seconds */
15047 +       clear_io_check_error(reason);
15048 +}
15049 +
15050 +static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
15051 +{
15052 +#ifdef CONFIG_MCA
15053 +       /* Might actually be able to figure out what the guilty party
15054 +       * is. */
15055 +       if( MCA_bus ) {
15056 +               mca_handle_nmi();
15057 +               return;
15058 +       }
15059 +#endif
15060 +       printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
15061 +               reason, smp_processor_id());
15062 +       printk("Dazed and confused, but trying to continue\n");
15063 +       printk("Do you have a strange power saving mode enabled?\n");
15064 +}
15065 +
15066 +static DEFINE_SPINLOCK(nmi_print_lock);
15067 +
15068 +void die_nmi (struct pt_regs *regs, const char *msg)
15069 +{
15070 +       if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
15071 +           NOTIFY_STOP)
15072 +               return;
15073 +
15074 +       spin_lock(&nmi_print_lock);
15075 +       /*
15076 +       * We are in trouble anyway, lets at least try
15077 +       * to get a message out.
15078 +       */
15079 +       bust_spinlocks(1);
15080 +       printk(KERN_EMERG "%s", msg);
15081 +       printk(" on CPU%d, eip %08lx, registers:\n",
15082 +               smp_processor_id(), regs->eip);
15083 +       show_registers(regs);
15084 +       printk(KERN_EMERG "console shuts up ...\n");
15085 +       console_silent();
15086 +       spin_unlock(&nmi_print_lock);
15087 +       bust_spinlocks(0);
15088 +
15089 +       /* If we are in kernel we are probably nested up pretty bad
15090 +        * and might aswell get out now while we still can.
15091 +       */
15092 +       if (!user_mode_vm(regs)) {
15093 +               current->thread.trap_no = 2;
15094 +               crash_kexec(regs);
15095 +       }
15096 +
15097 +       do_exit(SIGSEGV);
15098 +}
15099 +
15100 +static void default_do_nmi(struct pt_regs * regs)
15101 +{
15102 +       unsigned char reason = 0;
15103 +
15104 +       /* Only the BSP gets external NMIs from the system.  */
15105 +       if (!smp_processor_id())
15106 +               reason = get_nmi_reason();
15107
15108 +       if (!(reason & 0xc0)) {
15109 +               if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
15110 +                                                       == NOTIFY_STOP)
15111 +                       return;
15112 +#ifdef CONFIG_X86_LOCAL_APIC
15113 +               /*
15114 +                * Ok, so this is none of the documented NMI sources,
15115 +                * so it must be the NMI watchdog.
15116 +                */
15117 +               if (nmi_watchdog) {
15118 +                       nmi_watchdog_tick(regs);
15119 +                       return;
15120 +               }
15121 +#endif
15122 +               unknown_nmi_error(reason, regs);
15123 +               return;
15124 +       }
15125 +       if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
15126 +               return;
15127 +       if (reason & 0x80)
15128 +               mem_parity_error(reason, regs);
15129 +       if (reason & 0x40)
15130 +               io_check_error(reason, regs);
15131 +       /*
15132 +        * Reassert NMI in case it became active meanwhile
15133 +        * as it's edge-triggered.
15134 +        */
15135 +       reassert_nmi();
15136 +}
15137 +
15138 +static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
15139 +{
15140 +       return 0;
15141 +}
15142
15143 +static nmi_callback_t nmi_callback = dummy_nmi_callback;
15144
15145 +fastcall void do_nmi(struct pt_regs * regs, long error_code)
15146 +{
15147 +       int cpu;
15148 +
15149 +       nmi_enter();
15150 +
15151 +       cpu = smp_processor_id();
15152 +
15153 +       ++nmi_count(cpu);
15154 +
15155 +       if (!rcu_dereference(nmi_callback)(regs, cpu))
15156 +               default_do_nmi(regs);
15157 +
15158 +       nmi_exit();
15159 +}
15160 +
15161 +void set_nmi_callback(nmi_callback_t callback)
15162 +{
15163 +       vmalloc_sync_all();
15164 +       rcu_assign_pointer(nmi_callback, callback);
15165 +}
15166 +EXPORT_SYMBOL_GPL(set_nmi_callback);
15167 +
15168 +void unset_nmi_callback(void)
15169 +{
15170 +       nmi_callback = dummy_nmi_callback;
15171 +}
15172 +EXPORT_SYMBOL_GPL(unset_nmi_callback);
15173 +
15174 +#ifdef CONFIG_KPROBES
15175 +fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
15176 +{
15177 +       if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
15178 +                       == NOTIFY_STOP)
15179 +               return;
15180 +       /* This is an interrupt gate, because kprobes wants interrupts
15181 +       disabled.  Normal trap handlers don't. */
15182 +       restore_interrupts(regs);
15183 +       do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
15184 +}
15185 +#endif
15186 +
15187 +/*
15188 + * Our handling of the processor debug registers is non-trivial.
15189 + * We do not clear them on entry and exit from the kernel. Therefore
15190 + * it is possible to get a watchpoint trap here from inside the kernel.
15191 + * However, the code in ./ptrace.c has ensured that the user can
15192 + * only set watchpoints on userspace addresses. Therefore the in-kernel
15193 + * watchpoint trap can only occur in code which is reading/writing
15194 + * from user space. Such code must not hold kernel locks (since it
15195 + * can equally take a page fault), therefore it is safe to call
15196 + * force_sig_info even though that claims and releases locks.
15197 + * 
15198 + * Code in ./signal.c ensures that the debug control register
15199 + * is restored before we deliver any signal, and therefore that
15200 + * user code runs with the correct debug control register even though
15201 + * we clear it here.
15202 + *
15203 + * Being careful here means that we don't have to be as careful in a
15204 + * lot of more complicated places (task switching can be a bit lazy
15205 + * about restoring all the debug state, and ptrace doesn't have to
15206 + * find every occurrence of the TF bit that could be saved away even
15207 + * by user code)
15208 + */
15209 +fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
15210 +{
15211 +       unsigned int condition;
15212 +       struct task_struct *tsk = current;
15213 +
15214 +       get_debugreg(condition, 6);
15215 +
15216 +       if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
15217 +                                       SIGTRAP) == NOTIFY_STOP)
15218 +               return;
15219 +       /* It's safe to allow irq's after DR6 has been saved */
15220 +       if (regs->eflags & X86_EFLAGS_IF)
15221 +               local_irq_enable();
15222 +
15223 +       /* Mask out spurious debug traps due to lazy DR7 setting */
15224 +       if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
15225 +               if (!tsk->thread.debugreg[7])
15226 +                       goto clear_dr7;
15227 +       }
15228 +
15229 +       if (regs->eflags & VM_MASK)
15230 +               goto debug_vm86;
15231 +
15232 +       /* Save debug status register where ptrace can see it */
15233 +       tsk->thread.debugreg[6] = condition;
15234 +
15235 +       /*
15236 +        * Single-stepping through TF: make sure we ignore any events in
15237 +        * kernel space (but re-enable TF when returning to user mode).
15238 +        */
15239 +       if (condition & DR_STEP) {
15240 +               /*
15241 +                * We already checked v86 mode above, so we can
15242 +                * check for kernel mode by just checking the CPL
15243 +                * of CS.
15244 +                */
15245 +               if (!user_mode(regs))
15246 +                       goto clear_TF_reenable;
15247 +       }
15248 +
15249 +       /* Ok, finally something we can handle */
15250 +       send_sigtrap(tsk, regs, error_code);
15251 +
15252 +       /* Disable additional traps. They'll be re-enabled when
15253 +        * the signal is delivered.
15254 +        */
15255 +clear_dr7:
15256 +       set_debugreg(0, 7);
15257 +       return;
15258 +
15259 +debug_vm86:
15260 +       handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
15261 +       return;
15262 +
15263 +clear_TF_reenable:
15264 +       set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
15265 +       regs->eflags &= ~TF_MASK;
15266 +       return;
15267 +}
15268 +
15269 +/*
15270 + * Note that we play around with the 'TS' bit in an attempt to get
15271 + * the correct behaviour even in the presence of the asynchronous
15272 + * IRQ13 behaviour
15273 + */
15274 +void math_error(void __user *eip)
15275 +{
15276 +       struct task_struct * task;
15277 +       siginfo_t info;
15278 +       unsigned short cwd, swd;
15279 +
15280 +       /*
15281 +        * Save the info for the exception handler and clear the error.
15282 +        */
15283 +       task = current;
15284 +       save_init_fpu(task);
15285 +       task->thread.trap_no = 16;
15286 +       task->thread.error_code = 0;
15287 +       info.si_signo = SIGFPE;
15288 +       info.si_errno = 0;
15289 +       info.si_code = __SI_FAULT;
15290 +       info.si_addr = eip;
15291 +       /*
15292 +        * (~cwd & swd) will mask out exceptions that are not set to unmasked
15293 +        * status.  0x3f is the exception bits in these regs, 0x200 is the
15294 +        * C1 reg you need in case of a stack fault, 0x040 is the stack
15295 +        * fault bit.  We should only be taking one exception at a time,
15296 +        * so if this combination doesn't produce any single exception,
15297 +        * then we have a bad program that isn't syncronizing its FPU usage
15298 +        * and it will suffer the consequences since we won't be able to
15299 +        * fully reproduce the context of the exception
15300 +        */
15301 +       cwd = get_fpu_cwd(task);
15302 +       swd = get_fpu_swd(task);
15303 +       switch (swd & ~cwd & 0x3f) {
15304 +               case 0x000: /* No unmasked exception */
15305 +                       return;
15306 +               default:    /* Multiple exceptions */
15307 +                       break;
15308 +               case 0x001: /* Invalid Op */
15309 +                       /*
15310 +                        * swd & 0x240 == 0x040: Stack Underflow
15311 +                        * swd & 0x240 == 0x240: Stack Overflow
15312 +                        * User must clear the SF bit (0x40) if set
15313 +                        */
15314 +                       info.si_code = FPE_FLTINV;
15315 +                       break;
15316 +               case 0x002: /* Denormalize */
15317 +               case 0x010: /* Underflow */
15318 +                       info.si_code = FPE_FLTUND;
15319 +                       break;
15320 +               case 0x004: /* Zero Divide */
15321 +                       info.si_code = FPE_FLTDIV;
15322 +                       break;
15323 +               case 0x008: /* Overflow */
15324 +                       info.si_code = FPE_FLTOVF;
15325 +                       break;
15326 +               case 0x020: /* Precision */
15327 +                       info.si_code = FPE_FLTRES;
15328 +                       break;
15329 +       }
15330 +       force_sig_info(SIGFPE, &info, task);
15331 +}
15332 +
15333 +fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
15334 +{
15335 +       ignore_fpu_irq = 1;
15336 +       math_error((void __user *)regs->eip);
15337 +}
15338 +
15339 +static void simd_math_error(void __user *eip)
15340 +{
15341 +       struct task_struct * task;
15342 +       siginfo_t info;
15343 +       unsigned short mxcsr;
15344 +
15345 +       /*
15346 +        * Save the info for the exception handler and clear the error.
15347 +        */
15348 +       task = current;
15349 +       save_init_fpu(task);
15350 +       task->thread.trap_no = 19;
15351 +       task->thread.error_code = 0;
15352 +       info.si_signo = SIGFPE;
15353 +       info.si_errno = 0;
15354 +       info.si_code = __SI_FAULT;
15355 +       info.si_addr = eip;
15356 +       /*
15357 +        * The SIMD FPU exceptions are handled a little differently, as there
15358 +        * is only a single status/control register.  Thus, to determine which
15359 +        * unmasked exception was caught we must mask the exception mask bits
15360 +        * at 0x1f80, and then use these to mask the exception bits at 0x3f.
15361 +        */
15362 +       mxcsr = get_fpu_mxcsr(task);
15363 +       switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
15364 +               case 0x000:
15365 +               default:
15366 +                       break;
15367 +               case 0x001: /* Invalid Op */
15368 +                       info.si_code = FPE_FLTINV;
15369 +                       break;
15370 +               case 0x002: /* Denormalize */
15371 +               case 0x010: /* Underflow */
15372 +                       info.si_code = FPE_FLTUND;
15373 +                       break;
15374 +               case 0x004: /* Zero Divide */
15375 +                       info.si_code = FPE_FLTDIV;
15376 +                       break;
15377 +               case 0x008: /* Overflow */
15378 +                       info.si_code = FPE_FLTOVF;
15379 +                       break;
15380 +               case 0x020: /* Precision */
15381 +                       info.si_code = FPE_FLTRES;
15382 +                       break;
15383 +       }
15384 +       force_sig_info(SIGFPE, &info, task);
15385 +}
15386 +
15387 +fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
15388 +                                         long error_code)
15389 +{
15390 +       if (cpu_has_xmm) {
15391 +               /* Handle SIMD FPU exceptions on PIII+ processors. */
15392 +               ignore_fpu_irq = 1;
15393 +               simd_math_error((void __user *)regs->eip);
15394 +       } else {
15395 +               /*
15396 +                * Handle strange cache flush from user space exception
15397 +                * in all other cases.  This is undocumented behaviour.
15398 +                */
15399 +               if (regs->eflags & VM_MASK) {
15400 +                       handle_vm86_fault((struct kernel_vm86_regs *)regs,
15401 +                                         error_code);
15402 +                       return;
15403 +               }
15404 +               current->thread.trap_no = 19;
15405 +               current->thread.error_code = error_code;
15406 +               die_if_kernel("cache flush denied", regs, error_code);
15407 +               force_sig(SIGSEGV, current);
15408 +       }
15409 +}
15410 +
15411 +#ifndef CONFIG_XEN
15412 +fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
15413 +                                         long error_code)
15414 +{
15415 +#if 0
15416 +       /* No need to warn about this any longer. */
15417 +       printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
15418 +#endif
15419 +}
15420 +
15421 +fastcall void setup_x86_bogus_stack(unsigned char * stk)
15422 +{
15423 +       unsigned long *switch16_ptr, *switch32_ptr;
15424 +       struct pt_regs *regs;
15425 +       unsigned long stack_top, stack_bot;
15426 +       unsigned short iret_frame16_off;
15427 +       int cpu = smp_processor_id();
15428 +       /* reserve the space on 32bit stack for the magic switch16 pointer */
15429 +       memmove(stk, stk + 8, sizeof(struct pt_regs));
15430 +       switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
15431 +       regs = (struct pt_regs *)stk;
15432 +       /* now the switch32 on 16bit stack */
15433 +       stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
15434 +       stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
15435 +       switch32_ptr = (unsigned long *)(stack_top - 8);
15436 +       iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
15437 +       /* copy iret frame on 16bit stack */
15438 +       memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
15439 +       /* fill in the switch pointers */
15440 +       switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
15441 +       switch16_ptr[1] = __ESPFIX_SS;
15442 +       switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
15443 +               8 - CPU_16BIT_STACK_SIZE;
15444 +       switch32_ptr[1] = __KERNEL_DS;
15445 +}
15446 +
15447 +fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
15448 +{
15449 +       unsigned long *switch32_ptr;
15450 +       unsigned char *stack16, *stack32;
15451 +       unsigned long stack_top, stack_bot;
15452 +       int len;
15453 +       int cpu = smp_processor_id();
15454 +       stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
15455 +       stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
15456 +       switch32_ptr = (unsigned long *)(stack_top - 8);
15457 +       /* copy the data from 16bit stack to 32bit stack */
15458 +       len = CPU_16BIT_STACK_SIZE - 8 - sp;
15459 +       stack16 = (unsigned char *)(stack_bot + sp);
15460 +       stack32 = (unsigned char *)
15461 +               (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
15462 +       memcpy(stack32, stack16, len);
15463 +       return stack32;
15464 +}
15465 +#endif
15466 +
15467 +/*
15468 + *  'math_state_restore()' saves the current math information in the
15469 + * old math state array, and gets the new ones from the current task
15470 + *
15471 + * Careful.. There are problems with IBM-designed IRQ13 behaviour.
15472 + * Don't touch unless you *really* know how it works.
15473 + *
15474 + * Must be called with kernel preemption disabled (in this case,
15475 + * local interrupts are disabled at the call-site in entry.S).
15476 + */
15477 +asmlinkage void math_state_restore(struct pt_regs regs)
15478 +{
15479 +       struct thread_info *thread = current_thread_info();
15480 +       struct task_struct *tsk = thread->task;
15481 +
15482 +       /* NB. 'clts' is done for us by Xen during virtual trap. */
15483 +       if (!tsk_used_math(tsk))
15484 +               init_fpu(tsk);
15485 +       restore_fpu(tsk);
15486 +       thread->status |= TS_USEDFPU;   /* So we fnsave on switch_to() */
15487 +}
15488 +
15489 +#ifndef CONFIG_MATH_EMULATION
15490 +
15491 +asmlinkage void math_emulate(long arg)
15492 +{
15493 +       printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
15494 +       printk(KERN_EMERG "killing %s.\n",current->comm);
15495 +       force_sig(SIGFPE,current);
15496 +       schedule();
15497 +}
15498 +
15499 +#endif /* CONFIG_MATH_EMULATION */
15500 +
15501 +#ifdef CONFIG_X86_F00F_BUG
15502 +void __init trap_init_f00f_bug(void)
15503 +{
15504 +       __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
15505 +
15506 +       /*
15507 +        * Update the IDT descriptor and reload the IDT so that
15508 +        * it uses the read-only mapped virtual address.
15509 +        */
15510 +       idt_descr.address = fix_to_virt(FIX_F00F_IDT);
15511 +       load_idt(&idt_descr);
15512 +}
15513 +#endif
15514 +
15515 +
15516 +/*
15517 + * NB. All these are "trap gates" (i.e. events_mask isn't set) except
15518 + * for those that specify <dpl>|4 in the second field.
15519 + */
15520 +static trap_info_t trap_table[] = {
15521 +       {  0, 0, __KERNEL_CS, (unsigned long)divide_error               },
15522 +       {  1, 0|4, __KERNEL_CS, (unsigned long)debug                    },
15523 +       {  3, 3|4, __KERNEL_CS, (unsigned long)int3                     },
15524 +       {  4, 3, __KERNEL_CS, (unsigned long)overflow                   },
15525 +       {  5, 0, __KERNEL_CS, (unsigned long)bounds                     },
15526 +       {  6, 0, __KERNEL_CS, (unsigned long)invalid_op                 },
15527 +       {  7, 0|4, __KERNEL_CS, (unsigned long)device_not_available     },
15528 +       {  9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun },
15529 +       { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS                },
15530 +       { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present        },
15531 +       { 12, 0, __KERNEL_CS, (unsigned long)stack_segment              },
15532 +       { 13, 0, __KERNEL_CS, (unsigned long)general_protection         },
15533 +       { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault               },
15534 +       { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment          },
15535 +       { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error          },
15536 +       { 17, 0, __KERNEL_CS, (unsigned long)alignment_check            },
15537 +#ifdef CONFIG_X86_MCE
15538 +       { 18, 0, __KERNEL_CS, (unsigned long)machine_check              },
15539 +#endif
15540 +       { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error     },
15541 +       { SYSCALL_VECTOR,  3, __KERNEL_CS, (unsigned long)system_call   },
15542 +       {  0, 0,           0, 0                                         }
15543 +};
15544 +
15545 +void __init trap_init(void)
15546 +{
15547 +       HYPERVISOR_set_trap_table(trap_table);
15548 +
15549 +       if (cpu_has_fxsr) {
15550 +               /*
15551 +                * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
15552 +                * Generates a compile-time "error: zero width for bit-field" if
15553 +                * the alignment is wrong.
15554 +                */
15555 +               struct fxsrAlignAssert {
15556 +                       int _:!(offsetof(struct task_struct,
15557 +                                       thread.i387.fxsave) & 15);
15558 +               };
15559 +
15560 +               printk(KERN_INFO "Enabling fast FPU save and restore... ");
15561 +               set_in_cr4(X86_CR4_OSFXSR);
15562 +               printk("done.\n");
15563 +       }
15564 +       if (cpu_has_xmm) {
15565 +               printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
15566 +                               "support... ");
15567 +               set_in_cr4(X86_CR4_OSXMMEXCPT);
15568 +               printk("done.\n");
15569 +       }
15570 +
15571 +       /*
15572 +        * Should be a barrier for any external CPU state.
15573 +        */
15574 +       cpu_init();
15575 +}
15576 +
15577 +void smp_trap_init(trap_info_t *trap_ctxt)
15578 +{
15579 +       trap_info_t *t = trap_table;
15580 +
15581 +       for (t = trap_table; t->address; t++) {
15582 +               trap_ctxt[t->vector].flags = t->flags;
15583 +               trap_ctxt[t->vector].cs = t->cs;
15584 +               trap_ctxt[t->vector].address = t->address;
15585 +       }
15586 +}
15587 +
15588 +static int __init kstack_setup(char *s)
15589 +{
15590 +       kstack_depth_to_print = simple_strtoul(s, NULL, 0);
15591 +       return 1;
15592 +}
15593 +__setup("kstack=", kstack_setup);
15594 diff -urNp linux-2.6/arch/i386/kernel/vm86.c new/arch/i386/kernel/vm86.c
15595 --- linux-2.6/arch/i386/kernel/vm86.c   2006-07-03 14:14:14.000000000 +0200
15596 +++ new/arch/i386/kernel/vm86.c 2006-06-07 13:29:36.000000000 +0200
15597 @@ -98,7 +98,9 @@
15598  struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
15599  struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
15600  {
15601 +#ifndef CONFIG_X86_NO_TSS
15602         struct tss_struct *tss;
15603 +#endif
15604         struct pt_regs *ret;
15605         unsigned long tmp;
15606  
15607 @@ -123,12 +125,16 @@ struct pt_regs * fastcall save_v86_state
15608                 do_exit(SIGSEGV);
15609         }
15610  
15611 +#ifndef CONFIG_X86_NO_TSS
15612         tss = &per_cpu(init_tss, get_cpu());
15613 +#endif
15614         current->thread.esp0 = current->thread.saved_esp0;
15615         current->thread.sysenter_cs = __KERNEL_CS;
15616         load_esp0(tss, &current->thread);
15617         current->thread.saved_esp0 = 0;
15618 +#ifndef CONFIG_X86_NO_TSS
15619         put_cpu();
15620 +#endif
15621  
15622         loadsegment(fs, current->thread.saved_fs);
15623         loadsegment(gs, current->thread.saved_gs);
15624 @@ -252,7 +258,9 @@ out:
15625  
15626  static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
15627  {
15628 +#ifndef CONFIG_X86_NO_TSS
15629         struct tss_struct *tss;
15630 +#endif
15631         long eax;
15632  /*
15633   * make sure the vm86() system call doesn't try to do anything silly
15634 @@ -297,12 +305,16 @@ static void do_sys_vm86(struct kernel_vm
15635         savesegment(fs, tsk->thread.saved_fs);
15636         savesegment(gs, tsk->thread.saved_gs);
15637  
15638 +#ifndef CONFIG_X86_NO_TSS
15639         tss = &per_cpu(init_tss, get_cpu());
15640 +#endif
15641         tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
15642         if (cpu_has_sep)
15643                 tsk->thread.sysenter_cs = 0;
15644         load_esp0(tss, &tsk->thread);
15645 +#ifndef CONFIG_X86_NO_TSS
15646         put_cpu();
15647 +#endif
15648  
15649         tsk->thread.screen_bitmap = info->screen_bitmap;
15650         if (info->flags & VM86_SCREEN_BITMAP)
15651 diff -urNp linux-2.6/arch/i386/kernel/vsyscall-note-xen.S new/arch/i386/kernel/vsyscall-note-xen.S
15652 --- linux-2.6/arch/i386/kernel/vsyscall-note-xen.S      1970-01-01 01:00:00.000000000 +0100
15653 +++ new/arch/i386/kernel/vsyscall-note-xen.S    2006-05-09 12:32:36.000000000 +0200
15654 @@ -0,0 +1,32 @@
15655 +/*
15656 + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
15657 + * Here we can supply some information useful to userland.
15658 + * First we get the vanilla i386 note that supplies the kernel version info.
15659 + */
15660 +
15661 +#include "vsyscall-note.S"
15662 +
15663 +/*
15664 + * Now we add a special note telling glibc's dynamic linker a fake hardware
15665 + * flavor that it will use to choose the search path for libraries in the
15666 + * same way it uses real hardware capabilities like "mmx".
15667 + * We supply "nosegneg" as the fake capability, to indicate that we
15668 + * do not like negative offsets in instructions using segment overrides,
15669 + * since we implement those inefficiently.  This makes it possible to
15670 + * install libraries optimized to avoid those access patterns in someplace
15671 + * like /lib/i686/tls/nosegneg.  Note that an /etc/ld.so.conf.d/file
15672 + * corresponding to the bits here is needed to make ldconfig work right.
15673 + * It should contain:
15674 + *     hwcap 0 nosegneg
15675 + * to match the mapping of bit to name that we give here.
15676 + */
15677 +#define NOTE_KERNELCAP_BEGIN(ncaps, mask) \
15678 +       ASM_ELF_NOTE_BEGIN(".note.kernelcap", "a", "GNU", 2) \
15679 +       .long ncaps, mask
15680 +#define NOTE_KERNELCAP(bit, name) \
15681 +       .byte bit; .asciz name
15682 +#define NOTE_KERNELCAP_END ASM_ELF_NOTE_END
15683 +
15684 +NOTE_KERNELCAP_BEGIN(1, 1)
15685 +NOTE_KERNELCAP(1, "nosegneg")  /* Change 1 back to 0 when glibc is fixed! */
15686 +NOTE_KERNELCAP_END
15687 diff -urNp linux-2.6/arch/i386/mach-xen/Makefile new/arch/i386/mach-xen/Makefile
15688 --- linux-2.6/arch/i386/mach-xen/Makefile       1970-01-01 01:00:00.000000000 +0100
15689 +++ new/arch/i386/mach-xen/Makefile     2006-05-09 12:32:36.000000000 +0200
15690 @@ -0,0 +1,5 @@
15691 +#
15692 +# Makefile for the linux kernel.
15693 +#
15694 +
15695 +obj-y                          := setup.o
15696 diff -urNp linux-2.6/arch/i386/mach-xen/setup.c new/arch/i386/mach-xen/setup.c
15697 --- linux-2.6/arch/i386/mach-xen/setup.c        1970-01-01 01:00:00.000000000 +0100
15698 +++ new/arch/i386/mach-xen/setup.c      2006-05-09 12:32:36.000000000 +0200
15699 @@ -0,0 +1,49 @@
15700 +/*
15701 + *     Machine specific setup for generic
15702 + */
15703 +
15704 +#include <linux/config.h>
15705 +#include <linux/smp.h>
15706 +#include <linux/init.h>
15707 +#include <linux/interrupt.h>
15708 +#include <asm/acpi.h>
15709 +#include <asm/arch_hooks.h>
15710 +
15711 +#ifdef CONFIG_HOTPLUG_CPU
15712 +#define DEFAULT_SEND_IPI       (1)
15713 +#else
15714 +#define DEFAULT_SEND_IPI       (0)
15715 +#endif
15716 +
15717 +int no_broadcast=DEFAULT_SEND_IPI;
15718 +
15719 +static __init int no_ipi_broadcast(char *str)
15720 +{
15721 +       get_option(&str, &no_broadcast);
15722 +       printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
15723 +                                                                                       "IPI Broadcast");
15724 +       return 1;
15725 +}
15726 +
15727 +__setup("no_ipi_broadcast", no_ipi_broadcast);
15728 +
15729 +static int __init print_ipi_mode(void)
15730 +{
15731 +       printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
15732 +                                                                                       "Shortcut");
15733 +       return 0;
15734 +}
15735 +
15736 +late_initcall(print_ipi_mode);
15737 +
15738 +/**
15739 + * pre_setup_arch_hook - hook called prior to any setup_arch() execution
15740 + *
15741 + * Description:
15742 + *     generally used to activate any machine specific identification
15743 + *     routines that may be needed before setup_arch() runs.  On VISWS
15744 + *     this is used to get the board revision and type.
15745 + **/
15746 +void __init pre_setup_arch_hook(void)
15747 +{
15748 +}
15749 diff -urNp linux-2.6/arch/i386/Makefile new/arch/i386/Makefile
15750 --- linux-2.6/arch/i386/Makefile        2006-07-03 14:14:14.000000000 +0200
15751 +++ new/arch/i386/Makefile      2006-05-09 12:32:33.000000000 +0200
15752 @@ -48,6 +48,11 @@ CFLAGS                               += $(shell if [ $(call cc-vers
15753  
15754  CFLAGS += $(cflags-y)
15755  
15756 +cppflags-$(CONFIG_XEN) += \
15757 +       -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION)
15758 +
15759 +CPPFLAGS += $(cppflags-y)
15760 +
15761  # Default subarch .c files
15762  mcore-y  := mach-default
15763  
15764 @@ -71,6 +76,10 @@ mcore-$(CONFIG_X86_BIGSMP)   := mach-defau
15765  mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-i386/mach-summit
15766  mcore-$(CONFIG_X86_SUMMIT)  := mach-default
15767  
15768 +# Xen subarch support
15769 +mflags-$(CONFIG_X86_XEN)       := -Iinclude/asm-i386/mach-xen
15770 +mcore-$(CONFIG_X86_XEN)                := mach-xen
15771 +
15772  # generic subarchitecture
15773  mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-i386/mach-generic
15774  mcore-$(CONFIG_X86_GENERICARCH) := mach-default
15775 @@ -105,6 +114,19 @@ boot := arch/i386/boot
15776  PHONY += zImage bzImage compressed zlilo bzlilo \
15777           zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install
15778  
15779 +ifdef CONFIG_XEN
15780 +CPPFLAGS := -Iinclude$(if $(KBUILD_SRC),2)/asm/mach-xen $(CPPFLAGS)
15781 +head-y := arch/i386/kernel/head-xen.o arch/i386/kernel/init_task-xen.o
15782 +boot := arch/i386/boot-xen
15783 +.PHONY: vmlinuz
15784 +all: vmlinuz
15785 +
15786 +vmlinuz: vmlinux
15787 +       $(Q)$(MAKE) $(build)=$(boot) $@
15788 +
15789 +install:
15790 +       $(Q)$(MAKE) $(build)=$(boot) XENGUEST=$(XENGUEST) $@
15791 +else
15792  all: bzImage
15793  
15794  # KBUILD_IMAGE specify target image being built
15795 @@ -127,6 +149,7 @@ fdimage fdimage144 fdimage288 isoimage: 
15796  
15797  install:
15798         $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install
15799 +endif
15800  
15801  archclean:
15802         $(Q)$(MAKE) $(clean)=arch/i386/boot
15803 @@ -145,3 +168,4 @@ endef
15804  CLEAN_FILES += arch/$(ARCH)/boot/fdimage \
15805                arch/$(ARCH)/boot/image.iso \
15806                arch/$(ARCH)/boot/mtools.conf
15807 +CLEAN_FILES += vmlinuz vmlinux-stripped
15808 diff -urNp linux-2.6/arch/i386/mm/fault-xen.c new/arch/i386/mm/fault-xen.c
15809 --- linux-2.6/arch/i386/mm/fault-xen.c  1970-01-01 01:00:00.000000000 +0100
15810 +++ new/arch/i386/mm/fault-xen.c        2006-06-28 14:32:13.000000000 +0200
15811 @@ -0,0 +1,734 @@
15812 +/*
15813 + *  linux/arch/i386/mm/fault.c
15814 + *
15815 + *  Copyright (C) 1995  Linus Torvalds
15816 + */
15817 +
15818 +#include <linux/signal.h>
15819 +#include <linux/sched.h>
15820 +#include <linux/kernel.h>
15821 +#include <linux/errno.h>
15822 +#include <linux/string.h>
15823 +#include <linux/types.h>
15824 +#include <linux/ptrace.h>
15825 +#include <linux/mman.h>
15826 +#include <linux/mm.h>
15827 +#include <linux/smp.h>
15828 +#include <linux/smp_lock.h>
15829 +#include <linux/interrupt.h>
15830 +#include <linux/init.h>
15831 +#include <linux/tty.h>
15832 +#include <linux/vt_kern.h>             /* For unblank_screen() */
15833 +#include <linux/highmem.h>
15834 +#include <linux/module.h>
15835 +#include <linux/kprobes.h>
15836 +
15837 +#include <asm/system.h>
15838 +#include <asm/uaccess.h>
15839 +#include <asm/desc.h>
15840 +#include <asm/kdebug.h>
15841 +
15842 +extern void die(const char *,struct pt_regs *,long);
15843 +
15844 +/*
15845 + * Unlock any spinlocks which will prevent us from getting the
15846 + * message out 
15847 + */
15848 +void bust_spinlocks(int yes)
15849 +{
15850 +       int loglevel_save = console_loglevel;
15851 +
15852 +       if (yes) {
15853 +               oops_in_progress = 1;
15854 +               return;
15855 +       }
15856 +#ifdef CONFIG_VT
15857 +       unblank_screen();
15858 +#endif
15859 +       oops_in_progress = 0;
15860 +       /*
15861 +        * OK, the message is on the console.  Now we call printk()
15862 +        * without oops_in_progress set so that printk will give klogd
15863 +        * a poke.  Hold onto your hats...
15864 +        */
15865 +       console_loglevel = 15;          /* NMI oopser may have shut the console up */
15866 +       printk(" ");
15867 +       console_loglevel = loglevel_save;
15868 +}
15869 +
15870 +/*
15871 + * Return EIP plus the CS segment base.  The segment limit is also
15872 + * adjusted, clamped to the kernel/user address space (whichever is
15873 + * appropriate), and returned in *eip_limit.
15874 + *
15875 + * The segment is checked, because it might have been changed by another
15876 + * task between the original faulting instruction and here.
15877 + *
15878 + * If CS is no longer a valid code segment, or if EIP is beyond the
15879 + * limit, or if it is a kernel address when CS is not a kernel segment,
15880 + * then the returned value will be greater than *eip_limit.
15881 + * 
15882 + * This is slow, but is very rarely executed.
15883 + */
15884 +static inline unsigned long get_segment_eip(struct pt_regs *regs,
15885 +                                           unsigned long *eip_limit)
15886 +{
15887 +       unsigned long eip = regs->eip;
15888 +       unsigned seg = regs->xcs & 0xffff;
15889 +       u32 seg_ar, seg_limit, base, *desc;
15890 +
15891 +       /* The standard kernel/user address space limit. */
15892 +       *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg;
15893 +
15894 +       /* Unlikely, but must come before segment checks. */
15895 +       if (unlikely((regs->eflags & VM_MASK) != 0))
15896 +               return eip + (seg << 4);
15897 +       
15898 +       /* By far the most common cases. */
15899 +       if (likely(seg == __USER_CS || seg == GET_KERNEL_CS()))
15900 +               return eip;
15901 +
15902 +       /* Check the segment exists, is within the current LDT/GDT size,
15903 +          that kernel/user (ring 0..3) has the appropriate privilege,
15904 +          that it's a code segment, and get the limit. */
15905 +       __asm__ ("larl %3,%0; lsll %3,%1"
15906 +                : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
15907 +       if ((~seg_ar & 0x9800) || eip > seg_limit) {
15908 +               *eip_limit = 0;
15909 +               return 1;        /* So that returned eip > *eip_limit. */
15910 +       }
15911 +
15912 +       /* Get the GDT/LDT descriptor base. 
15913 +          When you look for races in this code remember that
15914 +          LDT and other horrors are only used in user space. */
15915 +       if (seg & (1<<2)) {
15916 +               /* Must lock the LDT while reading it. */
15917 +               down(&current->mm->context.sem);
15918 +               desc = current->mm->context.ldt;
15919 +               desc = (void *)desc + (seg & ~7);
15920 +       } else {
15921 +               /* Must disable preemption while reading the GDT. */
15922 +               desc = (u32 *)get_cpu_gdt_table(get_cpu());
15923 +               desc = (void *)desc + (seg & ~7);
15924 +       }
15925 +
15926 +       /* Decode the code segment base from the descriptor */
15927 +       base = get_desc_base((unsigned long *)desc);
15928 +
15929 +       if (seg & (1<<2)) { 
15930 +               up(&current->mm->context.sem);
15931 +       } else
15932 +               put_cpu();
15933 +
15934 +       /* Adjust EIP and segment limit, and clamp at the kernel limit.
15935 +          It's legitimate for segments to wrap at 0xffffffff. */
15936 +       seg_limit += base;
15937 +       if (seg_limit < *eip_limit && seg_limit >= base)
15938 +               *eip_limit = seg_limit;
15939 +       return eip + base;
15940 +}
15941 +
15942 +/* 
15943 + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
15944 + * Check that here and ignore it.
15945 + */
15946 +static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
15947 +{ 
15948 +       unsigned long limit;
15949 +       unsigned long instr = get_segment_eip (regs, &limit);
15950 +       int scan_more = 1;
15951 +       int prefetch = 0; 
15952 +       int i;
15953 +
15954 +       for (i = 0; scan_more && i < 15; i++) { 
15955 +               unsigned char opcode;
15956 +               unsigned char instr_hi;
15957 +               unsigned char instr_lo;
15958 +
15959 +               if (instr > limit)
15960 +                       break;
15961 +               if (__get_user(opcode, (unsigned char __user *) instr))
15962 +                       break; 
15963 +
15964 +               instr_hi = opcode & 0xf0; 
15965 +               instr_lo = opcode & 0x0f; 
15966 +               instr++;
15967 +
15968 +               switch (instr_hi) { 
15969 +               case 0x20:
15970 +               case 0x30:
15971 +                       /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
15972 +                       scan_more = ((instr_lo & 7) == 0x6);
15973 +                       break;
15974 +                       
15975 +               case 0x60:
15976 +                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
15977 +                       scan_more = (instr_lo & 0xC) == 0x4;
15978 +                       break;          
15979 +               case 0xF0:
15980 +                       /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
15981 +                       scan_more = !instr_lo || (instr_lo>>1) == 1;
15982 +                       break;                  
15983 +               case 0x00:
15984 +                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
15985 +                       scan_more = 0;
15986 +                       if (instr > limit)
15987 +                               break;
15988 +                       if (__get_user(opcode, (unsigned char __user *) instr))
15989 +                               break;
15990 +                       prefetch = (instr_lo == 0xF) &&
15991 +                               (opcode == 0x0D || opcode == 0x18);
15992 +                       break;                  
15993 +               default:
15994 +                       scan_more = 0;
15995 +                       break;
15996 +               } 
15997 +       }
15998 +       return prefetch;
15999 +}
16000 +
16001 +static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
16002 +                             unsigned long error_code)
16003 +{
16004 +       if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
16005 +                    boot_cpu_data.x86 >= 6)) {
16006 +               /* Catch an obscure case of prefetch inside an NX page. */
16007 +               if (nx_enabled && (error_code & 16))
16008 +                       return 0;
16009 +               return __is_prefetch(regs, addr);
16010 +       }
16011 +       return 0;
16012 +} 
16013 +
16014 +static noinline void force_sig_info_fault(int si_signo, int si_code,
16015 +       unsigned long address, struct task_struct *tsk)
16016 +{
16017 +       siginfo_t info;
16018 +
16019 +       info.si_signo = si_signo;
16020 +       info.si_errno = 0;
16021 +       info.si_code = si_code;
16022 +       info.si_addr = (void __user *)address;
16023 +       force_sig_info(si_signo, &info, tsk);
16024 +}
16025 +
16026 +fastcall void do_invalid_op(struct pt_regs *, unsigned long);
16027 +
16028 +#ifdef CONFIG_X86_PAE
16029 +static void dump_fault_path(unsigned long address)
16030 +{
16031 +       unsigned long *p, page;
16032 +       unsigned long mfn; 
16033 +
16034 +       page = read_cr3();
16035 +       p  = (unsigned long *)__va(page);
16036 +       p += (address >> 30) * 2;
16037 +       printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]);
16038 +       if (p[0] & 1) {
16039 +               mfn  = (p[0] >> PAGE_SHIFT) | ((p[1] & 0x7) << 20); 
16040 +               page = mfn_to_pfn(mfn) << PAGE_SHIFT; 
16041 +               p  = (unsigned long *)__va(page);
16042 +               address &= 0x3fffffff;
16043 +               p += (address >> 21) * 2;
16044 +               printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n", 
16045 +                      page, p[1], p[0]);
16046 +#ifndef CONFIG_HIGHPTE
16047 +               if (p[0] & 1) {
16048 +                       mfn  = (p[0] >> PAGE_SHIFT) | ((p[1] & 0x7) << 20); 
16049 +                       page = mfn_to_pfn(mfn) << PAGE_SHIFT; 
16050 +                       p  = (unsigned long *) __va(page);
16051 +                       address &= 0x001fffff;
16052 +                       p += (address >> 12) * 2;
16053 +                       printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n",
16054 +                              page, p[1], p[0]);
16055 +               }
16056 +#endif
16057 +       }
16058 +}
16059 +#else
16060 +static void dump_fault_path(unsigned long address)
16061 +{
16062 +       unsigned long page;
16063 +
16064 +       page = read_cr3();
16065 +       page = ((unsigned long *) __va(page))[address >> 22];
16066 +       if (oops_may_print())
16067 +               printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
16068 +                      machine_to_phys(page));
16069 +       /*
16070 +        * We must not directly access the pte in the highpte
16071 +        * case, the page table might be allocated in highmem.
16072 +        * And lets rather not kmap-atomic the pte, just in case
16073 +        * it's allocated already.
16074 +        */
16075 +#ifndef CONFIG_HIGHPTE
16076 +       if ((page & 1) && oops_may_print()) {
16077 +               page &= PAGE_MASK;
16078 +               address &= 0x003ff000;
16079 +               page = machine_to_phys(page);
16080 +               page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
16081 +               printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
16082 +                      machine_to_phys(page));
16083 +       }
16084 +#endif
16085 +}
16086 +#endif
16087 +
16088 +static int spurious_fault(struct pt_regs *regs,
16089 +                         unsigned long address,
16090 +                         unsigned long error_code)
16091 +{
16092 +       pgd_t *pgd;
16093 +       pud_t *pud;
16094 +       pmd_t *pmd;
16095 +       pte_t *pte;
16096 +
16097 +#ifdef CONFIG_XEN
16098 +       /* Faults in hypervisor area are never spurious. */
16099 +       if (address >= HYPERVISOR_VIRT_START)
16100 +               return 0;
16101 +#endif
16102 +
16103 +       /* Reserved-bit violation or user access to kernel space? */
16104 +       if (error_code & 0x0c)
16105 +               return 0;
16106 +
16107 +       pgd = init_mm.pgd + pgd_index(address);
16108 +       if (!pgd_present(*pgd))
16109 +               return 0;
16110 +
16111 +       pud = pud_offset(pgd, address);
16112 +       if (!pud_present(*pud))
16113 +               return 0;
16114 +
16115 +       pmd = pmd_offset(pud, address);
16116 +       if (!pmd_present(*pmd))
16117 +               return 0;
16118 +
16119 +       pte = pte_offset_kernel(pmd, address);
16120 +       if (!pte_present(*pte))
16121 +               return 0;
16122 +       if ((error_code & 0x02) && !pte_write(*pte))
16123 +               return 0;
16124 +#ifdef CONFIG_X86_PAE
16125 +       if ((error_code & 0x10) && (pte_val(*pte) & _PAGE_NX))
16126 +               return 0;
16127 +#endif
16128 +
16129 +       return 1;
16130 +}
16131 +
16132 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
16133 +{
16134 +       unsigned index = pgd_index(address);
16135 +       pgd_t *pgd_k;
16136 +       pud_t *pud, *pud_k;
16137 +       pmd_t *pmd, *pmd_k;
16138 +
16139 +       pgd += index;
16140 +       pgd_k = init_mm.pgd + index;
16141 +
16142 +       if (!pgd_present(*pgd_k))
16143 +               return NULL;
16144 +
16145 +       /*
16146 +        * set_pgd(pgd, *pgd_k); here would be useless on PAE
16147 +        * and redundant with the set_pmd() on non-PAE. As would
16148 +        * set_pud.
16149 +        */
16150 +
16151 +       pud = pud_offset(pgd, address);
16152 +       pud_k = pud_offset(pgd_k, address);
16153 +       if (!pud_present(*pud_k))
16154 +               return NULL;
16155 +
16156 +       pmd = pmd_offset(pud, address);
16157 +       pmd_k = pmd_offset(pud_k, address);
16158 +       if (!pmd_present(*pmd_k))
16159 +               return NULL;
16160 +       if (!pmd_present(*pmd))
16161 +#ifndef CONFIG_XEN
16162 +               set_pmd(pmd, *pmd_k);
16163 +#else
16164 +               /*
16165 +                * When running on Xen we must launder *pmd_k through
16166 +                * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
16167 +                */
16168 +               set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
16169 +#endif
16170 +       else
16171 +               BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
16172 +       return pmd_k;
16173 +}
16174 +
16175 +/*
16176 + * Handle a fault on the vmalloc or module mapping area
16177 + *
16178 + * This assumes no large pages in there.
16179 + */
16180 +static inline int vmalloc_fault(unsigned long address)
16181 +{
16182 +       unsigned long pgd_paddr;
16183 +       pmd_t *pmd_k;
16184 +       pte_t *pte_k;
16185 +       /*
16186 +        * Synchronize this task's top level page-table
16187 +        * with the 'reference' page table.
16188 +        *
16189 +        * Do _not_ use "current" here. We might be inside
16190 +        * an interrupt in the middle of a task switch..
16191 +        */
16192 +       pgd_paddr = read_cr3();
16193 +       pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
16194 +       if (!pmd_k)
16195 +               return -1;
16196 +       pte_k = pte_offset_kernel(pmd_k, address);
16197 +       if (!pte_present(*pte_k))
16198 +               return -1;
16199 +       return 0;
16200 +}
16201 +
16202 +/*
16203 + * This routine handles page faults.  It determines the address,
16204 + * and the problem, and then passes it off to one of the appropriate
16205 + * routines.
16206 + *
16207 + * error_code:
16208 + *     bit 0 == 0 means no page found, 1 means protection fault
16209 + *     bit 1 == 0 means read, 1 means write
16210 + *     bit 2 == 0 means kernel, 1 means user-mode
16211 + *     bit 3 == 1 means use of reserved bit detected
16212 + *     bit 4 == 1 means fault was an instruction fetch
16213 + */
16214 +fastcall void __kprobes do_page_fault(struct pt_regs *regs,
16215 +                                     unsigned long error_code)
16216 +{
16217 +       struct task_struct *tsk;
16218 +       struct mm_struct *mm;
16219 +       struct vm_area_struct * vma;
16220 +       unsigned long address;
16221 +       int write, si_code;
16222 +
16223 +       /* get the address */
16224 +        address = read_cr2();
16225 +
16226 +       /* Set the "privileged fault" bit to something sane. */
16227 +       error_code &= ~4;
16228 +       error_code |= (regs->xcs & 2) << 1;
16229 +       if (regs->eflags & X86_EFLAGS_VM)
16230 +               error_code |= 4;
16231 +
16232 +       tsk = current;
16233 +
16234 +       si_code = SEGV_MAPERR;
16235 +
16236 +       /*
16237 +        * We fault-in kernel-space virtual memory on-demand. The
16238 +        * 'reference' page table is init_mm.pgd.
16239 +        *
16240 +        * NOTE! We MUST NOT take any locks for this case. We may
16241 +        * be in an interrupt or a critical region, and should
16242 +        * only copy the information from the master page table,
16243 +        * nothing more.
16244 +        *
16245 +        * This verifies that the fault happens in kernel space
16246 +        * (error_code & 4) == 0, and that the fault was not a
16247 +        * protection error (error_code & 9) == 0.
16248 +        */
16249 +       if (unlikely(address >= TASK_SIZE)) {
16250 +#ifdef CONFIG_XEN
16251 +               /* Faults in hypervisor area can never be patched up. */
16252 +               if (address >= HYPERVISOR_VIRT_START)
16253 +                       goto bad_area_nosemaphore;
16254 +#endif
16255 +               if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
16256 +                       return;
16257 +               /* Can take a spurious fault if mapping changes R/O -> R/W. */
16258 +               if (spurious_fault(regs, address, error_code))
16259 +                       return;
16260 +               if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
16261 +                                               SIGSEGV) == NOTIFY_STOP)
16262 +                       return;
16263 +               /*
16264 +                * Don't take the mm semaphore here. If we fixup a prefetch
16265 +                * fault we could otherwise deadlock.
16266 +                */
16267 +               goto bad_area_nosemaphore;
16268 +       }
16269 +
16270 +       if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
16271 +                                       SIGSEGV) == NOTIFY_STOP)
16272 +               return;
16273 +
16274 +       /* It's safe to allow irq's after cr2 has been saved and the vmalloc
16275 +          fault has been handled. */
16276 +       if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
16277 +               local_irq_enable();
16278 +
16279 +       mm = tsk->mm;
16280 +
16281 +       /*
16282 +        * If we're in an interrupt, have no user context or are running in an
16283 +        * atomic region then we must not take the fault..
16284 +        */
16285 +       if (in_atomic() || !mm)
16286 +               goto bad_area_nosemaphore;
16287 +
16288 +       /* When running in the kernel we expect faults to occur only to
16289 +        * addresses in user space.  All other faults represent errors in the
16290 +        * kernel and should generate an OOPS.  Unfortunatly, in the case of an
16291 +        * erroneous fault occuring in a code path which already holds mmap_sem
16292 +        * we will deadlock attempting to validate the fault against the
16293 +        * address space.  Luckily the kernel only validly references user
16294 +        * space from well defined areas of code, which are listed in the
16295 +        * exceptions table.
16296 +        *
16297 +        * As the vast majority of faults will be valid we will only perform
16298 +        * the source reference check when there is a possibilty of a deadlock.
16299 +        * Attempt to lock the address space, if we cannot we then validate the
16300 +        * source.  If this is invalid we can skip the address space check,
16301 +        * thus avoiding the deadlock.
16302 +        */
16303 +       if (!down_read_trylock(&mm->mmap_sem)) {
16304 +               if ((error_code & 4) == 0 &&
16305 +                   !search_exception_tables(regs->eip))
16306 +                       goto bad_area_nosemaphore;
16307 +               down_read(&mm->mmap_sem);
16308 +       }
16309 +
16310 +       vma = find_vma(mm, address);
16311 +       if (!vma)
16312 +               goto bad_area;
16313 +       if (vma->vm_start <= address)
16314 +               goto good_area;
16315 +       if (!(vma->vm_flags & VM_GROWSDOWN))
16316 +               goto bad_area;
16317 +       if (error_code & 4) {
16318 +               /*
16319 +                * accessing the stack below %esp is always a bug.
16320 +                * The "+ 32" is there due to some instructions (like
16321 +                * pusha) doing post-decrement on the stack and that
16322 +                * doesn't show up until later..
16323 +                */
16324 +               if (address + 32 < regs->esp)
16325 +                       goto bad_area;
16326 +       }
16327 +       if (expand_stack(vma, address))
16328 +               goto bad_area;
16329 +/*
16330 + * Ok, we have a good vm_area for this memory access, so
16331 + * we can handle it..
16332 + */
16333 +good_area:
16334 +       si_code = SEGV_ACCERR;
16335 +       write = 0;
16336 +       switch (error_code & 3) {
16337 +               default:        /* 3: write, present */
16338 +#ifdef TEST_VERIFY_AREA
16339 +                       if (regs->cs == GET_KERNEL_CS())
16340 +                               printk("WP fault at %08lx\n", regs->eip);
16341 +#endif
16342 +                       /* fall through */
16343 +               case 2:         /* write, not present */
16344 +                       if (!(vma->vm_flags & VM_WRITE))
16345 +                               goto bad_area;
16346 +                       write++;
16347 +                       break;
16348 +               case 1:         /* read, present */
16349 +                       goto bad_area;
16350 +               case 0:         /* read, not present */
16351 +                       if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
16352 +                               goto bad_area;
16353 +       }
16354 +
16355 + survive:
16356 +       /*
16357 +        * If for any reason at all we couldn't handle the fault,
16358 +        * make sure we exit gracefully rather than endlessly redo
16359 +        * the fault.
16360 +        */
16361 +       switch (handle_mm_fault(mm, vma, address, write)) {
16362 +               case VM_FAULT_MINOR:
16363 +                       tsk->min_flt++;
16364 +                       break;
16365 +               case VM_FAULT_MAJOR:
16366 +                       tsk->maj_flt++;
16367 +                       break;
16368 +               case VM_FAULT_SIGBUS:
16369 +                       goto do_sigbus;
16370 +               case VM_FAULT_OOM:
16371 +                       goto out_of_memory;
16372 +               default:
16373 +                       BUG();
16374 +       }
16375 +
16376 +       /*
16377 +        * Did it hit the DOS screen memory VA from vm86 mode?
16378 +        */
16379 +       if (regs->eflags & VM_MASK) {
16380 +               unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
16381 +               if (bit < 32)
16382 +                       tsk->thread.screen_bitmap |= 1 << bit;
16383 +       }
16384 +       up_read(&mm->mmap_sem);
16385 +       return;
16386 +
16387 +/*
16388 + * Something tried to access memory that isn't in our memory map..
16389 + * Fix it, but check if it's kernel or user first..
16390 + */
16391 +bad_area:
16392 +       up_read(&mm->mmap_sem);
16393 +
16394 +bad_area_nosemaphore:
16395 +       /* User mode accesses just cause a SIGSEGV */
16396 +       if (error_code & 4) {
16397 +               /* 
16398 +                * Valid to do another page fault here because this one came 
16399 +                * from user space.
16400 +                */
16401 +               if (is_prefetch(regs, address, error_code))
16402 +                       return;
16403 +
16404 +               tsk->thread.cr2 = address;
16405 +               /* Kernel addresses are always protection faults */
16406 +               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
16407 +               tsk->thread.trap_no = 14;
16408 +               force_sig_info_fault(SIGSEGV, si_code, address, tsk);
16409 +               return;
16410 +       }
16411 +
16412 +#ifdef CONFIG_X86_F00F_BUG
16413 +       /*
16414 +        * Pentium F0 0F C7 C8 bug workaround.
16415 +        */
16416 +       if (boot_cpu_data.f00f_bug) {
16417 +               unsigned long nr;
16418 +               
16419 +               nr = (address - idt_descr.address) >> 3;
16420 +
16421 +               if (nr == 6) {
16422 +                       do_invalid_op(regs, 0);
16423 +                       return;
16424 +               }
16425 +       }
16426 +#endif
16427 +
16428 +no_context:
16429 +       /* Are we prepared to handle this kernel fault?  */
16430 +       if (fixup_exception(regs))
16431 +               return;
16432 +
16433 +       /* 
16434 +        * Valid to do another page fault here, because if this fault
16435 +        * had been triggered by is_prefetch fixup_exception would have 
16436 +        * handled it.
16437 +        */
16438 +       if (is_prefetch(regs, address, error_code))
16439 +               return;
16440 +
16441 +/*
16442 + * Oops. The kernel tried to access some bad page. We'll have to
16443 + * terminate things with extreme prejudice.
16444 + */
16445 +
16446 +       bust_spinlocks(1);
16447 +
16448 +       if (oops_may_print()) {
16449 +       #ifdef CONFIG_X86_PAE
16450 +               if (error_code & 16) {
16451 +                       pte_t *pte = lookup_address(address);
16452 +
16453 +                       if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
16454 +                               printk(KERN_CRIT "kernel tried to execute "
16455 +                                       "NX-protected page - exploit attempt? "
16456 +                                       "(uid: %d)\n", current->uid);
16457 +               }
16458 +       #endif
16459 +               if (address < PAGE_SIZE)
16460 +                       printk(KERN_ALERT "BUG: unable to handle kernel NULL "
16461 +                                       "pointer dereference");
16462 +               else
16463 +                       printk(KERN_ALERT "BUG: unable to handle kernel paging"
16464 +                                       " request");
16465 +               printk(" at virtual address %08lx\n",address);
16466 +               printk(KERN_ALERT " printing eip:\n");
16467 +               printk("%08lx\n", regs->eip);
16468 +               dump_fault_path(address);
16469 +       }
16470 +       tsk->thread.cr2 = address;
16471 +       tsk->thread.trap_no = 14;
16472 +       tsk->thread.error_code = error_code;
16473 +       die("Oops", regs, error_code);
16474 +       bust_spinlocks(0);
16475 +       do_exit(SIGKILL);
16476 +
16477 +/*
16478 + * We ran out of memory, or some other thing happened to us that made
16479 + * us unable to handle the page fault gracefully.
16480 + */
16481 +out_of_memory:
16482 +       up_read(&mm->mmap_sem);
16483 +       if (tsk->pid == 1) {
16484 +               yield();
16485 +               down_read(&mm->mmap_sem);
16486 +               goto survive;
16487 +       }
16488 +       printk("VM: killing process %s\n", tsk->comm);
16489 +       if (error_code & 4)
16490 +               do_exit(SIGKILL);
16491 +       goto no_context;
16492 +
16493 +do_sigbus:
16494 +       up_read(&mm->mmap_sem);
16495 +
16496 +       /* Kernel mode? Handle exceptions or die */
16497 +       if (!(error_code & 4))
16498 +               goto no_context;
16499 +
16500 +       /* User space => ok to do another page fault */
16501 +       if (is_prefetch(regs, address, error_code))
16502 +               return;
16503 +
16504 +       tsk->thread.cr2 = address;
16505 +       tsk->thread.error_code = error_code;
16506 +       tsk->thread.trap_no = 14;
16507 +       force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
16508 +}
16509 +
16510 +#ifndef CONFIG_X86_PAE
16511 +void vmalloc_sync_all(void)
16512 +{
16513 +       /*
16514 +        * Note that races in the updates of insync and start aren't
16515 +        * problematic: insync can only get set bits added, and updates to
16516 +        * start are only improving performance (without affecting correctness
16517 +        * if undone).
16518 +        */
16519 +       static DECLARE_BITMAP(insync, PTRS_PER_PGD);
16520 +       static unsigned long start = TASK_SIZE;
16521 +       unsigned long address;
16522 +
16523 +       BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
16524 +       for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
16525 +               if (!test_bit(pgd_index(address), insync)) {
16526 +                       unsigned long flags;
16527 +                       struct page *page;
16528 +
16529 +                       spin_lock_irqsave(&pgd_lock, flags);
16530 +                       for (page = pgd_list; page; page =
16531 +                                       (struct page *)page->index)
16532 +                               if (!vmalloc_sync_one(page_address(page),
16533 +                                                               address)) {
16534 +                                       BUG_ON(page != pgd_list);
16535 +                                       break;
16536 +                               }
16537 +                       spin_unlock_irqrestore(&pgd_lock, flags);
16538 +                       if (!page)
16539 +                               set_bit(pgd_index(address), insync);
16540 +               }
16541 +               if (address == start && test_bit(pgd_index(address), insync))
16542 +                       start = address + PGDIR_SIZE;
16543 +       }
16544 +}
16545 +#endif
16546 diff -urNp linux-2.6/arch/i386/mm/highmem-xen.c new/arch/i386/mm/highmem-xen.c
16547 --- linux-2.6/arch/i386/mm/highmem-xen.c        1970-01-01 01:00:00.000000000 +0100
16548 +++ new/arch/i386/mm/highmem-xen.c      2006-07-07 15:10:03.000000000 +0200
16549 @@ -0,0 +1,133 @@
16550 +#include <linux/highmem.h>
16551 +#include <linux/module.h>
16552 +
16553 +void *kmap(struct page *page)
16554 +{
16555 +       might_sleep();
16556 +       if (!PageHighMem(page))
16557 +               return page_address(page);
16558 +       return kmap_high(page);
16559 +}
16560 +
16561 +void kunmap(struct page *page)
16562 +{
16563 +       if (in_interrupt())
16564 +               BUG();
16565 +       if (!PageHighMem(page))
16566 +               return;
16567 +       kunmap_high(page);
16568 +}
16569 +
16570 +/*
16571 + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
16572 + * no global lock is needed and because the kmap code must perform a global TLB
16573 + * invalidation when the kmap pool wraps.
16574 + *
16575 + * However when holding an atomic kmap is is not legal to sleep, so atomic
16576 + * kmaps are appropriate for short, tight code paths only.
16577 + */
16578 +static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot)
16579 +{
16580 +       enum fixed_addresses idx;
16581 +       unsigned long vaddr;
16582 +
16583 +       /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
16584 +       inc_preempt_count();
16585 +       if (!PageHighMem(page))
16586 +               return page_address(page);
16587 +
16588 +       idx = type + KM_TYPE_NR*smp_processor_id();
16589 +       vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
16590 +#ifdef CONFIG_DEBUG_HIGHMEM
16591 +       if (!pte_none(*(kmap_pte-idx)))
16592 +               BUG();
16593 +#endif
16594 +       set_pte_at_sync(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
16595 +
16596 +       return (void*) vaddr;
16597 +}
16598 +
16599 +void *kmap_atomic(struct page *page, enum km_type type)
16600 +{
16601 +       return __kmap_atomic(page, type, kmap_prot);
16602 +}
16603 +
16604 +/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */
16605 +void *kmap_atomic_pte(struct page *page, enum km_type type)
16606 +{
16607 +       return __kmap_atomic(page, type, PAGE_KERNEL_RO);
16608 +}
16609 +
16610 +void kunmap_atomic(void *kvaddr, enum km_type type)
16611 +{
16612 +#if defined(CONFIG_DEBUG_HIGHMEM) || defined(CONFIG_XEN)
16613 +       unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
16614 +       enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
16615 +
16616 +       if (vaddr < FIXADDR_START) { // FIXME
16617 +               dec_preempt_count();
16618 +               preempt_check_resched();
16619 +               return;
16620 +       }
16621 +#endif
16622 +
16623 +#if defined(CONFIG_DEBUG_HIGHMEM)
16624 +       if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
16625 +               BUG();
16626 +
16627 +       /*
16628 +        * force other mappings to Oops if they'll try to access
16629 +        * this pte without first remap it
16630 +        */
16631 +       pte_clear(&init_mm, vaddr, kmap_pte-idx);
16632 +       __flush_tlb_one(vaddr);
16633 +#elif defined(CONFIG_XEN)
16634 +       /*
16635 +        * We must ensure there are no dangling pagetable references when
16636 +        * returning memory to Xen (decrease_reservation).
16637 +        * XXX TODO: We could make this faster by only zapping when
16638 +        * kmap_flush_unused is called but that is trickier and more invasive.
16639 +        */
16640 +       pte_clear(&init_mm, vaddr, kmap_pte-idx);
16641 +#endif
16642 +
16643 +       dec_preempt_count();
16644 +       preempt_check_resched();
16645 +}
16646 +
16647 +/* This is the same as kmap_atomic() but can map memory that doesn't
16648 + * have a struct page associated with it.
16649 + */
16650 +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
16651 +{
16652 +       enum fixed_addresses idx;
16653 +       unsigned long vaddr;
16654 +
16655 +       inc_preempt_count();
16656 +
16657 +       idx = type + KM_TYPE_NR*smp_processor_id();
16658 +       vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
16659 +       set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
16660 +       __flush_tlb_one(vaddr);
16661 +
16662 +       return (void*) vaddr;
16663 +}
16664 +
16665 +struct page *kmap_atomic_to_page(void *ptr)
16666 +{
16667 +       unsigned long idx, vaddr = (unsigned long)ptr;
16668 +       pte_t *pte;
16669 +
16670 +       if (vaddr < FIXADDR_START)
16671 +               return virt_to_page(ptr);
16672 +
16673 +       idx = virt_to_fix(vaddr);
16674 +       pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
16675 +       return pte_page(*pte);
16676 +}
16677 +
16678 +EXPORT_SYMBOL(kmap);
16679 +EXPORT_SYMBOL(kunmap);
16680 +EXPORT_SYMBOL(kmap_atomic);
16681 +EXPORT_SYMBOL(kunmap_atomic);
16682 +EXPORT_SYMBOL(kmap_atomic_to_page);
16683 diff -urNp linux-2.6/arch/i386/mm/hypervisor.c new/arch/i386/mm/hypervisor.c
16684 --- linux-2.6/arch/i386/mm/hypervisor.c 1970-01-01 01:00:00.000000000 +0100
16685 +++ new/arch/i386/mm/hypervisor.c       2006-07-07 15:10:03.000000000 +0200
16686 @@ -0,0 +1,453 @@
16687 +/******************************************************************************
16688 + * mm/hypervisor.c
16689 + * 
16690 + * Update page tables via the hypervisor.
16691 + * 
16692 + * Copyright (c) 2002-2004, K A Fraser
16693 + * 
16694 + * This program is free software; you can redistribute it and/or
16695 + * modify it under the terms of the GNU General Public License version 2
16696 + * as published by the Free Software Foundation; or, when distributed
16697 + * separately from the Linux kernel or incorporated into other
16698 + * software packages, subject to the following license:
16699 + * 
16700 + * Permission is hereby granted, free of charge, to any person obtaining a copy
16701 + * of this source file (the "Software"), to deal in the Software without
16702 + * restriction, including without limitation the rights to use, copy, modify,
16703 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
16704 + * and to permit persons to whom the Software is furnished to do so, subject to
16705 + * the following conditions:
16706 + * 
16707 + * The above copyright notice and this permission notice shall be included in
16708 + * all copies or substantial portions of the Software.
16709 + * 
16710 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16711 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16712 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16713 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16714 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
16715 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
16716 + * IN THE SOFTWARE.
16717 + */
16718 +
16719 +#include <linux/config.h>
16720 +#include <linux/sched.h>
16721 +#include <linux/mm.h>
16722 +#include <linux/vmalloc.h>
16723 +#include <asm/page.h>
16724 +#include <asm/pgtable.h>
16725 +#include <asm/hypervisor.h>
16726 +#include <xen/balloon.h>
16727 +#include <xen/features.h>
16728 +#include <xen/interface/memory.h>
16729 +#include <linux/module.h>
16730 +#include <linux/percpu.h>
16731 +#include <asm/tlbflush.h>
16732 +
16733 +#ifdef CONFIG_X86_64
16734 +#define pmd_val_ma(v) (v).pmd
16735 +#else
16736 +#ifdef CONFIG_X86_PAE
16737 +# define pmd_val_ma(v) ((v).pmd)
16738 +# define pud_val_ma(v) ((v).pgd.pgd)
16739 +#else
16740 +# define pmd_val_ma(v) ((v).pud.pgd.pgd)
16741 +#endif
16742 +#endif
16743 +
16744 +void xen_l1_entry_update(pte_t *ptr, pte_t val)
16745 +{
16746 +       mmu_update_t u;
16747 +       u.ptr = virt_to_machine(ptr);
16748 +       u.val = pte_val_ma(val);
16749 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
16750 +}
16751 +
16752 +void xen_l2_entry_update(pmd_t *ptr, pmd_t val)
16753 +{
16754 +       mmu_update_t u;
16755 +       u.ptr = virt_to_machine(ptr);
16756 +       u.val = pmd_val_ma(val);
16757 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
16758 +}
16759 +
16760 +#ifdef CONFIG_X86_PAE
16761 +void xen_l3_entry_update(pud_t *ptr, pud_t val)
16762 +{
16763 +       mmu_update_t u;
16764 +       u.ptr = virt_to_machine(ptr);
16765 +       u.val = pud_val_ma(val);
16766 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
16767 +}
16768 +#endif
16769 +
16770 +#ifdef CONFIG_X86_64
16771 +void xen_l3_entry_update(pud_t *ptr, pud_t val)
16772 +{
16773 +       mmu_update_t u;
16774 +       u.ptr = virt_to_machine(ptr);
16775 +       u.val = val.pud;
16776 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
16777 +}
16778 +
16779 +void xen_l4_entry_update(pgd_t *ptr, pgd_t val)
16780 +{
16781 +       mmu_update_t u;
16782 +       u.ptr = virt_to_machine(ptr);
16783 +       u.val = val.pgd;
16784 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
16785 +}
16786 +#endif /* CONFIG_X86_64 */
16787 +
16788 +void xen_machphys_update(unsigned long mfn, unsigned long pfn)
16789 +{
16790 +       mmu_update_t u;
16791 +       if (xen_feature(XENFEAT_auto_translated_physmap)) {
16792 +               BUG_ON(pfn != mfn);
16793 +               return;
16794 +       }
16795 +       u.ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
16796 +       u.val = pfn;
16797 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
16798 +}
16799 +
16800 +void xen_pt_switch(unsigned long ptr)
16801 +{
16802 +       struct mmuext_op op;
16803 +       op.cmd = MMUEXT_NEW_BASEPTR;
16804 +       op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
16805 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16806 +}
16807 +
16808 +void xen_new_user_pt(unsigned long ptr)
16809 +{
16810 +       struct mmuext_op op;
16811 +       op.cmd = MMUEXT_NEW_USER_BASEPTR;
16812 +       op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
16813 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16814 +}
16815 +
16816 +void xen_tlb_flush(void)
16817 +{
16818 +       struct mmuext_op op;
16819 +       op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
16820 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16821 +}
16822 +EXPORT_SYMBOL(xen_tlb_flush);
16823 +
16824 +void xen_invlpg(unsigned long ptr)
16825 +{
16826 +       struct mmuext_op op;
16827 +       op.cmd = MMUEXT_INVLPG_LOCAL;
16828 +       op.arg1.linear_addr = ptr & PAGE_MASK;
16829 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16830 +}
16831 +EXPORT_SYMBOL(xen_invlpg);
16832 +
16833 +#ifdef CONFIG_SMP
16834 +
16835 +void xen_tlb_flush_all(void)
16836 +{
16837 +       struct mmuext_op op;
16838 +       op.cmd = MMUEXT_TLB_FLUSH_ALL;
16839 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16840 +}
16841 +
16842 +void xen_tlb_flush_mask(cpumask_t *mask)
16843 +{
16844 +       struct mmuext_op op;
16845 +       if ( cpus_empty(*mask) )
16846 +               return;
16847 +       op.cmd = MMUEXT_TLB_FLUSH_MULTI;
16848 +       op.arg2.vcpumask = mask->bits;
16849 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16850 +}
16851 +
16852 +void xen_invlpg_all(unsigned long ptr)
16853 +{
16854 +       struct mmuext_op op;
16855 +       op.cmd = MMUEXT_INVLPG_ALL;
16856 +       op.arg1.linear_addr = ptr & PAGE_MASK;
16857 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16858 +}
16859 +
16860 +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr)
16861 +{
16862 +       struct mmuext_op op;
16863 +       if ( cpus_empty(*mask) )
16864 +               return;
16865 +       op.cmd = MMUEXT_INVLPG_MULTI;
16866 +       op.arg1.linear_addr = ptr & PAGE_MASK;
16867 +       op.arg2.vcpumask    = mask->bits;
16868 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16869 +}
16870 +
16871 +#endif /* CONFIG_SMP */
16872 +
16873 +void xen_pgd_pin(unsigned long ptr)
16874 +{
16875 +       struct mmuext_op op;
16876 +#ifdef CONFIG_X86_64
16877 +       op.cmd = MMUEXT_PIN_L4_TABLE;
16878 +#elif defined(CONFIG_X86_PAE)
16879 +       op.cmd = MMUEXT_PIN_L3_TABLE;
16880 +#else
16881 +       op.cmd = MMUEXT_PIN_L2_TABLE;
16882 +#endif
16883 +       op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
16884 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16885 +}
16886 +
16887 +void xen_pgd_unpin(unsigned long ptr)
16888 +{
16889 +       struct mmuext_op op;
16890 +       op.cmd = MMUEXT_UNPIN_TABLE;
16891 +       op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
16892 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16893 +}
16894 +
16895 +void xen_set_ldt(unsigned long ptr, unsigned long len)
16896 +{
16897 +       struct mmuext_op op;
16898 +       op.cmd = MMUEXT_SET_LDT;
16899 +       op.arg1.linear_addr = ptr;
16900 +       op.arg2.nr_ents     = len;
16901 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16902 +}
16903 +
16904 +/*
16905 + * Bitmap is indexed by page number. If bit is set, the page is part of a
16906 + * xen_create_contiguous_region() area of memory.
16907 + */
16908 +unsigned long *contiguous_bitmap;
16909 +
16910 +static void contiguous_bitmap_set(
16911 +       unsigned long first_page, unsigned long nr_pages)
16912 +{
16913 +       unsigned long start_off, end_off, curr_idx, end_idx;
16914 +
16915 +       curr_idx  = first_page / BITS_PER_LONG;
16916 +       start_off = first_page & (BITS_PER_LONG-1);
16917 +       end_idx   = (first_page + nr_pages) / BITS_PER_LONG;
16918 +       end_off   = (first_page + nr_pages) & (BITS_PER_LONG-1);
16919 +
16920 +       if (curr_idx == end_idx) {
16921 +               contiguous_bitmap[curr_idx] |=
16922 +                       ((1UL<<end_off)-1) & -(1UL<<start_off);
16923 +       } else {
16924 +               contiguous_bitmap[curr_idx] |= -(1UL<<start_off);
16925 +               while ( ++curr_idx < end_idx )
16926 +                       contiguous_bitmap[curr_idx] = ~0UL;
16927 +               contiguous_bitmap[curr_idx] |= (1UL<<end_off)-1;
16928 +       }
16929 +}
16930 +
16931 +static void contiguous_bitmap_clear(
16932 +       unsigned long first_page, unsigned long nr_pages)
16933 +{
16934 +       unsigned long start_off, end_off, curr_idx, end_idx;
16935 +
16936 +       curr_idx  = first_page / BITS_PER_LONG;
16937 +       start_off = first_page & (BITS_PER_LONG-1);
16938 +       end_idx   = (first_page + nr_pages) / BITS_PER_LONG;
16939 +       end_off   = (first_page + nr_pages) & (BITS_PER_LONG-1);
16940 +
16941 +       if (curr_idx == end_idx) {
16942 +               contiguous_bitmap[curr_idx] &=
16943 +                       -(1UL<<end_off) | ((1UL<<start_off)-1);
16944 +       } else {
16945 +               contiguous_bitmap[curr_idx] &= (1UL<<start_off)-1;
16946 +               while ( ++curr_idx != end_idx )
16947 +                       contiguous_bitmap[curr_idx] = 0;
16948 +               contiguous_bitmap[curr_idx] &= -(1UL<<end_off);
16949 +       }
16950 +}
16951 +
16952 +/* Protected by balloon_lock. */
16953 +#define MAX_CONTIG_ORDER 9 /* 2MB */
16954 +static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
16955 +
16956 +/* Ensure multi-page extents are contiguous in machine memory. */
16957 +int xen_create_contiguous_region(
16958 +       unsigned long vstart, unsigned int order, unsigned int address_bits)
16959 +{
16960 +       unsigned long *in_frames = discontig_frames, out_frame;
16961 +       unsigned long  frame, i, flags;
16962 +       long           rc;
16963 +       int            success;
16964 +       struct xen_memory_exchange exchange = {
16965 +               .in = {
16966 +                       .nr_extents   = 1UL << order,
16967 +                       .extent_order = 0,
16968 +                       .domid        = DOMID_SELF
16969 +               },
16970 +               .out = {
16971 +                       .nr_extents   = 1,
16972 +                       .extent_order = order,
16973 +                       .address_bits = address_bits,
16974 +                       .domid        = DOMID_SELF
16975 +               }
16976 +       };
16977 +
16978 +       /*
16979 +        * Currently an auto-translated guest will not perform I/O, nor will
16980 +        * it require PAE page directories below 4GB. Therefore any calls to
16981 +        * this function are redundant and can be ignored.
16982 +        */
16983 +       if (xen_feature(XENFEAT_auto_translated_physmap))
16984 +               return 0;
16985 +
16986 +       if (unlikely(order > MAX_CONTIG_ORDER))
16987 +               return -ENOMEM;
16988 +
16989 +       set_xen_guest_handle(exchange.in.extent_start, in_frames);
16990 +       set_xen_guest_handle(exchange.out.extent_start, &out_frame);
16991 +
16992 +       scrub_pages(vstart, 1 << order);
16993 +
16994 +       balloon_lock(flags);
16995 +
16996 +       /* 1. Zap current PTEs, remembering MFNs. */
16997 +       for (i = 0; i < (1UL<<order); i++) {
16998 +               in_frames[i] = pfn_to_mfn((__pa(vstart) >> PAGE_SHIFT) + i);
16999 +               if (HYPERVISOR_update_va_mapping(vstart + (i*PAGE_SIZE),
17000 +                                                __pte_ma(0), 0))
17001 +                       BUG();
17002 +               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
17003 +                       INVALID_P2M_ENTRY);
17004 +       }
17005 +
17006 +       /* 2. Get a new contiguous memory extent. */
17007 +       out_frame = __pa(vstart) >> PAGE_SHIFT;
17008 +       rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
17009 +       success = (exchange.nr_exchanged == (1UL << order));
17010 +       BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
17011 +       BUG_ON(success && (rc != 0));
17012 +       if (unlikely(rc == -ENOSYS)) {
17013 +               /* Compatibility when XENMEM_exchange is unsupported. */
17014 +               if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
17015 +                                        &exchange.in) != (1UL << order))
17016 +                       BUG();
17017 +               success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
17018 +                                               &exchange.out) == 1);
17019 +               if (!success) {
17020 +                       /* Couldn't get special memory: fall back to normal. */
17021 +                       for (i = 0; i < (1UL<<order); i++)
17022 +                               in_frames[i] = (__pa(vstart)>>PAGE_SHIFT) + i;
17023 +                       if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
17024 +                                                &exchange.in) != (1UL<<order))
17025 +                               BUG();
17026 +               }
17027 +       }
17028 +
17029 +       /* 3. Map the new extent in place of old pages. */
17030 +       for (i = 0; i < (1UL<<order); i++) {
17031 +               frame = success ? (out_frame + i) : in_frames[i];
17032 +               if (HYPERVISOR_update_va_mapping(vstart + (i*PAGE_SIZE),
17033 +                                                pfn_pte_ma(frame,
17034 +                                                           PAGE_KERNEL),
17035 +                                                0))
17036 +                       BUG();
17037 +               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
17038 +       }
17039 +
17040 +       flush_tlb_all();
17041 +
17042 +       if (success)
17043 +               contiguous_bitmap_set(__pa(vstart) >> PAGE_SHIFT,
17044 +                                     1UL << order);
17045 +
17046 +       balloon_unlock(flags);
17047 +
17048 +       return success ? 0 : -ENOMEM;
17049 +}
17050 +
17051 +void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
17052 +{
17053 +       unsigned long *out_frames = discontig_frames, in_frame;
17054 +       unsigned long  frame, i, flags;
17055 +       long           rc;
17056 +       int            success;
17057 +       struct xen_memory_exchange exchange = {
17058 +               .in = {
17059 +                       .nr_extents   = 1,
17060 +                       .extent_order = order,
17061 +                       .domid        = DOMID_SELF
17062 +               },
17063 +               .out = {
17064 +                       .nr_extents   = 1UL << order,
17065 +                       .extent_order = 0,
17066 +                       .domid        = DOMID_SELF
17067 +               }
17068 +       };
17069 +
17070 +       if (xen_feature(XENFEAT_auto_translated_physmap) ||
17071 +           !test_bit(__pa(vstart) >> PAGE_SHIFT, contiguous_bitmap))
17072 +               return;
17073 +
17074 +       if (unlikely(order > MAX_CONTIG_ORDER))
17075 +               return;
17076 +
17077 +       set_xen_guest_handle(exchange.in.extent_start, &in_frame);
17078 +       set_xen_guest_handle(exchange.out.extent_start, out_frames);
17079 +
17080 +       scrub_pages(vstart, 1 << order);
17081 +
17082 +       balloon_lock(flags);
17083 +
17084 +       contiguous_bitmap_clear(__pa(vstart) >> PAGE_SHIFT, 1UL << order);
17085 +
17086 +       /* 1. Find start MFN of contiguous extent. */
17087 +       in_frame = pfn_to_mfn(__pa(vstart) >> PAGE_SHIFT);
17088 +
17089 +       /* 2. Zap current PTEs. */
17090 +       for (i = 0; i < (1UL<<order); i++) {
17091 +               if (HYPERVISOR_update_va_mapping(vstart + (i*PAGE_SIZE),
17092 +                                                __pte_ma(0), 0))
17093 +                       BUG();
17094 +               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
17095 +                       INVALID_P2M_ENTRY);
17096 +               out_frames[i] = (__pa(vstart) >> PAGE_SHIFT) + i;
17097 +       }
17098 +
17099 +       /* 3. Do the exchange for non-contiguous MFNs. */
17100 +       rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
17101 +       success = (exchange.nr_exchanged == 1);
17102 +       BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
17103 +       BUG_ON(success && (rc != 0));
17104 +       if (unlikely(rc == -ENOSYS)) {
17105 +               /* Compatibility when XENMEM_exchange is unsupported. */
17106 +               if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
17107 +                                        &exchange.in) != 1)
17108 +                       BUG();
17109 +               if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
17110 +                                        &exchange.out) != (1UL << order))
17111 +                       BUG();
17112 +               success = 1;
17113 +       }
17114 +
17115 +       /* 4. Map new pages in place of old pages. */
17116 +       for (i = 0; i < (1UL<<order); i++) {
17117 +               frame = success ? out_frames[i] : (in_frame + i);
17118 +               if (HYPERVISOR_update_va_mapping(vstart + (i*PAGE_SIZE),
17119 +                                                pfn_pte_ma(frame,
17120 +                                                           PAGE_KERNEL),
17121 +                                                0))
17122 +                       BUG();
17123 +               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
17124 +       }
17125 +
17126 +       flush_tlb_all();
17127 +
17128 +       balloon_unlock(flags);
17129 +}
17130 +
17131 +#ifdef __i386__
17132 +int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
17133 +{
17134 +       __u32 *lp = (__u32 *)((char *)ldt + entry * 8);
17135 +       maddr_t mach_lp = arbitrary_virt_to_machine(lp);
17136 +       return HYPERVISOR_update_descriptor(
17137 +               mach_lp, (u64)entry_a | ((u64)entry_b<<32));
17138 +}
17139 +#endif
17140 diff -urNp linux-2.6/arch/i386/mm/init-xen.c new/arch/i386/mm/init-xen.c
17141 --- linux-2.6/arch/i386/mm/init-xen.c   1970-01-01 01:00:00.000000000 +0100
17142 +++ new/arch/i386/mm/init-xen.c 2006-06-28 14:32:13.000000000 +0200
17143 @@ -0,0 +1,850 @@
17144 +/*
17145 + *  linux/arch/i386/mm/init.c
17146 + *
17147 + *  Copyright (C) 1995  Linus Torvalds
17148 + *
17149 + *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
17150 + */
17151 +
17152 +#include <linux/config.h>
17153 +#include <linux/module.h>
17154 +#include <linux/signal.h>
17155 +#include <linux/sched.h>
17156 +#include <linux/kernel.h>
17157 +#include <linux/errno.h>
17158 +#include <linux/string.h>
17159 +#include <linux/types.h>
17160 +#include <linux/ptrace.h>
17161 +#include <linux/mman.h>
17162 +#include <linux/mm.h>
17163 +#include <linux/hugetlb.h>
17164 +#include <linux/swap.h>
17165 +#include <linux/smp.h>
17166 +#include <linux/init.h>
17167 +#include <linux/highmem.h>
17168 +#include <linux/pagemap.h>
17169 +#include <linux/bootmem.h>
17170 +#include <linux/slab.h>
17171 +#include <linux/proc_fs.h>
17172 +#include <linux/efi.h>
17173 +#include <linux/memory_hotplug.h>
17174 +#include <linux/initrd.h>
17175 +#include <linux/dma-mapping.h>
17176 +#include <linux/scatterlist.h>
17177 +
17178 +#include <asm/processor.h>
17179 +#include <asm/system.h>
17180 +#include <asm/uaccess.h>
17181 +#include <asm/pgtable.h>
17182 +#include <asm/dma.h>
17183 +#include <asm/fixmap.h>
17184 +#include <asm/e820.h>
17185 +#include <asm/apic.h>
17186 +#include <asm/tlb.h>
17187 +#include <asm/tlbflush.h>
17188 +#include <asm/sections.h>
17189 +#include <asm/hypervisor.h>
17190 +#include <asm/swiotlb.h>
17191 +
17192 +extern unsigned long *contiguous_bitmap;
17193 +
17194 +unsigned int __VMALLOC_RESERVE = 128 << 20;
17195 +
17196 +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
17197 +unsigned long highstart_pfn, highend_pfn;
17198 +
17199 +static int noinline do_test_wp_bit(void);
17200 +
17201 +/*
17202 + * Creates a middle page table and puts a pointer to it in the
17203 + * given global directory entry. This only returns the gd entry
17204 + * in non-PAE compilation mode, since the middle layer is folded.
17205 + */
17206 +static pmd_t * __init one_md_table_init(pgd_t *pgd)
17207 +{
17208 +       pud_t *pud;
17209 +       pmd_t *pmd_table;
17210 +               
17211 +#ifdef CONFIG_X86_PAE
17212 +       pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
17213 +       make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
17214 +       set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
17215 +       pud = pud_offset(pgd, 0);
17216 +       if (pmd_table != pmd_offset(pud, 0)) 
17217 +               BUG();
17218 +#else
17219 +       pud = pud_offset(pgd, 0);
17220 +       pmd_table = pmd_offset(pud, 0);
17221 +#endif
17222 +
17223 +       return pmd_table;
17224 +}
17225 +
17226 +/*
17227 + * Create a page table and place a pointer to it in a middle page
17228 + * directory entry.
17229 + */
17230 +static pte_t * __init one_page_table_init(pmd_t *pmd)
17231 +{
17232 +       if (pmd_none(*pmd)) {
17233 +               pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
17234 +               make_lowmem_page_readonly(page_table,
17235 +                                         XENFEAT_writable_page_tables);
17236 +               set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
17237 +               if (page_table != pte_offset_kernel(pmd, 0))
17238 +                       BUG();  
17239 +
17240 +               return page_table;
17241 +       }
17242 +       
17243 +       return pte_offset_kernel(pmd, 0);
17244 +}
17245 +
17246 +/*
17247 + * This function initializes a certain range of kernel virtual memory 
17248 + * with new bootmem page tables, everywhere page tables are missing in
17249 + * the given range.
17250 + */
17251 +
17252 +/*
17253 + * NOTE: The pagetables are allocated contiguous on the physical space 
17254 + * so we can cache the place of the first one and move around without 
17255 + * checking the pgd every time.
17256 + */
17257 +static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
17258 +{
17259 +       pgd_t *pgd;
17260 +       pud_t *pud;
17261 +       pmd_t *pmd;
17262 +       int pgd_idx, pmd_idx;
17263 +       unsigned long vaddr;
17264 +
17265 +       vaddr = start;
17266 +       pgd_idx = pgd_index(vaddr);
17267 +       pmd_idx = pmd_index(vaddr);
17268 +       pgd = pgd_base + pgd_idx;
17269 +
17270 +       for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
17271 +               if (pgd_none(*pgd)) 
17272 +                       one_md_table_init(pgd);
17273 +               pud = pud_offset(pgd, vaddr);
17274 +               pmd = pmd_offset(pud, vaddr);
17275 +               for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
17276 +                       if (vaddr < HYPERVISOR_VIRT_START && pmd_none(*pmd)) 
17277 +                               one_page_table_init(pmd);
17278 +
17279 +                       vaddr += PMD_SIZE;
17280 +               }
17281 +               pmd_idx = 0;
17282 +       }
17283 +}
17284 +
17285 +static inline int is_kernel_text(unsigned long addr)
17286 +{
17287 +       if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
17288 +               return 1;
17289 +       return 0;
17290 +}
17291 +
17292 +/*
17293 + * This maps the physical memory to kernel virtual address space, a total 
17294 + * of max_low_pfn pages, by creating page tables starting from address 
17295 + * PAGE_OFFSET.
17296 + */
17297 +static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
17298 +{
17299 +       unsigned long pfn;
17300 +       pgd_t *pgd;
17301 +       pmd_t *pmd;
17302 +       pte_t *pte;
17303 +       int pgd_idx, pmd_idx, pte_ofs;
17304 +
17305 +       unsigned long max_ram_pfn = xen_start_info->nr_pages;
17306 +       if (max_ram_pfn > max_low_pfn)
17307 +               max_ram_pfn = max_low_pfn;
17308 +
17309 +       pgd_idx = pgd_index(PAGE_OFFSET);
17310 +       pgd = pgd_base + pgd_idx;
17311 +       pfn = 0;
17312 +       pmd_idx = pmd_index(PAGE_OFFSET);
17313 +       pte_ofs = pte_index(PAGE_OFFSET);
17314 +
17315 +       for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
17316 +#ifdef CONFIG_XEN
17317 +               /*
17318 +                * Native linux hasn't PAE-paging enabled yet at this
17319 +                * point.  When running as xen domain we are in PAE
17320 +                * mode already, thus we can't simply hook a empty
17321 +                * pmd.  That would kill the mappings we are currently
17322 +                * using ...
17323 +                */
17324 +               pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET);
17325 +#else
17326 +               pmd = one_md_table_init(pgd);
17327 +#endif
17328 +               if (pfn >= max_low_pfn)
17329 +                       continue;
17330 +               pmd += pmd_idx;
17331 +               for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
17332 +                       unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
17333 +                       if (address >= HYPERVISOR_VIRT_START)
17334 +                               continue;
17335 +
17336 +                       /* Map with big pages if possible, otherwise create normal page tables. */
17337 +                       if (cpu_has_pse) {
17338 +                               unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
17339 +
17340 +                               if (is_kernel_text(address) || is_kernel_text(address2))
17341 +                                       set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
17342 +                               else
17343 +                                       set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
17344 +                               pfn += PTRS_PER_PTE;
17345 +                       } else {
17346 +                               pte = one_page_table_init(pmd);
17347 +
17348 +                               pte += pte_ofs;
17349 +                               for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) {
17350 +                                               /* XEN: Only map initial RAM allocation. */
17351 +                                               if ((pfn >= max_ram_pfn) || pte_present(*pte))
17352 +                                                       continue;
17353 +                                               if (is_kernel_text(address))
17354 +                                                       set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
17355 +                                               else
17356 +                                                       set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
17357 +                               }
17358 +                               pte_ofs = 0;
17359 +                       }
17360 +               }
17361 +               pmd_idx = 0;
17362 +       }
17363 +}
17364 +
17365 +#ifndef CONFIG_XEN
17366 +
17367 +static inline int page_kills_ppro(unsigned long pagenr)
17368 +{
17369 +       if (pagenr >= 0x70000 && pagenr <= 0x7003F)
17370 +               return 1;
17371 +       return 0;
17372 +}
17373 +
17374 +#else
17375 +
17376 +#define page_kills_ppro(p)     0
17377 +
17378 +#endif
17379 +
17380 +extern int is_available_memory(efi_memory_desc_t *);
17381 +
17382 +int page_is_ram(unsigned long pagenr)
17383 +{
17384 +       int i;
17385 +       unsigned long addr, end;
17386 +
17387 +       if (efi_enabled) {
17388 +               efi_memory_desc_t *md;
17389 +               void *p;
17390 +
17391 +               for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
17392 +                       md = p;
17393 +                       if (!is_available_memory(md))
17394 +                               continue;
17395 +                       addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
17396 +                       end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
17397 +
17398 +                       if ((pagenr >= addr) && (pagenr < end))
17399 +                               return 1;
17400 +               }
17401 +               return 0;
17402 +       }
17403 +
17404 +       for (i = 0; i < e820.nr_map; i++) {
17405 +
17406 +               if (e820.map[i].type != E820_RAM)       /* not usable memory */
17407 +                       continue;
17408 +               /*
17409 +                *      !!!FIXME!!! Some BIOSen report areas as RAM that
17410 +                *      are not. Notably the 640->1Mb area. We need a sanity
17411 +                *      check here.
17412 +                */
17413 +               addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
17414 +               end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
17415 +               if  ((pagenr >= addr) && (pagenr < end))
17416 +                       return 1;
17417 +       }
17418 +       return 0;
17419 +}
17420 +
17421 +#ifdef CONFIG_HIGHMEM
17422 +pte_t *kmap_pte;
17423 +pgprot_t kmap_prot;
17424 +
17425 +#define kmap_get_fixmap_pte(vaddr)                                     \
17426 +       pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
17427 +
17428 +static void __init kmap_init(void)
17429 +{
17430 +       unsigned long kmap_vstart;
17431 +
17432 +       /* cache the first kmap pte */
17433 +       kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
17434 +       kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
17435 +
17436 +       kmap_prot = PAGE_KERNEL;
17437 +}
17438 +
17439 +static void __init permanent_kmaps_init(pgd_t *pgd_base)
17440 +{
17441 +       pgd_t *pgd;
17442 +       pud_t *pud;
17443 +       pmd_t *pmd;
17444 +       pte_t *pte;
17445 +       unsigned long vaddr;
17446 +
17447 +       vaddr = PKMAP_BASE;
17448 +       page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
17449 +
17450 +       pgd = swapper_pg_dir + pgd_index(vaddr);
17451 +       pud = pud_offset(pgd, vaddr);
17452 +       pmd = pmd_offset(pud, vaddr);
17453 +       pte = pte_offset_kernel(pmd, vaddr);
17454 +       pkmap_page_table = pte; 
17455 +}
17456 +
17457 +static void __meminit free_new_highpage(struct page *page, int pfn)
17458 +{
17459 +       init_page_count(page);
17460 +       if (pfn < xen_start_info->nr_pages)
17461 +               __free_page(page);
17462 +       totalhigh_pages++;
17463 +}
17464 +
17465 +void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
17466 +{
17467 +       if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
17468 +               ClearPageReserved(page);
17469 +               free_new_highpage(page, pfn);
17470 +       } else
17471 +               SetPageReserved(page);
17472 +}
17473 +
17474 +static int add_one_highpage_hotplug(struct page *page, unsigned long pfn)
17475 +{
17476 +       free_new_highpage(page, pfn);
17477 +       totalram_pages++;
17478 +#ifdef CONFIG_FLATMEM
17479 +       max_mapnr = max(pfn, max_mapnr);
17480 +#endif
17481 +       num_physpages++;
17482 +       return 0;
17483 +}
17484 +
17485 +/*
17486 + * Not currently handling the NUMA case.
17487 + * Assuming single node and all memory that
17488 + * has been added dynamically that would be
17489 + * onlined here is in HIGHMEM
17490 + */
17491 +void online_page(struct page *page)
17492 +{
17493 +       ClearPageReserved(page);
17494 +       add_one_highpage_hotplug(page, page_to_pfn(page));
17495 +}
17496 +
17497 +
17498 +#ifdef CONFIG_NUMA
17499 +extern void set_highmem_pages_init(int);
17500 +#else
17501 +static void __init set_highmem_pages_init(int bad_ppro)
17502 +{
17503 +       int pfn;
17504 +       for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
17505 +               add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
17506 +       totalram_pages += totalhigh_pages;
17507 +}
17508 +#endif /* CONFIG_FLATMEM */
17509 +
17510 +#else
17511 +#define kmap_init() do { } while (0)
17512 +#define permanent_kmaps_init(pgd_base) do { } while (0)
17513 +#define set_highmem_pages_init(bad_ppro) do { } while (0)
17514 +#endif /* CONFIG_HIGHMEM */
17515 +
17516 +unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
17517 +EXPORT_SYMBOL(__PAGE_KERNEL);
17518 +unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
17519 +
17520 +#ifdef CONFIG_NUMA
17521 +extern void __init remap_numa_kva(void);
17522 +#else
17523 +#define remap_numa_kva() do {} while (0)
17524 +#endif
17525 +
17526 +pgd_t *swapper_pg_dir;
17527 +
17528 +static void __init pagetable_init (void)
17529 +{
17530 +       unsigned long vaddr;
17531 +       pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
17532 +
17533 +       swapper_pg_dir = pgd_base;
17534 +       init_mm.pgd    = pgd_base;
17535 +
17536 +       /* Enable PSE if available */
17537 +       if (cpu_has_pse) {
17538 +               set_in_cr4(X86_CR4_PSE);
17539 +       }
17540 +
17541 +       /* Enable PGE if available */
17542 +       if (cpu_has_pge) {
17543 +               set_in_cr4(X86_CR4_PGE);
17544 +               __PAGE_KERNEL |= _PAGE_GLOBAL;
17545 +               __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
17546 +       }
17547 +
17548 +       kernel_physical_mapping_init(pgd_base);
17549 +       remap_numa_kva();
17550 +
17551 +       /*
17552 +        * Fixed mappings, only the page table structure has to be
17553 +        * created - mappings will be set by set_fixmap():
17554 +        */
17555 +       vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
17556 +       page_table_range_init(vaddr, 0, pgd_base);
17557 +
17558 +       permanent_kmaps_init(pgd_base);
17559 +}
17560 +
17561 +#ifdef CONFIG_SOFTWARE_SUSPEND
17562 +/*
17563 + * Swap suspend & friends need this for resume because things like the intel-agp
17564 + * driver might have split up a kernel 4MB mapping.
17565 + */
17566 +char __nosavedata swsusp_pg_dir[PAGE_SIZE]
17567 +       __attribute__ ((aligned (PAGE_SIZE)));
17568 +
17569 +static inline void save_pg_dir(void)
17570 +{
17571 +       memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
17572 +}
17573 +#else
17574 +static inline void save_pg_dir(void)
17575 +{
17576 +}
17577 +#endif
17578 +
17579 +void zap_low_mappings (void)
17580 +{
17581 +       int i;
17582 +
17583 +       save_pg_dir();
17584 +
17585 +       /*
17586 +        * Zap initial low-memory mappings.
17587 +        *
17588 +        * Note that "pgd_clear()" doesn't do it for
17589 +        * us, because pgd_clear() is a no-op on i386.
17590 +        */
17591 +       for (i = 0; i < USER_PTRS_PER_PGD; i++)
17592 +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
17593 +               set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
17594 +#else
17595 +               set_pgd(swapper_pg_dir+i, __pgd(0));
17596 +#endif
17597 +       flush_tlb_all();
17598 +}
17599 +
17600 +static int disable_nx __initdata = 0;
17601 +u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
17602 +EXPORT_SYMBOL(__supported_pte_mask);
17603 +
17604 +/*
17605 + * noexec = on|off
17606 + *
17607 + * Control non executable mappings.
17608 + *
17609 + * on      Enable
17610 + * off     Disable
17611 + */
17612 +void __init noexec_setup(const char *str)
17613 +{
17614 +       if (!strncmp(str, "on",2) && cpu_has_nx) {
17615 +               __supported_pte_mask |= _PAGE_NX;
17616 +               disable_nx = 0;
17617 +       } else if (!strncmp(str,"off",3)) {
17618 +               disable_nx = 1;
17619 +               __supported_pte_mask &= ~_PAGE_NX;
17620 +       }
17621 +}
17622 +
17623 +int nx_enabled = 0;
17624 +#ifdef CONFIG_X86_PAE
17625 +
17626 +static void __init set_nx(void)
17627 +{
17628 +       unsigned int v[4], l, h;
17629 +
17630 +       if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
17631 +               cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
17632 +               if ((v[3] & (1 << 20)) && !disable_nx) {
17633 +                       rdmsr(MSR_EFER, l, h);
17634 +                       l |= EFER_NX;
17635 +                       wrmsr(MSR_EFER, l, h);
17636 +                       nx_enabled = 1;
17637 +                       __supported_pte_mask |= _PAGE_NX;
17638 +               }
17639 +       }
17640 +}
17641 +
17642 +/*
17643 + * Enables/disables executability of a given kernel page and
17644 + * returns the previous setting.
17645 + */
17646 +int __init set_kernel_exec(unsigned long vaddr, int enable)
17647 +{
17648 +       pte_t *pte;
17649 +       int ret = 1;
17650 +
17651 +       if (!nx_enabled)
17652 +               goto out;
17653 +
17654 +       pte = lookup_address(vaddr);
17655 +       BUG_ON(!pte);
17656 +
17657 +       if (!pte_exec_kernel(*pte))
17658 +               ret = 0;
17659 +
17660 +       if (enable)
17661 +               pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
17662 +       else
17663 +               pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
17664 +       __flush_tlb_all();
17665 +out:
17666 +       return ret;
17667 +}
17668 +
17669 +#endif
17670 +
17671 +/*
17672 + * paging_init() sets up the page tables - note that the first 8MB are
17673 + * already mapped by head.S.
17674 + *
17675 + * This routines also unmaps the page at virtual kernel address 0, so
17676 + * that we can trap those pesky NULL-reference errors in the kernel.
17677 + */
17678 +void __init paging_init(void)
17679 +{
17680 +       int i;
17681 +
17682 +#ifdef CONFIG_X86_PAE
17683 +       set_nx();
17684 +       if (nx_enabled)
17685 +               printk("NX (Execute Disable) protection: active\n");
17686 +#endif
17687 +
17688 +       pagetable_init();
17689 +
17690 +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
17691 +       /*
17692 +        * We will bail out later - printk doesn't work right now so
17693 +        * the user would just see a hanging kernel.
17694 +        * when running as xen domain we are already in PAE mode at
17695 +        * this point.
17696 +        */
17697 +       if (cpu_has_pae)
17698 +               set_in_cr4(X86_CR4_PAE);
17699 +#endif
17700 +       __flush_tlb_all();
17701 +
17702 +       kmap_init();
17703 +
17704 +       /* Switch to the real shared_info page, and clear the
17705 +        * dummy page. */
17706 +       set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
17707 +       HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
17708 +       memset(empty_zero_page, 0, sizeof(empty_zero_page));
17709 +
17710 +       /* Setup mapping of lower 1st MB */
17711 +       for (i = 0; i < NR_FIX_ISAMAPS; i++)
17712 +               if (xen_start_info->flags & SIF_PRIVILEGED)
17713 +                       set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
17714 +               else
17715 +                       __set_fixmap(FIX_ISAMAP_BEGIN - i,
17716 +                                    virt_to_machine(empty_zero_page),
17717 +                                    PAGE_KERNEL_RO);
17718 +}
17719 +
17720 +/*
17721 + * Test if the WP bit works in supervisor mode. It isn't supported on 386's
17722 + * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
17723 + * used to involve black magic jumps to work around some nasty CPU bugs,
17724 + * but fortunately the switch to using exceptions got rid of all that.
17725 + */
17726 +
17727 +static void __init test_wp_bit(void)
17728 +{
17729 +       printk("Checking if this processor honours the WP bit even in supervisor mode... ");
17730 +
17731 +       /* Any page-aligned address will do, the test is non-destructive */
17732 +       __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
17733 +       boot_cpu_data.wp_works_ok = do_test_wp_bit();
17734 +       clear_fixmap(FIX_WP_TEST);
17735 +
17736 +       if (!boot_cpu_data.wp_works_ok) {
17737 +               printk("No.\n");
17738 +#ifdef CONFIG_X86_WP_WORKS_OK
17739 +               panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
17740 +#endif
17741 +       } else {
17742 +               printk("Ok.\n");
17743 +       }
17744 +}
17745 +
17746 +static void __init set_max_mapnr_init(void)
17747 +{
17748 +#ifdef CONFIG_HIGHMEM
17749 +       num_physpages = highend_pfn;
17750 +#else
17751 +       num_physpages = max_low_pfn;
17752 +#endif
17753 +#ifdef CONFIG_FLATMEM
17754 +       max_mapnr = num_physpages;
17755 +#endif
17756 +}
17757 +
17758 +static struct kcore_list kcore_mem, kcore_vmalloc; 
17759 +
17760 +void __init mem_init(void)
17761 +{
17762 +       extern int ppro_with_ram_bug(void);
17763 +       int codesize, reservedpages, datasize, initsize;
17764 +       int tmp;
17765 +       int bad_ppro;
17766 +       unsigned long pfn;
17767 +
17768 +       contiguous_bitmap = alloc_bootmem_low_pages(
17769 +               (max_low_pfn + 2*BITS_PER_LONG) >> 3);
17770 +       BUG_ON(!contiguous_bitmap);
17771 +       memset(contiguous_bitmap, 0, (max_low_pfn + 2*BITS_PER_LONG) >> 3);
17772 +
17773 +#if defined(CONFIG_SWIOTLB)
17774 +       swiotlb_init(); 
17775 +#endif
17776 +
17777 +#ifdef CONFIG_FLATMEM
17778 +       if (!mem_map)
17779 +               BUG();
17780 +#endif
17781 +       
17782 +       bad_ppro = ppro_with_ram_bug();
17783 +
17784 +#ifdef CONFIG_HIGHMEM
17785 +       /* check that fixmap and pkmap do not overlap */
17786 +       if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
17787 +               printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
17788 +               printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
17789 +                               PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
17790 +               BUG();
17791 +       }
17792 +#endif
17793
17794 +       set_max_mapnr_init();
17795 +
17796 +#ifdef CONFIG_HIGHMEM
17797 +       high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
17798 +#else
17799 +       high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
17800 +#endif
17801 +       printk("vmalloc area: %lx-%lx, maxmem %lx\n",
17802 +              VMALLOC_START,VMALLOC_END,MAXMEM);
17803 +       BUG_ON(VMALLOC_START > VMALLOC_END);
17804 +       
17805 +       /* this will put all low memory onto the freelists */
17806 +       totalram_pages += free_all_bootmem();
17807 +       /* XEN: init and count low-mem pages outside initial allocation. */
17808 +       for (pfn = xen_start_info->nr_pages; pfn < max_low_pfn; pfn++) {
17809 +               ClearPageReserved(&mem_map[pfn]);
17810 +               init_page_count(&mem_map[pfn]);
17811 +               totalram_pages++;
17812 +       }
17813 +
17814 +       reservedpages = 0;
17815 +       for (tmp = 0; tmp < max_low_pfn; tmp++)
17816 +               /*
17817 +                * Only count reserved RAM pages
17818 +                */
17819 +               if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
17820 +                       reservedpages++;
17821 +
17822 +       set_highmem_pages_init(bad_ppro);
17823 +
17824 +       codesize =  (unsigned long) &_etext - (unsigned long) &_text;
17825 +       datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
17826 +       initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
17827 +
17828 +       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
17829 +       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
17830 +                  VMALLOC_END-VMALLOC_START);
17831 +
17832 +       printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
17833 +               (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
17834 +               num_physpages << (PAGE_SHIFT-10),
17835 +               codesize >> 10,
17836 +               reservedpages << (PAGE_SHIFT-10),
17837 +               datasize >> 10,
17838 +               initsize >> 10,
17839 +               (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
17840 +              );
17841 +
17842 +#ifdef CONFIG_X86_PAE
17843 +       if (!cpu_has_pae)
17844 +               panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
17845 +#endif
17846 +       if (boot_cpu_data.wp_works_ok < 0)
17847 +               test_wp_bit();
17848 +
17849 +       /*
17850 +        * Subtle. SMP is doing it's boot stuff late (because it has to
17851 +        * fork idle threads) - but it also needs low mappings for the
17852 +        * protected-mode entry to work. We zap these entries only after
17853 +        * the WP-bit has been tested.
17854 +        */
17855 +#ifndef CONFIG_SMP
17856 +       zap_low_mappings();
17857 +#endif
17858 +
17859 +       set_bit(PG_pinned, &virt_to_page(init_mm.pgd)->flags);
17860 +}
17861 +
17862 +/*
17863 + * this is for the non-NUMA, single node SMP system case.
17864 + * Specifically, in the case of x86, we will always add
17865 + * memory to the highmem for now.
17866 + */
17867 +#ifdef CONFIG_MEMORY_HOTPLUG
17868 +#ifndef CONFIG_NEED_MULTIPLE_NODES
17869 +int add_memory(u64 start, u64 size)
17870 +{
17871 +       struct pglist_data *pgdata = &contig_page_data;
17872 +       struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
17873 +       unsigned long start_pfn = start >> PAGE_SHIFT;
17874 +       unsigned long nr_pages = size >> PAGE_SHIFT;
17875 +
17876 +       return __add_pages(zone, start_pfn, nr_pages);
17877 +}
17878 +
17879 +int remove_memory(u64 start, u64 size)
17880 +{
17881 +       return -EINVAL;
17882 +}
17883 +#endif
17884 +#endif
17885 +
17886 +kmem_cache_t *pgd_cache;
17887 +kmem_cache_t *pmd_cache;
17888 +
17889 +void __init pgtable_cache_init(void)
17890 +{
17891 +       if (PTRS_PER_PMD > 1) {
17892 +               pmd_cache = kmem_cache_create("pmd",
17893 +                                       PTRS_PER_PMD*sizeof(pmd_t),
17894 +                                       PTRS_PER_PMD*sizeof(pmd_t),
17895 +                                       0,
17896 +                                       pmd_ctor,
17897 +                                       NULL);
17898 +               if (!pmd_cache)
17899 +                       panic("pgtable_cache_init(): cannot create pmd cache");
17900 +       }
17901 +       pgd_cache = kmem_cache_create("pgd",
17902 +#ifndef CONFIG_XEN
17903 +                               PTRS_PER_PGD*sizeof(pgd_t),
17904 +                               PTRS_PER_PGD*sizeof(pgd_t),
17905 +#else
17906 +                               PAGE_SIZE,
17907 +                               PAGE_SIZE,
17908 +#endif
17909 +                               0,
17910 +                               pgd_ctor,
17911 +                               PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
17912 +       if (!pgd_cache)
17913 +               panic("pgtable_cache_init(): Cannot create pgd cache");
17914 +}
17915 +
17916 +/*
17917 + * This function cannot be __init, since exceptions don't work in that
17918 + * section.  Put this after the callers, so that it cannot be inlined.
17919 + */
17920 +static int noinline do_test_wp_bit(void)
17921 +{
17922 +       char tmp_reg;
17923 +       int flag;
17924 +
17925 +       __asm__ __volatile__(
17926 +               "       movb %0,%1      \n"
17927 +               "1:     movb %1,%0      \n"
17928 +               "       xorl %2,%2      \n"
17929 +               "2:                     \n"
17930 +               ".section __ex_table,\"a\"\n"
17931 +               "       .align 4        \n"
17932 +               "       .long 1b,2b     \n"
17933 +               ".previous              \n"
17934 +               :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
17935 +                "=q" (tmp_reg),
17936 +                "=r" (flag)
17937 +               :"2" (1)
17938 +               :"memory");
17939 +       
17940 +       return flag;
17941 +}
17942 +
17943 +#ifdef CONFIG_DEBUG_RODATA
17944 +
17945 +extern char __start_rodata, __end_rodata;
17946 +void mark_rodata_ro(void)
17947 +{
17948 +       unsigned long addr = (unsigned long)&__start_rodata;
17949 +
17950 +       for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
17951 +               change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
17952 +
17953 +       printk ("Write protecting the kernel read-only data: %luk\n",
17954 +                       (unsigned long)(&__end_rodata - &__start_rodata) >> 10);
17955 +
17956 +       /*
17957 +        * change_page_attr() requires a global_flush_tlb() call after it.
17958 +        * We do this after the printk so that if something went wrong in the
17959 +        * change, the printk gets out at least to give a better debug hint
17960 +        * of who is the culprit.
17961 +        */
17962 +       global_flush_tlb();
17963 +}
17964 +#endif
17965 +
17966 +void free_init_pages(char *what, unsigned long begin, unsigned long end)
17967 +{
17968 +       unsigned long addr;
17969 +
17970 +       for (addr = begin; addr < end; addr += PAGE_SIZE) {
17971 +               ClearPageReserved(virt_to_page(addr));
17972 +               init_page_count(virt_to_page(addr));
17973 +               memset((void *)addr, 0xcc, PAGE_SIZE);
17974 +               free_page(addr);
17975 +               totalram_pages++;
17976 +       }
17977 +       printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
17978 +}
17979 +
17980 +void free_initmem(void)
17981 +{
17982 +       free_init_pages("unused kernel memory",
17983 +                       (unsigned long)(&__init_begin),
17984 +                       (unsigned long)(&__init_end));
17985 +}
17986 +
17987 +#ifdef CONFIG_BLK_DEV_INITRD
17988 +void free_initrd_mem(unsigned long start, unsigned long end)
17989 +{
17990 +       free_init_pages("initrd memory", start, end);
17991 +}
17992 +#endif
17993 +
17994 diff -urNp linux-2.6/arch/i386/mm/ioremap-xen.c new/arch/i386/mm/ioremap-xen.c
17995 --- linux-2.6/arch/i386/mm/ioremap-xen.c        1970-01-01 01:00:00.000000000 +0100
17996 +++ new/arch/i386/mm/ioremap-xen.c      2006-05-23 21:31:13.000000000 +0200
17997 @@ -0,0 +1,476 @@
17998 +/*
17999 + * arch/i386/mm/ioremap.c
18000 + *
18001 + * Re-map IO memory to kernel address space so that we can access it.
18002 + * This is needed for high PCI addresses that aren't mapped in the
18003 + * 640k-1MB IO memory area on PC's
18004 + *
18005 + * (C) Copyright 1995 1996 Linus Torvalds
18006 + */
18007 +
18008 +#include <linux/vmalloc.h>
18009 +#include <linux/init.h>
18010 +#include <linux/slab.h>
18011 +#include <linux/module.h>
18012 +#include <asm/io.h>
18013 +#include <asm/fixmap.h>
18014 +#include <asm/cacheflush.h>
18015 +#include <asm/tlbflush.h>
18016 +#include <asm/pgtable.h>
18017 +#include <asm/pgalloc.h>
18018 +
18019 +#define ISA_START_ADDRESS      0x0
18020 +#define ISA_END_ADDRESS                0x100000
18021 +
18022 +#if 0 /* not PAE safe */
18023 +/* These hacky macros avoid phys->machine translations. */
18024 +#define __direct_pte(x) ((pte_t) { (x) } )
18025 +#define __direct_mk_pte(page_nr,pgprot) \
18026 +  __direct_pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot))
18027 +#define direct_mk_pte_phys(physpage, pgprot) \
18028 +  __direct_mk_pte((physpage) >> PAGE_SHIFT, pgprot)
18029 +#endif
18030 +
18031 +static int direct_remap_area_pte_fn(pte_t *pte, 
18032 +                                   struct page *pmd_page,
18033 +                                   unsigned long address, 
18034 +                                   void *data)
18035 +{
18036 +       mmu_update_t **v = (mmu_update_t **)data;
18037 +
18038 +       (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
18039 +                    PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
18040 +       (*v)++;
18041 +
18042 +       return 0;
18043 +}
18044 +
18045 +static int __direct_remap_pfn_range(struct mm_struct *mm,
18046 +                                   unsigned long address, 
18047 +                                   unsigned long mfn,
18048 +                                   unsigned long size, 
18049 +                                   pgprot_t prot,
18050 +                                   domid_t  domid)
18051 +{
18052 +       int rc;
18053 +       unsigned long i, start_address;
18054 +       mmu_update_t *u, *v, *w;
18055 +
18056 +       u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
18057 +       if (u == NULL)
18058 +               return -ENOMEM;
18059 +
18060 +       start_address = address;
18061 +
18062 +       flush_cache_all();
18063 +
18064 +       for (i = 0; i < size; i += PAGE_SIZE) {
18065 +               if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
18066 +                       /* Fill in the PTE pointers. */
18067 +                       rc = apply_to_page_range(mm, start_address, 
18068 +                                                address - start_address,
18069 +                                                direct_remap_area_pte_fn, &w);
18070 +                       if (rc)
18071 +                               goto out;
18072 +                       w = u;
18073 +                       rc = -EFAULT;
18074 +                       if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
18075 +                               goto out;
18076 +                       v = u;
18077 +                       start_address = address;
18078 +               }
18079 +
18080 +               /*
18081 +                * Fill in the machine address: PTE ptr is done later by
18082 +                * __direct_remap_area_pages(). 
18083 +                */
18084 +               v->val = pte_val_ma(pfn_pte_ma(mfn, prot));
18085 +
18086 +               mfn++;
18087 +               address += PAGE_SIZE; 
18088 +               v++;
18089 +       }
18090 +
18091 +       if (v != u) {
18092 +               /* get the ptep's filled in */
18093 +               rc = apply_to_page_range(mm, start_address,
18094 +                                        address - start_address,
18095 +                                        direct_remap_area_pte_fn, &w);
18096 +               if (rc)
18097 +                       goto out;
18098 +               rc = -EFAULT;
18099 +               if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
18100 +                       goto out;
18101 +       }
18102 +
18103 +       rc = 0;
18104 +
18105 + out:
18106 +       flush_tlb_all();
18107 +
18108 +       free_page((unsigned long)u);
18109 +
18110 +       return rc;
18111 +}
18112 +
18113 +int direct_remap_pfn_range(struct vm_area_struct *vma,
18114 +                          unsigned long address, 
18115 +                          unsigned long mfn,
18116 +                          unsigned long size, 
18117 +                          pgprot_t prot,
18118 +                          domid_t  domid)
18119 +{
18120 +       /* Same as remap_pfn_range(). */
18121 +       vma->vm_flags |= VM_IO | VM_RESERVED;
18122 +
18123 +       if (domid == DOMID_SELF)
18124 +               return -EINVAL;
18125 +
18126 +       return __direct_remap_pfn_range(
18127 +               vma->vm_mm, address, mfn, size, prot, domid);
18128 +}
18129 +EXPORT_SYMBOL(direct_remap_pfn_range);
18130 +
18131 +int direct_kernel_remap_pfn_range(unsigned long address, 
18132 +                                 unsigned long mfn,
18133 +                                 unsigned long size, 
18134 +                                 pgprot_t prot,
18135 +                                 domid_t  domid)
18136 +{
18137 +       return __direct_remap_pfn_range(
18138 +               &init_mm, address, mfn, size, prot, domid);
18139 +}
18140 +EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
18141 +
18142 +static int lookup_pte_fn(
18143 +       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
18144 +{
18145 +       uint64_t *ptep = (uint64_t *)data;
18146 +       if (ptep)
18147 +               *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
18148 +                        PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
18149 +       return 0;
18150 +}
18151 +
18152 +int create_lookup_pte_addr(struct mm_struct *mm, 
18153 +                          unsigned long address,
18154 +                          uint64_t *ptep)
18155 +{
18156 +       return apply_to_page_range(mm, address, PAGE_SIZE,
18157 +                                  lookup_pte_fn, ptep);
18158 +}
18159 +
18160 +EXPORT_SYMBOL(create_lookup_pte_addr);
18161 +
18162 +static int noop_fn(
18163 +       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
18164 +{
18165 +       return 0;
18166 +}
18167 +
18168 +int touch_pte_range(struct mm_struct *mm,
18169 +                   unsigned long address,
18170 +                   unsigned long size)
18171 +{
18172 +       return apply_to_page_range(mm, address, size, noop_fn, NULL);
18173 +} 
18174 +
18175 +EXPORT_SYMBOL(touch_pte_range);
18176 +
18177 +void *vm_map_xen_pages (unsigned long maddr, int vm_size, pgprot_t prot)
18178 +{
18179 +       int error;
18180 +       
18181 +       struct vm_struct *vma;
18182 +       vma = get_vm_area (vm_size, VM_IOREMAP);
18183 +      
18184 +       if (vma == NULL) {
18185 +               printk ("ioremap.c,vm_map_xen_pages(): "
18186 +                       "Failed to get VMA area\n");
18187 +               return NULL;
18188 +       }
18189 +
18190 +       error = direct_kernel_remap_pfn_range((unsigned long) vma->addr,
18191 +                                             maddr >> PAGE_SHIFT, vm_size,
18192 +                                             prot, DOMID_SELF );
18193 +       if (error == 0) {
18194 +               return vma->addr;
18195 +       } else {
18196 +               printk ("ioremap.c,vm_map_xen_pages(): "
18197 +                       "Failed to map xen shared pages into kernel space\n");
18198 +               return NULL;
18199 +       }
18200 +}
18201 +EXPORT_SYMBOL(vm_map_xen_pages);
18202 +
18203 +/*
18204 + * Does @address reside within a non-highmem page that is local to this virtual
18205 + * machine (i.e., not an I/O page, nor a memory page belonging to another VM).
18206 + * See the comment that accompanies mfn_to_local_pfn() in page.h to understand
18207 + * why this works.
18208 + */
18209 +static inline int is_local_lowmem(unsigned long address)
18210 +{
18211 +       extern unsigned long max_low_pfn;
18212 +       return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn);
18213 +}
18214 +
18215 +/*
18216 + * Generic mapping function (not visible outside):
18217 + */
18218 +
18219 +/*
18220 + * Remap an arbitrary physical address space into the kernel virtual
18221 + * address space. Needed when the kernel wants to access high addresses
18222 + * directly.
18223 + *
18224 + * NOTE! We need to allow non-page-aligned mappings too: we will obviously
18225 + * have to convert them into an offset in a page-aligned mapping, but the
18226 + * caller shouldn't need to know that small detail.
18227 + */
18228 +void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
18229 +{
18230 +       void __iomem * addr;
18231 +       struct vm_struct * area;
18232 +       unsigned long offset, last_addr;
18233 +       domid_t domid = DOMID_IO;
18234 +
18235 +       /* Don't allow wraparound or zero size */
18236 +       last_addr = phys_addr + size - 1;
18237 +       if (!size || last_addr < phys_addr)
18238 +               return NULL;
18239 +
18240 +       /*
18241 +        * Don't remap the low PCI/ISA area, it's always mapped..
18242 +        */
18243 +       if (xen_start_info->flags & SIF_PRIVILEGED &&
18244 +           phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
18245 +               return (void __iomem *) isa_bus_to_virt(phys_addr);
18246 +
18247 +       /*
18248 +        * Don't allow anybody to remap normal RAM that we're using..
18249 +        */
18250 +       if (is_local_lowmem(phys_addr)) {
18251 +               char *t_addr, *t_end;
18252 +               struct page *page;
18253 +
18254 +               t_addr = bus_to_virt(phys_addr);
18255 +               t_end = t_addr + (size - 1);
18256 +          
18257 +               for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
18258 +                       if(!PageReserved(page))
18259 +                               return NULL;
18260 +
18261 +               domid = DOMID_SELF;
18262 +       }
18263 +
18264 +       /*
18265 +        * Mappings have to be page-aligned
18266 +        */
18267 +       offset = phys_addr & ~PAGE_MASK;
18268 +       phys_addr &= PAGE_MASK;
18269 +       size = PAGE_ALIGN(last_addr+1) - phys_addr;
18270 +
18271 +       /*
18272 +        * Ok, go for it..
18273 +        */
18274 +       area = get_vm_area(size, VM_IOREMAP | (flags << 20));
18275 +       if (!area)
18276 +               return NULL;
18277 +       area->phys_addr = phys_addr;
18278 +       addr = (void __iomem *) area->addr;
18279 +       flags |= _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED;
18280 +#ifdef __x86_64__
18281 +       flags |= _PAGE_USER;
18282 +#endif
18283 +       if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
18284 +                                    phys_addr>>PAGE_SHIFT,
18285 +                                    size, __pgprot(flags), domid)) {
18286 +               vunmap((void __force *) addr);
18287 +               return NULL;
18288 +       }
18289 +       return (void __iomem *) (offset + (char __iomem *)addr);
18290 +}
18291 +EXPORT_SYMBOL(__ioremap);
18292 +
18293 +/**
18294 + * ioremap_nocache     -   map bus memory into CPU space
18295 + * @offset:    bus address of the memory
18296 + * @size:      size of the resource to map
18297 + *
18298 + * ioremap_nocache performs a platform specific sequence of operations to
18299 + * make bus memory CPU accessible via the readb/readw/readl/writeb/
18300 + * writew/writel functions and the other mmio helpers. The returned
18301 + * address is not guaranteed to be usable directly as a virtual
18302 + * address. 
18303 + *
18304 + * This version of ioremap ensures that the memory is marked uncachable
18305 + * on the CPU as well as honouring existing caching rules from things like
18306 + * the PCI bus. Note that there are other caches and buffers on many 
18307 + * busses. In particular driver authors should read up on PCI writes
18308 + *
18309 + * It's useful if some control registers are in such an area and
18310 + * write combining or read caching is not desirable:
18311 + * 
18312 + * Must be freed with iounmap.
18313 + */
18314 +
18315 +void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
18316 +{
18317 +       unsigned long last_addr;
18318 +       void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
18319 +       if (!p) 
18320 +               return p; 
18321 +
18322 +       /* Guaranteed to be > phys_addr, as per __ioremap() */
18323 +       last_addr = phys_addr + size - 1;
18324 +
18325 +       if (is_local_lowmem(last_addr)) { 
18326 +               struct page *ppage = virt_to_page(bus_to_virt(phys_addr));
18327 +               unsigned long npages;
18328 +
18329 +               phys_addr &= PAGE_MASK;
18330 +
18331 +               /* This might overflow and become zero.. */
18332 +               last_addr = PAGE_ALIGN(last_addr);
18333 +
18334 +               /* .. but that's ok, because modulo-2**n arithmetic will make
18335 +               * the page-aligned "last - first" come out right.
18336 +               */
18337 +               npages = (last_addr - phys_addr) >> PAGE_SHIFT;
18338 +
18339 +               if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { 
18340 +                       iounmap(p); 
18341 +                       p = NULL;
18342 +               }
18343 +               global_flush_tlb();
18344 +       }
18345 +
18346 +       return p;                                       
18347 +}
18348 +EXPORT_SYMBOL(ioremap_nocache);
18349 +
18350 +/**
18351 + * iounmap - Free a IO remapping
18352 + * @addr: virtual address from ioremap_*
18353 + *
18354 + * Caller must ensure there is only one unmapping for the same pointer.
18355 + */
18356 +void iounmap(volatile void __iomem *addr)
18357 +{
18358 +       struct vm_struct *p, *o;
18359 +
18360 +       if ((void __force *)addr <= high_memory)
18361 +               return;
18362 +
18363 +       /*
18364 +        * __ioremap special-cases the PCI/ISA range by not instantiating a
18365 +        * vm_area and by simply returning an address into the kernel mapping
18366 +        * of ISA space.   So handle that here.
18367 +        */
18368 +       if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
18369 +               return;
18370 +
18371 +       addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
18372 +
18373 +       /* Use the vm area unlocked, assuming the caller
18374 +          ensures there isn't another iounmap for the same address
18375 +          in parallel. Reuse of the virtual address is prevented by
18376 +          leaving it in the global lists until we're done with it.
18377 +          cpa takes care of the direct mappings. */
18378 +       read_lock(&vmlist_lock);
18379 +       for (p = vmlist; p; p = p->next) {
18380 +               if (p->addr == addr)
18381 +                       break;
18382 +       }
18383 +       read_unlock(&vmlist_lock);
18384 +
18385 +       if (!p) {
18386 +               printk("iounmap: bad address %p\n", addr);
18387 +               dump_stack();
18388 +               return;
18389 +       }
18390 +
18391 +       /* Reset the direct mapping. Can block */
18392 +       if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) {
18393 +               /* p->size includes the guard page, but cpa doesn't like that */
18394 +               change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)),
18395 +                                (p->size - PAGE_SIZE) >> PAGE_SHIFT,
18396 +                                PAGE_KERNEL);
18397 +               global_flush_tlb();
18398 +       } 
18399 +
18400 +       /* Finally remove it */
18401 +       o = remove_vm_area((void *)addr);
18402 +       BUG_ON(p != o || o == NULL);
18403 +       kfree(p); 
18404 +}
18405 +EXPORT_SYMBOL(iounmap);
18406 +
18407 +void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
18408 +{
18409 +       unsigned long offset, last_addr;
18410 +       unsigned int nrpages;
18411 +       enum fixed_addresses idx;
18412 +
18413 +       /* Don't allow wraparound or zero size */
18414 +       last_addr = phys_addr + size - 1;
18415 +       if (!size || last_addr < phys_addr)
18416 +               return NULL;
18417 +
18418 +       /*
18419 +        * Don't remap the low PCI/ISA area, it's always mapped..
18420 +        */
18421 +       if (xen_start_info->flags & SIF_PRIVILEGED &&
18422 +           phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
18423 +               return isa_bus_to_virt(phys_addr);
18424 +
18425 +       /*
18426 +        * Mappings have to be page-aligned
18427 +        */
18428 +       offset = phys_addr & ~PAGE_MASK;
18429 +       phys_addr &= PAGE_MASK;
18430 +       size = PAGE_ALIGN(last_addr) - phys_addr;
18431 +
18432 +       /*
18433 +        * Mappings have to fit in the FIX_BTMAP area.
18434 +        */
18435 +       nrpages = size >> PAGE_SHIFT;
18436 +       if (nrpages > NR_FIX_BTMAPS)
18437 +               return NULL;
18438 +
18439 +       /*
18440 +        * Ok, go for it..
18441 +        */
18442 +       idx = FIX_BTMAP_BEGIN;
18443 +       while (nrpages > 0) {
18444 +               set_fixmap(idx, phys_addr);
18445 +               phys_addr += PAGE_SIZE;
18446 +               --idx;
18447 +               --nrpages;
18448 +       }
18449 +       return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
18450 +}
18451 +
18452 +void __init bt_iounmap(void *addr, unsigned long size)
18453 +{
18454 +       unsigned long virt_addr;
18455 +       unsigned long offset;
18456 +       unsigned int nrpages;
18457 +       enum fixed_addresses idx;
18458 +
18459 +       virt_addr = (unsigned long)addr;
18460 +       if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
18461 +               return;
18462 +       if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
18463 +               return;
18464 +       offset = virt_addr & ~PAGE_MASK;
18465 +       nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
18466 +
18467 +       idx = FIX_BTMAP_BEGIN;
18468 +       while (nrpages > 0) {
18469 +               clear_fixmap(idx);
18470 +               --idx;
18471 +               --nrpages;
18472 +       }
18473 +}
18474 diff -urNp linux-2.6/arch/i386/mm/Makefile new/arch/i386/mm/Makefile
18475 --- linux-2.6/arch/i386/mm/Makefile     2006-07-03 14:14:15.000000000 +0200
18476 +++ new/arch/i386/mm/Makefile   2006-05-09 12:32:36.000000000 +0200
18477 @@ -8,3 +8,11 @@ obj-$(CONFIG_NUMA) += discontig.o
18478  obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
18479  obj-$(CONFIG_HIGHMEM) += highmem.o
18480  obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o
18481 +
18482 +ifdef CONFIG_XEN
18483 +include $(srctree)/scripts/Makefile.xen
18484 +
18485 +obj-y          += hypervisor.o
18486 +
18487 +obj-y := $(call cherrypickxen, $(obj-y))
18488 +endif
18489 diff -urNp linux-2.6/arch/i386/mm/pageattr.c new/arch/i386/mm/pageattr.c
18490 --- linux-2.6/arch/i386/mm/pageattr.c   2006-07-03 14:14:15.000000000 +0200
18491 +++ new/arch/i386/mm/pageattr.c 2006-05-09 12:32:37.000000000 +0200
18492 @@ -85,7 +85,7 @@ static void set_pmd_pte(pte_t *kpte, uns
18493         unsigned long flags;
18494  
18495         set_pte_atomic(kpte, pte);      /* change init_mm */
18496 -       if (PTRS_PER_PMD > 1)
18497 +       if (HAVE_SHARED_KERNEL_PMD)
18498                 return;
18499  
18500         spin_lock_irqsave(&pgd_lock, flags);
18501 diff -urNp linux-2.6/arch/i386/mm/pgtable.c new/arch/i386/mm/pgtable.c
18502 --- linux-2.6/arch/i386/mm/pgtable.c    2006-07-03 14:14:15.000000000 +0200
18503 +++ new/arch/i386/mm/pgtable.c  2006-05-09 12:32:37.000000000 +0200
18504 @@ -13,6 +13,7 @@
18505  #include <linux/slab.h>
18506  #include <linux/pagemap.h>
18507  #include <linux/spinlock.h>
18508 +#include <linux/module.h>
18509  
18510  #include <asm/system.h>
18511  #include <asm/pgtable.h>
18512 @@ -138,6 +139,10 @@ void set_pmd_pfn(unsigned long vaddr, un
18513         __flush_tlb_one(vaddr);
18514  }
18515  
18516 +static int nr_fixmaps = 0;
18517 +unsigned long __FIXADDR_TOP = 0xfffff000;
18518 +EXPORT_SYMBOL(__FIXADDR_TOP);
18519 +
18520  void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
18521  {
18522         unsigned long address = __fix_to_virt(idx);
18523 @@ -147,6 +152,13 @@ void __set_fixmap (enum fixed_addresses 
18524                 return;
18525         }
18526         set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
18527 +       nr_fixmaps++;
18528 +}
18529 +
18530 +void set_fixaddr_top(unsigned long top)
18531 +{
18532 +       BUG_ON(nr_fixmaps > 0);
18533 +       __FIXADDR_TOP = top - PAGE_SIZE;
18534  }
18535  
18536  pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
18537 diff -urNp linux-2.6/arch/i386/mm/pgtable-xen.c new/arch/i386/mm/pgtable-xen.c
18538 --- linux-2.6/arch/i386/mm/pgtable-xen.c        1970-01-01 01:00:00.000000000 +0100
18539 +++ new/arch/i386/mm/pgtable-xen.c      2006-06-28 14:32:13.000000000 +0200
18540 @@ -0,0 +1,694 @@
18541 +/*
18542 + *  linux/arch/i386/mm/pgtable.c
18543 + */
18544 +
18545 +#include <linux/config.h>
18546 +#include <linux/sched.h>
18547 +#include <linux/kernel.h>
18548 +#include <linux/errno.h>
18549 +#include <linux/mm.h>
18550 +#include <linux/swap.h>
18551 +#include <linux/smp.h>
18552 +#include <linux/highmem.h>
18553 +#include <linux/slab.h>
18554 +#include <linux/pagemap.h>
18555 +#include <linux/spinlock.h>
18556 +#include <linux/module.h>
18557 +
18558 +#include <asm/system.h>
18559 +#include <asm/pgtable.h>
18560 +#include <asm/pgalloc.h>
18561 +#include <asm/fixmap.h>
18562 +#include <asm/e820.h>
18563 +#include <asm/tlb.h>
18564 +#include <asm/tlbflush.h>
18565 +#include <asm/io.h>
18566 +#include <asm/mmu_context.h>
18567 +
18568 +#include <xen/features.h>
18569 +#include <xen/foreign_page.h>
18570 +#include <asm/hypervisor.h>
18571 +
18572 +static void pgd_test_and_unpin(pgd_t *pgd);
18573 +
18574 +void show_mem(void)
18575 +{
18576 +       int total = 0, reserved = 0;
18577 +       int shared = 0, cached = 0;
18578 +       int highmem = 0;
18579 +       struct page *page;
18580 +       pg_data_t *pgdat;
18581 +       unsigned long i;
18582 +       struct page_state ps;
18583 +       unsigned long flags;
18584 +
18585 +       printk(KERN_INFO "Mem-info:\n");
18586 +       show_free_areas();
18587 +       printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
18588 +       for_each_online_pgdat(pgdat) {
18589 +               pgdat_resize_lock(pgdat, &flags);
18590 +               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
18591 +                       page = pgdat_page_nr(pgdat, i);
18592 +                       total++;
18593 +                       if (PageHighMem(page))
18594 +                               highmem++;
18595 +                       if (PageReserved(page))
18596 +                               reserved++;
18597 +                       else if (PageSwapCache(page))
18598 +                               cached++;
18599 +                       else if (page_count(page))
18600 +                               shared += page_count(page) - 1;
18601 +               }
18602 +               pgdat_resize_unlock(pgdat, &flags);
18603 +       }
18604 +       printk(KERN_INFO "%d pages of RAM\n", total);
18605 +       printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
18606 +       printk(KERN_INFO "%d reserved pages\n", reserved);
18607 +       printk(KERN_INFO "%d pages shared\n", shared);
18608 +       printk(KERN_INFO "%d pages swap cached\n", cached);
18609 +
18610 +       get_page_state(&ps);
18611 +       printk(KERN_INFO "%lu pages dirty\n", ps.nr_dirty);
18612 +       printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback);
18613 +       printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped);
18614 +       printk(KERN_INFO "%lu pages slab\n", ps.nr_slab);
18615 +       printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages);
18616 +}
18617 +
18618 +/*
18619 + * Associate a virtual page frame with a given physical page frame 
18620 + * and protection flags for that frame.
18621 + */ 
18622 +static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
18623 +{
18624 +       pgd_t *pgd;
18625 +       pud_t *pud;
18626 +       pmd_t *pmd;
18627 +       pte_t *pte;
18628 +
18629 +       pgd = swapper_pg_dir + pgd_index(vaddr);
18630 +       if (pgd_none(*pgd)) {
18631 +               BUG();
18632 +               return;
18633 +       }
18634 +       pud = pud_offset(pgd, vaddr);
18635 +       if (pud_none(*pud)) {
18636 +               BUG();
18637 +               return;
18638 +       }
18639 +       pmd = pmd_offset(pud, vaddr);
18640 +       if (pmd_none(*pmd)) {
18641 +               BUG();
18642 +               return;
18643 +       }
18644 +       pte = pte_offset_kernel(pmd, vaddr);
18645 +       /* <pfn,flags> stored as-is, to permit clearing entries */
18646 +       set_pte(pte, pfn_pte(pfn, flags));
18647 +
18648 +       /*
18649 +        * It's enough to flush this one mapping.
18650 +        * (PGE mappings get flushed as well)
18651 +        */
18652 +       __flush_tlb_one(vaddr);
18653 +}
18654 +
18655 +/*
18656 + * Associate a virtual page frame with a given physical page frame 
18657 + * and protection flags for that frame.
18658 + */ 
18659 +static void set_pte_pfn_ma(unsigned long vaddr, unsigned long pfn,
18660 +                          pgprot_t flags)
18661 +{
18662 +       pgd_t *pgd;
18663 +       pud_t *pud;
18664 +       pmd_t *pmd;
18665 +       pte_t *pte;
18666 +
18667 +       pgd = swapper_pg_dir + pgd_index(vaddr);
18668 +       if (pgd_none(*pgd)) {
18669 +               BUG();
18670 +               return;
18671 +       }
18672 +       pud = pud_offset(pgd, vaddr);
18673 +       if (pud_none(*pud)) {
18674 +               BUG();
18675 +               return;
18676 +       }
18677 +       pmd = pmd_offset(pud, vaddr);
18678 +       if (pmd_none(*pmd)) {
18679 +               BUG();
18680 +               return;
18681 +       }
18682 +       pte = pte_offset_kernel(pmd, vaddr);
18683 +       /* <pfn,flags> stored as-is, to permit clearing entries */
18684 +       set_pte(pte, pfn_pte_ma(pfn, flags));
18685 +
18686 +       /*
18687 +        * It's enough to flush this one mapping.
18688 +        * (PGE mappings get flushed as well)
18689 +        */
18690 +       __flush_tlb_one(vaddr);
18691 +}
18692 +
18693 +/*
18694 + * Associate a large virtual page frame with a given physical page frame 
18695 + * and protection flags for that frame. pfn is for the base of the page,
18696 + * vaddr is what the page gets mapped to - both must be properly aligned. 
18697 + * The pmd must already be instantiated. Assumes PAE mode.
18698 + */ 
18699 +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
18700 +{
18701 +       pgd_t *pgd;
18702 +       pud_t *pud;
18703 +       pmd_t *pmd;
18704 +
18705 +       if (vaddr & (PMD_SIZE-1)) {             /* vaddr is misaligned */
18706 +               printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
18707 +               return; /* BUG(); */
18708 +       }
18709 +       if (pfn & (PTRS_PER_PTE-1)) {           /* pfn is misaligned */
18710 +               printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
18711 +               return; /* BUG(); */
18712 +       }
18713 +       pgd = swapper_pg_dir + pgd_index(vaddr);
18714 +       if (pgd_none(*pgd)) {
18715 +               printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
18716 +               return; /* BUG(); */
18717 +       }
18718 +       pud = pud_offset(pgd, vaddr);
18719 +       pmd = pmd_offset(pud, vaddr);
18720 +       set_pmd(pmd, pfn_pmd(pfn, flags));
18721 +       /*
18722 +        * It's enough to flush this one mapping.
18723 +        * (PGE mappings get flushed as well)
18724 +        */
18725 +       __flush_tlb_one(vaddr);
18726 +}
18727 +
18728 +static int nr_fixmaps = 0;
18729 +unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - 2 * PAGE_SIZE);
18730 +EXPORT_SYMBOL(__FIXADDR_TOP);
18731 +
18732 +void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
18733 +{
18734 +       unsigned long address = __fix_to_virt(idx);
18735 +
18736 +       if (idx >= __end_of_fixed_addresses) {
18737 +               BUG();
18738 +               return;
18739 +       }
18740 +       switch (idx) {
18741 +       case FIX_WP_TEST:
18742 +#ifdef CONFIG_X86_F00F_BUG
18743 +       case FIX_F00F_IDT:
18744 +#endif
18745 +               set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
18746 +               break;
18747 +       default:
18748 +               set_pte_pfn_ma(address, phys >> PAGE_SHIFT, flags);
18749 +               break;
18750 +       }
18751 +       nr_fixmaps++;
18752 +}
18753 +
18754 +void set_fixaddr_top(unsigned long top)
18755 +{
18756 +       BUG_ON(nr_fixmaps > 0);
18757 +       __FIXADDR_TOP = top - PAGE_SIZE;
18758 +}
18759 +
18760 +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
18761 +{
18762 +       pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
18763 +       if (pte)
18764 +               make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
18765 +       return pte;
18766 +}
18767 +
18768 +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
18769 +{
18770 +       struct page *pte;
18771 +
18772 +#ifdef CONFIG_HIGHPTE
18773 +       pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
18774 +#else
18775 +       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
18776 +       if (pte) {
18777 +               SetPageForeign(pte, pte_free);
18778 +               init_page_count(pte);
18779 +       }
18780 +#endif
18781 +       return pte;
18782 +}
18783 +
18784 +void pte_free(struct page *pte)
18785 +{
18786 +       unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
18787 +
18788 +       if (!pte_write(*virt_to_ptep(va)))
18789 +               BUG_ON(HYPERVISOR_update_va_mapping(
18790 +                       va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0));
18791 +
18792 +       ClearPageForeign(pte);
18793 +       init_page_count(pte);
18794 +
18795 +       __free_page(pte);
18796 +}
18797 +
18798 +void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
18799 +{
18800 +       memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
18801 +}
18802 +
18803 +/*
18804 + * List of all pgd's needed for non-PAE so it can invalidate entries
18805 + * in both cached and uncached pgd's; not needed for PAE since the
18806 + * kernel pmd is shared. If PAE were not to share the pmd a similar
18807 + * tactic would be needed. This is essentially codepath-based locking
18808 + * against pageattr.c; it is the unique case in which a valid change
18809 + * of kernel pagetables can't be lazily synchronized by vmalloc faults.
18810 + * vmalloc faults work because attached pagetables are never freed.
18811 + * The locking scheme was chosen on the basis of manfred's
18812 + * recommendations and having no core impact whatsoever.
18813 + * -- wli
18814 + */
18815 +DEFINE_SPINLOCK(pgd_lock);
18816 +struct page *pgd_list;
18817 +
18818 +static inline void pgd_list_add(pgd_t *pgd)
18819 +{
18820 +       struct page *page = virt_to_page(pgd);
18821 +       page->index = (unsigned long)pgd_list;
18822 +       if (pgd_list)
18823 +               set_page_private(pgd_list, (unsigned long)&page->index);
18824 +       pgd_list = page;
18825 +       set_page_private(page, (unsigned long)&pgd_list);
18826 +}
18827 +
18828 +static inline void pgd_list_del(pgd_t *pgd)
18829 +{
18830 +       struct page *next, **pprev, *page = virt_to_page(pgd);
18831 +       next = (struct page *)page->index;
18832 +       pprev = (struct page **)page_private(page);
18833 +       *pprev = next;
18834 +       if (next)
18835 +               set_page_private(next, (unsigned long)pprev);
18836 +}
18837 +
18838 +void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
18839 +{
18840 +       unsigned long flags;
18841 +
18842 +       if (PTRS_PER_PMD > 1) {
18843 +               if (HAVE_SHARED_KERNEL_PMD)
18844 +                       clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
18845 +                                       swapper_pg_dir + USER_PTRS_PER_PGD,
18846 +                                       KERNEL_PGD_PTRS);
18847 +       } else {
18848 +               spin_lock_irqsave(&pgd_lock, flags);
18849 +               clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
18850 +                               swapper_pg_dir + USER_PTRS_PER_PGD,
18851 +                               KERNEL_PGD_PTRS);
18852 +               memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
18853 +               pgd_list_add(pgd);
18854 +               spin_unlock_irqrestore(&pgd_lock, flags);
18855 +       }
18856 +}
18857 +
18858 +/* never called when PTRS_PER_PMD > 1 */
18859 +void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
18860 +{
18861 +       unsigned long flags; /* can be called from interrupt context */
18862 +
18863 +       spin_lock_irqsave(&pgd_lock, flags);
18864 +       pgd_list_del(pgd);
18865 +       spin_unlock_irqrestore(&pgd_lock, flags);
18866 +
18867 +       pgd_test_and_unpin(pgd);
18868 +}
18869 +
18870 +pgd_t *pgd_alloc(struct mm_struct *mm)
18871 +{
18872 +       int i;
18873 +       pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
18874 +       pmd_t **pmd;
18875 +       unsigned long flags;
18876 +
18877 +       pgd_test_and_unpin(pgd);
18878 +
18879 +       if (PTRS_PER_PMD == 1 || !pgd)
18880 +               return pgd;
18881 +
18882 +       if (HAVE_SHARED_KERNEL_PMD) {
18883 +               for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
18884 +                       pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
18885 +                       if (!pmd)
18886 +                               goto out_oom;
18887 +                       set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
18888 +               }
18889 +               return pgd;
18890 +       }
18891 +
18892 +       /*
18893 +        * We can race save/restore (if we sleep during a GFP_KERNEL memory
18894 +        * allocation). We therefore store virtual addresses of pmds as they
18895 +        * do not change across save/restore, and poke the machine addresses
18896 +        * into the pgdir under the pgd_lock.
18897 +        */
18898 +       pmd = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
18899 +       if (!pmd) {
18900 +               kmem_cache_free(pgd_cache, pgd);
18901 +               return NULL;
18902 +       }
18903 +
18904 +       /* Allocate pmds, remember virtual addresses. */
18905 +       for (i = 0; i < PTRS_PER_PGD; ++i) {
18906 +               pmd[i] = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
18907 +               if (!pmd[i])
18908 +                       goto out_oom;
18909 +       }
18910 +
18911 +       spin_lock_irqsave(&pgd_lock, flags);
18912 +
18913 +       /* Protect against save/restore: move below 4GB under pgd_lock. */
18914 +       if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
18915 +               int rc = xen_create_contiguous_region(
18916 +                       (unsigned long)pgd, 0, 32);
18917 +               if (rc) {
18918 +                       spin_unlock_irqrestore(&pgd_lock, flags);
18919 +                       goto out_oom;
18920 +               }
18921 +       }
18922 +
18923 +       /* Copy kernel pmd contents and write-protect the new pmds. */
18924 +       for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
18925 +               unsigned long v = (unsigned long)i << PGDIR_SHIFT;
18926 +               pgd_t *kpgd = pgd_offset_k(v);
18927 +               pud_t *kpud = pud_offset(kpgd, v);
18928 +               pmd_t *kpmd = pmd_offset(kpud, v);
18929 +               memcpy(pmd[i], kpmd, PAGE_SIZE);
18930 +               make_lowmem_page_readonly(
18931 +                       pmd[i], XENFEAT_writable_page_tables);
18932 +       }
18933 +
18934 +       /* It is safe to poke machine addresses of pmds under the pmd_lock. */
18935 +       for (i = 0; i < PTRS_PER_PGD; i++)
18936 +               set_pgd(&pgd[i], __pgd(1 + __pa(pmd[i])));
18937 +
18938 +       /* Ensure this pgd gets picked up and pinned on save/restore. */
18939 +       pgd_list_add(pgd);
18940 +
18941 +       spin_unlock_irqrestore(&pgd_lock, flags);
18942 +
18943 +       kfree(pmd);
18944 +
18945 +       return pgd;
18946 +
18947 +out_oom:
18948 +       if (HAVE_SHARED_KERNEL_PMD) {
18949 +               for (i--; i >= 0; i--)
18950 +                       kmem_cache_free(pmd_cache,
18951 +                                       (void *)__va(pgd_val(pgd[i])-1));
18952 +       } else {
18953 +               for (i--; i >= 0; i--)
18954 +                       kmem_cache_free(pmd_cache, pmd[i]);
18955 +               kfree(pmd);
18956 +       }
18957 +       kmem_cache_free(pgd_cache, pgd);
18958 +       return NULL;
18959 +}
18960 +
18961 +void pgd_free(pgd_t *pgd)
18962 +{
18963 +       int i;
18964 +
18965 +       /*
18966 +        * After this the pgd should not be pinned for the duration of this
18967 +        * function's execution. We should never sleep and thus never race:
18968 +        *  1. User pmds will not become write-protected under our feet due
18969 +        *     to a concurrent mm_pin_all().
18970 +        *  2. The machine addresses in PGD entries will not become invalid
18971 +        *     due to a concurrent save/restore.
18972 +        */
18973 +       pgd_test_and_unpin(pgd);
18974 +
18975 +       /* in the PAE case user pgd entries are overwritten before usage */
18976 +       if (PTRS_PER_PMD > 1) {
18977 +               for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
18978 +                       pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
18979 +                       kmem_cache_free(pmd_cache, pmd);
18980 +               }
18981 +
18982 +               if (!HAVE_SHARED_KERNEL_PMD) {
18983 +                       unsigned long flags;
18984 +                       spin_lock_irqsave(&pgd_lock, flags);
18985 +                       pgd_list_del(pgd);
18986 +                       spin_unlock_irqrestore(&pgd_lock, flags);
18987 +
18988 +                       for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
18989 +                               pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
18990 +                               make_lowmem_page_writable(
18991 +                                       pmd, XENFEAT_writable_page_tables);
18992 +                               memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
18993 +                               kmem_cache_free(pmd_cache, pmd);
18994 +                       }
18995 +
18996 +                       if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
18997 +                               xen_destroy_contiguous_region(
18998 +                                       (unsigned long)pgd, 0);
18999 +               }
19000 +       }
19001 +
19002 +       /* in the non-PAE case, free_pgtables() clears user pgd entries */
19003 +       kmem_cache_free(pgd_cache, pgd);
19004 +}
19005 +
19006 +void make_lowmem_page_readonly(void *va, unsigned int feature)
19007 +{
19008 +       pte_t *pte;
19009 +       int rc;
19010 +
19011 +       if (xen_feature(feature))
19012 +               return;
19013 +
19014 +       pte = virt_to_ptep(va);
19015 +       rc = HYPERVISOR_update_va_mapping(
19016 +               (unsigned long)va, pte_wrprotect(*pte), 0);
19017 +       BUG_ON(rc);
19018 +}
19019 +
19020 +void make_lowmem_page_writable(void *va, unsigned int feature)
19021 +{
19022 +       pte_t *pte;
19023 +       int rc;
19024 +
19025 +       if (xen_feature(feature))
19026 +               return;
19027 +
19028 +       pte = virt_to_ptep(va);
19029 +       rc = HYPERVISOR_update_va_mapping(
19030 +               (unsigned long)va, pte_mkwrite(*pte), 0);
19031 +       BUG_ON(rc);
19032 +}
19033 +
19034 +void make_page_readonly(void *va, unsigned int feature)
19035 +{
19036 +       pte_t *pte;
19037 +       int rc;
19038 +
19039 +       if (xen_feature(feature))
19040 +               return;
19041 +
19042 +       pte = virt_to_ptep(va);
19043 +       rc = HYPERVISOR_update_va_mapping(
19044 +               (unsigned long)va, pte_wrprotect(*pte), 0);
19045 +       if (rc) /* fallback? */
19046 +               xen_l1_entry_update(pte, pte_wrprotect(*pte));
19047 +       if ((unsigned long)va >= (unsigned long)high_memory) {
19048 +               unsigned long pfn = pte_pfn(*pte);
19049 +#ifdef CONFIG_HIGHMEM
19050 +               if (pfn >= highstart_pfn)
19051 +                       kmap_flush_unused(); /* flush stale writable kmaps */
19052 +               else
19053 +#endif
19054 +                       make_lowmem_page_readonly(
19055 +                               phys_to_virt(pfn << PAGE_SHIFT), feature); 
19056 +       }
19057 +}
19058 +
19059 +void make_page_writable(void *va, unsigned int feature)
19060 +{
19061 +       pte_t *pte;
19062 +       int rc;
19063 +
19064 +       if (xen_feature(feature))
19065 +               return;
19066 +
19067 +       pte = virt_to_ptep(va);
19068 +       rc = HYPERVISOR_update_va_mapping(
19069 +               (unsigned long)va, pte_mkwrite(*pte), 0);
19070 +       if (rc) /* fallback? */
19071 +               xen_l1_entry_update(pte, pte_mkwrite(*pte));
19072 +       if ((unsigned long)va >= (unsigned long)high_memory) {
19073 +               unsigned long pfn = pte_pfn(*pte); 
19074 +#ifdef CONFIG_HIGHMEM
19075 +               if (pfn < highstart_pfn)
19076 +#endif
19077 +                       make_lowmem_page_writable(
19078 +                               phys_to_virt(pfn << PAGE_SHIFT), feature);
19079 +       }
19080 +}
19081 +
19082 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
19083 +{
19084 +       if (xen_feature(feature))
19085 +               return;
19086 +
19087 +       while (nr-- != 0) {
19088 +               make_page_readonly(va, feature);
19089 +               va = (void *)((unsigned long)va + PAGE_SIZE);
19090 +       }
19091 +}
19092 +
19093 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
19094 +{
19095 +       if (xen_feature(feature))
19096 +               return;
19097 +
19098 +       while (nr-- != 0) {
19099 +               make_page_writable(va, feature);
19100 +               va = (void *)((unsigned long)va + PAGE_SIZE);
19101 +       }
19102 +}
19103 +
19104 +static inline void pgd_walk_set_prot(void *pt, pgprot_t flags)
19105 +{
19106 +       struct page *page = virt_to_page(pt);
19107 +       unsigned long pfn = page_to_pfn(page);
19108 +
19109 +       if (PageHighMem(page))
19110 +               return;
19111 +       BUG_ON(HYPERVISOR_update_va_mapping(
19112 +               (unsigned long)__va(pfn << PAGE_SHIFT),
19113 +               pfn_pte(pfn, flags), 0));
19114 +}
19115 +
19116 +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
19117 +{
19118 +       pgd_t *pgd = pgd_base;
19119 +       pud_t *pud;
19120 +       pmd_t *pmd;
19121 +       pte_t *pte;
19122 +       int    g, u, m;
19123 +
19124 +       if (xen_feature(XENFEAT_auto_translated_physmap))
19125 +               return;
19126 +
19127 +       for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
19128 +               if (pgd_none(*pgd))
19129 +                       continue;
19130 +               pud = pud_offset(pgd, 0);
19131 +               if (PTRS_PER_PUD > 1) /* not folded */
19132 +                       pgd_walk_set_prot(pud,flags);
19133 +               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
19134 +                       if (pud_none(*pud))
19135 +                               continue;
19136 +                       pmd = pmd_offset(pud, 0);
19137 +                       if (PTRS_PER_PMD > 1) /* not folded */
19138 +                               pgd_walk_set_prot(pmd,flags);
19139 +                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
19140 +                               if (pmd_none(*pmd))
19141 +                                       continue;
19142 +                               pte = pte_offset_kernel(pmd,0);
19143 +                               pgd_walk_set_prot(pte,flags);
19144 +                       }
19145 +               }
19146 +       }
19147 +
19148 +       BUG_ON(HYPERVISOR_update_va_mapping(
19149 +               (unsigned long)pgd_base,
19150 +               pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
19151 +               UVMF_TLB_FLUSH));
19152 +}
19153 +
19154 +static void __pgd_pin(pgd_t *pgd)
19155 +{
19156 +       pgd_walk(pgd, PAGE_KERNEL_RO);
19157 +       xen_pgd_pin(__pa(pgd));
19158 +       set_bit(PG_pinned, &virt_to_page(pgd)->flags);
19159 +}
19160 +
19161 +static void __pgd_unpin(pgd_t *pgd)
19162 +{
19163 +       xen_pgd_unpin(__pa(pgd));
19164 +       pgd_walk(pgd, PAGE_KERNEL);
19165 +       clear_bit(PG_pinned, &virt_to_page(pgd)->flags);
19166 +}
19167 +
19168 +static void pgd_test_and_unpin(pgd_t *pgd)
19169 +{
19170 +       if (test_bit(PG_pinned, &virt_to_page(pgd)->flags))
19171 +               __pgd_unpin(pgd);
19172 +}
19173 +
19174 +void mm_pin(struct mm_struct *mm)
19175 +{
19176 +       if (xen_feature(XENFEAT_writable_page_tables))
19177 +               return;
19178 +       spin_lock(&mm->page_table_lock);
19179 +       __pgd_pin(mm->pgd);
19180 +       spin_unlock(&mm->page_table_lock);
19181 +}
19182 +
19183 +void mm_unpin(struct mm_struct *mm)
19184 +{
19185 +       if (xen_feature(XENFEAT_writable_page_tables))
19186 +               return;
19187 +       spin_lock(&mm->page_table_lock);
19188 +       __pgd_unpin(mm->pgd);
19189 +       spin_unlock(&mm->page_table_lock);
19190 +}
19191 +
19192 +void mm_pin_all(void)
19193 +{
19194 +       struct page *page;
19195 +       if (xen_feature(XENFEAT_writable_page_tables))
19196 +               return;
19197 +       for (page = pgd_list; page; page = (struct page *)page->index) {
19198 +               if (!test_bit(PG_pinned, &page->flags))
19199 +                       __pgd_pin((pgd_t *)page_address(page));
19200 +       }
19201 +}
19202 +
19203 +void _arch_dup_mmap(struct mm_struct *mm)
19204 +{
19205 +       if (!test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags))
19206 +               mm_pin(mm);
19207 +}
19208 +
19209 +void _arch_exit_mmap(struct mm_struct *mm)
19210 +{
19211 +       struct task_struct *tsk = current;
19212 +
19213 +       task_lock(tsk);
19214 +
19215 +       /*
19216 +        * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
19217 +        * *much* faster this way, as no tlb flushes means bigger wrpt batches.
19218 +        */
19219 +       if (tsk->active_mm == mm) {
19220 +               tsk->active_mm = &init_mm;
19221 +               atomic_inc(&init_mm.mm_count);
19222 +
19223 +               switch_mm(mm, &init_mm, tsk);
19224 +
19225 +               atomic_dec(&mm->mm_count);
19226 +               BUG_ON(atomic_read(&mm->mm_count) == 0);
19227 +       }
19228 +
19229 +       task_unlock(tsk);
19230 +
19231 +       if (test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags) &&
19232 +           (atomic_read(&mm->mm_count) == 1))
19233 +               mm_unpin(mm);
19234 +}
19235 diff -urNp linux-2.6/arch/i386/oprofile/Makefile new/arch/i386/oprofile/Makefile
19236 --- linux-2.6/arch/i386/oprofile/Makefile       2006-07-03 14:14:15.000000000 +0200
19237 +++ new/arch/i386/oprofile/Makefile     2006-05-09 12:32:37.000000000 +0200
19238 @@ -6,7 +6,11 @@ DRIVER_OBJS = $(addprefix ../../../drive
19239                 oprofilefs.o oprofile_stats.o  \
19240                 timer_int.o )
19241  
19242 +ifdef CONFIG_XEN
19243 +oprofile-y                             := $(DRIVER_OBJS) xenoprof.o
19244 +else 
19245  oprofile-y                             := $(DRIVER_OBJS) init.o backtrace.o
19246  oprofile-$(CONFIG_X86_LOCAL_APIC)      += nmi_int.o op_model_athlon.o \
19247                                            op_model_ppro.o op_model_p4.o
19248  oprofile-$(CONFIG_X86_IO_APIC)         += nmi_timer_int.o
19249 +endif
19250 diff -urNp linux-2.6/arch/i386/oprofile/xenoprof.c new/arch/i386/oprofile/xenoprof.c
19251 --- linux-2.6/arch/i386/oprofile/xenoprof.c     1970-01-01 01:00:00.000000000 +0100
19252 +++ new/arch/i386/oprofile/xenoprof.c   2006-07-07 15:10:03.000000000 +0200
19253 @@ -0,0 +1,542 @@
19254 +/**
19255 + * @file xenoprof.c
19256 + *
19257 + * @remark Copyright 2002 OProfile authors
19258 + * @remark Read the file COPYING
19259 + *
19260 + * @author John Levon <levon@movementarian.org>
19261 + *
19262 + * Modified by Aravind Menon and Jose Renato Santos for Xen
19263 + * These modifications are:
19264 + * Copyright (C) 2005 Hewlett-Packard Co.
19265 + */
19266 +
19267 +#include <linux/init.h>
19268 +#include <linux/notifier.h>
19269 +#include <linux/smp.h>
19270 +#include <linux/oprofile.h>
19271 +#include <linux/sysdev.h>
19272 +#include <linux/slab.h>
19273 +#include <linux/interrupt.h>
19274 +#include <linux/vmalloc.h>
19275 +#include <asm/nmi.h>
19276 +#include <asm/msr.h>
19277 +#include <asm/apic.h>
19278 +#include <asm/pgtable.h>
19279 +#include <xen/evtchn.h>
19280 +#include "op_counter.h"
19281 +
19282 +#include <xen/interface/xen.h>
19283 +#include <xen/interface/xenoprof.h>
19284 +#include <../../../drivers/oprofile/cpu_buffer.h>
19285 +
19286 +static int xenoprof_start(void);
19287 +static void xenoprof_stop(void);
19288 +
19289 +void * vm_map_xen_pages(unsigned long maddr, int vm_size, pgprot_t prot);
19290 +
19291 +static int xenoprof_enabled = 0;
19292 +static unsigned int num_events = 0;
19293 +static int is_primary = 0;
19294 +static int active_defined;
19295 +
19296 +/* sample buffers shared with Xen */
19297 +xenoprof_buf_t * xenoprof_buf[MAX_VIRT_CPUS];
19298 +/* Shared buffer area */
19299 +char * shared_buffer;
19300 +/* Number of buffers in shared area (one per VCPU) */
19301 +int nbuf;
19302 +/* Mappings of VIRQ_XENOPROF to irq number (per cpu) */
19303 +int ovf_irq[NR_CPUS];
19304 +/* cpu model type string - copied from Xen memory space on XENOPROF_init command */
19305 +char cpu_type[XENOPROF_CPU_TYPE_SIZE];
19306 +
19307 +/* Passive sample buffers shared with Xen */
19308 +xenoprof_buf_t *p_xenoprof_buf[MAX_OPROF_DOMAINS][MAX_VIRT_CPUS];
19309 +/* Passive shared buffer area */
19310 +char *p_shared_buffer[MAX_OPROF_DOMAINS];
19311 +
19312 +#ifdef CONFIG_PM
19313 +
19314 +static int xenoprof_suspend(struct sys_device * dev, pm_message_t state)
19315 +{
19316 +       if (xenoprof_enabled == 1)
19317 +               xenoprof_stop();
19318 +       return 0;
19319 +}
19320 +
19321 +
19322 +static int xenoprof_resume(struct sys_device * dev)
19323 +{
19324 +       if (xenoprof_enabled == 1)
19325 +               xenoprof_start();
19326 +       return 0;
19327 +}
19328 +
19329 +
19330 +static struct sysdev_class oprofile_sysclass = {
19331 +       set_kset_name("oprofile"),
19332 +       .resume         = xenoprof_resume,
19333 +       .suspend        = xenoprof_suspend
19334 +};
19335 +
19336 +
19337 +static struct sys_device device_oprofile = {
19338 +       .id     = 0,
19339 +       .cls    = &oprofile_sysclass,
19340 +};
19341 +
19342 +
19343 +static int __init init_driverfs(void)
19344 +{
19345 +       int error;
19346 +       if (!(error = sysdev_class_register(&oprofile_sysclass)))
19347 +               error = sysdev_register(&device_oprofile);
19348 +       return error;
19349 +}
19350 +
19351 +
19352 +static void __exit exit_driverfs(void)
19353 +{
19354 +       sysdev_unregister(&device_oprofile);
19355 +       sysdev_class_unregister(&oprofile_sysclass);
19356 +}
19357 +
19358 +#else
19359 +#define init_driverfs() do { } while (0)
19360 +#define exit_driverfs() do { } while (0)
19361 +#endif /* CONFIG_PM */
19362 +
19363 +unsigned long long oprofile_samples = 0;
19364 +unsigned long long p_oprofile_samples = 0;
19365 +
19366 +unsigned int pdomains;
19367 +struct xenoprof_passive passive_domains[MAX_OPROF_DOMAINS];
19368 +
19369 +static void xenoprof_add_pc(xenoprof_buf_t *buf, int is_passive)
19370 +{
19371 +       int head, tail, size;
19372 +
19373 +       head = buf->event_head;
19374 +       tail = buf->event_tail;
19375 +       size = buf->event_size;
19376 +
19377 +       if (tail > head) {
19378 +               while (tail < size) {
19379 +                       oprofile_add_pc(buf->event_log[tail].eip,
19380 +                                       buf->event_log[tail].mode,
19381 +                                       buf->event_log[tail].event);
19382 +                       if (!is_passive)
19383 +                               oprofile_samples++;
19384 +                       else
19385 +                               p_oprofile_samples++;
19386 +                       tail++;
19387 +               }
19388 +               tail = 0;
19389 +       }
19390 +       while (tail < head) {
19391 +               oprofile_add_pc(buf->event_log[tail].eip,
19392 +                               buf->event_log[tail].mode,
19393 +                               buf->event_log[tail].event);
19394 +               if (!is_passive)
19395 +                       oprofile_samples++;
19396 +               else
19397 +                       p_oprofile_samples++;
19398 +               tail++;
19399 +       }
19400 +
19401 +       buf->event_tail = tail;
19402 +}
19403 +
19404 +static void xenoprof_handle_passive(void)
19405 +{
19406 +       int i, j;
19407 +
19408 +       for (i = 0; i < pdomains; i++)
19409 +               for (j = 0; j < passive_domains[i].nbuf; j++) {
19410 +                       xenoprof_buf_t *buf = p_xenoprof_buf[i][j];
19411 +                       if (buf->event_head == buf->event_tail)
19412 +                               continue;
19413 +                        oprofile_add_pc(IGNORED_PC, CPU_MODE_PASSIVE_START, passive_domains[i].domain_id);
19414 +                       xenoprof_add_pc(buf, 1);
19415 +                        oprofile_add_pc(IGNORED_PC, CPU_MODE_PASSIVE_STOP, passive_domains[i].domain_id);
19416 +               }                       
19417 +}
19418 +
19419 +static irqreturn_t 
19420 +xenoprof_ovf_interrupt(int irq, void * dev_id, struct pt_regs * regs)
19421 +{
19422 +       struct xenoprof_buf * buf;
19423 +       int cpu;
19424 +       static unsigned long flag;
19425 +
19426 +       cpu = smp_processor_id();
19427 +       buf = xenoprof_buf[cpu];
19428 +
19429 +       xenoprof_add_pc(buf, 0);
19430 +
19431 +       if (is_primary && !test_and_set_bit(0, &flag)) {
19432 +               xenoprof_handle_passive();
19433 +               clear_bit(0, &flag);
19434 +       }
19435 +
19436 +       return IRQ_HANDLED;
19437 +}
19438 +
19439 +
19440 +static void unbind_virq(void)
19441 +{
19442 +       int i;
19443 +
19444 +       for_each_cpu(i) {
19445 +               if (ovf_irq[i] >= 0) {
19446 +                       unbind_from_irqhandler(ovf_irq[i], NULL);
19447 +                       ovf_irq[i] = -1;
19448 +               }
19449 +       }
19450 +}
19451 +
19452 +
19453 +static int bind_virq(void)
19454 +{
19455 +       int i, result;
19456 +
19457 +       for_each_cpu(i) {
19458 +               result = bind_virq_to_irqhandler(VIRQ_XENOPROF,
19459 +                                                i,
19460 +                                                xenoprof_ovf_interrupt,
19461 +                                                SA_INTERRUPT,
19462 +                                                "xenoprof",
19463 +                                                NULL);
19464 +
19465 +               if (result < 0) {
19466 +                       unbind_virq();
19467 +                       return result;
19468 +               }
19469 +
19470 +               ovf_irq[i] = result;
19471 +       }
19472 +               
19473 +       return 0;
19474 +}
19475 +
19476 +
19477 +static int xenoprof_setup(void)
19478 +{
19479 +       int ret;
19480 +       int i;
19481 +
19482 +       ret = bind_virq();
19483 +       if (ret)
19484 +               return ret;
19485 +
19486 +       if (is_primary) {
19487 +               struct xenoprof_counter counter;
19488 +
19489 +               /* Define dom0 as an active domain if not done yet */
19490 +               if (!active_defined) {
19491 +                       domid_t domid;
19492 +                       ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL);
19493 +                       if (ret)
19494 +                               goto err;
19495 +                       domid = 0;
19496 +                       ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid);
19497 +                       if (ret)
19498 +                               goto err;
19499 +                       active_defined = 1;
19500 +               }
19501 +
19502 +               ret = HYPERVISOR_xenoprof_op(XENOPROF_reserve_counters, NULL);
19503 +               if (ret)
19504 +                       goto err;
19505 +               for (i=0; i<num_events; i++) {
19506 +                       counter.ind       = i;
19507 +                       counter.count     = (uint64_t)counter_config[i].count;
19508 +                       counter.enabled   = (uint32_t)counter_config[i].enabled;
19509 +                       counter.event     = (uint32_t)counter_config[i].event;
19510 +                       counter.kernel    = (uint32_t)counter_config[i].kernel;
19511 +                       counter.user      = (uint32_t)counter_config[i].user;
19512 +                       counter.unit_mask = (uint64_t)counter_config[i].unit_mask;
19513 +                       HYPERVISOR_xenoprof_op(XENOPROF_counter, 
19514 +                                              &counter);
19515 +               }
19516 +               ret = HYPERVISOR_xenoprof_op(XENOPROF_setup_events, NULL);
19517 +
19518 +               if (ret)
19519 +                       goto err;
19520 +       }
19521 +
19522 +       ret = HYPERVISOR_xenoprof_op(XENOPROF_enable_virq, NULL);
19523 +       if (ret)
19524 +               goto err;
19525 +
19526 +       xenoprof_enabled = 1;
19527 +       return 0;
19528 + err:
19529 +       unbind_virq();
19530 +       return ret;
19531 +}
19532 +
19533 +
19534 +static void xenoprof_shutdown(void)
19535 +{
19536 +       xenoprof_enabled = 0;
19537 +
19538 +       HYPERVISOR_xenoprof_op(XENOPROF_disable_virq, NULL);
19539 +
19540 +       if (is_primary) {
19541 +               HYPERVISOR_xenoprof_op(XENOPROF_release_counters, NULL);
19542 +               active_defined = 0;
19543 +       }
19544 +
19545 +       unbind_virq();
19546 +
19547 +}
19548 +
19549 +
19550 +static int xenoprof_start(void)
19551 +{
19552 +       int ret = 0;
19553 +
19554 +       if (is_primary)
19555 +               ret = HYPERVISOR_xenoprof_op(XENOPROF_start, NULL);
19556 +
19557 +       return ret;
19558 +}
19559 +
19560 +
19561 +static void xenoprof_stop(void)
19562 +{
19563 +       if (is_primary)
19564 +               HYPERVISOR_xenoprof_op(XENOPROF_stop, NULL);
19565 +}
19566 +
19567 +
19568 +static int xenoprof_set_active(int * active_domains,
19569 +                              unsigned int adomains)
19570 +{
19571 +       int ret = 0;
19572 +       int i;
19573 +       int set_dom0 = 0;
19574 +       domid_t domid;
19575 +
19576 +       if (!is_primary)
19577 +               return 0;
19578 +
19579 +       if (adomains > MAX_OPROF_DOMAINS)
19580 +               return -E2BIG;
19581 +
19582 +       ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL);
19583 +       if (ret)
19584 +               return ret;
19585 +
19586 +       for (i=0; i<adomains; i++) {
19587 +               domid = active_domains[i];
19588 +               if (domid != active_domains[i]) {
19589 +                       ret = -EINVAL;
19590 +                       goto out;
19591 +               }
19592 +               ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid);
19593 +               if (ret)
19594 +                       goto out;
19595 +               if (active_domains[i] == 0)
19596 +                       set_dom0 = 1;
19597 +       }
19598 +       /* dom0 must always be active but may not be in the list */ 
19599 +       if (!set_dom0) {
19600 +               domid = 0;
19601 +               ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid);
19602 +       }
19603 +
19604 +out:
19605 +       if (ret)
19606 +               HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL);
19607 +       active_defined = !ret;
19608 +       return ret;
19609 +}
19610 +
19611 +static int xenoprof_set_passive(int * p_domains,
19612 +                                unsigned int pdoms)
19613 +{
19614 +       int ret;
19615 +       int i, j;
19616 +       int vm_size;
19617 +       int npages;
19618 +       struct xenoprof_buf *buf;
19619 +       pgprot_t prot = __pgprot(_KERNPG_TABLE);
19620 +
19621 +       if (!is_primary)
19622 +               return 0;
19623 +
19624 +       if (pdoms > MAX_OPROF_DOMAINS)
19625 +               return -E2BIG;
19626 +
19627 +       ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_passive_list, NULL);
19628 +       if (ret)
19629 +               return ret;
19630 +
19631 +       for (i = 0; i < pdoms; i++) {
19632 +               passive_domains[i].domain_id = p_domains[i];
19633 +               passive_domains[i].max_samples = 2048;
19634 +               ret = HYPERVISOR_xenoprof_op(XENOPROF_set_passive, &passive_domains[i]);
19635 +               if (ret)
19636 +                       return ret;
19637 +
19638 +               npages = (passive_domains[i].bufsize * passive_domains[i].nbuf - 1) / PAGE_SIZE + 1;
19639 +               vm_size = npages * PAGE_SIZE;
19640 +
19641 +               p_shared_buffer[i] = (char *)vm_map_xen_pages(passive_domains[i].buf_maddr,
19642 +                                                             vm_size, prot);
19643 +               if (!p_shared_buffer[i]) {
19644 +                       ret = -ENOMEM;
19645 +                       goto out;
19646 +               }
19647 +
19648 +               for (j = 0; j < passive_domains[i].nbuf; j++) {
19649 +                       buf = (struct xenoprof_buf *)
19650 +                               &p_shared_buffer[i][j * passive_domains[i].bufsize];
19651 +                       BUG_ON(buf->vcpu_id >= MAX_VIRT_CPUS);
19652 +                       p_xenoprof_buf[i][buf->vcpu_id] = buf;
19653 +               }
19654 +
19655 +       }
19656 +
19657 +       pdomains = pdoms;
19658 +       return 0;
19659 +
19660 +out:
19661 +       for (j = 0; j < i; j++) {
19662 +               vunmap(p_shared_buffer[j]);
19663 +               p_shared_buffer[j] = NULL;
19664 +       }
19665 +
19666 +       return ret;
19667 +}
19668 +
19669 +struct op_counter_config counter_config[OP_MAX_COUNTER];
19670 +
19671 +static int xenoprof_create_files(struct super_block * sb, struct dentry * root)
19672 +{
19673 +       unsigned int i;
19674 +
19675 +       for (i = 0; i < num_events; ++i) {
19676 +               struct dentry * dir;
19677 +               char buf[2];
19678
19679 +               snprintf(buf, 2, "%d", i);
19680 +               dir = oprofilefs_mkdir(sb, root, buf);
19681 +               oprofilefs_create_ulong(sb, dir, "enabled",
19682 +                                       &counter_config[i].enabled);
19683 +               oprofilefs_create_ulong(sb, dir, "event",
19684 +                                       &counter_config[i].event);
19685 +               oprofilefs_create_ulong(sb, dir, "count",
19686 +                                       &counter_config[i].count);
19687 +               oprofilefs_create_ulong(sb, dir, "unit_mask",
19688 +                                       &counter_config[i].unit_mask);
19689 +               oprofilefs_create_ulong(sb, dir, "kernel",
19690 +                                       &counter_config[i].kernel);
19691 +               oprofilefs_create_ulong(sb, dir, "user",
19692 +                                       &counter_config[i].user);
19693 +       }
19694 +
19695 +       return 0;
19696 +}
19697 +
19698 +
19699 +struct oprofile_operations xenoprof_ops = {
19700 +       .create_files   = xenoprof_create_files,
19701 +       .set_active     = xenoprof_set_active,
19702 +       .set_passive    = xenoprof_set_passive,
19703 +       .setup          = xenoprof_setup,
19704 +       .shutdown       = xenoprof_shutdown,
19705 +       .start          = xenoprof_start,
19706 +       .stop           = xenoprof_stop
19707 +};
19708 +
19709 +
19710 +/* in order to get driverfs right */
19711 +static int using_xenoprof;
19712 +
19713 +int __init oprofile_arch_init(struct oprofile_operations * ops)
19714 +{
19715 +       struct xenoprof_init init;
19716 +       struct xenoprof_buf * buf;
19717 +       int vm_size;
19718 +       int npages;
19719 +       int ret;
19720 +       int i;
19721 +
19722 +       init.max_samples = 16;
19723 +       ret = HYPERVISOR_xenoprof_op(XENOPROF_init, &init);
19724 +
19725 +       if (!ret) {
19726 +               pgprot_t prot = __pgprot(_KERNPG_TABLE);
19727 +
19728 +               num_events = init.num_events;
19729 +               is_primary = init.is_primary;
19730 +               nbuf = init.nbuf;
19731 +
19732 +               /* just in case - make sure we do not overflow event list 
19733 +                   (i.e. counter_config list) */
19734 +               if (num_events > OP_MAX_COUNTER)
19735 +                       num_events = OP_MAX_COUNTER;
19736 +
19737 +               npages = (init.bufsize * nbuf - 1) / PAGE_SIZE + 1;
19738 +               vm_size = npages * PAGE_SIZE;
19739 +
19740 +               shared_buffer = (char *)vm_map_xen_pages(init.buf_maddr,
19741 +                                                        vm_size, prot);
19742 +               if (!shared_buffer) {
19743 +                       ret = -ENOMEM;
19744 +                       goto out;
19745 +               }
19746 +
19747 +               for (i=0; i< nbuf; i++) {
19748 +                       buf = (struct xenoprof_buf*) 
19749 +                               &shared_buffer[i * init.bufsize];
19750 +                       BUG_ON(buf->vcpu_id >= MAX_VIRT_CPUS);
19751 +                       xenoprof_buf[buf->vcpu_id] = buf;
19752 +               }
19753 +
19754 +               /*  cpu_type is detected by Xen */
19755 +               cpu_type[XENOPROF_CPU_TYPE_SIZE-1] = 0;
19756 +               strncpy(cpu_type, init.cpu_type, XENOPROF_CPU_TYPE_SIZE - 1);
19757 +               xenoprof_ops.cpu_type = cpu_type;
19758 +
19759 +               init_driverfs();
19760 +               using_xenoprof = 1;
19761 +               *ops = xenoprof_ops;
19762 +
19763 +               for (i=0; i<NR_CPUS; i++)
19764 +                       ovf_irq[i] = -1;
19765 +
19766 +               active_defined = 0;
19767 +       }
19768 + out:
19769 +       printk(KERN_INFO "oprofile_arch_init: ret %d, events %d, "
19770 +              "is_primary %d\n", ret, num_events, is_primary);
19771 +       return ret;
19772 +}
19773 +
19774 +
19775 +void __exit oprofile_arch_exit(void)
19776 +{
19777 +       int i;
19778 +
19779 +       if (using_xenoprof)
19780 +               exit_driverfs();
19781 +
19782 +       if (shared_buffer) {
19783 +               vunmap(shared_buffer);
19784 +               shared_buffer = NULL;
19785 +       }
19786 +       if (is_primary) {
19787 +               for (i = 0; i < pdomains; i++)
19788 +                       if (p_shared_buffer[i]) {
19789 +                               vunmap(p_shared_buffer[i]);
19790 +                               p_shared_buffer[i] = NULL;
19791 +                       }
19792 +               HYPERVISOR_xenoprof_op(XENOPROF_shutdown, NULL);
19793 +        }
19794 +
19795 +}
19796 diff -urNp linux-2.6/arch/i386/pci/irq-xen.c new/arch/i386/pci/irq-xen.c
19797 --- linux-2.6/arch/i386/pci/irq-xen.c   1970-01-01 01:00:00.000000000 +0100
19798 +++ new/arch/i386/pci/irq-xen.c 2006-05-23 18:37:09.000000000 +0200
19799 @@ -0,0 +1,1204 @@
19800 +/*
19801 + *     Low-Level PCI Support for PC -- Routing of Interrupts
19802 + *
19803 + *     (c) 1999--2000 Martin Mares <mj@ucw.cz>
19804 + */
19805 +
19806 +#include <linux/config.h>
19807 +#include <linux/types.h>
19808 +#include <linux/kernel.h>
19809 +#include <linux/pci.h>
19810 +#include <linux/init.h>
19811 +#include <linux/slab.h>
19812 +#include <linux/interrupt.h>
19813 +#include <linux/dmi.h>
19814 +#include <asm/io.h>
19815 +#include <asm/smp.h>
19816 +#include <asm/io_apic.h>
19817 +#include <linux/irq.h>
19818 +#include <linux/acpi.h>
19819 +
19820 +#include "pci.h"
19821 +
19822 +#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
19823 +#define PIRQ_VERSION 0x0100
19824 +
19825 +static int broken_hp_bios_irq9;
19826 +static int acer_tm360_irqrouting;
19827 +
19828 +static struct irq_routing_table *pirq_table;
19829 +
19830 +static int pirq_enable_irq(struct pci_dev *dev);
19831 +
19832 +/*
19833 + * Never use: 0, 1, 2 (timer, keyboard, and cascade)
19834 + * Avoid using: 13, 14 and 15 (FP error and IDE).
19835 + * Penalize: 3, 4, 6, 7, 12 (known ISA uses: serial, floppy, parallel and mouse)
19836 + */
19837 +unsigned int pcibios_irq_mask = 0xfff8;
19838 +
19839 +static int pirq_penalty[16] = {
19840 +       1000000, 1000000, 1000000, 1000, 1000, 0, 1000, 1000,
19841 +       0, 0, 0, 0, 1000, 100000, 100000, 100000
19842 +};
19843 +
19844 +struct irq_router {
19845 +       char *name;
19846 +       u16 vendor, device;
19847 +       int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
19848 +       int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
19849 +};
19850 +
19851 +struct irq_router_handler {
19852 +       u16 vendor;
19853 +       int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device);
19854 +};
19855 +
19856 +int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL;
19857 +void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
19858 +
19859 +/*
19860 + *  Check passed address for the PCI IRQ Routing Table signature
19861 + *  and perform checksum verification.
19862 + */
19863 +
19864 +static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
19865 +{
19866 +       struct irq_routing_table *rt;
19867 +       int i;
19868 +       u8 sum;
19869 +
19870 +       rt = (struct irq_routing_table *) addr;
19871 +       if (rt->signature != PIRQ_SIGNATURE ||
19872 +           rt->version != PIRQ_VERSION ||
19873 +           rt->size % 16 ||
19874 +           rt->size < sizeof(struct irq_routing_table))
19875 +               return NULL;
19876 +       sum = 0;
19877 +       for (i=0; i < rt->size; i++)
19878 +               sum += addr[i];
19879 +       if (!sum) {
19880 +               DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
19881 +               return rt;
19882 +       }
19883 +       return NULL;
19884 +}
19885 +
19886 +
19887 +
19888 +/*
19889 + *  Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table.
19890 + */
19891 +
19892 +static struct irq_routing_table * __init pirq_find_routing_table(void)
19893 +{
19894 +       u8 *addr;
19895 +       struct irq_routing_table *rt;
19896 +
19897 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
19898 +       if (pirq_table_addr) {
19899 +               rt = pirq_check_routing_table((u8 *) isa_bus_to_virt(pirq_table_addr));
19900 +               if (rt)
19901 +                       return rt;
19902 +               printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
19903 +       }
19904 +       for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
19905 +               rt = pirq_check_routing_table(addr);
19906 +               if (rt)
19907 +                       return rt;
19908 +       }
19909 +#endif
19910 +       
19911 +       return NULL;
19912 +}
19913 +
19914 +/*
19915 + *  If we have a IRQ routing table, use it to search for peer host
19916 + *  bridges.  It's a gross hack, but since there are no other known
19917 + *  ways how to get a list of buses, we have to go this way.
19918 + */
19919 +
19920 +static void __init pirq_peer_trick(void)
19921 +{
19922 +       struct irq_routing_table *rt = pirq_table;
19923 +       u8 busmap[256];
19924 +       int i;
19925 +       struct irq_info *e;
19926 +
19927 +       memset(busmap, 0, sizeof(busmap));
19928 +       for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
19929 +               e = &rt->slots[i];
19930 +#ifdef DEBUG
19931 +               {
19932 +                       int j;
19933 +                       DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
19934 +                       for(j=0; j<4; j++)
19935 +                               DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
19936 +                       DBG("\n");
19937 +               }
19938 +#endif
19939 +               busmap[e->bus] = 1;
19940 +       }
19941 +       for(i = 1; i < 256; i++) {
19942 +               if (!busmap[i] || pci_find_bus(0, i))
19943 +                       continue;
19944 +               if (pci_scan_bus(i, &pci_root_ops, NULL))
19945 +                       printk(KERN_INFO "PCI: Discovered primary peer bus %02x [IRQ]\n", i);
19946 +       }
19947 +       pcibios_last_bus = -1;
19948 +}
19949 +
19950 +/*
19951 + *  Code for querying and setting of IRQ routes on various interrupt routers.
19952 + */
19953 +
19954 +void eisa_set_level_irq(unsigned int irq)
19955 +{
19956 +       unsigned char mask = 1 << (irq & 7);
19957 +       unsigned int port = 0x4d0 + (irq >> 3);
19958 +       unsigned char val;
19959 +       static u16 eisa_irq_mask;
19960 +
19961 +       if (irq >= 16 || (1 << irq) & eisa_irq_mask)
19962 +               return;
19963 +
19964 +       eisa_irq_mask |= (1 << irq);
19965 +       printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq);
19966 +       val = inb(port);
19967 +       if (!(val & mask)) {
19968 +               DBG(KERN_DEBUG " -> edge");
19969 +               outb(val | mask, port);
19970 +       }
19971 +}
19972 +
19973 +/*
19974 + * Common IRQ routing practice: nybbles in config space,
19975 + * offset by some magic constant.
19976 + */
19977 +static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr)
19978 +{
19979 +       u8 x;
19980 +       unsigned reg = offset + (nr >> 1);
19981 +
19982 +       pci_read_config_byte(router, reg, &x);
19983 +       return (nr & 1) ? (x >> 4) : (x & 0xf);
19984 +}
19985 +
19986 +static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
19987 +{
19988 +       u8 x;
19989 +       unsigned reg = offset + (nr >> 1);
19990 +
19991 +       pci_read_config_byte(router, reg, &x);
19992 +       x = (nr & 1) ? ((x & 0x0f) | (val << 4)) : ((x & 0xf0) | val);
19993 +       pci_write_config_byte(router, reg, x);
19994 +}
19995 +
19996 +/*
19997 + * ALI pirq entries are damn ugly, and completely undocumented.
19998 + * This has been figured out from pirq tables, and it's not a pretty
19999 + * picture.
20000 + */
20001 +static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20002 +{
20003 +       static unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
20004 +
20005 +       return irqmap[read_config_nybble(router, 0x48, pirq-1)];
20006 +}
20007 +
20008 +static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20009 +{
20010 +       static unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
20011 +       unsigned int val = irqmap[irq];
20012 +               
20013 +       if (val) {
20014 +               write_config_nybble(router, 0x48, pirq-1, val);
20015 +               return 1;
20016 +       }
20017 +       return 0;
20018 +}
20019 +
20020 +/*
20021 + * The Intel PIIX4 pirq rules are fairly simple: "pirq" is
20022 + * just a pointer to the config space.
20023 + */
20024 +static int pirq_piix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20025 +{
20026 +       u8 x;
20027 +
20028 +       pci_read_config_byte(router, pirq, &x);
20029 +       return (x < 16) ? x : 0;
20030 +}
20031 +
20032 +static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20033 +{
20034 +       pci_write_config_byte(router, pirq, irq);
20035 +       return 1;
20036 +}
20037 +
20038 +/*
20039 + * The VIA pirq rules are nibble-based, like ALI,
20040 + * but without the ugly irq number munging.
20041 + * However, PIRQD is in the upper instead of lower 4 bits.
20042 + */
20043 +static int pirq_via_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20044 +{
20045 +       return read_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq);
20046 +}
20047 +
20048 +static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20049 +{
20050 +       write_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq, irq);
20051 +       return 1;
20052 +}
20053 +
20054 +/*
20055 + * The VIA pirq rules are nibble-based, like ALI,
20056 + * but without the ugly irq number munging.
20057 + * However, for 82C586, nibble map is different .
20058 + */
20059 +static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20060 +{
20061 +       static unsigned int pirqmap[4] = { 3, 2, 5, 1 };
20062 +       return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
20063 +}
20064 +
20065 +static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20066 +{
20067 +       static unsigned int pirqmap[4] = { 3, 2, 5, 1 };
20068 +       write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
20069 +       return 1;
20070 +}
20071 +
20072 +/*
20073 + * ITE 8330G pirq rules are nibble-based
20074 + * FIXME: pirqmap may be { 1, 0, 3, 2 },
20075 + *       2+3 are both mapped to irq 9 on my system
20076 + */
20077 +static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20078 +{
20079 +       static unsigned char pirqmap[4] = { 1, 0, 2, 3 };
20080 +       return read_config_nybble(router,0x43, pirqmap[pirq-1]);
20081 +}
20082 +
20083 +static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20084 +{
20085 +       static unsigned char pirqmap[4] = { 1, 0, 2, 3 };
20086 +       write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
20087 +       return 1;
20088 +}
20089 +
20090 +/*
20091 + * OPTI: high four bits are nibble pointer..
20092 + * I wonder what the low bits do?
20093 + */
20094 +static int pirq_opti_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20095 +{
20096 +       return read_config_nybble(router, 0xb8, pirq >> 4);
20097 +}
20098 +
20099 +static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20100 +{
20101 +       write_config_nybble(router, 0xb8, pirq >> 4, irq);
20102 +       return 1;
20103 +}
20104 +
20105 +/*
20106 + * Cyrix: nibble offset 0x5C
20107 + * 0x5C bits 7:4 is INTB bits 3:0 is INTA 
20108 + * 0x5D bits 7:4 is INTD bits 3:0 is INTC
20109 + */
20110 +static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20111 +{
20112 +       return read_config_nybble(router, 0x5C, (pirq-1)^1);
20113 +}
20114 +
20115 +static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20116 +{
20117 +       write_config_nybble(router, 0x5C, (pirq-1)^1, irq);
20118 +       return 1;
20119 +}
20120 +
20121 +/*
20122 + *     PIRQ routing for SiS 85C503 router used in several SiS chipsets.
20123 + *     We have to deal with the following issues here:
20124 + *     - vendors have different ideas about the meaning of link values
20125 + *     - some onboard devices (integrated in the chipset) have special
20126 + *       links and are thus routed differently (i.e. not via PCI INTA-INTD)
20127 + *     - different revision of the router have a different layout for
20128 + *       the routing registers, particularly for the onchip devices
20129 + *
20130 + *     For all routing registers the common thing is we have one byte
20131 + *     per routeable link which is defined as:
20132 + *              bit 7      IRQ mapping enabled (0) or disabled (1)
20133 + *              bits [6:4] reserved (sometimes used for onchip devices)
20134 + *              bits [3:0] IRQ to map to
20135 + *                  allowed: 3-7, 9-12, 14-15
20136 + *                  reserved: 0, 1, 2, 8, 13
20137 + *
20138 + *     The config-space registers located at 0x41/0x42/0x43/0x44 are
20139 + *     always used to route the normal PCI INT A/B/C/D respectively.
20140 + *     Apparently there are systems implementing PCI routing table using
20141 + *     link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
20142 + *     We try our best to handle both link mappings.
20143 + *     
20144 + *     Currently (2003-05-21) it appears most SiS chipsets follow the
20145 + *     definition of routing registers from the SiS-5595 southbridge.
20146 + *     According to the SiS 5595 datasheets the revision id's of the
20147 + *     router (ISA-bridge) should be 0x01 or 0xb0.
20148 + *
20149 + *     Furthermore we've also seen lspci dumps with revision 0x00 and 0xb1.
20150 + *     Looks like these are used in a number of SiS 5xx/6xx/7xx chipsets.
20151 + *     They seem to work with the current routing code. However there is
20152 + *     some concern because of the two USB-OHCI HCs (original SiS 5595
20153 + *     had only one). YMMV.
20154 + *
20155 + *     Onchip routing for router rev-id 0x01/0xb0 and probably 0x00/0xb1:
20156 + *
20157 + *     0x61:   IDEIRQ:
20158 + *             bits [6:5] must be written 01
20159 + *             bit 4 channel-select primary (0), secondary (1)
20160 + *
20161 + *     0x62:   USBIRQ:
20162 + *             bit 6 OHCI function disabled (0), enabled (1)
20163 + *     
20164 + *     0x6a:   ACPI/SCI IRQ: bits 4-6 reserved
20165 + *
20166 + *     0x7e:   Data Acq. Module IRQ - bits 4-6 reserved
20167 + *
20168 + *     We support USBIRQ (in addition to INTA-INTD) and keep the
20169 + *     IDE, ACPI and DAQ routing untouched as set by the BIOS.
20170 + *
20171 + *     Currently the only reported exception is the new SiS 65x chipset
20172 + *     which includes the SiS 69x southbridge. Here we have the 85C503
20173 + *     router revision 0x04 and there are changes in the register layout
20174 + *     mostly related to the different USB HCs with USB 2.0 support.
20175 + *
20176 + *     Onchip routing for router rev-id 0x04 (try-and-error observation)
20177 + *
20178 + *     0x60/0x61/0x62/0x63:    1xEHCI and 3xOHCI (companion) USB-HCs
20179 + *                             bit 6-4 are probably unused, not like 5595
20180 + */
20181 +
20182 +#define PIRQ_SIS_IRQ_MASK      0x0f
20183 +#define PIRQ_SIS_IRQ_DISABLE   0x80
20184 +#define PIRQ_SIS_USB_ENABLE    0x40
20185 +
20186 +static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20187 +{
20188 +       u8 x;
20189 +       int reg;
20190 +
20191 +       reg = pirq;
20192 +       if (reg >= 0x01 && reg <= 0x04)
20193 +               reg += 0x40;
20194 +       pci_read_config_byte(router, reg, &x);
20195 +       return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK);
20196 +}
20197 +
20198 +static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20199 +{
20200 +       u8 x;
20201 +       int reg;
20202 +
20203 +       reg = pirq;
20204 +       if (reg >= 0x01 && reg <= 0x04)
20205 +               reg += 0x40;
20206 +       pci_read_config_byte(router, reg, &x);
20207 +       x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE);
20208 +       x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE;
20209 +       pci_write_config_byte(router, reg, x);
20210 +       return 1;
20211 +}
20212 +
20213 +
20214 +/*
20215 + * VLSI: nibble offset 0x74 - educated guess due to routing table and
20216 + *       config space of VLSI 82C534 PCI-bridge/router (1004:0102)
20217 + *       Tested on HP OmniBook 800 covering PIRQ 1, 2, 4, 8 for onboard
20218 + *       devices, PIRQ 3 for non-pci(!) soundchip and (untested) PIRQ 6
20219 + *       for the busbridge to the docking station.
20220 + */
20221 +
20222 +static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20223 +{
20224 +       if (pirq > 8) {
20225 +               printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
20226 +               return 0;
20227 +       }
20228 +       return read_config_nybble(router, 0x74, pirq-1);
20229 +}
20230 +
20231 +static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20232 +{
20233 +       if (pirq > 8) {
20234 +               printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
20235 +               return 0;
20236 +       }
20237 +       write_config_nybble(router, 0x74, pirq-1, irq);
20238 +       return 1;
20239 +}
20240 +
20241 +/*
20242 + * ServerWorks: PCI interrupts mapped to system IRQ lines through Index
20243 + * and Redirect I/O registers (0x0c00 and 0x0c01).  The Index register
20244 + * format is (PCIIRQ## | 0x10), e.g.: PCIIRQ10=0x1a.  The Redirect
20245 + * register is a straight binary coding of desired PIC IRQ (low nibble).
20246 + *
20247 + * The 'link' value in the PIRQ table is already in the correct format
20248 + * for the Index register.  There are some special index values:
20249 + * 0x00 for ACPI (SCI), 0x01 for USB, 0x02 for IDE0, 0x04 for IDE1,
20250 + * and 0x03 for SMBus.
20251 + */
20252 +static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20253 +{
20254 +       outb_p(pirq, 0xc00);
20255 +       return inb(0xc01) & 0xf;
20256 +}
20257 +
20258 +static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20259 +{
20260 +       outb_p(pirq, 0xc00);
20261 +       outb_p(irq, 0xc01);
20262 +       return 1;
20263 +}
20264 +
20265 +/* Support for AMD756 PCI IRQ Routing
20266 + * Jhon H. Caicedo <jhcaiced@osso.org.co>
20267 + * Jun/21/2001 0.2.0 Release, fixed to use "nybble" functions... (jhcaiced)
20268 + * Jun/19/2001 Alpha Release 0.1.0 (jhcaiced)
20269 + * The AMD756 pirq rules are nibble-based
20270 + * offset 0x56 0-3 PIRQA  4-7  PIRQB
20271 + * offset 0x57 0-3 PIRQC  4-7  PIRQD
20272 + */
20273 +static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
20274 +{
20275 +       u8 irq;
20276 +       irq = 0;
20277 +       if (pirq <= 4)
20278 +       {
20279 +               irq = read_config_nybble(router, 0x56, pirq - 1);
20280 +       }
20281 +       printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
20282 +               dev->vendor, dev->device, pirq, irq);
20283 +       return irq;
20284 +}
20285 +
20286 +static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20287 +{
20288 +       printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n", 
20289 +               dev->vendor, dev->device, pirq, irq);
20290 +       if (pirq <= 4)
20291 +       {
20292 +               write_config_nybble(router, 0x56, pirq - 1, irq);
20293 +       }
20294 +       return 1;
20295 +}
20296 +
20297 +#ifdef CONFIG_PCI_BIOS
20298 +
20299 +static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
20300 +{
20301 +       struct pci_dev *bridge;
20302 +       int pin = pci_get_interrupt_pin(dev, &bridge);
20303 +       return pcibios_set_irq_routing(bridge, pin, irq);
20304 +}
20305 +
20306 +#endif
20307 +
20308 +static __init int intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20309 +{
20310 +       static struct pci_device_id pirq_440gx[] = {
20311 +               { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) },
20312 +               { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) },
20313 +               { },
20314 +       };
20315 +
20316 +       /* 440GX has a proprietary PIRQ router -- don't use it */
20317 +       if (pci_dev_present(pirq_440gx))
20318 +               return 0;
20319 +
20320 +       switch(device)
20321 +       {
20322 +               case PCI_DEVICE_ID_INTEL_82371FB_0:
20323 +               case PCI_DEVICE_ID_INTEL_82371SB_0:
20324 +               case PCI_DEVICE_ID_INTEL_82371AB_0:
20325 +               case PCI_DEVICE_ID_INTEL_82371MX:
20326 +               case PCI_DEVICE_ID_INTEL_82443MX_0:
20327 +               case PCI_DEVICE_ID_INTEL_82801AA_0:
20328 +               case PCI_DEVICE_ID_INTEL_82801AB_0:
20329 +               case PCI_DEVICE_ID_INTEL_82801BA_0:
20330 +               case PCI_DEVICE_ID_INTEL_82801BA_10:
20331 +               case PCI_DEVICE_ID_INTEL_82801CA_0:
20332 +               case PCI_DEVICE_ID_INTEL_82801CA_12:
20333 +               case PCI_DEVICE_ID_INTEL_82801DB_0:
20334 +               case PCI_DEVICE_ID_INTEL_82801E_0:
20335 +               case PCI_DEVICE_ID_INTEL_82801EB_0:
20336 +               case PCI_DEVICE_ID_INTEL_ESB_1:
20337 +               case PCI_DEVICE_ID_INTEL_ICH6_0:
20338 +               case PCI_DEVICE_ID_INTEL_ICH6_1:
20339 +               case PCI_DEVICE_ID_INTEL_ICH7_0:
20340 +               case PCI_DEVICE_ID_INTEL_ICH7_1:
20341 +               case PCI_DEVICE_ID_INTEL_ICH7_30:
20342 +               case PCI_DEVICE_ID_INTEL_ICH7_31:
20343 +               case PCI_DEVICE_ID_INTEL_ESB2_0:
20344 +               case PCI_DEVICE_ID_INTEL_ICH8_0:
20345 +               case PCI_DEVICE_ID_INTEL_ICH8_1:
20346 +               case PCI_DEVICE_ID_INTEL_ICH8_2:
20347 +               case PCI_DEVICE_ID_INTEL_ICH8_3:
20348 +               case PCI_DEVICE_ID_INTEL_ICH8_4:
20349 +                       r->name = "PIIX/ICH";
20350 +                       r->get = pirq_piix_get;
20351 +                       r->set = pirq_piix_set;
20352 +                       return 1;
20353 +       }
20354 +       return 0;
20355 +}
20356 +
20357 +static __init int via_router_probe(struct irq_router *r,
20358 +                               struct pci_dev *router, u16 device)
20359 +{
20360 +       /* FIXME: We should move some of the quirk fixup stuff here */
20361 +
20362 +       /*
20363 +        * work arounds for some buggy BIOSes
20364 +        */
20365 +       if (device == PCI_DEVICE_ID_VIA_82C586_0) {
20366 +               switch(router->device) {
20367 +               case PCI_DEVICE_ID_VIA_82C686:
20368 +                       /*
20369 +                        * Asus k7m bios wrongly reports 82C686A
20370 +                        * as 586-compatible
20371 +                        */
20372 +                       device = PCI_DEVICE_ID_VIA_82C686;
20373 +                       break;
20374 +               case PCI_DEVICE_ID_VIA_8235:
20375 +                       /**
20376 +                        * Asus a7v-x bios wrongly reports 8235
20377 +                        * as 586-compatible
20378 +                        */
20379 +                       device = PCI_DEVICE_ID_VIA_8235;
20380 +                       break;
20381 +               }
20382 +       }
20383 +
20384 +       switch(device) {
20385 +       case PCI_DEVICE_ID_VIA_82C586_0:
20386 +               r->name = "VIA";
20387 +               r->get = pirq_via586_get;
20388 +               r->set = pirq_via586_set;
20389 +               return 1;
20390 +       case PCI_DEVICE_ID_VIA_82C596:
20391 +       case PCI_DEVICE_ID_VIA_82C686:
20392 +       case PCI_DEVICE_ID_VIA_8231:
20393 +       case PCI_DEVICE_ID_VIA_8233A:
20394 +       case PCI_DEVICE_ID_VIA_8235:
20395 +       case PCI_DEVICE_ID_VIA_8237:
20396 +               /* FIXME: add new ones for 8233/5 */
20397 +               r->name = "VIA";
20398 +               r->get = pirq_via_get;
20399 +               r->set = pirq_via_set;
20400 +               return 1;
20401 +       }
20402 +       return 0;
20403 +}
20404 +
20405 +static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20406 +{
20407 +       switch(device)
20408 +       {
20409 +               case PCI_DEVICE_ID_VLSI_82C534:
20410 +                       r->name = "VLSI 82C534";
20411 +                       r->get = pirq_vlsi_get;
20412 +                       r->set = pirq_vlsi_set;
20413 +                       return 1;
20414 +       }
20415 +       return 0;
20416 +}
20417 +
20418 +
20419 +static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20420 +{
20421 +       switch(device)
20422 +       {
20423 +               case PCI_DEVICE_ID_SERVERWORKS_OSB4:
20424 +               case PCI_DEVICE_ID_SERVERWORKS_CSB5:
20425 +                       r->name = "ServerWorks";
20426 +                       r->get = pirq_serverworks_get;
20427 +                       r->set = pirq_serverworks_set;
20428 +                       return 1;
20429 +       }
20430 +       return 0;
20431 +}
20432 +
20433 +static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20434 +{
20435 +       if (device != PCI_DEVICE_ID_SI_503)
20436 +               return 0;
20437 +               
20438 +       r->name = "SIS";
20439 +       r->get = pirq_sis_get;
20440 +       r->set = pirq_sis_set;
20441 +       return 1;
20442 +}
20443 +
20444 +static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20445 +{
20446 +       switch(device)
20447 +       {
20448 +               case PCI_DEVICE_ID_CYRIX_5520:
20449 +                       r->name = "NatSemi";
20450 +                       r->get = pirq_cyrix_get;
20451 +                       r->set = pirq_cyrix_set;
20452 +                       return 1;
20453 +       }
20454 +       return 0;
20455 +}
20456 +
20457 +static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20458 +{
20459 +       switch(device)
20460 +       {
20461 +               case PCI_DEVICE_ID_OPTI_82C700:
20462 +                       r->name = "OPTI";
20463 +                       r->get = pirq_opti_get;
20464 +                       r->set = pirq_opti_set;
20465 +                       return 1;
20466 +       }
20467 +       return 0;
20468 +}
20469 +
20470 +static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20471 +{
20472 +       switch(device)
20473 +       {
20474 +               case PCI_DEVICE_ID_ITE_IT8330G_0:
20475 +                       r->name = "ITE";
20476 +                       r->get = pirq_ite_get;
20477 +                       r->set = pirq_ite_set;
20478 +                       return 1;
20479 +       }
20480 +       return 0;
20481 +}
20482 +
20483 +static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20484 +{
20485 +       switch(device)
20486 +       {
20487 +       case PCI_DEVICE_ID_AL_M1533:
20488 +       case PCI_DEVICE_ID_AL_M1563:
20489 +               printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
20490 +               r->name = "ALI";
20491 +               r->get = pirq_ali_get;
20492 +               r->set = pirq_ali_set;
20493 +               return 1;
20494 +       }
20495 +       return 0;
20496 +}
20497 +
20498 +static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
20499 +{
20500 +       switch(device)
20501 +       {
20502 +               case PCI_DEVICE_ID_AMD_VIPER_740B:
20503 +                       r->name = "AMD756";
20504 +                       break;
20505 +               case PCI_DEVICE_ID_AMD_VIPER_7413:
20506 +                       r->name = "AMD766";
20507 +                       break;
20508 +               case PCI_DEVICE_ID_AMD_VIPER_7443:
20509 +                       r->name = "AMD768";
20510 +                       break;
20511 +               default:
20512 +                       return 0;
20513 +       }
20514 +       r->get = pirq_amd756_get;
20515 +       r->set = pirq_amd756_set;
20516 +       return 1;
20517 +}
20518 +               
20519 +static __initdata struct irq_router_handler pirq_routers[] = {
20520 +       { PCI_VENDOR_ID_INTEL, intel_router_probe },
20521 +       { PCI_VENDOR_ID_AL, ali_router_probe },
20522 +       { PCI_VENDOR_ID_ITE, ite_router_probe },
20523 +       { PCI_VENDOR_ID_VIA, via_router_probe },
20524 +       { PCI_VENDOR_ID_OPTI, opti_router_probe },
20525 +       { PCI_VENDOR_ID_SI, sis_router_probe },
20526 +       { PCI_VENDOR_ID_CYRIX, cyrix_router_probe },
20527 +       { PCI_VENDOR_ID_VLSI, vlsi_router_probe },
20528 +       { PCI_VENDOR_ID_SERVERWORKS, serverworks_router_probe },
20529 +       { PCI_VENDOR_ID_AMD, amd_router_probe },
20530 +       /* Someone with docs needs to add the ATI Radeon IGP */
20531 +       { 0, NULL }
20532 +};
20533 +static struct irq_router pirq_router;
20534 +static struct pci_dev *pirq_router_dev;
20535 +
20536 +
20537 +/*
20538 + *     FIXME: should we have an option to say "generic for
20539 + *     chipset" ?
20540 + */
20541
20542 +static void __init pirq_find_router(struct irq_router *r)
20543 +{
20544 +       struct irq_routing_table *rt = pirq_table;
20545 +       struct irq_router_handler *h;
20546 +
20547 +#ifdef CONFIG_PCI_BIOS
20548 +       if (!rt->signature) {
20549 +               printk(KERN_INFO "PCI: Using BIOS for IRQ routing\n");
20550 +               r->set = pirq_bios_set;
20551 +               r->name = "BIOS";
20552 +               return;
20553 +       }
20554 +#endif
20555 +
20556 +       /* Default unless a driver reloads it */
20557 +       r->name = "default";
20558 +       r->get = NULL;
20559 +       r->set = NULL;
20560 +       
20561 +       DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
20562 +           rt->rtr_vendor, rt->rtr_device);
20563 +
20564 +       pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn);
20565 +       if (!pirq_router_dev) {
20566 +               DBG(KERN_DEBUG "PCI: Interrupt router not found at "
20567 +                       "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
20568 +               return;
20569 +       }
20570 +
20571 +       for( h = pirq_routers; h->vendor; h++) {
20572 +               /* First look for a router match */
20573 +               if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
20574 +                       break;
20575 +               /* Fall back to a device match */
20576 +               if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
20577 +                       break;
20578 +       }
20579 +       printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
20580 +               pirq_router.name,
20581 +               pirq_router_dev->vendor,
20582 +               pirq_router_dev->device,
20583 +               pci_name(pirq_router_dev));
20584 +}
20585 +
20586 +static struct irq_info *pirq_get_info(struct pci_dev *dev)
20587 +{
20588 +       struct irq_routing_table *rt = pirq_table;
20589 +       int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
20590 +       struct irq_info *info;
20591 +
20592 +       for (info = rt->slots; entries--; info++)
20593 +               if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
20594 +                       return info;
20595 +       return NULL;
20596 +}
20597 +
20598 +static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
20599 +{
20600 +       u8 pin;
20601 +       struct irq_info *info;
20602 +       int i, pirq, newirq;
20603 +       int irq = 0;
20604 +       u32 mask;
20605 +       struct irq_router *r = &pirq_router;
20606 +       struct pci_dev *dev2 = NULL;
20607 +       char *msg = NULL;
20608 +
20609 +       /* Find IRQ pin */
20610 +       pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
20611 +       if (!pin) {
20612 +               DBG(KERN_DEBUG " -> no interrupt pin\n");
20613 +               return 0;
20614 +       }
20615 +       pin = pin - 1;
20616 +
20617 +       /* Find IRQ routing entry */
20618 +
20619 +       if (!pirq_table)
20620 +               return 0;
20621 +       
20622 +       DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
20623 +       info = pirq_get_info(dev);
20624 +       if (!info) {
20625 +               DBG(" -> not found in routing table\n" KERN_DEBUG);
20626 +               return 0;
20627 +       }
20628 +       pirq = info->irq[pin].link;
20629 +       mask = info->irq[pin].bitmap;
20630 +       if (!pirq) {
20631 +               DBG(" -> not routed\n" KERN_DEBUG);
20632 +               return 0;
20633 +       }
20634 +       DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
20635 +       mask &= pcibios_irq_mask;
20636 +
20637 +       /* Work around broken HP Pavilion Notebooks which assign USB to
20638 +          IRQ 9 even though it is actually wired to IRQ 11 */
20639 +
20640 +       if (broken_hp_bios_irq9 && pirq == 0x59 && dev->irq == 9) {
20641 +               dev->irq = 11;
20642 +               pci_write_config_byte(dev, PCI_INTERRUPT_LINE, 11);
20643 +               r->set(pirq_router_dev, dev, pirq, 11);
20644 +       }
20645 +
20646 +       /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
20647 +       if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
20648 +               pirq = 0x68;
20649 +               mask = 0x400;
20650 +               dev->irq = r->get(pirq_router_dev, dev, pirq);
20651 +               pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq);
20652 +       }
20653 +
20654 +       /*
20655 +        * Find the best IRQ to assign: use the one
20656 +        * reported by the device if possible.
20657 +        */
20658 +       newirq = dev->irq;
20659 +       if (newirq && !((1 << newirq) & mask)) {
20660 +               if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
20661 +               else printk("\n" KERN_WARNING
20662 +                       "PCI: IRQ %i for device %s doesn't match PIRQ mask "
20663 +                       "- try pci=usepirqmask\n" KERN_DEBUG, newirq,
20664 +                       pci_name(dev));
20665 +       }
20666 +       if (!newirq && assign) {
20667 +               for (i = 0; i < 16; i++) {
20668 +                       if (!(mask & (1 << i)))
20669 +                               continue;
20670 +                       if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, SA_SHIRQ))
20671 +                               newirq = i;
20672 +               }
20673 +       }
20674 +       DBG(" -> newirq=%d", newirq);
20675 +
20676 +       /* Check if it is hardcoded */
20677 +       if ((pirq & 0xf0) == 0xf0) {
20678 +               irq = pirq & 0xf;
20679 +               DBG(" -> hardcoded IRQ %d\n", irq);
20680 +               msg = "Hardcoded";
20681 +       } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
20682 +       ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
20683 +               DBG(" -> got IRQ %d\n", irq);
20684 +               msg = "Found";
20685 +       } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
20686 +               DBG(" -> assigning IRQ %d", newirq);
20687 +               if (r->set(pirq_router_dev, dev, pirq, newirq)) {
20688 +                       eisa_set_level_irq(newirq);
20689 +                       DBG(" ... OK\n");
20690 +                       msg = "Assigned";
20691 +                       irq = newirq;
20692 +               }
20693 +       }
20694 +
20695 +       if (!irq) {
20696 +               DBG(" ... failed\n");
20697 +               if (newirq && mask == (1 << newirq)) {
20698 +                       msg = "Guessed";
20699 +                       irq = newirq;
20700 +               } else
20701 +                       return 0;
20702 +       }
20703 +       printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
20704 +
20705 +       /* Update IRQ for all devices with the same pirq value */
20706 +       while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
20707 +               pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
20708 +               if (!pin)
20709 +                       continue;
20710 +               pin--;
20711 +               info = pirq_get_info(dev2);
20712 +               if (!info)
20713 +                       continue;
20714 +               if (info->irq[pin].link == pirq) {
20715 +                       /* We refuse to override the dev->irq information. Give a warning! */
20716 +                       if ( dev2->irq && dev2->irq != irq && \
20717 +                       (!(pci_probe & PCI_USE_PIRQ_MASK) || \
20718 +                       ((1 << dev2->irq) & mask)) ) {
20719 +#ifndef CONFIG_PCI_MSI
20720 +                               printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
20721 +                                      pci_name(dev2), dev2->irq, irq);
20722 +#endif
20723 +                               continue;
20724 +                       }
20725 +                       dev2->irq = irq;
20726 +                       pirq_penalty[irq]++;
20727 +                       if (dev != dev2)
20728 +                               printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
20729 +               }
20730 +       }
20731 +       return 1;
20732 +}
20733 +
20734 +static void __init pcibios_fixup_irqs(void)
20735 +{
20736 +       struct pci_dev *dev = NULL;
20737 +       u8 pin;
20738 +
20739 +       DBG(KERN_DEBUG "PCI: IRQ fixup\n");
20740 +       while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
20741 +               /*
20742 +                * If the BIOS has set an out of range IRQ number, just ignore it.
20743 +                * Also keep track of which IRQ's are already in use.
20744 +                */
20745 +               if (dev->irq >= 16) {
20746 +                       DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
20747 +                       dev->irq = 0;
20748 +               }
20749 +               /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
20750 +               if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
20751 +                       pirq_penalty[dev->irq] = 0;
20752 +               pirq_penalty[dev->irq]++;
20753 +       }
20754 +
20755 +       dev = NULL;
20756 +       while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
20757 +               pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
20758 +#ifdef CONFIG_X86_IO_APIC
20759 +               /*
20760 +                * Recalculate IRQ numbers if we use the I/O APIC.
20761 +                */
20762 +               if (io_apic_assign_pci_irqs)
20763 +               {
20764 +                       int irq;
20765 +
20766 +                       if (pin) {
20767 +                               pin--;          /* interrupt pins are numbered starting from 1 */
20768 +                               irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
20769 +       /*
20770 +        * Busses behind bridges are typically not listed in the MP-table.
20771 +        * In this case we have to look up the IRQ based on the parent bus,
20772 +        * parent slot, and pin number. The SMP code detects such bridged
20773 +        * busses itself so we should get into this branch reliably.
20774 +        */
20775 +                               if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
20776 +                                       struct pci_dev * bridge = dev->bus->self;
20777 +
20778 +                                       pin = (pin + PCI_SLOT(dev->devfn)) % 4;
20779 +                                       irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 
20780 +                                                       PCI_SLOT(bridge->devfn), pin);
20781 +                                       if (irq >= 0)
20782 +                                               printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
20783 +                                                       pci_name(bridge), 'A' + pin, irq);
20784 +                               }
20785 +                               if (irq >= 0) {
20786 +                                       if (use_pci_vector() &&
20787 +                                               !platform_legacy_irq(irq))
20788 +                                               irq = IO_APIC_VECTOR(irq);
20789 +
20790 +                                       printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
20791 +                                               pci_name(dev), 'A' + pin, irq);
20792 +                                       dev->irq = irq;
20793 +                               }
20794 +                       }
20795 +               }
20796 +#endif
20797 +               /*
20798 +                * Still no IRQ? Try to lookup one...
20799 +                */
20800 +               if (pin && !dev->irq)
20801 +                       pcibios_lookup_irq(dev, 0);
20802 +       }
20803 +}
20804 +
20805 +/*
20806 + * Work around broken HP Pavilion Notebooks which assign USB to
20807 + * IRQ 9 even though it is actually wired to IRQ 11
20808 + */
20809 +static int __init fix_broken_hp_bios_irq9(struct dmi_system_id *d)
20810 +{
20811 +       if (!broken_hp_bios_irq9) {
20812 +               broken_hp_bios_irq9 = 1;
20813 +               printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
20814 +       }
20815 +       return 0;
20816 +}
20817 +
20818 +/*
20819 + * Work around broken Acer TravelMate 360 Notebooks which assign
20820 + * Cardbus to IRQ 11 even though it is actually wired to IRQ 10
20821 + */
20822 +static int __init fix_acer_tm360_irqrouting(struct dmi_system_id *d)
20823 +{
20824 +       if (!acer_tm360_irqrouting) {
20825 +               acer_tm360_irqrouting = 1;
20826 +               printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
20827 +       }
20828 +       return 0;
20829 +}
20830 +
20831 +static struct dmi_system_id __initdata pciirq_dmi_table[] = {
20832 +       {
20833 +               .callback = fix_broken_hp_bios_irq9,
20834 +               .ident = "HP Pavilion N5400 Series Laptop",
20835 +               .matches = {
20836 +                       DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
20837 +                       DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
20838 +                       DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
20839 +                       DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
20840 +               },
20841 +       },
20842 +       {
20843 +               .callback = fix_acer_tm360_irqrouting,
20844 +               .ident = "Acer TravelMate 36x Laptop",
20845 +               .matches = {
20846 +                       DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
20847 +                       DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
20848 +               },
20849 +       },
20850 +       { }
20851 +};
20852 +
20853 +static int __init pcibios_irq_init(void)
20854 +{
20855 +       DBG(KERN_DEBUG "PCI: IRQ init\n");
20856 +
20857 +       if (pcibios_enable_irq || raw_pci_ops == NULL)
20858 +               return 0;
20859 +
20860 +       dmi_check_system(pciirq_dmi_table);
20861 +
20862 +       pirq_table = pirq_find_routing_table();
20863 +
20864 +#ifdef CONFIG_PCI_BIOS
20865 +       if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN))
20866 +               pirq_table = pcibios_get_irq_routing_table();
20867 +#endif
20868 +       if (pirq_table) {
20869 +               pirq_peer_trick();
20870 +               pirq_find_router(&pirq_router);
20871 +               if (pirq_table->exclusive_irqs) {
20872 +                       int i;
20873 +                       for (i=0; i<16; i++)
20874 +                               if (!(pirq_table->exclusive_irqs & (1 << i)))
20875 +                                       pirq_penalty[i] += 100;
20876 +               }
20877 +               /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
20878 +               if (io_apic_assign_pci_irqs)
20879 +                       pirq_table = NULL;
20880 +       }
20881 +
20882 +       pcibios_enable_irq = pirq_enable_irq;
20883 +
20884 +       pcibios_fixup_irqs();
20885 +       return 0;
20886 +}
20887 +
20888 +subsys_initcall(pcibios_irq_init);
20889 +
20890 +
20891 +static void pirq_penalize_isa_irq(int irq, int active)
20892 +{
20893 +       /*
20894 +        *  If any ISAPnP device reports an IRQ in its list of possible
20895 +        *  IRQ's, we try to avoid assigning it to PCI devices.
20896 +        */
20897 +       if (irq < 16) {
20898 +               if (active)
20899 +                       pirq_penalty[irq] += 1000;
20900 +               else
20901 +                       pirq_penalty[irq] += 100;
20902 +       }
20903 +}
20904 +
20905 +void pcibios_penalize_isa_irq(int irq, int active)
20906 +{
20907 +#ifdef CONFIG_ACPI
20908 +       if (!acpi_noirq)
20909 +               acpi_penalize_isa_irq(irq, active);
20910 +       else
20911 +#endif
20912 +               pirq_penalize_isa_irq(irq, active);
20913 +}
20914 +
20915 +static int pirq_enable_irq(struct pci_dev *dev)
20916 +{
20917 +       u8 pin;
20918 +       struct pci_dev *temp_dev;
20919 +
20920 +       pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
20921 +       if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
20922 +               char *msg = "";
20923 +
20924 +               pin--;          /* interrupt pins are numbered starting from 1 */
20925 +
20926 +               if (io_apic_assign_pci_irqs) {
20927 +                       int irq;
20928 +
20929 +                       irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
20930 +                       /*
20931 +                        * Busses behind bridges are typically not listed in the MP-table.
20932 +                        * In this case we have to look up the IRQ based on the parent bus,
20933 +                        * parent slot, and pin number. The SMP code detects such bridged
20934 +                        * busses itself so we should get into this branch reliably.
20935 +                        */
20936 +                       temp_dev = dev;
20937 +                       while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
20938 +                               struct pci_dev * bridge = dev->bus->self;
20939 +
20940 +                               pin = (pin + PCI_SLOT(dev->devfn)) % 4;
20941 +                               irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 
20942 +                                               PCI_SLOT(bridge->devfn), pin);
20943 +                               if (irq >= 0)
20944 +                                       printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
20945 +                                               pci_name(bridge), 'A' + pin, irq);
20946 +                               dev = bridge;
20947 +                       }
20948 +                       dev = temp_dev;
20949 +                       if (irq >= 0) {
20950 +#ifdef CONFIG_PCI_MSI
20951 +                               if (!platform_legacy_irq(irq))
20952 +                                       irq = IO_APIC_VECTOR(irq);
20953 +#endif
20954 +                               printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
20955 +                                       pci_name(dev), 'A' + pin, irq);
20956 +                               dev->irq = irq;
20957 +                               return 0;
20958 +                       } else
20959 +                               msg = " Probably buggy MP table.";
20960 +               } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
20961 +                       msg = "";
20962 +               else
20963 +                       msg = " Please try using pci=biosirq.";
20964 +
20965 +               /* With IDE legacy devices the IRQ lookup failure is not a problem.. */
20966 +               if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
20967 +                       return 0;
20968 +
20969 +               printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
20970 +                      'A' + pin, pci_name(dev), msg);
20971 +       }
20972 +       return 0;
20973 +}
20974 +
20975 +int pci_vector_resources(int last, int nr_released)
20976 +{
20977 +       int count = nr_released;
20978 +
20979 +       int next = last;
20980 +       int offset = (last % 8);
20981 +
20982 +       while (next < FIRST_SYSTEM_VECTOR) {
20983 +               next += 8;
20984 +#ifdef CONFIG_X86_64
20985 +               if (next == IA32_SYSCALL_VECTOR)
20986 +                       continue;
20987 +#else
20988 +               if (next == SYSCALL_VECTOR)
20989 +                       continue;
20990 +#endif
20991 +               count++;
20992 +               if (next >= FIRST_SYSTEM_VECTOR) {
20993 +                       if (offset%8) {
20994 +                               next = FIRST_DEVICE_VECTOR + offset;
20995 +                               offset++;
20996 +                               continue;
20997 +                       }
20998 +                       count--;
20999 +               }
21000 +       }
21001 +
21002 +       return count;
21003 +}
21004 diff -urNp linux-2.6/arch/i386/pci/Makefile new/arch/i386/pci/Makefile
21005 --- linux-2.6/arch/i386/pci/Makefile    2006-07-03 14:14:15.000000000 +0200
21006 +++ new/arch/i386/pci/Makefile  2006-05-09 12:32:37.000000000 +0200
21007 @@ -4,6 +4,10 @@ obj-$(CONFIG_PCI_BIOS)         += pcbios.o
21008  obj-$(CONFIG_PCI_MMCONFIG)     += mmconfig.o direct.o
21009  obj-$(CONFIG_PCI_DIRECT)       += direct.o
21010  
21011 +# pcifront should be after pcbios.o, mmconfig.o, and direct.o as it should only
21012 +# take over if direct access to the PCI bus is unavailable
21013 +obj-$(CONFIG_XEN_PCIDEV_FRONTEND)      += pcifront.o
21014 +
21015  pci-y                          := fixup.o
21016  pci-$(CONFIG_ACPI)             += acpi.o
21017  pci-y                          += legacy.o irq.o
21018 @@ -12,3 +16,8 @@ pci-$(CONFIG_X86_VISWS)               := visws.o fixu
21019  pci-$(CONFIG_X86_NUMAQ)                := numa.o irq.o
21020  
21021  obj-y                          += $(pci-y) common.o
21022 +
21023 +ifdef CONFIG_XEN
21024 +include $(srctree)/scripts/Makefile.xen
21025 +obj-y := $(call cherrypickxen, $(obj-y))
21026 +endif
21027 diff -urNp linux-2.6/arch/i386/pci/pcifront.c new/arch/i386/pci/pcifront.c
21028 --- linux-2.6/arch/i386/pci/pcifront.c  1970-01-01 01:00:00.000000000 +0100
21029 +++ new/arch/i386/pci/pcifront.c        2006-05-09 12:32:37.000000000 +0200
21030 @@ -0,0 +1,55 @@
21031 +/*
21032 + * PCI Frontend Stub - puts some "dummy" functions in to the Linux x86 PCI core
21033 + *                     to support the Xen PCI Frontend's operation
21034 + *
21035 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
21036 + */
21037 +#include <linux/module.h>
21038 +#include <linux/init.h>
21039 +#include <linux/pci.h>
21040 +#include <asm/acpi.h>
21041 +#include "pci.h"
21042 +
21043 +static int pcifront_enable_irq(struct pci_dev *dev)
21044 +{
21045 +       u8 irq;
21046 +       pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq);
21047 +       dev->irq = irq;
21048 +
21049 +       return 0;
21050 +}
21051 +
21052 +extern u8 pci_cache_line_size;
21053 +
21054 +static int __init pcifront_x86_stub_init(void)
21055 +{
21056 +       struct cpuinfo_x86 *c = &boot_cpu_data;
21057 +
21058 +       /* Only install our method if we haven't found real hardware already */
21059 +       if (raw_pci_ops)
21060 +               return 0;
21061 +
21062 +       printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n");
21063 +
21064 +       /* Copied from arch/i386/pci/common.c */
21065 +       pci_cache_line_size = 32 >> 2;
21066 +       if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD)
21067 +               pci_cache_line_size = 64 >> 2;  /* K7 & K8 */
21068 +       else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
21069 +               pci_cache_line_size = 128 >> 2; /* P4 */
21070 +
21071 +       /* On x86, we need to disable the normal IRQ routing table and
21072 +        * just ask the backend
21073 +        */
21074 +       pcibios_enable_irq = pcifront_enable_irq;
21075 +       pcibios_disable_irq = NULL;
21076 +
21077 +#ifdef CONFIG_ACPI
21078 +       /* Keep ACPI out of the picture */
21079 +       acpi_noirq = 1;
21080 +#endif
21081 +
21082 +       return 0;
21083 +}
21084 +
21085 +arch_initcall(pcifront_x86_stub_init);
21086 diff -urNp linux-2.6/arch/i386/power/Makefile new/arch/i386/power/Makefile
21087 --- linux-2.6/arch/i386/power/Makefile  2006-07-03 14:14:15.000000000 +0200
21088 +++ new/arch/i386/power/Makefile        2006-05-09 12:32:37.000000000 +0200
21089 @@ -1,2 +1,4 @@
21090 -obj-$(CONFIG_PM)               += cpu.o
21091 +obj-$(CONFIG_PM_LEGACY)                += cpu.o
21092 +obj-$(CONFIG_SOFTWARE_SUSPEND) += cpu.o
21093 +obj-$(CONFIG_ACPI_SLEEP)       += cpu.o
21094  obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o
21095 diff -urNp linux-2.6/arch/ia64/hp/sim/Makefile new/arch/ia64/hp/sim/Makefile
21096 --- linux-2.6/arch/ia64/hp/sim/Makefile 2006-07-03 14:14:15.000000000 +0200
21097 +++ new/arch/ia64/hp/sim/Makefile       2006-05-09 12:32:37.000000000 +0200
21098 @@ -14,3 +14,5 @@ obj-$(CONFIG_HP_SIMETH)       += simeth.o
21099  obj-$(CONFIG_HP_SIMSERIAL) += simserial.o
21100  obj-$(CONFIG_HP_SIMSERIAL_CONSOLE) += hpsim_console.o
21101  obj-$(CONFIG_HP_SIMSCSI) += simscsi.o
21102 +obj-$(CONFIG_XEN) += simserial.o
21103 +obj-$(CONFIG_XEN) += hpsim_console.o
21104 diff -urNp linux-2.6/arch/ia64/Kconfig new/arch/ia64/Kconfig
21105 --- linux-2.6/arch/ia64/Kconfig 2006-07-03 14:14:15.000000000 +0200
21106 +++ new/arch/ia64/Kconfig       2006-06-28 14:32:13.000000000 +0200
21107 @@ -58,6 +58,27 @@ config GENERIC_IOMAP
21108         bool
21109         default y
21110  
21111 +config XEN
21112 +       bool "Xen hypervisor support"
21113 +       default y
21114 +       help
21115 +         Enable Xen hypervisor support.  Resulting kernel runs
21116 +         both as a guest OS on Xen and natively on hardware.
21117 +
21118 +config XEN_IA64_DOM0_VP
21119 +       bool "dom0 vp model"
21120 +       depends on XEN
21121 +       default n
21122 +       help
21123 +         dom0 vp model
21124 +
21125 +config XEN_IA64_DOM0_NON_VP
21126 +       bool
21127 +       depends on XEN && !XEN_IA64_DOM0_VP
21128 +       default y
21129 +       help
21130 +         dom0 P=M model
21131 +
21132  config SCHED_NO_NO_OMIT_FRAME_POINTER
21133         bool
21134         default y
21135 @@ -506,3 +527,40 @@ source "arch/ia64/Kconfig.debug"
21136  source "security/Kconfig"
21137  
21138  source "crypto/Kconfig"
21139 +
21140 +#
21141 +# override default values of drivers/xen/Kconfig
21142 +#
21143 +if XEN
21144 +config XEN_UTIL
21145 +       default n if XEN_IA64_DOM0_VP
21146 +
21147 +config HAVE_ARCH_ALLOC_SKB
21148 +       default n if !XEN_IA64_DOM0_VP
21149 +
21150 +config HAVE_ARCH_DEV_ALLOC_SKB
21151 +       default n if !XEN_IA64_DOM0_VP
21152 +
21153 +config XEN_BALLOON
21154 +       default n if !XEN_IA64_DOM0_VP
21155 +
21156 +config XEN_SKBUFF
21157 +       default n if !XEN_IA64_DOM0_VP
21158 +
21159 +config XEN_NETDEV_BACKEND
21160 +       default n if !XEN_IA64_DOM0_VP
21161 +
21162 +config XEN_NETDEV_FRONTEND
21163 +       default n if !XEN_IA64_DOM0_VP
21164 +
21165 +config XEN_DEVMEM
21166 +       default n
21167 +
21168 +config XEN_REBOOT
21169 +       default n
21170 +
21171 +config XEN_SMPBOOT
21172 +       default n
21173 +endif
21174 +
21175 +source "drivers/xen/Kconfig"
21176 diff -urNp linux-2.6/arch/ia64/kernel/entry.S new/arch/ia64/kernel/entry.S
21177 --- linux-2.6/arch/ia64/kernel/entry.S  2006-07-03 14:14:15.000000000 +0200
21178 +++ new/arch/ia64/kernel/entry.S        2006-05-09 12:32:38.000000000 +0200
21179 @@ -181,7 +181,7 @@ END(sys_clone)
21180   *     called.  The code starting at .map relies on this.  The rest of the code
21181   *     doesn't care about the interrupt masking status.
21182   */
21183 -GLOBAL_ENTRY(ia64_switch_to)
21184 +GLOBAL_ENTRY(__ia64_switch_to)
21185         .prologue
21186         alloc r16=ar.pfs,1,0,0,0
21187         DO_SAVE_SWITCH_STACK
21188 @@ -235,7 +235,7 @@ GLOBAL_ENTRY(ia64_switch_to)
21189         ;;
21190         srlz.d
21191         br.cond.sptk .done
21192 -END(ia64_switch_to)
21193 +END(__ia64_switch_to)
21194  
21195  /*
21196   * Note that interrupts are enabled during save_switch_stack and load_switch_stack.  This
21197 @@ -376,7 +376,7 @@ END(save_switch_stack)
21198   *     - b7 holds address to return to
21199   *     - must not touch r8-r11
21200   */
21201 -ENTRY(load_switch_stack)
21202 +GLOBAL_ENTRY(load_switch_stack)
21203         .prologue
21204         .altrp b7
21205  
21206 @@ -511,7 +511,7 @@ END(clone)
21207          * because some system calls (such as ia64_execve) directly
21208          * manipulate ar.pfs.
21209          */
21210 -GLOBAL_ENTRY(ia64_trace_syscall)
21211 +GLOBAL_ENTRY(__ia64_trace_syscall)
21212         PT_REGS_UNWIND_INFO(0)
21213         /*
21214          * We need to preserve the scratch registers f6-f11 in case the system
21215 @@ -583,7 +583,7 @@ strace_error:
21216  (p6)   mov r10=-1
21217  (p6)   mov r8=r9
21218         br.cond.sptk .strace_save_retval
21219 -END(ia64_trace_syscall)
21220 +END(__ia64_trace_syscall)
21221  
21222         /*
21223          * When traced and returning from sigreturn, we invoke syscall_trace but then
21224 @@ -636,8 +636,11 @@ GLOBAL_ENTRY(ia64_ret_from_syscall)
21225         adds r2=PT(R8)+16,sp                    // r2 = &pt_regs.r8
21226         mov r10=r0                              // clear error indication in r10
21227  (p7)   br.cond.spnt handle_syscall_error       // handle potential syscall failure
21228 +       ;;
21229 +       // don't fall through, ia64_leave_syscall may be #define'd
21230 +       br.cond.sptk.few ia64_leave_syscall
21231 +       ;;
21232  END(ia64_ret_from_syscall)
21233 -       // fall through
21234  /*
21235   * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't
21236   *     need to switch to bank 0 and doesn't restore the scratch registers.
21237 @@ -682,7 +685,7 @@ END(ia64_ret_from_syscall)
21238   *           ar.csd: cleared
21239   *           ar.ssd: cleared
21240   */
21241 -ENTRY(ia64_leave_syscall)
21242 +GLOBAL_ENTRY(__ia64_leave_syscall)
21243         PT_REGS_UNWIND_INFO(0)
21244         /*
21245          * work.need_resched etc. mustn't get changed by this CPU before it returns to
21246 @@ -790,7 +793,7 @@ ENTRY(ia64_leave_syscall)
21247         mov.m ar.ssd=r0                 // M2   clear ar.ssd
21248         mov f11=f0                      // F    clear f11
21249         br.cond.sptk.many rbs_switch    // B
21250 -END(ia64_leave_syscall)
21251 +END(__ia64_leave_syscall)
21252  
21253  #ifdef CONFIG_IA32_SUPPORT
21254  GLOBAL_ENTRY(ia64_ret_from_ia32_execve)
21255 @@ -802,10 +805,13 @@ GLOBAL_ENTRY(ia64_ret_from_ia32_execve)
21256         st8.spill [r2]=r8       // store return value in slot for r8 and set unat bit
21257         .mem.offset 8,0
21258         st8.spill [r3]=r0       // clear error indication in slot for r10 and set unat bit
21259 +       ;;
21260 +       // don't fall through, ia64_leave_kernel may be #define'd
21261 +       br.cond.sptk.few ia64_leave_kernel
21262 +       ;;
21263  END(ia64_ret_from_ia32_execve)
21264 -       // fall through
21265  #endif /* CONFIG_IA32_SUPPORT */
21266 -GLOBAL_ENTRY(ia64_leave_kernel)
21267 +GLOBAL_ENTRY(__ia64_leave_kernel)
21268         PT_REGS_UNWIND_INFO(0)
21269         /*
21270          * work.need_resched etc. mustn't get changed by this CPU before it returns to
21271 @@ -1136,7 +1142,7 @@ skip_rbs_switch:
21272         ld8 r10=[r3]
21273         br.cond.sptk.many .work_processed_syscall       // re-check
21274  
21275 -END(ia64_leave_kernel)
21276 +END(__ia64_leave_kernel)
21277  
21278  ENTRY(handle_syscall_error)
21279         /*
21280 @@ -1176,7 +1182,7 @@ END(ia64_invoke_schedule_tail)
21281          * be set up by the caller.  We declare 8 input registers so the system call
21282          * args get preserved, in case we need to restart a system call.
21283          */
21284 -ENTRY(notify_resume_user)
21285 +GLOBAL_ENTRY(notify_resume_user)
21286         .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
21287         alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart!
21288         mov r9=ar.unat
21289 @@ -1264,7 +1270,7 @@ ENTRY(sys_rt_sigreturn)
21290         adds sp=16,sp
21291         ;;
21292         ld8 r9=[sp]                             // load new ar.unat
21293 -       mov.sptk b7=r8,ia64_leave_kernel
21294 +       mov.sptk b7=r8,__ia64_leave_kernel
21295         ;;
21296         mov ar.unat=r9
21297         br.many b7
21298 diff -urNp linux-2.6/arch/ia64/kernel/gate.lds.S new/arch/ia64/kernel/gate.lds.S
21299 --- linux-2.6/arch/ia64/kernel/gate.lds.S       2006-07-03 14:14:15.000000000 +0200
21300 +++ new/arch/ia64/kernel/gate.lds.S     1970-01-01 01:00:00.000000000 +0100
21301 @@ -1,96 +0,0 @@
21302 -/*
21303 - * Linker script for gate DSO.  The gate pages are an ELF shared object prelinked to its
21304 - * virtual address, with only one read-only segment and one execute-only segment (both fit
21305 - * in one page).  This script controls its layout.
21306 - */
21307 -
21308 -#include <linux/config.h>
21309 -
21310 -#include <asm/system.h>
21311 -
21312 -SECTIONS
21313 -{
21314 -  . = GATE_ADDR + SIZEOF_HEADERS;
21315 -
21316 -  .hash                                : { *(.hash) }                          :readable
21317 -  .dynsym                      : { *(.dynsym) }
21318 -  .dynstr                      : { *(.dynstr) }
21319 -  .gnu.version                 : { *(.gnu.version) }
21320 -  .gnu.version_d               : { *(.gnu.version_d) }
21321 -  .gnu.version_r               : { *(.gnu.version_r) }
21322 -  .dynamic                     : { *(.dynamic) }                       :readable :dynamic
21323 -
21324 -  /*
21325 -   * This linker script is used both with -r and with -shared.  For the layouts to match,
21326 -   * we need to skip more than enough space for the dynamic symbol table et al.  If this
21327 -   * amount is insufficient, ld -shared will barf.  Just increase it here.
21328 -   */
21329 -  . = GATE_ADDR + 0x500;
21330 -
21331 -  .data.patch                  : {
21332 -                                   __start_gate_mckinley_e9_patchlist = .;
21333 -                                   *(.data.patch.mckinley_e9)
21334 -                                   __end_gate_mckinley_e9_patchlist = .;
21335 -
21336 -                                   __start_gate_vtop_patchlist = .;
21337 -                                   *(.data.patch.vtop)
21338 -                                   __end_gate_vtop_patchlist = .;
21339 -
21340 -                                   __start_gate_fsyscall_patchlist = .;
21341 -                                   *(.data.patch.fsyscall_table)
21342 -                                   __end_gate_fsyscall_patchlist = .;
21343 -
21344 -                                   __start_gate_brl_fsys_bubble_down_patchlist = .;
21345 -                                   *(.data.patch.brl_fsys_bubble_down)
21346 -                                   __end_gate_brl_fsys_bubble_down_patchlist = .;
21347 -  }                                                                    :readable
21348 -  .IA_64.unwind_info           : { *(.IA_64.unwind_info*) }
21349 -  .IA_64.unwind                        : { *(.IA_64.unwind*) }                 :readable :unwind
21350 -#ifdef HAVE_BUGGY_SEGREL
21351 -  .text (GATE_ADDR + PAGE_SIZE)        : { *(.text) *(.text.*) }               :readable
21352 -#else
21353 -  . = ALIGN (PERCPU_PAGE_SIZE) + (. & (PERCPU_PAGE_SIZE - 1));
21354 -  .text                                : { *(.text) *(.text.*) }               :epc
21355 -#endif
21356 -
21357 -  /DISCARD/                    : {
21358 -       *(.got.plt) *(.got)
21359 -       *(.data .data.* .gnu.linkonce.d.*)
21360 -       *(.dynbss)
21361 -       *(.bss .bss.* .gnu.linkonce.b.*)
21362 -       *(__ex_table)
21363 -       *(__mca_table)
21364 -  }
21365 -}
21366 -
21367 -/*
21368 - * We must supply the ELF program headers explicitly to get just one
21369 - * PT_LOAD segment, and set the flags explicitly to make segments read-only.
21370 - */
21371 -PHDRS
21372 -{
21373 -  readable  PT_LOAD    FILEHDR PHDRS   FLAGS(4);       /* PF_R */
21374 -#ifndef HAVE_BUGGY_SEGREL
21375 -  epc      PT_LOAD     FILEHDR PHDRS   FLAGS(1);       /* PF_X */
21376 -#endif
21377 -  dynamic   PT_DYNAMIC                 FLAGS(4);       /* PF_R */
21378 -  unwind    0x70000001; /* PT_IA_64_UNWIND, but ld doesn't match the name */
21379 -}
21380 -
21381 -/*
21382 - * This controls what symbols we export from the DSO.
21383 - */
21384 -VERSION
21385 -{
21386 -  LINUX_2.5 {
21387 -    global:
21388 -       __kernel_syscall_via_break;
21389 -       __kernel_syscall_via_epc;
21390 -       __kernel_sigtramp;
21391 -
21392 -    local: *;
21393 -  };
21394 -}
21395 -
21396 -/* The ELF entry point can be used to set the AT_SYSINFO value.  */
21397 -ENTRY(__kernel_syscall_via_epc)
21398 diff -urNp linux-2.6/arch/ia64/kernel/gate.S new/arch/ia64/kernel/gate.S
21399 --- linux-2.6/arch/ia64/kernel/gate.S   2006-07-03 14:14:15.000000000 +0200
21400 +++ new/arch/ia64/kernel/gate.S 1970-01-01 01:00:00.000000000 +0100
21401 @@ -1,376 +0,0 @@
21402 -/*
21403 - * This file contains the code that gets mapped at the upper end of each task's text
21404 - * region.  For now, it contains the signal trampoline code only.
21405 - *
21406 - * Copyright (C) 1999-2003 Hewlett-Packard Co
21407 - *     David Mosberger-Tang <davidm@hpl.hp.com>
21408 - */
21409 -
21410 -#include <linux/config.h>
21411 -
21412 -#include <asm/asmmacro.h>
21413 -#include <asm/errno.h>
21414 -#include <asm/asm-offsets.h>
21415 -#include <asm/sigcontext.h>
21416 -#include <asm/system.h>
21417 -#include <asm/unistd.h>
21418 -
21419 -/*
21420 - * We can't easily refer to symbols inside the kernel.  To avoid full runtime relocation,
21421 - * complications with the linker (which likes to create PLT stubs for branches
21422 - * to targets outside the shared object) and to avoid multi-phase kernel builds, we
21423 - * simply create minimalistic "patch lists" in special ELF sections.
21424 - */
21425 -       .section ".data.patch.fsyscall_table", "a"
21426 -       .previous
21427 -#define LOAD_FSYSCALL_TABLE(reg)                       \
21428 -[1:]   movl reg=0;                                     \
21429 -       .xdata4 ".data.patch.fsyscall_table", 1b-.
21430 -
21431 -       .section ".data.patch.brl_fsys_bubble_down", "a"
21432 -       .previous
21433 -#define BRL_COND_FSYS_BUBBLE_DOWN(pr)                  \
21434 -[1:](pr)brl.cond.sptk 0;                               \
21435 -       .xdata4 ".data.patch.brl_fsys_bubble_down", 1b-.
21436 -
21437 -GLOBAL_ENTRY(__kernel_syscall_via_break)
21438 -       .prologue
21439 -       .altrp b6
21440 -       .body
21441 -       /*
21442 -        * Note: for (fast) syscall restart to work, the break instruction must be
21443 -        *       the first one in the bundle addressed by syscall_via_break.
21444 -        */
21445 -{ .mib
21446 -       break 0x100000
21447 -       nop.i 0
21448 -       br.ret.sptk.many b6
21449 -}
21450 -END(__kernel_syscall_via_break)
21451 -
21452 -/*
21453 - * On entry:
21454 - *     r11 = saved ar.pfs
21455 - *     r15 = system call #
21456 - *     b0  = saved return address
21457 - *     b6  = return address
21458 - * On exit:
21459 - *     r11 = saved ar.pfs
21460 - *     r15 = system call #
21461 - *     b0  = saved return address
21462 - *     all other "scratch" registers:  undefined
21463 - *     all "preserved" registers:      same as on entry
21464 - */
21465 -
21466 -GLOBAL_ENTRY(__kernel_syscall_via_epc)
21467 -       .prologue
21468 -       .altrp b6
21469 -       .body
21470 -{
21471 -       /*
21472 -        * Note: the kernel cannot assume that the first two instructions in this
21473 -        * bundle get executed.  The remaining code must be safe even if
21474 -        * they do not get executed.
21475 -        */
21476 -       adds r17=-1024,r15                      // A
21477 -       mov r10=0                               // A    default to successful syscall execution
21478 -       epc                                     // B    causes split-issue
21479 -}
21480 -       ;;
21481 -       rsm psr.be | psr.i                      // M2 (5 cyc to srlz.d)
21482 -       LOAD_FSYSCALL_TABLE(r14)                // X
21483 -       ;;
21484 -       mov r16=IA64_KR(CURRENT)                // M2 (12 cyc)
21485 -       shladd r18=r17,3,r14                    // A
21486 -       mov r19=NR_syscalls-1                   // A
21487 -       ;;
21488 -       lfetch [r18]                            // M0|1
21489 -       mov r29=psr                             // M2 (12 cyc)
21490 -       // If r17 is a NaT, p6 will be zero
21491 -       cmp.geu p6,p7=r19,r17                   // A    (sysnr > 0 && sysnr < 1024+NR_syscalls)?
21492 -       ;;
21493 -       mov r21=ar.fpsr                         // M2 (12 cyc)
21494 -       tnat.nz p10,p9=r15                      // I0
21495 -       mov.i r26=ar.pfs                        // I0 (would stall anyhow due to srlz.d...)
21496 -       ;;
21497 -       srlz.d                                  // M0 (forces split-issue) ensure PSR.BE==0
21498 -(p6)   ld8 r18=[r18]                           // M0|1
21499 -       nop.i 0
21500 -       ;;
21501 -       nop.m 0
21502 -(p6)   tbit.z.unc p8,p0=r18,0                  // I0 (dual-issues with "mov b7=r18"!)
21503 -       nop.i 0
21504 -       ;;
21505 -(p8)   ssm psr.i
21506 -(p6)   mov b7=r18                              // I0
21507 -(p8)   br.dptk.many b7                         // B
21508 -
21509 -       mov r27=ar.rsc                          // M2 (12 cyc)
21510 -/*
21511 - * brl.cond doesn't work as intended because the linker would convert this branch
21512 - * into a branch to a PLT.  Perhaps there will be a way to avoid this with some
21513 - * future version of the linker.  In the meantime, we just use an indirect branch
21514 - * instead.
21515 - */
21516 -#ifdef CONFIG_ITANIUM
21517 -(p6)   add r14=-8,r14                          // r14 <- addr of fsys_bubble_down entry
21518 -       ;;
21519 -(p6)   ld8 r14=[r14]                           // r14 <- fsys_bubble_down
21520 -       ;;
21521 -(p6)   mov b7=r14
21522 -(p6)   br.sptk.many b7
21523 -#else
21524 -       BRL_COND_FSYS_BUBBLE_DOWN(p6)
21525 -#endif
21526 -       ssm psr.i
21527 -       mov r10=-1
21528 -(p10)  mov r8=EINVAL
21529 -(p9)   mov r8=ENOSYS
21530 -       FSYS_RETURN
21531 -END(__kernel_syscall_via_epc)
21532 -
21533 -#      define ARG0_OFF         (16 + IA64_SIGFRAME_ARG0_OFFSET)
21534 -#      define ARG1_OFF         (16 + IA64_SIGFRAME_ARG1_OFFSET)
21535 -#      define ARG2_OFF         (16 + IA64_SIGFRAME_ARG2_OFFSET)
21536 -#      define SIGHANDLER_OFF   (16 + IA64_SIGFRAME_HANDLER_OFFSET)
21537 -#      define SIGCONTEXT_OFF   (16 + IA64_SIGFRAME_SIGCONTEXT_OFFSET)
21538 -
21539 -#      define FLAGS_OFF        IA64_SIGCONTEXT_FLAGS_OFFSET
21540 -#      define CFM_OFF          IA64_SIGCONTEXT_CFM_OFFSET
21541 -#      define FR6_OFF          IA64_SIGCONTEXT_FR6_OFFSET
21542 -#      define BSP_OFF          IA64_SIGCONTEXT_AR_BSP_OFFSET
21543 -#      define RNAT_OFF         IA64_SIGCONTEXT_AR_RNAT_OFFSET
21544 -#      define UNAT_OFF         IA64_SIGCONTEXT_AR_UNAT_OFFSET
21545 -#      define FPSR_OFF         IA64_SIGCONTEXT_AR_FPSR_OFFSET
21546 -#      define PR_OFF           IA64_SIGCONTEXT_PR_OFFSET
21547 -#      define RP_OFF           IA64_SIGCONTEXT_IP_OFFSET
21548 -#      define SP_OFF           IA64_SIGCONTEXT_R12_OFFSET
21549 -#      define RBS_BASE_OFF     IA64_SIGCONTEXT_RBS_BASE_OFFSET
21550 -#      define LOADRS_OFF       IA64_SIGCONTEXT_LOADRS_OFFSET
21551 -#      define base0            r2
21552 -#      define base1            r3
21553 -       /*
21554 -        * When we get here, the memory stack looks like this:
21555 -        *
21556 -        *   +===============================+
21557 -                *   |                               |
21558 -                *   //     struct sigframe          //
21559 -                *   |                               |
21560 -        *   +-------------------------------+ <-- sp+16
21561 -        *   |      16 byte of scratch       |
21562 -        *   |            space              |
21563 -        *   +-------------------------------+ <-- sp
21564 -        *
21565 -        * The register stack looks _exactly_ the way it looked at the time the signal
21566 -        * occurred.  In other words, we're treading on a potential mine-field: each
21567 -        * incoming general register may be a NaT value (including sp, in which case the
21568 -        * process ends up dying with a SIGSEGV).
21569 -        *
21570 -        * The first thing need to do is a cover to get the registers onto the backing
21571 -        * store.  Once that is done, we invoke the signal handler which may modify some
21572 -        * of the machine state.  After returning from the signal handler, we return
21573 -        * control to the previous context by executing a sigreturn system call.  A signal
21574 -        * handler may call the rt_sigreturn() function to directly return to a given
21575 -        * sigcontext.  However, the user-level sigreturn() needs to do much more than
21576 -        * calling the rt_sigreturn() system call as it needs to unwind the stack to
21577 -        * restore preserved registers that may have been saved on the signal handler's
21578 -        * call stack.
21579 -        */
21580 -
21581 -#define SIGTRAMP_SAVES                                                                         \
21582 -       .unwabi 3, 's';         /* mark this as a sigtramp handler (saves scratch regs) */      \
21583 -       .unwabi @svr4, 's'; /* backwards compatibility with old unwinders (remove in v2.7) */   \
21584 -       .savesp ar.unat, UNAT_OFF+SIGCONTEXT_OFF;                                               \
21585 -       .savesp ar.fpsr, FPSR_OFF+SIGCONTEXT_OFF;                                               \
21586 -       .savesp pr, PR_OFF+SIGCONTEXT_OFF;                                                      \
21587 -       .savesp rp, RP_OFF+SIGCONTEXT_OFF;                                                      \
21588 -       .savesp ar.pfs, CFM_OFF+SIGCONTEXT_OFF;                                                 \
21589 -       .vframesp SP_OFF+SIGCONTEXT_OFF
21590 -
21591 -GLOBAL_ENTRY(__kernel_sigtramp)
21592 -       // describe the state that is active when we get here:
21593 -       .prologue
21594 -       SIGTRAMP_SAVES
21595 -       .body
21596 -
21597 -       .label_state 1
21598 -
21599 -       adds base0=SIGHANDLER_OFF,sp
21600 -       adds base1=RBS_BASE_OFF+SIGCONTEXT_OFF,sp
21601 -       br.call.sptk.many rp=1f
21602 -1:
21603 -       ld8 r17=[base0],(ARG0_OFF-SIGHANDLER_OFF)       // get pointer to signal handler's plabel
21604 -       ld8 r15=[base1]                                 // get address of new RBS base (or NULL)
21605 -       cover                           // push args in interrupted frame onto backing store
21606 -       ;;
21607 -       cmp.ne p1,p0=r15,r0             // do we need to switch rbs? (note: pr is saved by kernel)
21608 -       mov.m r9=ar.bsp                 // fetch ar.bsp
21609 -       .spillsp.p p1, ar.rnat, RNAT_OFF+SIGCONTEXT_OFF
21610 -(p1)   br.cond.spnt setup_rbs          // yup -> (clobbers p8, r14-r16, and r18-r20)
21611 -back_from_setup_rbs:
21612 -       alloc r8=ar.pfs,0,0,3,0
21613 -       ld8 out0=[base0],16             // load arg0 (signum)
21614 -       adds base1=(ARG1_OFF-(RBS_BASE_OFF+SIGCONTEXT_OFF)),base1
21615 -       ;;
21616 -       ld8 out1=[base1]                // load arg1 (siginfop)
21617 -       ld8 r10=[r17],8                 // get signal handler entry point
21618 -       ;;
21619 -       ld8 out2=[base0]                // load arg2 (sigcontextp)
21620 -       ld8 gp=[r17]                    // get signal handler's global pointer
21621 -       adds base0=(BSP_OFF+SIGCONTEXT_OFF),sp
21622 -       ;;
21623 -       .spillsp ar.bsp, BSP_OFF+SIGCONTEXT_OFF
21624 -       st8 [base0]=r9                  // save sc_ar_bsp
21625 -       adds base0=(FR6_OFF+SIGCONTEXT_OFF),sp
21626 -       adds base1=(FR6_OFF+16+SIGCONTEXT_OFF),sp
21627 -       ;;
21628 -       stf.spill [base0]=f6,32
21629 -       stf.spill [base1]=f7,32
21630 -       ;;
21631 -       stf.spill [base0]=f8,32
21632 -       stf.spill [base1]=f9,32
21633 -       mov b6=r10
21634 -       ;;
21635 -       stf.spill [base0]=f10,32
21636 -       stf.spill [base1]=f11,32
21637 -       ;;
21638 -       stf.spill [base0]=f12,32
21639 -       stf.spill [base1]=f13,32
21640 -       ;;
21641 -       stf.spill [base0]=f14,32
21642 -       stf.spill [base1]=f15,32
21643 -       br.call.sptk.many rp=b6                 // call the signal handler
21644 -.ret0: adds base0=(BSP_OFF+SIGCONTEXT_OFF),sp
21645 -       ;;
21646 -       ld8 r15=[base0]                         // fetch sc_ar_bsp
21647 -       mov r14=ar.bsp
21648 -       ;;
21649 -       cmp.ne p1,p0=r14,r15                    // do we need to restore the rbs?
21650 -(p1)   br.cond.spnt restore_rbs                // yup -> (clobbers r14-r18, f6 & f7)
21651 -       ;;
21652 -back_from_restore_rbs:
21653 -       adds base0=(FR6_OFF+SIGCONTEXT_OFF),sp
21654 -       adds base1=(FR6_OFF+16+SIGCONTEXT_OFF),sp
21655 -       ;;
21656 -       ldf.fill f6=[base0],32
21657 -       ldf.fill f7=[base1],32
21658 -       ;;
21659 -       ldf.fill f8=[base0],32
21660 -       ldf.fill f9=[base1],32
21661 -       ;;
21662 -       ldf.fill f10=[base0],32
21663 -       ldf.fill f11=[base1],32
21664 -       ;;
21665 -       ldf.fill f12=[base0],32
21666 -       ldf.fill f13=[base1],32
21667 -       ;;
21668 -       ldf.fill f14=[base0],32
21669 -       ldf.fill f15=[base1],32
21670 -       mov r15=__NR_rt_sigreturn
21671 -       .restore sp                             // pop .prologue
21672 -       break __BREAK_SYSCALL
21673 -
21674 -       .prologue
21675 -       SIGTRAMP_SAVES
21676 -setup_rbs:
21677 -       mov ar.rsc=0                            // put RSE into enforced lazy mode
21678 -       ;;
21679 -       .save ar.rnat, r19
21680 -       mov r19=ar.rnat                         // save RNaT before switching backing store area
21681 -       adds r14=(RNAT_OFF+SIGCONTEXT_OFF),sp
21682 -
21683 -       mov r18=ar.bspstore
21684 -       mov ar.bspstore=r15                     // switch over to new register backing store area
21685 -       ;;
21686 -
21687 -       .spillsp ar.rnat, RNAT_OFF+SIGCONTEXT_OFF
21688 -       st8 [r14]=r19                           // save sc_ar_rnat
21689 -       .body
21690 -       mov.m r16=ar.bsp                        // sc_loadrs <- (new bsp - new bspstore) << 16
21691 -       adds r14=(LOADRS_OFF+SIGCONTEXT_OFF),sp
21692 -       ;;
21693 -       invala
21694 -       sub r15=r16,r15
21695 -       extr.u r20=r18,3,6
21696 -       ;;
21697 -       mov ar.rsc=0xf                          // set RSE into eager mode, pl 3
21698 -       cmp.eq p8,p0=63,r20
21699 -       shl r15=r15,16
21700 -       ;;
21701 -       st8 [r14]=r15                           // save sc_loadrs
21702 -(p8)   st8 [r18]=r19           // if bspstore points at RNaT slot, store RNaT there now
21703 -       .restore sp                             // pop .prologue
21704 -       br.cond.sptk back_from_setup_rbs
21705 -
21706 -       .prologue
21707 -       SIGTRAMP_SAVES
21708 -       .spillsp ar.rnat, RNAT_OFF+SIGCONTEXT_OFF
21709 -       .body
21710 -restore_rbs:
21711 -       // On input:
21712 -       //      r14 = bsp1 (bsp at the time of return from signal handler)
21713 -       //      r15 = bsp0 (bsp at the time the signal occurred)
21714 -       //
21715 -       // Here, we need to calculate bspstore0, the value that ar.bspstore needs
21716 -       // to be set to, based on bsp0 and the size of the dirty partition on
21717 -       // the alternate stack (sc_loadrs >> 16).  This can be done with the
21718 -       // following algorithm:
21719 -       //
21720 -       //  bspstore0 = rse_skip_regs(bsp0, -rse_num_regs(bsp1 - (loadrs >> 19), bsp1));
21721 -       //
21722 -       // This is what the code below does.
21723 -       //
21724 -       alloc r2=ar.pfs,0,0,0,0                 // alloc null frame
21725 -       adds r16=(LOADRS_OFF+SIGCONTEXT_OFF),sp
21726 -       adds r18=(RNAT_OFF+SIGCONTEXT_OFF),sp
21727 -       ;;
21728 -       ld8 r17=[r16]
21729 -       ld8 r16=[r18]                   // get new rnat
21730 -       extr.u r18=r15,3,6      // r18 <- rse_slot_num(bsp0)
21731 -       ;;
21732 -       mov ar.rsc=r17                  // put RSE into enforced lazy mode
21733 -       shr.u r17=r17,16
21734 -       ;;
21735 -       sub r14=r14,r17         // r14 (bspstore1) <- bsp1 - (sc_loadrs >> 16)
21736 -       shr.u r17=r17,3         // r17 <- (sc_loadrs >> 19)
21737 -       ;;
21738 -       loadrs                  // restore dirty partition
21739 -       extr.u r14=r14,3,6      // r14 <- rse_slot_num(bspstore1)
21740 -       ;;
21741 -       add r14=r14,r17         // r14 <- rse_slot_num(bspstore1) + (sc_loadrs >> 19)
21742 -       ;;
21743 -       shr.u r14=r14,6         // r14 <- (rse_slot_num(bspstore1) + (sc_loadrs >> 19))/0x40
21744 -       ;;
21745 -       sub r14=r14,r17         // r14 <- -rse_num_regs(bspstore1, bsp1)
21746 -       movl r17=0x8208208208208209
21747 -       ;;
21748 -       add r18=r18,r14         // r18 (delta) <- rse_slot_num(bsp0) - rse_num_regs(bspstore1,bsp1)
21749 -       setf.sig f7=r17
21750 -       cmp.lt p7,p0=r14,r0     // p7 <- (r14 < 0)?
21751 -       ;;
21752 -(p7)   adds r18=-62,r18        // delta -= 62
21753 -       ;;
21754 -       setf.sig f6=r18
21755 -       ;;
21756 -       xmpy.h f6=f6,f7
21757 -       ;;
21758 -       getf.sig r17=f6
21759 -       ;;
21760 -       add r17=r17,r18
21761 -       shr r18=r18,63
21762 -       ;;
21763 -       shr r17=r17,5
21764 -       ;;
21765 -       sub r17=r17,r18         // r17 = delta/63
21766 -       ;;
21767 -       add r17=r14,r17         // r17 <- delta/63 - rse_num_regs(bspstore1, bsp1)
21768 -       ;;
21769 -       shladd r15=r17,3,r15    // r15 <- bsp0 + 8*(delta/63 - rse_num_regs(bspstore1, bsp1))
21770 -       ;;
21771 -       mov ar.bspstore=r15                     // switch back to old register backing store area
21772 -       ;;
21773 -       mov ar.rnat=r16                         // restore RNaT
21774 -       mov ar.rsc=0xf                          // (will be restored later on from sc_ar_rsc)
21775 -       // invala not necessary as that will happen when returning to user-mode
21776 -       br.cond.sptk back_from_restore_rbs
21777 -END(__kernel_sigtramp)
21778 diff -urNp linux-2.6/arch/ia64/kernel/head.S new/arch/ia64/kernel/head.S
21779 --- linux-2.6/arch/ia64/kernel/head.S   2006-07-03 14:14:15.000000000 +0200
21780 +++ new/arch/ia64/kernel/head.S 2006-05-09 12:32:38.000000000 +0200
21781 @@ -363,6 +363,12 @@ start_ap:
21782         ;;
21783  (isBP) st8 [r2]=r28            // save the address of the boot param area passed by the bootloader
21784  
21785 +#ifdef CONFIG_XEN
21786 +       //  Note: isBP is used by the subprogram.
21787 +       br.call.sptk.many rp=early_xen_setup
21788 +       ;;
21789 +#endif
21790 +
21791  #ifdef CONFIG_SMP
21792  (isAP) br.call.sptk.many rp=start_secondary
21793  .ret0:
21794 diff -urNp linux-2.6/arch/ia64/kernel/iosapic.c new/arch/ia64/kernel/iosapic.c
21795 --- linux-2.6/arch/ia64/kernel/iosapic.c        2006-07-03 14:14:15.000000000 +0200
21796 +++ new/arch/ia64/kernel/iosapic.c      2006-06-28 14:32:13.000000000 +0200
21797 @@ -160,6 +160,65 @@ static unsigned char pcat_compat __devin
21798  static int iosapic_kmalloc_ok;
21799  static LIST_HEAD(free_rte_list);
21800  
21801 +#ifdef CONFIG_XEN
21802 +#include <xen/interface/xen.h>
21803 +#include <xen/interface/physdev.h>
21804 +#include <asm/hypervisor.h>
21805 +static inline unsigned int xen_iosapic_read(char __iomem *iosapic, unsigned int reg)
21806 +{
21807 +       struct physdev_apic apic_op;
21808 +       int ret;
21809 +
21810 +       apic_op.apic_physbase = (unsigned long)iosapic -
21811 +                                       __IA64_UNCACHED_OFFSET;
21812 +       apic_op.reg = reg;
21813 +       ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
21814 +       if (ret)
21815 +               return ret;
21816 +       return apic_op.value;
21817 +}
21818 +
21819 +static inline void xen_iosapic_write(char __iomem *iosapic, unsigned int reg, u32 val)
21820 +{
21821 +       struct physdev_apic apic_op;
21822 +
21823 +       apic_op.apic_physbase = (unsigned long)iosapic - 
21824 +                                       __IA64_UNCACHED_OFFSET;
21825 +       apic_op.reg = reg;
21826 +       apic_op.value = val;
21827 +       HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op);
21828 +}
21829 +
21830 +static inline unsigned int iosapic_read(char __iomem *iosapic, unsigned int reg)
21831 +{
21832 +       if (!is_running_on_xen()) {
21833 +               writel(reg, iosapic + IOSAPIC_REG_SELECT);
21834 +               return readl(iosapic + IOSAPIC_WINDOW);
21835 +       } else
21836 +               return xen_iosapic_read(iosapic, reg);
21837 +}
21838 +
21839 +static inline void iosapic_write(char __iomem *iosapic, unsigned int reg, u32 val)
21840 +{
21841 +       if (!is_running_on_xen()) {
21842 +               writel(reg, iosapic + IOSAPIC_REG_SELECT);
21843 +               writel(val, iosapic + IOSAPIC_WINDOW);
21844 +       } else
21845 +               xen_iosapic_write(iosapic, reg, val);
21846 +}
21847 +
21848 +int xen_assign_irq_vector(int irq)
21849 +{
21850 +       struct physdev_irq irq_op;
21851 +
21852 +       irq_op.irq = irq;
21853 +       if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
21854 +               return -ENOSPC;
21855 +
21856 +       return irq_op.vector;
21857 +}
21858 +#endif /* XEN */
21859 +
21860  /*
21861   * Find an IOSAPIC associated with a GSI
21862   */
21863 @@ -654,6 +713,9 @@ register_intr (unsigned int gsi, int vec
21864         iosapic_intr_info[vector].dmode    = delivery;
21865         iosapic_intr_info[vector].trigger  = trigger;
21866  
21867 +       if (is_running_on_xen())
21868 +               return 0;
21869 +
21870         if (trigger == IOSAPIC_EDGE)
21871                 irq_type = &irq_type_iosapic_edge;
21872         else
21873 @@ -1016,6 +1078,9 @@ iosapic_system_init (int system_pcat_com
21874         }
21875  
21876         pcat_compat = system_pcat_compat;
21877 +       if (is_running_on_xen())
21878 +               return;
21879 +
21880         if (pcat_compat) {
21881                 /*
21882                  * Disable the compatibility mode interrupts (8259 style),
21883 diff -urNp linux-2.6/arch/ia64/kernel/irq_ia64.c new/arch/ia64/kernel/irq_ia64.c
21884 --- linux-2.6/arch/ia64/kernel/irq_ia64.c       2006-07-03 14:14:15.000000000 +0200
21885 +++ new/arch/ia64/kernel/irq_ia64.c     2006-06-28 14:32:13.000000000 +0200
21886 @@ -66,6 +66,13 @@ int
21887  assign_irq_vector (int irq)
21888  {
21889         int pos, vector;
21890 +
21891 +#ifdef CONFIG_XEN
21892 +       if (is_running_on_xen()) {
21893 +               extern int xen_assign_irq_vector(int);
21894 +               return xen_assign_irq_vector(irq);
21895 +       }
21896 +#endif
21897   again:
21898         pos = find_first_zero_bit(ia64_vector_mask, IA64_NUM_DEVICE_VECTORS);
21899         vector = IA64_FIRST_DEVICE_VECTOR + pos;
21900 @@ -224,6 +231,157 @@ static struct irqaction ipi_irqaction = 
21901  };
21902  #endif
21903  
21904 +#ifdef CONFIG_XEN
21905 +#include <xen/evtchn.h>
21906 +#include <xen/interface/callback.h>
21907 +
21908 +static char timer_name[NR_CPUS][15];
21909 +static char ipi_name[NR_CPUS][15];
21910 +static char resched_name[NR_CPUS][15];
21911 +
21912 +struct saved_irq {
21913 +       unsigned int irq;
21914 +       struct irqaction *action;
21915 +};
21916 +/* 16 should be far optimistic value, since only several percpu irqs
21917 + * are registered early.
21918 + */
21919 +#define MAX_LATE_IRQ   16
21920 +static struct saved_irq saved_percpu_irqs[MAX_LATE_IRQ];
21921 +static unsigned short late_irq_cnt = 0;
21922 +static unsigned short saved_irq_cnt = 0;
21923 +static int xen_slab_ready = 0;
21924 +
21925 +/* Dummy stub. Though we may check RESCHEDULE_VECTOR before __do_IRQ,
21926 + * it ends up to issue several memory accesses upon percpu data and
21927 + * thus adds unnecessary traffic to other paths.
21928 + */
21929 +static irqreturn_t
21930 +handle_reschedule(int irq, void *dev_id, struct pt_regs *regs)
21931 +{
21932 +
21933 +       return IRQ_HANDLED;
21934 +}
21935 +
21936 +static struct irqaction resched_irqaction = {
21937 +       .handler =      handle_reschedule,
21938 +       .flags =        SA_INTERRUPT,
21939 +       .name =         "RESCHED"
21940 +};
21941 +
21942 +/*
21943 + * This is xen version percpu irq registration, which needs bind
21944 + * to xen specific evtchn sub-system. One trick here is that xen
21945 + * evtchn binding interface depends on kmalloc because related
21946 + * port needs to be freed at device/cpu down. So we cache the
21947 + * registration on BSP before slab is ready and then deal them
21948 + * at later point. For rest instances happening after slab ready,
21949 + * we hook them to xen evtchn immediately.
21950 + *
21951 + * FIXME: MCA is not supported by far, and thus "nomca" boot param is
21952 + * required.
21953 + */
21954 +static void
21955 +xen_register_percpu_irq (unsigned int irq, struct irqaction *action, int save)
21956 +{
21957 +       unsigned int cpu = smp_processor_id();
21958 +       int ret = 0;
21959 +
21960 +       if (xen_slab_ready) {
21961 +               switch (irq) {
21962 +               case IA64_TIMER_VECTOR:
21963 +                       sprintf(timer_name[cpu], "%s%d", action->name, cpu);
21964 +                       ret = bind_virq_to_irqhandler(VIRQ_ITC, cpu,
21965 +                               action->handler, action->flags,
21966 +                               timer_name[cpu], action->dev_id);
21967 +                       printk(KERN_INFO "register VIRQ_ITC (%s) to xen irq (%d)\n", timer_name[cpu], ret);
21968 +                       break;
21969 +               case IA64_IPI_RESCHEDULE:
21970 +                       sprintf(resched_name[cpu], "%s%d", action->name, cpu);
21971 +                       ret = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR, cpu,
21972 +                               action->handler, action->flags,
21973 +                               resched_name[cpu], action->dev_id);
21974 +                       printk(KERN_INFO "register RESCHEDULE_VECTOR (%s) to xen irq (%d)\n", resched_name[cpu], ret);
21975 +                       break;
21976 +               case IA64_IPI_VECTOR:
21977 +                       sprintf(ipi_name[cpu], "%s%d", action->name, cpu);
21978 +                       ret = bind_ipi_to_irqhandler(IPI_VECTOR, cpu,
21979 +                               action->handler, action->flags,
21980 +                               ipi_name[cpu], action->dev_id);
21981 +                       printk(KERN_INFO "register IPI_VECTOR (%s) to xen irq (%d)\n", ipi_name[cpu], ret);
21982 +                       break;
21983 +               case IA64_SPURIOUS_INT_VECTOR:
21984 +                       break;
21985 +               default:
21986 +                       printk(KERN_WARNING "Percpu irq %d is unsupported by xen!\n", irq);
21987 +                       break;
21988 +               }
21989 +               BUG_ON(ret < 0);
21990 +       } 
21991 +
21992 +       /* For BSP, we cache registered percpu irqs, and then re-walk
21993 +        * them when initializing APs
21994 +        */
21995 +       if (!cpu && save) {
21996 +               BUG_ON(saved_irq_cnt == MAX_LATE_IRQ);
21997 +               saved_percpu_irqs[saved_irq_cnt].irq = irq;
21998 +               saved_percpu_irqs[saved_irq_cnt].action = action;
21999 +               saved_irq_cnt++;
22000 +               if (!xen_slab_ready)
22001 +                       late_irq_cnt++;
22002 +       }
22003 +}
22004 +
22005 +static void
22006 +xen_bind_early_percpu_irq (void)
22007 +{
22008 +       int i;
22009 +
22010 +       xen_slab_ready = 1;
22011 +       /* There's no race when accessing this cached array, since only
22012 +        * BSP will face with such step shortly
22013 +        */
22014 +       for (i = 0; i < late_irq_cnt; i++)
22015 +               xen_register_percpu_irq(saved_percpu_irqs[i].irq,
22016 +                       saved_percpu_irqs[i].action, 0);
22017 +}
22018 +
22019 +/* FIXME: There's no obvious point to check whether slab is ready. So
22020 + * a hack is used here by utilizing a late time hook.
22021 + */
22022 +extern void (*late_time_init)(void);
22023 +extern char xen_event_callback;
22024 +extern void xen_init_IRQ(void);
22025 +
22026 +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
22027 +void xen_smp_intr_init(void)
22028 +{
22029 +#ifdef CONFIG_SMP
22030 +       unsigned int cpu = smp_processor_id();
22031 +       unsigned int i = 0;
22032 +       struct callback_register event = {
22033 +               .type = CALLBACKTYPE_event,
22034 +               .address = (unsigned long)&xen_event_callback,
22035 +       };
22036 +       static cpumask_t registered_cpumask;
22037 +
22038 +       if (!cpu)
22039 +               return;
22040 +
22041 +       /* This should be piggyback when setup vcpu guest context */
22042 +       BUG_ON(HYPERVISOR_callback_op(CALLBACKOP_register, &event));
22043 +
22044 +       if (!cpu_isset(cpu, registered_cpumask)) {
22045 +               cpu_set(cpu, registered_cpumask);
22046 +               for (i = 0; i < saved_irq_cnt; i++)
22047 +                       xen_register_percpu_irq(saved_percpu_irqs[i].irq,
22048 +                                               saved_percpu_irqs[i].action,
22049 +                                               0);
22050 +       }
22051 +#endif /* CONFIG_SMP */
22052 +}
22053 +#endif /* CONFIG_XEN */
22054 +
22055  void
22056  register_percpu_irq (ia64_vector vec, struct irqaction *action)
22057  {
22058 @@ -232,6 +390,10 @@ register_percpu_irq (ia64_vector vec, st
22059  
22060         for (irq = 0; irq < NR_IRQS; ++irq)
22061                 if (irq_to_vector(irq) == vec) {
22062 +#ifdef CONFIG_XEN
22063 +                       if (is_running_on_xen())
22064 +                               return xen_register_percpu_irq(vec, action, 1);
22065 +#endif
22066                         desc = irq_descp(irq);
22067                         desc->status |= IRQ_PER_CPU;
22068                         desc->handler = &irq_type_ia64_lsapic;
22069 @@ -243,6 +405,21 @@ register_percpu_irq (ia64_vector vec, st
22070  void __init
22071  init_IRQ (void)
22072  {
22073 +#ifdef CONFIG_XEN
22074 +       /* Maybe put into platform_irq_init later */
22075 +       if (is_running_on_xen()) {
22076 +               struct callback_register event = {
22077 +                       .type = CALLBACKTYPE_event,
22078 +                       .address = (unsigned long)&xen_event_callback,
22079 +               };
22080 +               xen_init_IRQ();
22081 +               BUG_ON(HYPERVISOR_callback_op(CALLBACKOP_register, &event));
22082 +               late_time_init = xen_bind_early_percpu_irq;
22083 +#ifdef CONFIG_SMP
22084 +               register_percpu_irq(IA64_IPI_RESCHEDULE, &resched_irqaction);
22085 +#endif /* CONFIG_SMP */
22086 +       }
22087 +#endif /* CONFIG_XEN */
22088         register_percpu_irq(IA64_SPURIOUS_INT_VECTOR, NULL);
22089  #ifdef CONFIG_SMP
22090         register_percpu_irq(IA64_IPI_VECTOR, &ipi_irqaction);
22091 @@ -260,6 +437,37 @@ ia64_send_ipi (int cpu, int vector, int 
22092         unsigned long ipi_data;
22093         unsigned long phys_cpu_id;
22094  
22095 +#ifdef CONFIG_XEN
22096 +        if (is_running_on_xen()) {
22097 +               int irq = -1;
22098 +
22099 +               /* TODO: we need to call vcpu_up here */
22100 +               if (unlikely(vector == ap_wakeup_vector)) {
22101 +                       extern void xen_send_ipi (int cpu, int vec);
22102 +                       xen_send_ipi (cpu, vector);
22103 +                       //vcpu_prepare_and_up(cpu);
22104 +                       return;
22105 +               }
22106 +
22107 +               switch(vector) {
22108 +               case IA64_IPI_VECTOR:
22109 +                       irq = per_cpu(ipi_to_irq, cpu)[IPI_VECTOR];
22110 +                       break;
22111 +               case IA64_IPI_RESCHEDULE:
22112 +                       irq = per_cpu(ipi_to_irq, cpu)[RESCHEDULE_VECTOR];
22113 +                       break;
22114 +               default:
22115 +                       printk(KERN_WARNING"Unsupported IPI type 0x%x\n", vector);
22116 +                       irq = 0;
22117 +                       break;
22118 +               }               
22119 +       
22120 +               BUG_ON(irq < 0);
22121 +               notify_remote_via_irq(irq);
22122 +               return;
22123 +        }
22124 +#endif /* CONFIG_XEN */
22125 +
22126  #ifdef CONFIG_SMP
22127         phys_cpu_id = cpu_physical_id(cpu);
22128  #else
22129 diff -urNp linux-2.6/arch/ia64/kernel/pal.S new/arch/ia64/kernel/pal.S
22130 --- linux-2.6/arch/ia64/kernel/pal.S    2006-07-03 14:14:15.000000000 +0200
22131 +++ new/arch/ia64/kernel/pal.S  2006-05-09 12:32:39.000000000 +0200
22132 @@ -16,6 +16,7 @@
22133  #include <asm/processor.h>
22134  
22135         .data
22136 +       .globl pal_entry_point
22137  pal_entry_point:
22138         data8 ia64_pal_default_handler
22139         .text
22140 @@ -53,7 +54,7 @@ END(ia64_pal_default_handler)
22141   * in4        1 ==> clear psr.ic,  0 ==> don't clear psr.ic
22142   *
22143   */
22144 -GLOBAL_ENTRY(ia64_pal_call_static)
22145 +GLOBAL_ENTRY(__ia64_pal_call_static)
22146         .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5)
22147         alloc loc1 = ar.pfs,5,5,0,0
22148         movl loc2 = pal_entry_point
22149 @@ -90,7 +91,7 @@ GLOBAL_ENTRY(ia64_pal_call_static)
22150         ;;
22151         srlz.d                          // seralize restoration of psr.l
22152         br.ret.sptk.many b0
22153 -END(ia64_pal_call_static)
22154 +END(__ia64_pal_call_static)
22155  
22156  /*
22157   * Make a PAL call using the stacked registers calling convention.
22158 diff -urNp linux-2.6/arch/ia64/kernel/patch.c new/arch/ia64/kernel/patch.c
22159 --- linux-2.6/arch/ia64/kernel/patch.c  2006-07-03 14:14:15.000000000 +0200
22160 +++ new/arch/ia64/kernel/patch.c        1970-01-01 01:00:00.000000000 +0100
22161 @@ -1,197 +0,0 @@
22162 -/*
22163 - * Instruction-patching support.
22164 - *
22165 - * Copyright (C) 2003 Hewlett-Packard Co
22166 - *     David Mosberger-Tang <davidm@hpl.hp.com>
22167 - */
22168 -#include <linux/init.h>
22169 -#include <linux/string.h>
22170 -
22171 -#include <asm/patch.h>
22172 -#include <asm/processor.h>
22173 -#include <asm/sections.h>
22174 -#include <asm/system.h>
22175 -#include <asm/unistd.h>
22176 -
22177 -/*
22178 - * This was adapted from code written by Tony Luck:
22179 - *
22180 - * The 64-bit value in a "movl reg=value" is scattered between the two words of the bundle
22181 - * like this:
22182 - *
22183 - * 6  6         5         4         3         2         1
22184 - * 3210987654321098765432109876543210987654321098765432109876543210
22185 - * ABBBBBBBBBBBBBBBBBBBBBBBCCCCCCCCCCCCCCCCCCDEEEEEFFFFFFFFFGGGGGGG
22186 - *
22187 - * CCCCCCCCCCCCCCCCCCxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
22188 - * xxxxAFFFFFFFFFEEEEEDxGGGGGGGxxxxxxxxxxxxxBBBBBBBBBBBBBBBBBBBBBBB
22189 - */
22190 -static u64
22191 -get_imm64 (u64 insn_addr)
22192 -{
22193 -       u64 *p = (u64 *) (insn_addr & -16);     /* mask out slot number */
22194 -
22195 -       return ( (p[1] & 0x0800000000000000UL) << 4)  | /*A*/
22196 -               ((p[1] & 0x00000000007fffffUL) << 40) | /*B*/
22197 -               ((p[0] & 0xffffc00000000000UL) >> 24) | /*C*/
22198 -               ((p[1] & 0x0000100000000000UL) >> 23) | /*D*/
22199 -               ((p[1] & 0x0003e00000000000UL) >> 29) | /*E*/
22200 -               ((p[1] & 0x07fc000000000000UL) >> 43) | /*F*/
22201 -               ((p[1] & 0x000007f000000000UL) >> 36);  /*G*/
22202 -}
22203 -
22204 -/* Patch instruction with "val" where "mask" has 1 bits. */
22205 -void
22206 -ia64_patch (u64 insn_addr, u64 mask, u64 val)
22207 -{
22208 -       u64 m0, m1, v0, v1, b0, b1, *b = (u64 *) (insn_addr & -16);
22209 -#      define insn_mask ((1UL << 41) - 1)
22210 -       unsigned long shift;
22211 -
22212 -       b0 = b[0]; b1 = b[1];
22213 -       shift = 5 + 41 * (insn_addr % 16); /* 5 bits of template, then 3 x 41-bit instructions */
22214 -       if (shift >= 64) {
22215 -               m1 = mask << (shift - 64);
22216 -               v1 = val << (shift - 64);
22217 -       } else {
22218 -               m0 = mask << shift; m1 = mask >> (64 - shift);
22219 -               v0 = val  << shift; v1 = val >> (64 - shift);
22220 -               b[0] = (b0 & ~m0) | (v0 & m0);
22221 -       }
22222 -       b[1] = (b1 & ~m1) | (v1 & m1);
22223 -}
22224 -
22225 -void
22226 -ia64_patch_imm64 (u64 insn_addr, u64 val)
22227 -{
22228 -       /* The assembler may generate offset pointing to either slot 1
22229 -          or slot 2 for a long (2-slot) instruction, occupying slots 1
22230 -          and 2.  */
22231 -       insn_addr &= -16UL;
22232 -       ia64_patch(insn_addr + 2,
22233 -                  0x01fffefe000UL, (  ((val & 0x8000000000000000UL) >> 27) /* bit 63 -> 36 */
22234 -                                    | ((val & 0x0000000000200000UL) <<  0) /* bit 21 -> 21 */
22235 -                                    | ((val & 0x00000000001f0000UL) <<  6) /* bit 16 -> 22 */
22236 -                                    | ((val & 0x000000000000ff80UL) << 20) /* bit  7 -> 27 */
22237 -                                    | ((val & 0x000000000000007fUL) << 13) /* bit  0 -> 13 */));
22238 -       ia64_patch(insn_addr + 1, 0x1ffffffffffUL, val >> 22);
22239 -}
22240 -
22241 -void
22242 -ia64_patch_imm60 (u64 insn_addr, u64 val)
22243 -{
22244 -       /* The assembler may generate offset pointing to either slot 1
22245 -          or slot 2 for a long (2-slot) instruction, occupying slots 1
22246 -          and 2.  */
22247 -       insn_addr &= -16UL;
22248 -       ia64_patch(insn_addr + 2,
22249 -                  0x011ffffe000UL, (  ((val & 0x0800000000000000UL) >> 23) /* bit 59 -> 36 */
22250 -                                    | ((val & 0x00000000000fffffUL) << 13) /* bit  0 -> 13 */));
22251 -       ia64_patch(insn_addr + 1, 0x1fffffffffcUL, val >> 18);
22252 -}
22253 -
22254 -/*
22255 - * We need sometimes to load the physical address of a kernel
22256 - * object.  Often we can convert the virtual address to physical
22257 - * at execution time, but sometimes (either for performance reasons
22258 - * or during error recovery) we cannot to this.  Patch the marked
22259 - * bundles to load the physical address.
22260 - */
22261 -void __init
22262 -ia64_patch_vtop (unsigned long start, unsigned long end)
22263 -{
22264 -       s32 *offp = (s32 *) start;
22265 -       u64 ip;
22266 -
22267 -       while (offp < (s32 *) end) {
22268 -               ip = (u64) offp + *offp;
22269 -
22270 -               /* replace virtual address with corresponding physical address: */
22271 -               ia64_patch_imm64(ip, ia64_tpa(get_imm64(ip)));
22272 -               ia64_fc((void *) ip);
22273 -               ++offp;
22274 -       }
22275 -       ia64_sync_i();
22276 -       ia64_srlz_i();
22277 -}
22278 -
22279 -void __init
22280 -ia64_patch_mckinley_e9 (unsigned long start, unsigned long end)
22281 -{
22282 -       static int first_time = 1;
22283 -       int need_workaround;
22284 -       s32 *offp = (s32 *) start;
22285 -       u64 *wp;
22286 -
22287 -       need_workaround = (local_cpu_data->family == 0x1f && local_cpu_data->model == 0);
22288 -
22289 -       if (first_time) {
22290 -               first_time = 0;
22291 -               if (need_workaround)
22292 -                       printk(KERN_INFO "Leaving McKinley Errata 9 workaround enabled\n");
22293 -               else
22294 -                       printk(KERN_INFO "McKinley Errata 9 workaround not needed; "
22295 -                              "disabling it\n");
22296 -       }
22297 -       if (need_workaround)
22298 -               return;
22299 -
22300 -       while (offp < (s32 *) end) {
22301 -               wp = (u64 *) ia64_imva((char *) offp + *offp);
22302 -               wp[0] = 0x0000000100000000UL; /* nop.m 0; nop.i 0; nop.i 0 */
22303 -               wp[1] = 0x0004000000000200UL;
22304 -               wp[2] = 0x0000000100000011UL; /* nop.m 0; nop.i 0; br.ret.sptk.many b6 */
22305 -               wp[3] = 0x0084006880000200UL;
22306 -               ia64_fc(wp); ia64_fc(wp + 2);
22307 -               ++offp;
22308 -       }
22309 -       ia64_sync_i();
22310 -       ia64_srlz_i();
22311 -}
22312 -
22313 -static void __init
22314 -patch_fsyscall_table (unsigned long start, unsigned long end)
22315 -{
22316 -       extern unsigned long fsyscall_table[NR_syscalls];
22317 -       s32 *offp = (s32 *) start;
22318 -       u64 ip;
22319 -
22320 -       while (offp < (s32 *) end) {
22321 -               ip = (u64) ia64_imva((char *) offp + *offp);
22322 -               ia64_patch_imm64(ip, (u64) fsyscall_table);
22323 -               ia64_fc((void *) ip);
22324 -               ++offp;
22325 -       }
22326 -       ia64_sync_i();
22327 -       ia64_srlz_i();
22328 -}
22329 -
22330 -static void __init
22331 -patch_brl_fsys_bubble_down (unsigned long start, unsigned long end)
22332 -{
22333 -       extern char fsys_bubble_down[];
22334 -       s32 *offp = (s32 *) start;
22335 -       u64 ip;
22336 -
22337 -       while (offp < (s32 *) end) {
22338 -               ip = (u64) offp + *offp;
22339 -               ia64_patch_imm60((u64) ia64_imva((void *) ip),
22340 -                                (u64) (fsys_bubble_down - (ip & -16)) / 16);
22341 -               ia64_fc((void *) ip);
22342 -               ++offp;
22343 -       }
22344 -       ia64_sync_i();
22345 -       ia64_srlz_i();
22346 -}
22347 -
22348 -void __init
22349 -ia64_patch_gate (void)
22350 -{
22351 -#      define START(name)      ((unsigned long) __start_gate_##name##_patchlist)
22352 -#      define END(name)        ((unsigned long)__end_gate_##name##_patchlist)
22353 -
22354 -       patch_fsyscall_table(START(fsyscall), END(fsyscall));
22355 -       patch_brl_fsys_bubble_down(START(brl_fsys_bubble_down), END(brl_fsys_bubble_down));
22356 -       ia64_patch_vtop(START(vtop), END(vtop));
22357 -       ia64_patch_mckinley_e9(START(mckinley_e9), END(mckinley_e9));
22358 -}
22359 diff -urNp linux-2.6/arch/ia64/kernel/setup.c new/arch/ia64/kernel/setup.c
22360 --- linux-2.6/arch/ia64/kernel/setup.c  2006-07-03 14:14:15.000000000 +0200
22361 +++ new/arch/ia64/kernel/setup.c        2006-06-28 14:32:13.000000000 +0200
22362 @@ -61,6 +61,10 @@
22363  #include <asm/system.h>
22364  #include <asm/unistd.h>
22365  #include <asm/system.h>
22366 +#ifdef CONFIG_XEN
22367 +#include <asm/hypervisor.h>
22368 +#endif
22369 +#include <linux/dma-mapping.h>
22370  
22371  #if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE)
22372  # error "struct cpuinfo_ia64 too big!"
22373 @@ -243,6 +247,14 @@ reserve_memory (void)
22374         rsvd_region[n].end   = (unsigned long) ia64_imva(_end);
22375         n++;
22376  
22377 +#ifdef CONFIG_XEN
22378 +       if (is_running_on_xen()) {
22379 +               rsvd_region[n].start = (unsigned long)__va((HYPERVISOR_shared_info->arch.start_info_pfn << PAGE_SHIFT));
22380 +               rsvd_region[n].end   = rsvd_region[n].start + PAGE_SIZE;
22381 +               n++;
22382 +       }
22383 +#endif
22384 +
22385  #ifdef CONFIG_BLK_DEV_INITRD
22386         if (ia64_boot_param->initrd_start) {
22387                 rsvd_region[n].start = (unsigned long)__va(ia64_boot_param->initrd_start);
22388 @@ -260,6 +272,7 @@ reserve_memory (void)
22389         n++;
22390  
22391         num_rsvd_regions = n;
22392 +       BUG_ON(IA64_MAX_RSVD_REGIONS + 1 < n);
22393  
22394         sort_regions(rsvd_region, num_rsvd_regions);
22395  }
22396 @@ -333,6 +346,16 @@ early_console_setup (char *cmdline)
22397  {
22398         int earlycons = 0;
22399  
22400 +#ifdef CONFIG_XEN
22401 +#ifndef CONFIG_IA64_HP_SIM
22402 +       if (is_running_on_xen()) {
22403 +               extern struct console hpsim_cons;
22404 +               hpsim_cons.flags |= CON_BOOT;
22405 +               register_console(&hpsim_cons);
22406 +               earlycons++;
22407 +       }
22408 +#endif
22409 +#endif
22410  #ifdef CONFIG_SERIAL_SGI_L1_CONSOLE
22411         {
22412                 extern int sn_serial_console_early_setup(void);
22413 @@ -402,6 +425,11 @@ setup_arch (char **cmdline_p)
22414  {
22415         unw_init();
22416  
22417 +#ifdef CONFIG_XEN
22418 +       if (is_running_on_xen())
22419 +               setup_xen_features();
22420 +#endif
22421 +
22422         ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist);
22423  
22424         *cmdline_p = __va(ia64_boot_param->command_line);
22425 @@ -478,6 +506,29 @@ setup_arch (char **cmdline_p)
22426                         conswitchp = &vga_con;
22427  # endif
22428         }
22429 +#ifdef CONFIG_XEN
22430 +       if (is_running_on_xen()) {
22431 +               shared_info_t *s = HYPERVISOR_shared_info;
22432 +
22433 +               xen_start_info = __va(s->arch.start_info_pfn << PAGE_SHIFT);
22434 +               xen_start_info->flags = s->arch.flags;
22435 +
22436 +               printk("Running on Xen! start_info_pfn=0x%lx nr_pages=%ld "
22437 +                      "flags=0x%x\n", s->arch.start_info_pfn,
22438 +                      xen_start_info->nr_pages, xen_start_info->flags);
22439 +
22440 +               /* xen_start_info isn't setup yet, get the flags manually */
22441 +               if (s->arch.flags & SIF_INITDOMAIN) {
22442 +                       if (!(s->arch.flags & SIF_PRIVILEGED))
22443 +                               panic("Xen granted us console access "
22444 +                                     "but not privileged status");
22445 +               } else {
22446 +                       extern int console_use_vt;
22447 +                       conswitchp = NULL;
22448 +                       console_use_vt = 0;
22449 +               }
22450 +       }
22451 +#endif
22452  #endif
22453  
22454         /* enable IA-64 Machine Check Abort Handling unless disabled */
22455 @@ -486,6 +537,7 @@ setup_arch (char **cmdline_p)
22456  
22457         platform_setup(cmdline_p);
22458         paging_init();
22459 +       contiguous_bitmap_init(max_pfn);
22460  }
22461  
22462  /*
22463 @@ -870,6 +922,15 @@ cpu_init (void)
22464         /* size of physical stacked register partition plus 8 bytes: */
22465         __get_cpu_var(ia64_phys_stacked_size_p8) = num_phys_stacked*8 + 8;
22466         platform_cpu_init();
22467 +
22468 +#ifdef CONFIG_XEN
22469 +       /* Need to be moved into platform_cpu_init later */
22470 +       if (is_running_on_xen()) {
22471 +               extern void xen_smp_intr_init(void);
22472 +               xen_smp_intr_init();
22473 +       }
22474 +#endif
22475 +
22476         pm_idle = default_idle;
22477  }
22478  
22479 diff -urNp linux-2.6/arch/ia64/Makefile new/arch/ia64/Makefile
22480 --- linux-2.6/arch/ia64/Makefile        2006-07-03 14:14:15.000000000 +0200
22481 +++ new/arch/ia64/Makefile      2006-05-09 12:32:37.000000000 +0200
22482 @@ -45,6 +45,12 @@ ifeq ($(call cc-version),0304)
22483  endif
22484  
22485  CFLAGS += $(cflags-y)
22486 +
22487 +cppflags-$(CONFIG_XEN) += \
22488 +       -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION)
22489 +
22490 +CPPFLAGS += $(cppflags-y)
22491 +
22492  head-y := arch/ia64/kernel/head.o arch/ia64/kernel/init_task.o
22493  
22494  libs-y                         += arch/ia64/lib/
22495 @@ -55,9 +61,15 @@ core-$(CONFIG_IA64_GENERIC)  += arch/ia6
22496  core-$(CONFIG_IA64_HP_ZX1)     += arch/ia64/dig/
22497  core-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += arch/ia64/dig/
22498  core-$(CONFIG_IA64_SGI_SN2)    += arch/ia64/sn/
22499 +core-$(CONFIG_XEN)             += arch/ia64/xen/
22500  
22501  drivers-$(CONFIG_PCI)          += arch/ia64/pci/
22502 +ifneq ($(CONFIG_XEN),y)
22503  drivers-$(CONFIG_IA64_HP_SIM)  += arch/ia64/hp/sim/
22504 +endif
22505 +ifneq ($(CONFIG_IA64_GENERIC),y)
22506 +drivers-$(CONFIG_XEN)          += arch/ia64/hp/sim/
22507 +endif
22508  drivers-$(CONFIG_IA64_HP_ZX1)  += arch/ia64/hp/common/ arch/ia64/hp/zx1/
22509  drivers-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += arch/ia64/hp/common/ arch/ia64/hp/zx1/
22510  drivers-$(CONFIG_IA64_GENERIC) += arch/ia64/hp/common/ arch/ia64/hp/zx1/ arch/ia64/hp/sim/ arch/ia64/sn/
22511 @@ -71,6 +83,8 @@ all: compressed unwcheck
22512  
22513  compressed: vmlinux.gz
22514  
22515 +vmlinuz: vmlinux.gz
22516 +
22517  vmlinux.gz: vmlinux
22518         $(Q)$(MAKE) $(build)=$(boot) $@
22519  
22520 @@ -85,8 +99,8 @@ CLEAN_FILES += vmlinux.gz bootloader
22521  boot:  lib/lib.a vmlinux
22522         $(Q)$(MAKE) $(build)=$(boot) $@
22523  
22524 -install: vmlinux.gz
22525 -       sh $(srctree)/arch/ia64/install.sh $(KERNELRELEASE) $< System.map "$(INSTALL_PATH)"
22526 +install:
22527 +       -yes | sh $(srctree)/arch/ia64/install.sh $(KERNELRELEASE) vmlinux.gz System.map "$(INSTALL_PATH)"
22528  
22529  define archhelp
22530    echo '* compressed   - Build compressed kernel image'
22531 diff -urNp linux-2.6/arch/ia64/mm/ioremap.c new/arch/ia64/mm/ioremap.c
22532 --- linux-2.6/arch/ia64/mm/ioremap.c    2006-07-03 14:14:16.000000000 +0200
22533 +++ new/arch/ia64/mm/ioremap.c  2006-05-09 12:32:39.000000000 +0200
22534 @@ -15,6 +15,9 @@
22535  static inline void __iomem *
22536  __ioremap (unsigned long offset, unsigned long size)
22537  {
22538 +#ifdef CONFIG_XEN
22539 +       offset = HYPERVISOR_ioremap(offset, size);
22540 +#endif
22541         return (void __iomem *) (__IA64_UNCACHED_OFFSET | offset);
22542  }
22543  
22544 diff -urNp linux-2.6/arch/ia64/xen/drivers/README new/arch/ia64/xen/drivers/README
22545 --- linux-2.6/arch/ia64/xen/drivers/README      1970-01-01 01:00:00.000000000 +0100
22546 +++ new/arch/ia64/xen/drivers/README    2006-05-09 12:32:40.000000000 +0200
22547 @@ -0,0 +1,2 @@
22548 +This is a temporary location for source/Makefiles that need to be
22549 +patched/reworked in drivers/xen to work with xenlinux/ia64.
22550 diff -urNp linux-2.6/arch/ia64/xen/hypercall.S new/arch/ia64/xen/hypercall.S
22551 --- linux-2.6/arch/ia64/xen/hypercall.S 1970-01-01 01:00:00.000000000 +0100
22552 +++ new/arch/ia64/xen/hypercall.S       2006-05-23 18:42:17.000000000 +0200
22553 @@ -0,0 +1,353 @@
22554 +/*
22555 + * Support routines for Xen hypercalls
22556 + *
22557 + * Copyright (C) 2005 Dan Magenheimer <dan.magenheimer@hp.com>
22558 + */
22559 +
22560 +#include <linux/config.h>
22561 +#include <asm/processor.h>
22562 +#include <asm/asmmacro.h>
22563 +
22564 +/* To clear vpsr.ic, vpsr.i needs to be cleared first */
22565 +#define XEN_CLEAR_PSR_IC                               \
22566 +       mov r14=1;                                      \
22567 +       movl r15=XSI_PSR_I_ADDR;                        \
22568 +       movl r2=XSI_PSR_IC;                             \
22569 +       ;;                                              \
22570 +       ld8 r15=[r15];                                  \
22571 +       ld4 r3=[r2];                                    \
22572 +       ;;                                              \
22573 +       ld1 r16=[r15];                                  \
22574 +       ;;                                              \
22575 +       st1 [r15]=r14;                                  \
22576 +       st4 [r2]=r0;                                    \
22577 +       ;;
22578 +
22579 +/* First restore vpsr.ic, and then vpsr.i */
22580 +#define XEN_RESTORE_PSR_IC                             \
22581 +       st4 [r2]=r3;                                    \
22582 +       st1 [r15]=r16;                                  \
22583 +       ;;
22584 +
22585 +GLOBAL_ENTRY(xen_get_ivr)
22586 +       movl r8=running_on_xen;;
22587 +       ld4 r8=[r8];;
22588 +       cmp.eq p7,p0=r8,r0;;
22589 +(p7)   mov r8=cr.ivr;;
22590 +(p7)   br.ret.sptk.many rp
22591 +       ;;
22592 +       XEN_CLEAR_PSR_IC
22593 +       ;;
22594 +       XEN_HYPER_GET_IVR
22595 +       ;;
22596 +       XEN_RESTORE_PSR_IC
22597 +       ;;
22598 +       br.ret.sptk.many rp
22599 +       ;;
22600 +END(xen_get_ivr)
22601 +
22602 +GLOBAL_ENTRY(xen_get_tpr)
22603 +       movl r8=running_on_xen;;
22604 +       ld4 r8=[r8];;
22605 +       cmp.eq p7,p0=r8,r0;;
22606 +(p7)   mov r8=cr.tpr;;
22607 +(p7)   br.ret.sptk.many rp
22608 +       ;;
22609 +       XEN_CLEAR_PSR_IC
22610 +       ;;
22611 +       XEN_HYPER_GET_TPR
22612 +       ;;
22613 +       XEN_RESTORE_PSR_IC
22614 +       ;;
22615 +       br.ret.sptk.many rp
22616 +       ;;
22617 +END(xen_get_tpr)
22618 +
22619 +GLOBAL_ENTRY(xen_set_tpr)
22620 +       movl r8=running_on_xen;;
22621 +       ld4 r8=[r8];;
22622 +       cmp.eq p7,p0=r8,r0;;
22623 +(p7)   mov cr.tpr=r32;;
22624 +(p7)   br.ret.sptk.many rp
22625 +       ;;
22626 +       mov r8=r32
22627 +       ;;
22628 +       XEN_CLEAR_PSR_IC
22629 +       ;;
22630 +       XEN_HYPER_SET_TPR
22631 +       ;;
22632 +       XEN_RESTORE_PSR_IC
22633 +       ;;
22634 +       br.ret.sptk.many rp
22635 +       ;;
22636 +END(xen_set_tpr)
22637 +
22638 +GLOBAL_ENTRY(xen_eoi)
22639 +       movl r8=running_on_xen;;
22640 +       ld4 r8=[r8];;
22641 +       cmp.eq p7,p0=r8,r0;;
22642 +(p7)   mov cr.eoi=r0;;
22643 +(p7)   br.ret.sptk.many rp
22644 +       ;;
22645 +       mov r8=r32
22646 +       ;;
22647 +       XEN_CLEAR_PSR_IC
22648 +       ;;
22649 +       XEN_HYPER_EOI
22650 +       ;;
22651 +       XEN_RESTORE_PSR_IC
22652 +       ;;
22653 +       br.ret.sptk.many rp
22654 +       ;;
22655 +END(xen_eoi)
22656 +
22657 +GLOBAL_ENTRY(xen_thash)
22658 +       movl r8=running_on_xen;;
22659 +       ld4 r8=[r8];;
22660 +       cmp.eq p7,p0=r8,r0;;
22661 +(p7)   thash r8=r32;;
22662 +(p7)   br.ret.sptk.many rp
22663 +       ;;
22664 +       mov r8=r32
22665 +       ;;
22666 +       XEN_CLEAR_PSR_IC
22667 +       ;;
22668 +       XEN_HYPER_THASH
22669 +       ;;
22670 +       XEN_RESTORE_PSR_IC
22671 +       ;;
22672 +       br.ret.sptk.many rp
22673 +       ;;
22674 +END(xen_thash)
22675 +
22676 +GLOBAL_ENTRY(xen_set_itm)
22677 +       movl r8=running_on_xen;;
22678 +       ld4 r8=[r8];;
22679 +       cmp.eq p7,p0=r8,r0;;
22680 +(p7)   mov cr.itm=r32;;
22681 +(p7)   br.ret.sptk.many rp
22682 +       ;;
22683 +       mov r8=r32
22684 +       ;;
22685 +       XEN_CLEAR_PSR_IC
22686 +       ;;
22687 +       XEN_HYPER_SET_ITM
22688 +       ;;
22689 +       XEN_RESTORE_PSR_IC
22690 +       ;;
22691 +       br.ret.sptk.many rp
22692 +       ;;
22693 +END(xen_set_itm)
22694 +
22695 +GLOBAL_ENTRY(xen_ptcga)
22696 +       movl r8=running_on_xen;;
22697 +       ld4 r8=[r8];;
22698 +       cmp.eq p7,p0=r8,r0;;
22699 +(p7)   ptc.ga r32,r33;;
22700 +(p7)   br.ret.sptk.many rp
22701 +       ;;
22702 +       mov r8=r32
22703 +       mov r9=r33
22704 +       ;;
22705 +       XEN_CLEAR_PSR_IC
22706 +       ;;
22707 +       XEN_HYPER_PTC_GA
22708 +       ;;
22709 +       XEN_RESTORE_PSR_IC
22710 +       ;;
22711 +       br.ret.sptk.many rp
22712 +       ;;
22713 +END(xen_ptcga)
22714 +
22715 +GLOBAL_ENTRY(xen_get_rr)
22716 +       movl r8=running_on_xen;;
22717 +       ld4 r8=[r8];;
22718 +       cmp.eq p7,p0=r8,r0;;
22719 +(p7)   mov r8=rr[r32];;
22720 +(p7)   br.ret.sptk.many rp
22721 +       ;;
22722 +       mov r8=r32
22723 +       ;;
22724 +       XEN_CLEAR_PSR_IC
22725 +       ;;
22726 +       XEN_HYPER_GET_RR
22727 +       ;;
22728 +       XEN_RESTORE_PSR_IC
22729 +       ;;
22730 +       br.ret.sptk.many rp
22731 +       ;;
22732 +END(xen_get_rr)
22733 +
22734 +GLOBAL_ENTRY(xen_set_rr)
22735 +       movl r8=running_on_xen;;
22736 +       ld4 r8=[r8];;
22737 +       cmp.eq p7,p0=r8,r0;;
22738 +(p7)   mov rr[r32]=r33;;
22739 +(p7)   br.ret.sptk.many rp
22740 +       ;;
22741 +       mov r8=r32
22742 +       mov r9=r33
22743 +       ;;
22744 +       XEN_CLEAR_PSR_IC
22745 +       ;;
22746 +       XEN_HYPER_SET_RR
22747 +       ;;
22748 +       XEN_RESTORE_PSR_IC
22749 +       ;;
22750 +       br.ret.sptk.many rp
22751 +       ;;
22752 +END(xen_set_rr)
22753 +
22754 +GLOBAL_ENTRY(xen_set_kr)
22755 +       movl r8=running_on_xen;;
22756 +       ld4 r8=[r8];;
22757 +       cmp.ne p7,p0=r8,r0;;
22758 +(p7)   br.cond.spnt.few 1f;
22759 +       ;;
22760 +       cmp.eq p7,p0=r8,r0
22761 +       adds r8=-1,r8;;
22762 +(p7)   mov ar0=r9
22763 +(p7)   br.ret.sptk.many rp;;
22764 +       cmp.eq p7,p0=r8,r0
22765 +       adds r8=-1,r8;;
22766 +(p7)   mov ar1=r9
22767 +(p7)   br.ret.sptk.many rp;;
22768 +       cmp.eq p7,p0=r8,r0
22769 +       adds r8=-1,r8;;
22770 +(p7)   mov ar2=r9
22771 +(p7)   br.ret.sptk.many rp;;
22772 +       cmp.eq p7,p0=r8,r0
22773 +       adds r8=-1,r8;;
22774 +(p7)   mov ar3=r9
22775 +(p7)   br.ret.sptk.many rp;;
22776 +       cmp.eq p7,p0=r8,r0
22777 +       adds r8=-1,r8;;
22778 +(p7)   mov ar4=r9
22779 +(p7)   br.ret.sptk.many rp;;
22780 +       cmp.eq p7,p0=r8,r0
22781 +       adds r8=-1,r8;;
22782 +(p7)   mov ar5=r9
22783 +(p7)   br.ret.sptk.many rp;;
22784 +       cmp.eq p7,p0=r8,r0
22785 +       adds r8=-1,r8;;
22786 +(p7)   mov ar6=r9
22787 +(p7)   br.ret.sptk.many rp;;
22788 +       cmp.eq p7,p0=r8,r0
22789 +       adds r8=-1,r8;;
22790 +(p7)   mov ar7=r9
22791 +(p7)   br.ret.sptk.many rp;;
22792 +
22793 +1:     mov r8=r32
22794 +       mov r9=r33
22795 +       ;;
22796 +       XEN_CLEAR_PSR_IC
22797 +       ;;
22798 +       XEN_HYPER_SET_KR
22799 +       ;;
22800 +       XEN_RESTORE_PSR_IC
22801 +       ;;
22802 +       br.ret.sptk.many rp
22803 +END(xen_set_kr)
22804 +
22805 +GLOBAL_ENTRY(xen_fc)
22806 +       movl r8=running_on_xen;;
22807 +       ld4 r8=[r8];;
22808 +       cmp.eq p7,p0=r8,r0;;
22809 +(p7)   fc r32;;
22810 +(p7)   br.ret.sptk.many rp
22811 +       ;;
22812 +       mov r8=r32
22813 +       ;;
22814 +       XEN_CLEAR_PSR_IC
22815 +       ;;
22816 +       XEN_HYPER_FC
22817 +       ;;
22818 +       XEN_RESTORE_PSR_IC
22819 +       ;;
22820 +       br.ret.sptk.many rp
22821 +END(xen_fc)
22822 +
22823 +GLOBAL_ENTRY(xen_get_cpuid)
22824 +       movl r8=running_on_xen;;
22825 +       ld4 r8=[r8];;
22826 +       cmp.eq p7,p0=r8,r0;;
22827 +(p7)   mov r8=cpuid[r32];;
22828 +(p7)   br.ret.sptk.many rp
22829 +       ;;
22830 +       mov r8=r32
22831 +       ;;
22832 +       XEN_CLEAR_PSR_IC
22833 +       ;;
22834 +       XEN_HYPER_GET_CPUID
22835 +       ;;
22836 +       XEN_RESTORE_PSR_IC
22837 +       ;;
22838 +       br.ret.sptk.many rp
22839 +END(xen_get_cpuid)
22840 +
22841 +GLOBAL_ENTRY(xen_get_pmd)
22842 +       movl r8=running_on_xen;;
22843 +       ld4 r8=[r8];;
22844 +       cmp.eq p7,p0=r8,r0;;
22845 +(p7)   mov r8=pmd[r32];;
22846 +(p7)   br.ret.sptk.many rp
22847 +       ;;
22848 +       mov r8=r32
22849 +       ;;
22850 +       XEN_CLEAR_PSR_IC
22851 +       ;;
22852 +       XEN_HYPER_GET_PMD
22853 +       ;;
22854 +       XEN_RESTORE_PSR_IC
22855 +       ;;
22856 +       br.ret.sptk.many rp
22857 +END(xen_get_pmd)
22858 +
22859 +#ifdef CONFIG_IA32_SUPPORT
22860 +GLOBAL_ENTRY(xen_get_eflag)
22861 +       movl r8=running_on_xen;;
22862 +       ld4 r8=[r8];;
22863 +       cmp.eq p7,p0=r8,r0;;
22864 +(p7)   mov r8=ar24;;
22865 +(p7)   br.ret.sptk.many rp
22866 +       ;;
22867 +       mov r8=r32
22868 +       ;;
22869 +       XEN_CLEAR_PSR_IC
22870 +       ;;
22871 +       XEN_HYPER_GET_EFLAG
22872 +       ;;
22873 +       XEN_RESTORE_PSR_IC
22874 +       ;;
22875 +       br.ret.sptk.many rp
22876 +END(xen_get_eflag)
22877 +       
22878 +// some bits aren't set if pl!=0, see SDM vol1 3.1.8
22879 +GLOBAL_ENTRY(xen_set_eflag)
22880 +       movl r8=running_on_xen;;
22881 +       ld4 r8=[r8];;
22882 +       cmp.eq p7,p0=r8,r0;;
22883 +(p7)   mov ar24=r32
22884 +(p7)   br.ret.sptk.many rp
22885 +       ;;
22886 +       mov r8=r32
22887 +       ;;
22888 +       XEN_CLEAR_PSR_IC
22889 +       ;;
22890 +       XEN_HYPER_SET_EFLAG
22891 +       ;;
22892 +       XEN_RESTORE_PSR_IC
22893 +       ;;
22894 +       br.ret.sptk.many rp
22895 +END(xen_set_eflag)
22896 +#endif
22897 +
22898 +GLOBAL_ENTRY(xen_send_ipi)
22899 +        mov r14=r32
22900 +        mov r15=r33
22901 +        mov r2=0x400
22902 +        break 0x1000
22903 +        ;;
22904 +        br.ret.sptk.many rp
22905 +        ;;
22906 +END(xen_send_ipi)
22907 diff -urNp linux-2.6/arch/ia64/xen/hypervisor.c new/arch/ia64/xen/hypervisor.c
22908 --- linux-2.6/arch/ia64/xen/hypervisor.c        1970-01-01 01:00:00.000000000 +0100
22909 +++ new/arch/ia64/xen/hypervisor.c      2006-06-28 14:32:13.000000000 +0200
22910 @@ -0,0 +1,784 @@
22911 +/******************************************************************************
22912 + * include/asm-ia64/shadow.h
22913 + *
22914 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
22915 + *                    VA Linux Systems Japan K.K.
22916 + *
22917 + * This program is free software; you can redistribute it and/or modify
22918 + * it under the terms of the GNU General Public License as published by
22919 + * the Free Software Foundation; either version 2 of the License, or
22920 + * (at your option) any later version.
22921 + *
22922 + * This program is distributed in the hope that it will be useful,
22923 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
22924 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22925 + * GNU General Public License for more details.
22926 + *
22927 + * You should have received a copy of the GNU General Public License
22928 + * along with this program; if not, write to the Free Software
22929 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22930 + *
22931 + */
22932 +
22933 +//#include <linux/kernel.h>
22934 +#include <linux/spinlock.h>
22935 +#include <linux/bootmem.h>
22936 +#include <linux/module.h>
22937 +#include <linux/vmalloc.h>
22938 +#include <asm/page.h>
22939 +#include <asm/hypervisor.h>
22940 +#include <asm/hypercall.h>
22941 +#include <xen/interface/memory.h>
22942 +#include <xen/balloon.h>
22943 +
22944 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)XSI_BASE;
22945 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
22946 +
22947 +start_info_t *xen_start_info;
22948 +
22949 +int running_on_xen;
22950 +EXPORT_SYMBOL(running_on_xen);
22951 +
22952 +//XXX xen/ia64 copy_from_guest() is broken.
22953 +//    This is a temporal work around until it is fixed.
22954 +//    used by balloon.c netfront.c
22955 +
22956 +// get_xen_guest_handle is defined only when __XEN_TOOLS__ is defined
22957 +// if the definition in arch-ia64.h is changed, this must be updated.
22958 +#define get_xen_guest_handle(val, hnd)  do { val = (hnd).p; } while (0)
22959 +
22960 +int
22961 +ia64_xenmem_reservation_op(unsigned long op,
22962 +                          struct xen_memory_reservation* reservation__)
22963 +{
22964 +       struct xen_memory_reservation reservation = *reservation__;
22965 +       unsigned long* frame_list;
22966 +       unsigned long nr_extents = reservation__->nr_extents;
22967 +       int ret = 0;
22968 +       get_xen_guest_handle(frame_list, reservation__->extent_start);
22969 +
22970 +       BUG_ON(op != XENMEM_increase_reservation &&
22971 +              op != XENMEM_decrease_reservation &&
22972 +              op != XENMEM_populate_physmap);
22973 +
22974 +       while (nr_extents > 0) {
22975 +               int tmp_ret;
22976 +               volatile unsigned long dummy;
22977 +
22978 +               set_xen_guest_handle(reservation.extent_start, frame_list);
22979 +               reservation.nr_extents = nr_extents;
22980 +
22981 +               dummy = frame_list[0];// re-install tlb entry before hypercall
22982 +               tmp_ret = ____HYPERVISOR_memory_op(op, &reservation);
22983 +               if (tmp_ret < 0) {
22984 +                       if (ret == 0) {
22985 +                               ret = tmp_ret;
22986 +                       }
22987 +                       break;
22988 +               }
22989 +               if (tmp_ret == 0) {
22990 +                       //XXX dirty work around for skbuff_ctor()
22991 +                       //    of a non-privileged domain, 
22992 +                       if ((op == XENMEM_increase_reservation ||
22993 +                            op == XENMEM_populate_physmap) &&
22994 +                           !(xen_start_info->flags & SIF_PRIVILEGED) &&
22995 +                           reservation.extent_order > 0)
22996 +                               return ret;
22997 +               }
22998 +               frame_list += tmp_ret;
22999 +               nr_extents -= tmp_ret;
23000 +               ret += tmp_ret;
23001 +       }
23002 +       return ret;
23003 +}
23004 +
23005 +//XXX same as i386, x86_64 contiguous_bitmap_set(), contiguous_bitmap_clear()
23006 +// move those to lib/contiguous_bitmap?
23007 +//XXX discontigmem/sparsemem
23008 +
23009 +/*
23010 + * Bitmap is indexed by page number. If bit is set, the page is part of a
23011 + * xen_create_contiguous_region() area of memory.
23012 + */
23013 +unsigned long *contiguous_bitmap;
23014 +
23015 +void
23016 +contiguous_bitmap_init(unsigned long end_pfn)
23017 +{
23018 +       unsigned long size = (end_pfn + 2 * BITS_PER_LONG) >> 3;
23019 +       contiguous_bitmap = alloc_bootmem_low_pages(size);
23020 +       BUG_ON(!contiguous_bitmap);
23021 +       memset(contiguous_bitmap, 0, size);
23022 +}
23023 +
23024 +#if 0
23025 +int
23026 +contiguous_bitmap_test(void* p)
23027 +{
23028 +       return test_bit(__pa(p) >> PAGE_SHIFT, contiguous_bitmap);
23029 +}
23030 +#endif
23031 +
23032 +static void contiguous_bitmap_set(
23033 +       unsigned long first_page, unsigned long nr_pages)
23034 +{
23035 +       unsigned long start_off, end_off, curr_idx, end_idx;
23036 +
23037 +       curr_idx  = first_page / BITS_PER_LONG;
23038 +       start_off = first_page & (BITS_PER_LONG-1);
23039 +       end_idx   = (first_page + nr_pages) / BITS_PER_LONG;
23040 +       end_off   = (first_page + nr_pages) & (BITS_PER_LONG-1);
23041 +
23042 +       if (curr_idx == end_idx) {
23043 +               contiguous_bitmap[curr_idx] |=
23044 +                       ((1UL<<end_off)-1) & -(1UL<<start_off);
23045 +       } else {
23046 +               contiguous_bitmap[curr_idx] |= -(1UL<<start_off);
23047 +               while ( ++curr_idx < end_idx )
23048 +                       contiguous_bitmap[curr_idx] = ~0UL;
23049 +               contiguous_bitmap[curr_idx] |= (1UL<<end_off)-1;
23050 +       }
23051 +}
23052 +
23053 +static void contiguous_bitmap_clear(
23054 +       unsigned long first_page, unsigned long nr_pages)
23055 +{
23056 +       unsigned long start_off, end_off, curr_idx, end_idx;
23057 +
23058 +       curr_idx  = first_page / BITS_PER_LONG;
23059 +       start_off = first_page & (BITS_PER_LONG-1);
23060 +       end_idx   = (first_page + nr_pages) / BITS_PER_LONG;
23061 +       end_off   = (first_page + nr_pages) & (BITS_PER_LONG-1);
23062 +
23063 +       if (curr_idx == end_idx) {
23064 +               contiguous_bitmap[curr_idx] &=
23065 +                       -(1UL<<end_off) | ((1UL<<start_off)-1);
23066 +       } else {
23067 +               contiguous_bitmap[curr_idx] &= (1UL<<start_off)-1;
23068 +               while ( ++curr_idx != end_idx )
23069 +                       contiguous_bitmap[curr_idx] = 0;
23070 +               contiguous_bitmap[curr_idx] &= -(1UL<<end_off);
23071 +       }
23072 +}
23073 +
23074 +static unsigned long
23075 +HYPERVISOR_populate_physmap(unsigned long gpfn, unsigned int extent_order,
23076 +                           unsigned int address_bits)
23077 +{
23078 +       unsigned long ret;
23079 +        struct xen_memory_reservation reservation = {
23080 +               .nr_extents   = 1,
23081 +                .address_bits = address_bits,
23082 +                .extent_order = extent_order,
23083 +                .domid        = DOMID_SELF
23084 +        };
23085 +       set_xen_guest_handle(reservation.extent_start, &gpfn);
23086 +       ret = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
23087 +       // it may fail on non-privileged domain with extent_order > 0.
23088 +       BUG_ON(ret != 1 &&
23089 +              !(ret == 0 && !(xen_start_info->flags & SIF_PRIVILEGED) &&
23090 +                extent_order > 0));
23091 +       if (ret != 1)
23092 +               return -EINVAL;//XXX
23093 +       return 0;
23094 +}
23095 +
23096 +static unsigned long
23097 +HYPERVISOR_remove_physmap(unsigned long gpfn, unsigned int extent_order)
23098 +{
23099 +       unsigned long ret;
23100 +       struct xen_memory_reservation reservation = {
23101 +               .nr_extents   = 1,
23102 +               .address_bits = 0,
23103 +               .extent_order = extent_order,
23104 +               .domid        = DOMID_SELF
23105 +       };
23106 +       set_xen_guest_handle(reservation.extent_start, &gpfn);
23107 +       ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
23108 +       BUG_ON(ret != 1);
23109 +       return 0;
23110 +}
23111 +
23112 +/* Ensure multi-page extents are contiguous in machine memory. */
23113 +int
23114 +__xen_create_contiguous_region(unsigned long vstart,
23115 +                              unsigned int order, unsigned int address_bits)
23116 +{
23117 +       unsigned long error = 0;
23118 +       unsigned long gphys = __pa(vstart);
23119 +       unsigned long start_gpfn = gphys >> PAGE_SHIFT;
23120 +       unsigned long num_gpfn = 1 << order;
23121 +       unsigned long i;
23122 +       unsigned long flags;
23123 +
23124 +       scrub_pages(vstart, num_gpfn);
23125 +
23126 +       balloon_lock(flags);
23127 +
23128 +       error = HYPERVISOR_remove_physmap(start_gpfn, order);
23129 +       if (error) {
23130 +               goto fail;
23131 +       }
23132 +
23133 +       error = HYPERVISOR_populate_physmap(start_gpfn, order, address_bits);
23134 +       if (error) {
23135 +               goto fail;
23136 +       }
23137 +       contiguous_bitmap_set(start_gpfn, num_gpfn);
23138 +#if 0
23139 +       {
23140 +       unsigned long mfn;
23141 +       unsigned long mfn_prev = ~0UL;
23142 +       for (i = 0; i < num_gpfn; i++) {
23143 +               mfn = pfn_to_mfn_for_dma(start_gpfn + i);
23144 +               if (mfn_prev != ~0UL && mfn != mfn_prev + 1) {
23145 +                       xprintk("\n");
23146 +                       xprintk("%s:%d order %d "
23147 +                               "start 0x%lx bus 0x%lx machine 0x%lx\n",
23148 +                               __func__, __LINE__, order,
23149 +                               vstart, virt_to_bus((void*)vstart),
23150 +                               phys_to_machine_for_dma(gphys));
23151 +                       xprintk("mfn: ");
23152 +                       for (i = 0; i < num_gpfn; i++) {
23153 +                               mfn = pfn_to_mfn_for_dma(start_gpfn + i);
23154 +                               xprintk("0x%lx ", mfn);
23155 +                       }
23156 +                       xprintk("\n");
23157 +                       goto out;
23158 +               }
23159 +               mfn_prev = mfn;
23160 +       }
23161 +       }
23162 +#endif
23163 +out:
23164 +       balloon_unlock(flags);
23165 +       return error;
23166 +
23167 +fail:
23168 +       for (i = 0; i < num_gpfn; i++) {
23169 +               error = HYPERVISOR_populate_physmap(start_gpfn + i, 0, 0);
23170 +               if (error) {
23171 +                       BUG();//XXX
23172 +               }
23173 +       }
23174 +       goto out;
23175 +}
23176 +
23177 +void
23178 +__xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
23179 +{
23180 +       unsigned long flags;
23181 +       unsigned long error = 0;
23182 +       unsigned long start_gpfn = __pa(vstart) >> PAGE_SHIFT;
23183 +       unsigned long num_gpfn = 1UL << order;
23184 +       unsigned long* gpfns;
23185 +       struct xen_memory_reservation reservation;
23186 +       unsigned long i;
23187 +
23188 +       gpfns = kmalloc(sizeof(gpfns[0]) * num_gpfn,
23189 +                       GFP_KERNEL | __GFP_NOFAIL);
23190 +       for (i = 0; i < num_gpfn; i++) {
23191 +               gpfns[i] = start_gpfn + i;
23192 +       }
23193 +
23194 +       scrub_pages(vstart, num_gpfn);
23195 +
23196 +       balloon_lock(flags);
23197 +
23198 +       contiguous_bitmap_clear(start_gpfn, num_gpfn);
23199 +       error = HYPERVISOR_remove_physmap(start_gpfn, order);
23200 +       if (error) {
23201 +               goto fail;
23202 +       }
23203 +
23204 +       set_xen_guest_handle(reservation.extent_start, gpfns);
23205 +       reservation.nr_extents   = num_gpfn;
23206 +       reservation.address_bits = 0;
23207 +       reservation.extent_order = 0;
23208 +       reservation.domid        = DOMID_SELF;
23209 +       error = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
23210 +       if (error != num_gpfn) {
23211 +               error = -EFAULT;//XXX
23212 +               goto fail;
23213 +       }
23214 +       error = 0;
23215 +out:
23216 +       balloon_unlock(flags);
23217 +       kfree(gpfns);
23218 +       if (error) {
23219 +               // error can't be returned.
23220 +               BUG();//XXX
23221 +       }
23222 +       return;
23223 +
23224 +fail:
23225 +       for (i = 0; i < num_gpfn; i++) {
23226 +               int tmp_error;// don't overwrite error.
23227 +               tmp_error = HYPERVISOR_populate_physmap(start_gpfn + i, 0, 0);
23228 +               if (tmp_error) {
23229 +                       BUG();//XXX
23230 +               }
23231 +       }
23232 +       goto out;
23233 +}
23234 +
23235 +
23236 +///////////////////////////////////////////////////////////////////////////
23237 +// grant table hack
23238 +// cmd: GNTTABOP_xxx
23239 +
23240 +#include <linux/mm.h>
23241 +#include <xen/interface/xen.h>
23242 +#include <xen/gnttab.h>
23243 +
23244 +static void
23245 +gnttab_map_grant_ref_pre(struct gnttab_map_grant_ref *uop)
23246 +{
23247 +       uint32_t flags;
23248 +
23249 +       flags = uop->flags;
23250 +
23251 +       if (flags & GNTMAP_host_map) {
23252 +               if (flags & GNTMAP_application_map) {
23253 +                       xprintd("GNTMAP_application_map is not supported yet: flags 0x%x\n", flags);
23254 +                       BUG();
23255 +               }
23256 +               if (flags & GNTMAP_contains_pte) {
23257 +                       xprintd("GNTMAP_contains_pte is not supported yet flags 0x%x\n", flags);
23258 +                       BUG();
23259 +               }
23260 +       } else if (flags & GNTMAP_device_map) {
23261 +               xprintd("GNTMAP_device_map is not supported yet 0x%x\n", flags);
23262 +               BUG();//XXX not yet. actually this flag is not used.
23263 +       } else {
23264 +               BUG();
23265 +       }
23266 +}
23267 +
23268 +int
23269 +HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count)
23270 +{
23271 +       if (cmd == GNTTABOP_map_grant_ref) {
23272 +               unsigned int i;
23273 +               for (i = 0; i < count; i++) {
23274 +                       gnttab_map_grant_ref_pre(
23275 +                               (struct gnttab_map_grant_ref*)uop + i);
23276 +               }
23277 +       }
23278 +
23279 +       return ____HYPERVISOR_grant_table_op(cmd, uop, count);
23280 +}
23281 +
23282 +
23283 +///////////////////////////////////////////////////////////////////////////
23284 +// PageForeign(), SetPageForeign(), ClearPageForeign()
23285 +
23286 +struct address_space xen_ia64_foreign_dummy_mapping;
23287 +
23288 +///////////////////////////////////////////////////////////////////////////
23289 +// foreign mapping
23290 +#include <linux/efi.h>
23291 +#include <asm/meminit.h> // for IA64_GRANULE_SIZE, GRANULEROUND{UP,DOWN}()
23292 +
23293 +static unsigned long privcmd_resource_min = 0;
23294 +// Xen/ia64 currently can handle pseudo physical address bits up to
23295 +// (PAGE_SHIFT * 3)
23296 +static unsigned long privcmd_resource_max = GRANULEROUNDDOWN((1UL << (PAGE_SHIFT * 3)) - 1);
23297 +static unsigned long privcmd_resource_align = IA64_GRANULE_SIZE;
23298 +
23299 +static unsigned long
23300 +md_end_addr(const efi_memory_desc_t *md)
23301 +{
23302 +       return md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
23303 +}
23304 +
23305 +#define XEN_IA64_PRIVCMD_LEAST_GAP_SIZE        (1024 * 1024 * 1024UL)
23306 +static int
23307 +xen_ia64_privcmd_check_size(unsigned long start, unsigned long end)
23308 +{
23309 +       return (start < end &&
23310 +               (end - start) > XEN_IA64_PRIVCMD_LEAST_GAP_SIZE);
23311 +}
23312 +
23313 +static int __init
23314 +xen_ia64_privcmd_init(void)
23315 +{
23316 +       void *efi_map_start, *efi_map_end, *p;
23317 +       u64 efi_desc_size;
23318 +       efi_memory_desc_t *md;
23319 +       unsigned long tmp_min;
23320 +       unsigned long tmp_max;
23321 +       unsigned long gap_size;
23322 +       unsigned long prev_end;
23323 +
23324 +       if (!is_running_on_xen())
23325 +               return -1;
23326 +
23327 +       efi_map_start = __va(ia64_boot_param->efi_memmap);
23328 +       efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
23329 +       efi_desc_size = ia64_boot_param->efi_memdesc_size;
23330 +
23331 +       // at first check the used highest address
23332 +       for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
23333 +               // nothing
23334 +       }
23335 +       md = p - efi_desc_size;
23336 +       privcmd_resource_min = GRANULEROUNDUP(md_end_addr(md));
23337 +       if (xen_ia64_privcmd_check_size(privcmd_resource_min,
23338 +                                       privcmd_resource_max)) {
23339 +               goto out;
23340 +       }
23341 +
23342 +       // the used highest address is too large. try to find the largest gap.
23343 +       tmp_min = privcmd_resource_max;
23344 +       tmp_max = 0;
23345 +       gap_size = 0;
23346 +       prev_end = 0;
23347 +       for (p = efi_map_start;
23348 +            p < efi_map_end - efi_desc_size;
23349 +            p += efi_desc_size) {
23350 +               unsigned long end;
23351 +               efi_memory_desc_t* next;
23352 +               unsigned long next_start;
23353 +
23354 +               md = p;
23355 +               end = md_end_addr(md);
23356 +               if (end > privcmd_resource_max) {
23357 +                       break;
23358 +               }
23359 +               if (end < prev_end) {
23360 +                       // work around. 
23361 +                       // Xen may pass incompletely sorted memory
23362 +                       // descriptors like
23363 +                       // [x, x + length]
23364 +                       // [x, x]
23365 +                       // this order should be reversed.
23366 +                       continue;
23367 +               }
23368 +               next = p + efi_desc_size;
23369 +               next_start = next->phys_addr;
23370 +               if (next_start > privcmd_resource_max) {
23371 +                       next_start = privcmd_resource_max;
23372 +               }
23373 +               if (end < next_start && gap_size < (next_start - end)) {
23374 +                       tmp_min = end;
23375 +                       tmp_max = next_start;
23376 +                       gap_size = tmp_max - tmp_min;
23377 +               }
23378 +               prev_end = end;
23379 +       }
23380 +
23381 +       privcmd_resource_min = GRANULEROUNDUP(tmp_min);
23382 +       if (xen_ia64_privcmd_check_size(privcmd_resource_min, tmp_max)) {
23383 +               privcmd_resource_max = tmp_max;
23384 +               goto out;
23385 +       }
23386 +
23387 +       privcmd_resource_min = tmp_min;
23388 +       privcmd_resource_max = tmp_max;
23389 +       if (!xen_ia64_privcmd_check_size(privcmd_resource_min,
23390 +                                        privcmd_resource_max)) {
23391 +               // Any large enough gap isn't found.
23392 +               // go ahead anyway with the warning hoping that large region
23393 +               // won't be requested.
23394 +               printk(KERN_WARNING "xen privcmd: large enough region for privcmd mmap is not found.\n");
23395 +       }
23396 +
23397 +out:
23398 +       printk(KERN_INFO "xen privcmd uses pseudo physical addr range [0x%lx, 0x%lx] (%ldMB)\n",
23399 +              privcmd_resource_min, privcmd_resource_max, 
23400 +              (privcmd_resource_max - privcmd_resource_min) >> 20);
23401 +       BUG_ON(privcmd_resource_min >= privcmd_resource_max);
23402 +       return 0;
23403 +}
23404 +late_initcall(xen_ia64_privcmd_init);
23405 +
23406 +struct xen_ia64_privcmd_entry {
23407 +       atomic_t        map_count;
23408 +#define INVALID_GPFN   (~0UL)
23409 +       unsigned long   gpfn;
23410 +};
23411 +
23412 +struct xen_ia64_privcmd_range {
23413 +       atomic_t                        ref_count;
23414 +       unsigned long                   pgoff; // in PAGE_SIZE
23415 +       struct resource*                res;
23416 +
23417 +       unsigned long                   num_entries;
23418 +       struct xen_ia64_privcmd_entry   entries[0];
23419 +};
23420 +
23421 +struct xen_ia64_privcmd_vma {
23422 +       struct xen_ia64_privcmd_range*  range;
23423 +
23424 +       unsigned long                   num_entries;
23425 +       struct xen_ia64_privcmd_entry*  entries;
23426 +};
23427 +
23428 +static void
23429 +xen_ia64_privcmd_init_entry(struct xen_ia64_privcmd_entry* entry)
23430 +{
23431 +       atomic_set(&entry->map_count, 0);
23432 +       entry->gpfn = INVALID_GPFN;
23433 +}
23434 +
23435 +static int
23436 +xen_ia64_privcmd_entry_mmap(struct vm_area_struct* vma,
23437 +                           unsigned long addr,
23438 +                           struct xen_ia64_privcmd_range* privcmd_range,
23439 +                           int i,
23440 +                           unsigned long mfn,
23441 +                           pgprot_t prot,
23442 +                           domid_t domid)
23443 +{
23444 +       int error = 0;
23445 +       struct xen_ia64_privcmd_entry* entry = &privcmd_range->entries[i];
23446 +       unsigned long gpfn;
23447 +       unsigned long flags;
23448 +
23449 +       BUG_ON((addr & ~PAGE_MASK) != 0);
23450 +       BUG_ON(mfn == INVALID_MFN);
23451 +
23452 +       if (entry->gpfn != INVALID_GPFN) {
23453 +               error = -EBUSY;
23454 +               goto out;
23455 +       }
23456 +       gpfn = (privcmd_range->res->start >> PAGE_SHIFT) + i;
23457 +
23458 +       flags = ASSIGN_writable;
23459 +       if (pgprot_val(prot) == PROT_READ) {
23460 +               flags = ASSIGN_readonly;
23461 +       }
23462 +       error = HYPERVISOR_add_physmap(gpfn, mfn, flags, domid);
23463 +       if (error != 0) {
23464 +               goto out;
23465 +       }
23466 +
23467 +       prot = vma->vm_page_prot;
23468 +       error = remap_pfn_range(vma, addr, gpfn, 1 << PAGE_SHIFT, prot);
23469 +       if (error != 0) {
23470 +               error = HYPERVISOR_zap_physmap(gpfn, 0);
23471 +               if (error) {
23472 +                       BUG();//XXX
23473 +               }
23474 +       } else {
23475 +               atomic_inc(&entry->map_count);
23476 +               entry->gpfn = gpfn;
23477 +       }
23478 +
23479 +out:
23480 +       return error;
23481 +}
23482 +
23483 +static void
23484 +xen_ia64_privcmd_entry_munmap(struct xen_ia64_privcmd_range* privcmd_range,
23485 +                             int i)
23486 +{
23487 +       struct xen_ia64_privcmd_entry* entry = &privcmd_range->entries[i];
23488 +       unsigned long gpfn = entry->gpfn;
23489 +       //gpfn = (privcmd_range->res->start >> PAGE_SHIFT) +
23490 +       //      (vma->vm_pgoff - privcmd_range->pgoff);
23491 +       int error;
23492 +
23493 +       error = HYPERVISOR_zap_physmap(gpfn, 0);
23494 +       if (error) {
23495 +               BUG();//XXX
23496 +       }
23497 +       entry->gpfn = INVALID_GPFN;
23498 +}
23499 +
23500 +static void
23501 +xen_ia64_privcmd_entry_open(struct xen_ia64_privcmd_range* privcmd_range,
23502 +                           int i)
23503 +{
23504 +       struct xen_ia64_privcmd_entry* entry = &privcmd_range->entries[i];
23505 +       if (entry->gpfn != INVALID_GPFN) {
23506 +               atomic_inc(&entry->map_count);
23507 +       } else {
23508 +               BUG_ON(atomic_read(&entry->map_count) != 0);
23509 +       }
23510 +}
23511 +
23512 +static void
23513 +xen_ia64_privcmd_entry_close(struct xen_ia64_privcmd_range* privcmd_range,
23514 +                            int i)
23515 +{
23516 +       struct xen_ia64_privcmd_entry* entry = &privcmd_range->entries[i];
23517 +       if (entry->gpfn != INVALID_GPFN &&
23518 +           atomic_dec_and_test(&entry->map_count)) {
23519 +               xen_ia64_privcmd_entry_munmap(privcmd_range, i);
23520 +       }
23521 +}
23522 +
23523 +static void xen_ia64_privcmd_vma_open(struct vm_area_struct* vma);
23524 +static void xen_ia64_privcmd_vma_close(struct vm_area_struct* vma);
23525 +
23526 +struct vm_operations_struct xen_ia64_privcmd_vm_ops = {
23527 +       .open = &xen_ia64_privcmd_vma_open,
23528 +       .close = &xen_ia64_privcmd_vma_close,
23529 +};
23530 +
23531 +static void
23532 +__xen_ia64_privcmd_vma_open(struct vm_area_struct* vma,
23533 +                           struct xen_ia64_privcmd_vma* privcmd_vma,
23534 +                           struct xen_ia64_privcmd_range* privcmd_range)
23535 +{
23536 +       unsigned long entry_offset = vma->vm_pgoff - privcmd_range->pgoff;
23537 +       unsigned long num_entries = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
23538 +       unsigned long i;
23539 +
23540 +       BUG_ON(entry_offset < 0);
23541 +       BUG_ON(entry_offset + num_entries > privcmd_range->num_entries);
23542 +
23543 +       privcmd_vma->range = privcmd_range;
23544 +       privcmd_vma->num_entries = num_entries;
23545 +       privcmd_vma->entries = &privcmd_range->entries[entry_offset];
23546 +       vma->vm_private_data = privcmd_vma;
23547 +       for (i = 0; i < privcmd_vma->num_entries; i++) {
23548 +               xen_ia64_privcmd_entry_open(privcmd_range, entry_offset + i);
23549 +       }
23550 +
23551 +       vma->vm_private_data = privcmd_vma;
23552 +       vma->vm_ops = &xen_ia64_privcmd_vm_ops;
23553 +}
23554 +
23555 +static void
23556 +xen_ia64_privcmd_vma_open(struct vm_area_struct* vma)
23557 +{
23558 +       struct xen_ia64_privcmd_vma* privcmd_vma = (struct xen_ia64_privcmd_vma*)vma->vm_private_data;
23559 +       struct xen_ia64_privcmd_range* privcmd_range = privcmd_vma->range;
23560 +
23561 +       atomic_inc(&privcmd_range->ref_count);
23562 +       // vm_op->open() can't fail.
23563 +       privcmd_vma = kmalloc(sizeof(*privcmd_vma), GFP_KERNEL | __GFP_NOFAIL);
23564 +
23565 +       __xen_ia64_privcmd_vma_open(vma, privcmd_vma, privcmd_range);
23566 +}
23567 +
23568 +static void
23569 +xen_ia64_privcmd_vma_close(struct vm_area_struct* vma)
23570 +{
23571 +       struct xen_ia64_privcmd_vma* privcmd_vma =
23572 +               (struct xen_ia64_privcmd_vma*)vma->vm_private_data;
23573 +       struct xen_ia64_privcmd_range* privcmd_range = privcmd_vma->range;
23574 +       unsigned long entry_offset = vma->vm_pgoff - privcmd_range->pgoff;
23575 +       unsigned long i;
23576 +
23577 +       for (i = 0; i < privcmd_vma->num_entries; i++) {
23578 +               xen_ia64_privcmd_entry_close(privcmd_range, entry_offset + i);
23579 +       }
23580 +       vma->vm_private_data = NULL;
23581 +       kfree(privcmd_vma);
23582 +
23583 +       if (atomic_dec_and_test(&privcmd_range->ref_count)) {
23584 +#if 1
23585 +               for (i = 0; i < privcmd_range->num_entries; i++) {
23586 +                       struct xen_ia64_privcmd_entry* entry =
23587 +                               &privcmd_range->entries[i];
23588 +                       BUG_ON(atomic_read(&entry->map_count) != 0);
23589 +                       BUG_ON(entry->gpfn != INVALID_GPFN);
23590 +               }
23591 +#endif
23592 +               release_resource(privcmd_range->res);
23593 +               kfree(privcmd_range->res);
23594 +               vfree(privcmd_range);
23595 +       }
23596 +}
23597 +
23598 +int
23599 +privcmd_mmap(struct file * file, struct vm_area_struct * vma)
23600 +{
23601 +       int error;
23602 +       unsigned long size = vma->vm_end - vma->vm_start;
23603 +       unsigned long num_entries = size >> PAGE_SHIFT;
23604 +       struct xen_ia64_privcmd_range* privcmd_range = NULL;
23605 +       struct xen_ia64_privcmd_vma* privcmd_vma = NULL;
23606 +       struct resource* res = NULL;
23607 +       unsigned long i;
23608 +       BUG_ON(!is_running_on_xen());
23609 +
23610 +       BUG_ON(file->private_data != NULL);
23611 +
23612 +       error = -ENOMEM;
23613 +       privcmd_range =
23614 +               vmalloc(sizeof(*privcmd_range) +
23615 +                       sizeof(privcmd_range->entries[0]) * num_entries);
23616 +       if (privcmd_range == NULL) {
23617 +               goto out_enomem0;
23618 +       }
23619 +       privcmd_vma = kmalloc(sizeof(*privcmd_vma), GFP_KERNEL);
23620 +       if (privcmd_vma == NULL) {
23621 +               goto out_enomem1;
23622 +       }
23623 +       res = kzalloc(sizeof(*res), GFP_KERNEL);
23624 +       if (res == NULL) {
23625 +               goto out_enomem1;
23626 +       }
23627 +       res->name = "Xen privcmd mmap";
23628 +       error = allocate_resource(&iomem_resource, res, size,
23629 +                                 privcmd_resource_min, privcmd_resource_max,
23630 +                                 privcmd_resource_align, NULL, NULL);
23631 +       if (error) {
23632 +               goto out_enomem1;
23633 +       }
23634 +       privcmd_range->res = res;
23635 +
23636 +       /* DONTCOPY is essential for Xen as copy_page_range is broken. */
23637 +       vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP;
23638 +
23639 +       atomic_set(&privcmd_range->ref_count, 1);
23640 +       privcmd_range->pgoff = vma->vm_pgoff;
23641 +       privcmd_range->num_entries = num_entries;
23642 +       for (i = 0; i < privcmd_range->num_entries; i++) {
23643 +               xen_ia64_privcmd_init_entry(&privcmd_range->entries[i]);
23644 +       }
23645 +
23646 +       __xen_ia64_privcmd_vma_open(vma, privcmd_vma, privcmd_range);
23647 +       return 0;
23648 +
23649 +out_enomem1:
23650 +       kfree(res);
23651 +       kfree(privcmd_vma);
23652 +out_enomem0:
23653 +       vfree(privcmd_range);
23654 +       return error;
23655 +}
23656 +
23657 +int
23658 +direct_remap_pfn_range(struct vm_area_struct *vma,
23659 +                      unsigned long address,   // process virtual address
23660 +                      unsigned long mfn,       // mfn, mfn + 1, ... mfn + size/PAGE_SIZE
23661 +                      unsigned long size,
23662 +                      pgprot_t prot,
23663 +                      domid_t  domid)          // target domain
23664 +{
23665 +       struct xen_ia64_privcmd_vma* privcmd_vma =
23666 +               (struct xen_ia64_privcmd_vma*)vma->vm_private_data;
23667 +       struct xen_ia64_privcmd_range* privcmd_range = privcmd_vma->range;
23668 +       unsigned long entry_offset = vma->vm_pgoff - privcmd_range->pgoff;
23669 +
23670 +       unsigned long i;
23671 +       unsigned long offset;
23672 +       int error = 0;
23673 +       BUG_ON(!is_running_on_xen());
23674 +
23675 +#if 0
23676 +       if (prot != vm->vm_page_prot) {
23677 +               return -EINVAL;
23678 +       }
23679 +#endif
23680 +
23681 +       i = (address - vma->vm_start) >> PAGE_SHIFT;
23682 +       for (offset = 0; offset < size; offset += PAGE_SIZE) {
23683 +               error = xen_ia64_privcmd_entry_mmap(vma, (address + offset) & PAGE_MASK, privcmd_range, entry_offset + i, mfn, prot, domid);
23684 +               if (error != 0) {
23685 +                       break;
23686 +               }
23687 +
23688 +               i++;
23689 +               mfn++;
23690 +        }
23691 +
23692 +       return error;
23693 +}
23694 +
23695 diff -urNp linux-2.6/arch/ia64/xen/Makefile new/arch/ia64/xen/Makefile
23696 --- linux-2.6/arch/ia64/xen/Makefile    1970-01-01 01:00:00.000000000 +0100
23697 +++ new/arch/ia64/xen/Makefile  2006-06-07 13:15:16.000000000 +0200
23698 @@ -0,0 +1,8 @@
23699 +#
23700 +# Makefile for Xen components
23701 +#
23702 +
23703 +obj-y := hypercall.o xenivt.o xenentry.o xensetup.o xenpal.o xenhpski.o
23704 +
23705 +obj-$(CONFIG_XEN_IA64_DOM0_VP) += hypervisor.o pci-dma-xen.o util.o
23706 +pci-dma-xen-$(CONFIG_XEN_IA64_DOM0_VP) := ../../i386/kernel/pci-dma-xen.o
23707 diff -urNp linux-2.6/arch/ia64/xen/util.c new/arch/ia64/xen/util.c
23708 --- linux-2.6/arch/ia64/xen/util.c      1970-01-01 01:00:00.000000000 +0100
23709 +++ new/arch/ia64/xen/util.c    2006-06-28 14:32:13.000000000 +0200
23710 @@ -0,0 +1,130 @@
23711 +/******************************************************************************
23712 + * arch/ia64/xen/util.c
23713 + * This file is the ia64 counterpart of drivers/xen/util.c
23714 + *
23715 + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
23716 + *                    VA Linux Systems Japan K.K.
23717 + *
23718 + * This program is free software; you can redistribute it and/or modify
23719 + * it under the terms of the GNU General Public License as published by
23720 + * the Free Software Foundation; either version 2 of the License, or
23721 + * (at your option) any later version.
23722 + *
23723 + * This program is distributed in the hope that it will be useful,
23724 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
23725 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23726 + * GNU General Public License for more details.
23727 + *
23728 + * You should have received a copy of the GNU General Public License
23729 + * along with this program; if not, write to the Free Software
23730 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23731 + *
23732 + */
23733 +
23734 +#include <linux/config.h>
23735 +#include <linux/mm.h>
23736 +#include <linux/module.h>
23737 +#include <linux/slab.h>
23738 +#include <linux/vmalloc.h>
23739 +#include <asm/uaccess.h>
23740 +#include <xen/driver_util.h>
23741 +
23742 +struct vm_struct *alloc_vm_area(unsigned long size)
23743 +{
23744 +       int order;
23745 +       unsigned long virt;
23746 +       unsigned long nr_pages;
23747 +       struct vm_struct* area;
23748 +       
23749 +       order = get_order(size);
23750 +       virt = __get_free_pages(GFP_KERNEL, order);
23751 +       if (virt == 0) {
23752 +               goto err0;
23753 +       }
23754 +       nr_pages = 1 << order;
23755 +       scrub_pages(virt, nr_pages);
23756 +       
23757 +       area = kmalloc(sizeof(*area), GFP_KERNEL);
23758 +       if (area == NULL) {
23759 +               goto err1;
23760 +       }
23761 +       
23762 +        area->flags = VM_IOREMAP;//XXX
23763 +        area->addr = (void*)virt;
23764 +        area->size = size;
23765 +        area->pages = NULL; //XXX
23766 +        area->nr_pages = nr_pages;
23767 +        area->phys_addr = __pa(virt);
23768 +
23769 +       return area;
23770 +
23771 +err1:
23772 +       free_pages(virt, order);
23773 +err0:
23774 +       return NULL;
23775 +       
23776 +}
23777 +EXPORT_SYMBOL_GPL(alloc_vm_area);
23778 +
23779 +void free_vm_area(struct vm_struct *area)
23780 +{
23781 +       unsigned int order = get_order(area->size);
23782 +       unsigned long i;
23783 +
23784 +       // This area is used for foreign page mappping.
23785 +       // So underlying machine page may not be assigned.
23786 +       for (i = 0; i < (1 << order); i++) {
23787 +               unsigned long ret;
23788 +               unsigned long gpfn = (area->phys_addr >> PAGE_SHIFT) + i;
23789 +               struct xen_memory_reservation reservation = {
23790 +                       .nr_extents   = 1,
23791 +                       .address_bits = 0,
23792 +                       .extent_order = 0,
23793 +                       .domid        = DOMID_SELF
23794 +               };
23795 +               set_xen_guest_handle(reservation.extent_start, &gpfn);
23796 +               ret = HYPERVISOR_memory_op(XENMEM_populate_physmap,
23797 +                                          &reservation);
23798 +               BUG_ON(ret != 1);
23799 +       }
23800 +       free_pages((unsigned long)area->addr, order);
23801 +       kfree(area);
23802 +}
23803 +EXPORT_SYMBOL_GPL(free_vm_area);
23804 +
23805 +void lock_vm_area(struct vm_struct *area)
23806 +{
23807 +       // nothing
23808 +}
23809 +EXPORT_SYMBOL_GPL(lock_vm_area);
23810 +
23811 +void unlock_vm_area(struct vm_struct *area)
23812 +{
23813 +       // nothing
23814 +}
23815 +EXPORT_SYMBOL_GPL(unlock_vm_area);
23816 +
23817 +#ifndef CONFIG_XEN_IA64_DOM0_VP
23818 +/* We just need a range of legal va here, though finally identity
23819 + * mapped one is instead used for gnttab mapping.
23820 + */
23821 +unsigned long alloc_empty_foreign_map_page_range(unsigned long pages)
23822 +{
23823 +       struct vm_struct *vma;
23824 +
23825 +       if ( (vma = get_vm_area(PAGE_SIZE * pages, VM_ALLOC)) == NULL )
23826 +               return NULL;
23827 +
23828 +       return (unsigned long)vma->addr;
23829 +}
23830 +#endif
23831 +
23832 +/*
23833 + * Local variables:
23834 + *  c-file-style: "linux"
23835 + *  indent-tabs-mode: t
23836 + *  c-indent-level: 8
23837 + *  c-basic-offset: 8
23838 + *  tab-width: 8
23839 + * End:
23840 + */
23841 diff -urNp linux-2.6/arch/ia64/xen/xenentry.S new/arch/ia64/xen/xenentry.S
23842 --- linux-2.6/arch/ia64/xen/xenentry.S  1970-01-01 01:00:00.000000000 +0100
23843 +++ new/arch/ia64/xen/xenentry.S        2006-05-23 18:42:17.000000000 +0200
23844 @@ -0,0 +1,883 @@
23845 +/*
23846 + * ia64/xen/entry.S
23847 + *
23848 + * Alternate kernel routines for Xen.  Heavily leveraged from
23849 + *   ia64/kernel/entry.S
23850 + *
23851 + * Copyright (C) 2005 Hewlett-Packard Co
23852 + *     Dan Magenheimer <dan.magenheimer@.hp.com>
23853 + */
23854 +
23855 +#include <linux/config.h>
23856 +
23857 +#include <asm/asmmacro.h>
23858 +#include <asm/cache.h>
23859 +#include <asm/errno.h>
23860 +#include <asm/kregs.h>
23861 +#include <asm/asm-offsets.h>
23862 +#include <asm/pgtable.h>
23863 +#include <asm/percpu.h>
23864 +#include <asm/processor.h>
23865 +#include <asm/thread_info.h>
23866 +#include <asm/unistd.h>
23867 +
23868 +#ifdef CONFIG_XEN
23869 +#include "xenminstate.h"
23870 +#else
23871 +#include "minstate.h"
23872 +#endif
23873 +
23874 +/*
23875 + * prev_task <- ia64_switch_to(struct task_struct *next)
23876 + *     With Ingo's new scheduler, interrupts are disabled when this routine gets
23877 + *     called.  The code starting at .map relies on this.  The rest of the code
23878 + *     doesn't care about the interrupt masking status.
23879 + */
23880 +#ifdef CONFIG_XEN
23881 +GLOBAL_ENTRY(xen_switch_to)
23882 +       .prologue
23883 +       alloc r16=ar.pfs,1,0,0,0
23884 +       movl r22=running_on_xen;;
23885 +       ld4 r22=[r22];;
23886 +       cmp.eq p7,p0=r22,r0
23887 +(p7)   br.cond.sptk.many __ia64_switch_to;;
23888 +#else
23889 +GLOBAL_ENTRY(ia64_switch_to)
23890 +       .prologue
23891 +       alloc r16=ar.pfs,1,0,0,0
23892 +#endif
23893 +       DO_SAVE_SWITCH_STACK
23894 +       .body
23895 +
23896 +       adds r22=IA64_TASK_THREAD_KSP_OFFSET,r13
23897 +       movl r25=init_task
23898 +       mov r27=IA64_KR(CURRENT_STACK)
23899 +       adds r21=IA64_TASK_THREAD_KSP_OFFSET,in0
23900 +       dep r20=0,in0,61,3              // physical address of "next"
23901 +       ;;
23902 +       st8 [r22]=sp                    // save kernel stack pointer of old task
23903 +       shr.u r26=r20,IA64_GRANULE_SHIFT
23904 +       cmp.eq p7,p6=r25,in0
23905 +       ;;
23906 +#ifdef CONFIG_XEN
23907 +       movl r8=XSI_PSR_IC
23908 +       ;;
23909 +       st4 [r8]=r0     // force psr.ic off for hyperprivop(s)
23910 +       ;;
23911 +#endif
23912 +       /*
23913 +        * If we've already mapped this task's page, we can skip doing it again.
23914 +        */
23915 +(p6)   cmp.eq p7,p6=r26,r27
23916 +(p6)   br.cond.dpnt .map
23917 +       ;;
23918 +.done:
23919 +#ifdef CONFIG_XEN
23920 +       // psr.ic already off
23921 +       // update "current" application register
23922 +       mov r8=IA64_KR_CURRENT
23923 +       mov r9=in0;;
23924 +       XEN_HYPER_SET_KR
23925 +       ld8 sp=[r21]                    // load kernel stack pointer of new task
23926 +       movl r27=XSI_PSR_IC
23927 +       mov r8=1
23928 +       ;;
23929 +       st4 [r27]=r8                    // psr.ic back on
23930 +#else
23931 +       ld8 sp=[r21]                    // load kernel stack pointer of new task
23932 +       mov IA64_KR(CURRENT)=in0        // update "current" application register
23933 +#endif
23934 +       mov r8=r13                      // return pointer to previously running task
23935 +       mov r13=in0                     // set "current" pointer
23936 +       ;;
23937 +       DO_LOAD_SWITCH_STACK
23938 +
23939 +#ifdef CONFIG_SMP
23940 +       sync.i                          // ensure "fc"s done by this CPU are visible on other CPUs
23941 +#endif
23942 +       br.ret.sptk.many rp             // boogie on out in new context
23943 +
23944 +.map:
23945 +#ifdef CONFIG_XEN
23946 +       // psr.ic already off
23947 +#else
23948 +       rsm psr.ic                      // interrupts (psr.i) are already disabled here
23949 +#endif
23950 +       movl r25=PAGE_KERNEL
23951 +       ;;
23952 +       srlz.d
23953 +       or r23=r25,r20                  // construct PA | page properties
23954 +       mov r25=IA64_GRANULE_SHIFT<<2
23955 +       ;;
23956 +#ifdef CONFIG_XEN
23957 +       movl r8=XSI_ITIR
23958 +       ;;
23959 +       st8 [r8]=r25
23960 +       ;;
23961 +       movl r8=XSI_IFA
23962 +       ;;
23963 +       st8 [r8]=in0                     // VA of next task...
23964 +       ;;
23965 +       mov r25=IA64_TR_CURRENT_STACK
23966 +       // remember last page we mapped...
23967 +       mov r8=IA64_KR_CURRENT_STACK
23968 +       mov r9=r26;;
23969 +       XEN_HYPER_SET_KR;;
23970 +#else
23971 +       mov cr.itir=r25
23972 +       mov cr.ifa=in0                  // VA of next task...
23973 +       ;;
23974 +       mov r25=IA64_TR_CURRENT_STACK
23975 +       mov IA64_KR(CURRENT_STACK)=r26  // remember last page we mapped...
23976 +#endif
23977 +       ;;
23978 +       itr.d dtr[r25]=r23              // wire in new mapping...
23979 +#ifndef CONFIG_XEN
23980 +       ssm psr.ic                      // reenable the psr.ic bit
23981 +       ;;
23982 +       srlz.d
23983 +#endif
23984 +       br.cond.sptk .done
23985 +#ifdef CONFIG_XEN
23986 +END(xen_switch_to)
23987 +#else
23988 +END(ia64_switch_to)
23989 +#endif
23990 +
23991 +       /*
23992 +        * Invoke a system call, but do some tracing before and after the call.
23993 +        * We MUST preserve the current register frame throughout this routine
23994 +        * because some system calls (such as ia64_execve) directly
23995 +        * manipulate ar.pfs.
23996 +        */
23997 +#ifdef CONFIG_XEN
23998 +GLOBAL_ENTRY(xen_trace_syscall)
23999 +       PT_REGS_UNWIND_INFO(0)
24000 +       movl r16=running_on_xen;;
24001 +       ld4 r16=[r16];;
24002 +       cmp.eq p7,p0=r16,r0
24003 +(p7)   br.cond.sptk.many __ia64_trace_syscall;;
24004 +#else
24005 +GLOBAL_ENTRY(ia64_trace_syscall)
24006 +       PT_REGS_UNWIND_INFO(0)
24007 +#endif
24008 +       /*
24009 +        * We need to preserve the scratch registers f6-f11 in case the system
24010 +        * call is sigreturn.
24011 +        */
24012 +       adds r16=PT(F6)+16,sp
24013 +       adds r17=PT(F7)+16,sp
24014 +       ;;
24015 +       stf.spill [r16]=f6,32
24016 +       stf.spill [r17]=f7,32
24017 +       ;;
24018 +       stf.spill [r16]=f8,32
24019 +       stf.spill [r17]=f9,32
24020 +       ;;
24021 +       stf.spill [r16]=f10
24022 +       stf.spill [r17]=f11
24023 +       br.call.sptk.many rp=syscall_trace_enter // give parent a chance to catch syscall args
24024 +       adds r16=PT(F6)+16,sp
24025 +       adds r17=PT(F7)+16,sp
24026 +       ;;
24027 +       ldf.fill f6=[r16],32
24028 +       ldf.fill f7=[r17],32
24029 +       ;;
24030 +       ldf.fill f8=[r16],32
24031 +       ldf.fill f9=[r17],32
24032 +       ;;
24033 +       ldf.fill f10=[r16]
24034 +       ldf.fill f11=[r17]
24035 +       // the syscall number may have changed, so re-load it and re-calculate the
24036 +       // syscall entry-point:
24037 +       adds r15=PT(R15)+16,sp                  // r15 = &pt_regs.r15 (syscall #)
24038 +       ;;
24039 +       ld8 r15=[r15]
24040 +       mov r3=NR_syscalls - 1
24041 +       ;;
24042 +       adds r15=-1024,r15
24043 +       movl r16=sys_call_table
24044 +       ;;
24045 +       shladd r20=r15,3,r16                    // r20 = sys_call_table + 8*(syscall-1024)
24046 +       cmp.leu p6,p7=r15,r3
24047 +       ;;
24048 +(p6)   ld8 r20=[r20]                           // load address of syscall entry point
24049 +(p7)   movl r20=sys_ni_syscall
24050 +       ;;
24051 +       mov b6=r20
24052 +       br.call.sptk.many rp=b6                 // do the syscall
24053 +.strace_check_retval:
24054 +       cmp.lt p6,p0=r8,r0                      // syscall failed?
24055 +       adds r2=PT(R8)+16,sp                    // r2 = &pt_regs.r8
24056 +       adds r3=PT(R10)+16,sp                   // r3 = &pt_regs.r10
24057 +       mov r10=0
24058 +(p6)   br.cond.sptk strace_error               // syscall failed ->
24059 +       ;;                                      // avoid RAW on r10
24060 +.strace_save_retval:
24061 +.mem.offset 0,0; st8.spill [r2]=r8             // store return value in slot for r8
24062 +.mem.offset 8,0; st8.spill [r3]=r10            // clear error indication in slot for r10
24063 +       br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value
24064 +.ret3:
24065 +(pUStk)        cmp.eq.unc p6,p0=r0,r0                  // p6 <- pUStk
24066 +       br.cond.sptk .work_pending_syscall_end
24067 +
24068 +strace_error:
24069 +       ld8 r3=[r2]                             // load pt_regs.r8
24070 +       sub r9=0,r8                             // negate return value to get errno value
24071 +       ;;
24072 +       cmp.ne p6,p0=r3,r0                      // is pt_regs.r8!=0?
24073 +       adds r3=16,r2                           // r3=&pt_regs.r10
24074 +       ;;
24075 +(p6)   mov r10=-1
24076 +(p6)   mov r8=r9
24077 +       br.cond.sptk .strace_save_retval
24078 +#ifdef CONFIG_XEN
24079 +END(xen_trace_syscall)
24080 +#else
24081 +END(ia64_trace_syscall)
24082 +#endif
24083 +
24084 +/*
24085 + * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't
24086 + *     need to switch to bank 0 and doesn't restore the scratch registers.
24087 + *     To avoid leaking kernel bits, the scratch registers are set to
24088 + *     the following known-to-be-safe values:
24089 + *
24090 + *               r1: restored (global pointer)
24091 + *               r2: cleared
24092 + *               r3: 1 (when returning to user-level)
24093 + *           r8-r11: restored (syscall return value(s))
24094 + *              r12: restored (user-level stack pointer)
24095 + *              r13: restored (user-level thread pointer)
24096 + *              r14: set to __kernel_syscall_via_epc
24097 + *              r15: restored (syscall #)
24098 + *          r16-r17: cleared
24099 + *              r18: user-level b6
24100 + *              r19: cleared
24101 + *              r20: user-level ar.fpsr
24102 + *              r21: user-level b0
24103 + *              r22: cleared
24104 + *              r23: user-level ar.bspstore
24105 + *              r24: user-level ar.rnat
24106 + *              r25: user-level ar.unat
24107 + *              r26: user-level ar.pfs
24108 + *              r27: user-level ar.rsc
24109 + *              r28: user-level ip
24110 + *              r29: user-level psr
24111 + *              r30: user-level cfm
24112 + *              r31: user-level pr
24113 + *           f6-f11: cleared
24114 + *               pr: restored (user-level pr)
24115 + *               b0: restored (user-level rp)
24116 + *               b6: restored
24117 + *               b7: set to __kernel_syscall_via_epc
24118 + *          ar.unat: restored (user-level ar.unat)
24119 + *           ar.pfs: restored (user-level ar.pfs)
24120 + *           ar.rsc: restored (user-level ar.rsc)
24121 + *          ar.rnat: restored (user-level ar.rnat)
24122 + *      ar.bspstore: restored (user-level ar.bspstore)
24123 + *          ar.fpsr: restored (user-level ar.fpsr)
24124 + *           ar.ccv: cleared
24125 + *           ar.csd: cleared
24126 + *           ar.ssd: cleared
24127 + */
24128 +#ifdef CONFIG_XEN
24129 +GLOBAL_ENTRY(xen_leave_syscall)
24130 +       PT_REGS_UNWIND_INFO(0)
24131 +       movl r22=running_on_xen;;
24132 +       ld4 r22=[r22];;
24133 +       cmp.eq p7,p0=r22,r0
24134 +(p7)   br.cond.sptk.many __ia64_leave_syscall;;
24135 +#else
24136 +ENTRY(ia64_leave_syscall)
24137 +       PT_REGS_UNWIND_INFO(0)
24138 +#endif
24139 +       /*
24140 +        * work.need_resched etc. mustn't get changed by this CPU before it returns to
24141 +        * user- or fsys-mode, hence we disable interrupts early on.
24142 +        *
24143 +        * p6 controls whether current_thread_info()->flags needs to be check for
24144 +        * extra work.  We always check for extra work when returning to user-level.
24145 +        * With CONFIG_PREEMPT, we also check for extra work when the preempt_count
24146 +        * is 0.  After extra work processing has been completed, execution
24147 +        * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check
24148 +        * needs to be redone.
24149 +        */
24150 +#ifdef CONFIG_PREEMPT
24151 +       rsm psr.i                               // disable interrupts
24152 +       cmp.eq pLvSys,p0=r0,r0                  // pLvSys=1: leave from syscall
24153 +(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
24154 +       ;;
24155 +       .pred.rel.mutex pUStk,pKStk
24156 +(pKStk) ld4 r21=[r20]                  // r21 <- preempt_count
24157 +(pUStk)        mov r21=0                       // r21 <- 0
24158 +       ;;
24159 +       cmp.eq p6,p0=r21,r0             // p6 <- pUStk || (preempt_count == 0)
24160 +#else /* !CONFIG_PREEMPT */
24161 +#ifdef CONFIG_XEN
24162 +       movl r2=XSI_PSR_I_ADDR
24163 +       mov r18=1
24164 +       ;;
24165 +       ld8 r2=[r2]
24166 +       ;;
24167 +(pUStk)        st1 [r2]=r18
24168 +#else
24169 +(pUStk)        rsm psr.i
24170 +#endif
24171 +       cmp.eq pLvSys,p0=r0,r0          // pLvSys=1: leave from syscall
24172 +(pUStk)        cmp.eq.unc p6,p0=r0,r0          // p6 <- pUStk
24173 +#endif
24174 +.work_processed_syscall:
24175 +       adds r2=PT(LOADRS)+16,r12
24176 +       adds r3=PT(AR_BSPSTORE)+16,r12
24177 +       adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
24178 +       ;;
24179 +(p6)   ld4 r31=[r18]                           // load current_thread_info()->flags
24180 +       ld8 r19=[r2],PT(B6)-PT(LOADRS)          // load ar.rsc value for "loadrs"
24181 +       nop.i 0
24182 +       ;;
24183 +       mov r16=ar.bsp                          // M2  get existing backing store pointer
24184 +       ld8 r18=[r2],PT(R9)-PT(B6)              // load b6
24185 +(p6)   and r15=TIF_WORK_MASK,r31               // any work other than TIF_SYSCALL_TRACE?
24186 +       ;;
24187 +       ld8 r23=[r3],PT(R11)-PT(AR_BSPSTORE)    // load ar.bspstore (may be garbage)
24188 +(p6)   cmp4.ne.unc p6,p0=r15, r0               // any special work pending?
24189 +(p6)   br.cond.spnt .work_pending_syscall
24190 +       ;;
24191 +       // start restoring the state saved on the kernel stack (struct pt_regs):
24192 +       ld8 r9=[r2],PT(CR_IPSR)-PT(R9)
24193 +       ld8 r11=[r3],PT(CR_IIP)-PT(R11)
24194 +(pNonSys) break 0              //      bug check: we shouldn't be here if pNonSys is TRUE!
24195 +       ;;
24196 +       invala                  // M0|1 invalidate ALAT
24197 +#ifdef CONFIG_XEN
24198 +       movl r28=XSI_PSR_I_ADDR
24199 +       movl r29=XSI_PSR_IC
24200 +       ;;
24201 +       ld8 r28=[r28]
24202 +       mov r30=1
24203 +       ;;
24204 +       st1     [r28]=r30
24205 +       st4     [r29]=r0        // note: clears both vpsr.i and vpsr.ic!
24206 +       ;;
24207 +#else
24208 +       rsm psr.i | psr.ic      // M2   turn off interrupts and interruption collection
24209 +#endif
24210 +       cmp.eq p9,p0=r0,r0      // A    set p9 to indicate that we should restore cr.ifs
24211 +
24212 +       ld8 r29=[r2],16         // M0|1 load cr.ipsr
24213 +       ld8 r28=[r3],16         // M0|1 load cr.iip
24214 +       mov r22=r0              // A    clear r22
24215 +       ;;
24216 +       ld8 r30=[r2],16         // M0|1 load cr.ifs
24217 +       ld8 r25=[r3],16         // M0|1 load ar.unat
24218 +(pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
24219 +       ;;
24220 +       ld8 r26=[r2],PT(B0)-PT(AR_PFS)  // M0|1 load ar.pfs
24221 +(pKStk)        mov r22=psr                     // M2   read PSR now that interrupts are disabled
24222 +       nop 0
24223 +       ;;
24224 +       ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // M0|1 load b0
24225 +       ld8 r27=[r3],PT(PR)-PT(AR_RSC)  // M0|1 load ar.rsc
24226 +       mov f6=f0                       // F    clear f6
24227 +       ;;
24228 +       ld8 r24=[r2],PT(AR_FPSR)-PT(AR_RNAT)    // M0|1 load ar.rnat (may be garbage)
24229 +       ld8 r31=[r3],PT(R1)-PT(PR)              // M0|1 load predicates
24230 +       mov f7=f0                               // F    clear f7
24231 +       ;;
24232 +       ld8 r20=[r2],PT(R12)-PT(AR_FPSR)        // M0|1 load ar.fpsr
24233 +       ld8.fill r1=[r3],16                     // M0|1 load r1
24234 +(pUStk) mov r17=1                              // A
24235 +       ;;
24236 +(pUStk) st1 [r14]=r17                          // M2|3
24237 +       ld8.fill r13=[r3],16                    // M0|1
24238 +       mov f8=f0                               // F    clear f8
24239 +       ;;
24240 +       ld8.fill r12=[r2]                       // M0|1 restore r12 (sp)
24241 +       ld8.fill r15=[r3]                       // M0|1 restore r15
24242 +       mov b6=r18                              // I0   restore b6
24243 +
24244 +       addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0 // A
24245 +       mov f9=f0                                       // F    clear f9
24246 +(pKStk) br.cond.dpnt.many skip_rbs_switch              // B
24247 +
24248 +       srlz.d                          // M0   ensure interruption collection is off (for cover)
24249 +       shr.u r18=r19,16                // I0|1 get byte size of existing "dirty" partition
24250 +#ifdef CONFIG_XEN
24251 +       XEN_HYPER_COVER;
24252 +#else
24253 +       cover                           // B    add current frame into dirty partition & set cr.ifs
24254 +#endif
24255 +       ;;
24256 +(pUStk) ld4 r17=[r17]                  // M0|1 r17 = cpu_data->phys_stacked_size_p8
24257 +       mov r19=ar.bsp                  // M2   get new backing store pointer
24258 +       mov f10=f0                      // F    clear f10
24259 +
24260 +       nop.m 0
24261 +       movl r14=__kernel_syscall_via_epc // X
24262 +       ;;
24263 +       mov.m ar.csd=r0                 // M2   clear ar.csd
24264 +       mov.m ar.ccv=r0                 // M2   clear ar.ccv
24265 +       mov b7=r14                      // I0   clear b7 (hint with __kernel_syscall_via_epc)
24266 +
24267 +       mov.m ar.ssd=r0                 // M2   clear ar.ssd
24268 +       mov f11=f0                      // F    clear f11
24269 +       br.cond.sptk.many rbs_switch    // B
24270 +#ifdef CONFIG_XEN
24271 +END(xen_leave_syscall)
24272 +#else
24273 +END(ia64_leave_syscall)
24274 +#endif
24275 +
24276 +#ifdef CONFIG_XEN
24277 +GLOBAL_ENTRY(xen_leave_kernel)
24278 +       PT_REGS_UNWIND_INFO(0)
24279 +       movl r22=running_on_xen;;
24280 +       ld4 r22=[r22];;
24281 +       cmp.eq p7,p0=r22,r0
24282 +(p7)   br.cond.sptk.many __ia64_leave_kernel;;
24283 +#else
24284 +GLOBAL_ENTRY(ia64_leave_kernel)
24285 +       PT_REGS_UNWIND_INFO(0)
24286 +#endif
24287 +       /*
24288 +        * work.need_resched etc. mustn't get changed by this CPU before it returns to
24289 +        * user- or fsys-mode, hence we disable interrupts early on.
24290 +        *
24291 +        * p6 controls whether current_thread_info()->flags needs to be check for
24292 +        * extra work.  We always check for extra work when returning to user-level.
24293 +        * With CONFIG_PREEMPT, we also check for extra work when the preempt_count
24294 +        * is 0.  After extra work processing has been completed, execution
24295 +        * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check
24296 +        * needs to be redone.
24297 +        */
24298 +#ifdef CONFIG_PREEMPT
24299 +       rsm psr.i                               // disable interrupts
24300 +       cmp.eq p0,pLvSys=r0,r0                  // pLvSys=0: leave from kernel
24301 +(pKStk)        adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
24302 +       ;;
24303 +       .pred.rel.mutex pUStk,pKStk
24304 +(pKStk)        ld4 r21=[r20]                   // r21 <- preempt_count
24305 +(pUStk)        mov r21=0                       // r21 <- 0
24306 +       ;;
24307 +       cmp.eq p6,p0=r21,r0             // p6 <- pUStk || (preempt_count == 0)
24308 +#else
24309 +#ifdef CONFIG_XEN
24310 +(pUStk)        movl r17=XSI_PSR_I_ADDR
24311 +(pUStk)        mov r31=1
24312 +               ;;
24313 +(pUStk)        ld8 r17=[r17]
24314 +               ;;
24315 +(pUStk)        st1 [r17]=r31
24316 +       ;;
24317 +#else
24318 +(pUStk)        rsm psr.i
24319 +#endif
24320 +       cmp.eq p0,pLvSys=r0,r0          // pLvSys=0: leave from kernel
24321 +(pUStk)        cmp.eq.unc p6,p0=r0,r0          // p6 <- pUStk
24322 +#endif
24323 +.work_processed_kernel:
24324 +       adds r17=TI_FLAGS+IA64_TASK_SIZE,r13
24325 +       ;;
24326 +(p6)   ld4 r31=[r17]                           // load current_thread_info()->flags
24327 +       adds r21=PT(PR)+16,r12
24328 +       ;;
24329 +
24330 +       lfetch [r21],PT(CR_IPSR)-PT(PR)
24331 +       adds r2=PT(B6)+16,r12
24332 +       adds r3=PT(R16)+16,r12
24333 +       ;;
24334 +       lfetch [r21]
24335 +       ld8 r28=[r2],8          // load b6
24336 +       adds r29=PT(R24)+16,r12
24337 +
24338 +       ld8.fill r16=[r3],PT(AR_CSD)-PT(R16)
24339 +       adds r30=PT(AR_CCV)+16,r12
24340 +(p6)   and r19=TIF_WORK_MASK,r31               // any work other than TIF_SYSCALL_TRACE?
24341 +       ;;
24342 +       ld8.fill r24=[r29]
24343 +       ld8 r15=[r30]           // load ar.ccv
24344 +(p6)   cmp4.ne.unc p6,p0=r19, r0               // any special work pending?
24345 +       ;;
24346 +       ld8 r29=[r2],16         // load b7
24347 +       ld8 r30=[r3],16         // load ar.csd
24348 +(p6)   br.cond.spnt .work_pending
24349 +       ;;
24350 +       ld8 r31=[r2],16         // load ar.ssd
24351 +       ld8.fill r8=[r3],16
24352 +       ;;
24353 +       ld8.fill r9=[r2],16
24354 +       ld8.fill r10=[r3],PT(R17)-PT(R10)
24355 +       ;;
24356 +       ld8.fill r11=[r2],PT(R18)-PT(R11)
24357 +       ld8.fill r17=[r3],16
24358 +       ;;
24359 +       ld8.fill r18=[r2],16
24360 +       ld8.fill r19=[r3],16
24361 +       ;;
24362 +       ld8.fill r20=[r2],16
24363 +       ld8.fill r21=[r3],16
24364 +       mov ar.csd=r30
24365 +       mov ar.ssd=r31
24366 +       ;;
24367 +#ifdef CONFIG_XEN
24368 +       movl r23=XSI_PSR_I_ADDR
24369 +       movl r22=XSI_PSR_IC
24370 +       ;;
24371 +       ld8 r23=[r23]
24372 +       mov r25=1
24373 +       ;;
24374 +       st1 [r23]=r25
24375 +       st4 [r22]=r0            // note: clears both vpsr.i and vpsr.ic!
24376 +       ;;
24377 +#else
24378 +       rsm psr.i | psr.ic      // initiate turning off of interrupt and interruption collection
24379 +#endif
24380 +       invala                  // invalidate ALAT
24381 +       ;;
24382 +       ld8.fill r22=[r2],24
24383 +       ld8.fill r23=[r3],24
24384 +       mov b6=r28
24385 +       ;;
24386 +       ld8.fill r25=[r2],16
24387 +       ld8.fill r26=[r3],16
24388 +       mov b7=r29
24389 +       ;;
24390 +       ld8.fill r27=[r2],16
24391 +       ld8.fill r28=[r3],16
24392 +       ;;
24393 +       ld8.fill r29=[r2],16
24394 +       ld8.fill r30=[r3],24
24395 +       ;;
24396 +       ld8.fill r31=[r2],PT(F9)-PT(R31)
24397 +       adds r3=PT(F10)-PT(F6),r3
24398 +       ;;
24399 +       ldf.fill f9=[r2],PT(F6)-PT(F9)
24400 +       ldf.fill f10=[r3],PT(F8)-PT(F10)
24401 +       ;;
24402 +       ldf.fill f6=[r2],PT(F7)-PT(F6)
24403 +       ;;
24404 +       ldf.fill f7=[r2],PT(F11)-PT(F7)
24405 +       ldf.fill f8=[r3],32
24406 +       ;;
24407 +       srlz.d  // ensure that inter. collection is off (VHPT is don't care, since text is pinned)
24408 +       mov ar.ccv=r15
24409 +       ;;
24410 +       ldf.fill f11=[r2]
24411 +#ifdef CONFIG_XEN
24412 +       ;;
24413 +       // r16-r31 all now hold bank1 values
24414 +       movl r2=XSI_BANK1_R16
24415 +       movl r3=XSI_BANK1_R16+8
24416 +       ;;
24417 +.mem.offset 0,0; st8.spill [r2]=r16,16
24418 +.mem.offset 8,0; st8.spill [r3]=r17,16
24419 +       ;;
24420 +.mem.offset 0,0; st8.spill [r2]=r18,16
24421 +.mem.offset 8,0; st8.spill [r3]=r19,16
24422 +       ;;
24423 +.mem.offset 0,0; st8.spill [r2]=r20,16
24424 +.mem.offset 8,0; st8.spill [r3]=r21,16
24425 +       ;;
24426 +.mem.offset 0,0; st8.spill [r2]=r22,16
24427 +.mem.offset 8,0; st8.spill [r3]=r23,16
24428 +       ;;
24429 +.mem.offset 0,0; st8.spill [r2]=r24,16
24430 +.mem.offset 8,0; st8.spill [r3]=r25,16
24431 +       ;;
24432 +.mem.offset 0,0; st8.spill [r2]=r26,16
24433 +.mem.offset 8,0; st8.spill [r3]=r27,16
24434 +       ;;
24435 +.mem.offset 0,0; st8.spill [r2]=r28,16
24436 +.mem.offset 8,0; st8.spill [r3]=r29,16
24437 +       ;;
24438 +.mem.offset 0,0; st8.spill [r2]=r30,16
24439 +.mem.offset 8,0; st8.spill [r3]=r31,16
24440 +       ;;
24441 +       movl r2=XSI_BANKNUM;;
24442 +       st4 [r2]=r0;
24443 +#else
24444 +       bsw.0                   // switch back to bank 0 (no stop bit required beforehand...)
24445 +#endif
24446 +       ;;
24447 +(pUStk)        mov r18=IA64_KR(CURRENT)// M2 (12 cycle read latency)
24448 +       adds r16=PT(CR_IPSR)+16,r12
24449 +       adds r17=PT(CR_IIP)+16,r12
24450 +
24451 +(pKStk)        mov r22=psr             // M2 read PSR now that interrupts are disabled
24452 +       nop.i 0
24453 +       nop.i 0
24454 +       ;;
24455 +       ld8 r29=[r16],16        // load cr.ipsr
24456 +       ld8 r28=[r17],16        // load cr.iip
24457 +       ;;
24458 +       ld8 r30=[r16],16        // load cr.ifs
24459 +       ld8 r25=[r17],16        // load ar.unat
24460 +       ;;
24461 +       ld8 r26=[r16],16        // load ar.pfs
24462 +       ld8 r27=[r17],16        // load ar.rsc
24463 +       cmp.eq p9,p0=r0,r0      // set p9 to indicate that we should restore cr.ifs
24464 +       ;;
24465 +       ld8 r24=[r16],16        // load ar.rnat (may be garbage)
24466 +       ld8 r23=[r17],16        // load ar.bspstore (may be garbage)
24467 +       ;;
24468 +       ld8 r31=[r16],16        // load predicates
24469 +       ld8 r21=[r17],16        // load b0
24470 +       ;;
24471 +       ld8 r19=[r16],16        // load ar.rsc value for "loadrs"
24472 +       ld8.fill r1=[r17],16    // load r1
24473 +       ;;
24474 +       ld8.fill r12=[r16],16
24475 +       ld8.fill r13=[r17],16
24476 +(pUStk)        adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18
24477 +       ;;
24478 +       ld8 r20=[r16],16        // ar.fpsr
24479 +       ld8.fill r15=[r17],16
24480 +       ;;
24481 +       ld8.fill r14=[r16],16
24482 +       ld8.fill r2=[r17]
24483 +(pUStk)        mov r17=1
24484 +       ;;
24485 +       ld8.fill r3=[r16]
24486 +(pUStk)        st1 [r18]=r17           // restore current->thread.on_ustack
24487 +       shr.u r18=r19,16        // get byte size of existing "dirty" partition
24488 +       ;;
24489 +       mov r16=ar.bsp          // get existing backing store pointer
24490 +       addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0
24491 +       ;;
24492 +       ld4 r17=[r17]           // r17 = cpu_data->phys_stacked_size_p8
24493 +(pKStk)        br.cond.dpnt skip_rbs_switch
24494 +
24495 +       /*
24496 +        * Restore user backing store.
24497 +        *
24498 +        * NOTE: alloc, loadrs, and cover can't be predicated.
24499 +        */
24500 +(pNonSys) br.cond.dpnt dont_preserve_current_frame
24501 +
24502 +#ifdef CONFIG_XEN
24503 +       XEN_HYPER_COVER;
24504 +#else
24505 +       cover                           // add current frame into dirty partition and set cr.ifs
24506 +#endif
24507 +       ;;
24508 +       mov r19=ar.bsp                  // get new backing store pointer
24509 +rbs_switch:
24510 +       sub r16=r16,r18                 // krbs = old bsp - size of dirty partition
24511 +       cmp.ne p9,p0=r0,r0              // clear p9 to skip restore of cr.ifs
24512 +       ;;
24513 +       sub r19=r19,r16                 // calculate total byte size of dirty partition
24514 +       add r18=64,r18                  // don't force in0-in7 into memory...
24515 +       ;;
24516 +       shl r19=r19,16                  // shift size of dirty partition into loadrs position
24517 +       ;;
24518 +dont_preserve_current_frame:
24519 +       /*
24520 +        * To prevent leaking bits between the kernel and user-space,
24521 +        * we must clear the stacked registers in the "invalid" partition here.
24522 +        * Not pretty, but at least it's fast (3.34 registers/cycle on Itanium,
24523 +        * 5 registers/cycle on McKinley).
24524 +        */
24525 +#      define pRecurse p6
24526 +#      define pReturn  p7
24527 +#ifdef CONFIG_ITANIUM
24528 +#      define Nregs    10
24529 +#else
24530 +#      define Nregs    14
24531 +#endif
24532 +       alloc loc0=ar.pfs,2,Nregs-2,2,0
24533 +       shr.u loc1=r18,9                // RNaTslots <= floor(dirtySize / (64*8))
24534 +       sub r17=r17,r18                 // r17 = (physStackedSize + 8) - dirtySize
24535 +       ;;
24536 +       mov ar.rsc=r19                  // load ar.rsc to be used for "loadrs"
24537 +       shladd in0=loc1,3,r17
24538 +       mov in1=0
24539 +       ;;
24540 +       TEXT_ALIGN(32)
24541 +rse_clear_invalid:
24542 +#ifdef CONFIG_ITANIUM
24543 +       // cycle 0
24544 + { .mii
24545 +       alloc loc0=ar.pfs,2,Nregs-2,2,0
24546 +       cmp.lt pRecurse,p0=Nregs*8,in0  // if more than Nregs regs left to clear, (re)curse
24547 +       add out0=-Nregs*8,in0
24548 +}{ .mfb
24549 +       add out1=1,in1                  // increment recursion count
24550 +       nop.f 0
24551 +       nop.b 0                         // can't do br.call here because of alloc (WAW on CFM)
24552 +       ;;
24553 +}{ .mfi        // cycle 1
24554 +       mov loc1=0
24555 +       nop.f 0
24556 +       mov loc2=0
24557 +}{ .mib
24558 +       mov loc3=0
24559 +       mov loc4=0
24560 +(pRecurse) br.call.sptk.many b0=rse_clear_invalid
24561 +
24562 +}{ .mfi        // cycle 2
24563 +       mov loc5=0
24564 +       nop.f 0
24565 +       cmp.ne pReturn,p0=r0,in1        // if recursion count != 0, we need to do a br.ret
24566 +}{ .mib
24567 +       mov loc6=0
24568 +       mov loc7=0
24569 +(pReturn) br.ret.sptk.many b0
24570 +}
24571 +#else /* !CONFIG_ITANIUM */
24572 +       alloc loc0=ar.pfs,2,Nregs-2,2,0
24573 +       cmp.lt pRecurse,p0=Nregs*8,in0  // if more than Nregs regs left to clear, (re)curse
24574 +       add out0=-Nregs*8,in0
24575 +       add out1=1,in1                  // increment recursion count
24576 +       mov loc1=0
24577 +       mov loc2=0
24578 +       ;;
24579 +       mov loc3=0
24580 +       mov loc4=0
24581 +       mov loc5=0
24582 +       mov loc6=0
24583 +       mov loc7=0
24584 +(pRecurse) br.call.dptk.few b0=rse_clear_invalid
24585 +       ;;
24586 +       mov loc8=0
24587 +       mov loc9=0
24588 +       cmp.ne pReturn,p0=r0,in1        // if recursion count != 0, we need to do a br.ret
24589 +       mov loc10=0
24590 +       mov loc11=0
24591 +(pReturn) br.ret.dptk.many b0
24592 +#endif /* !CONFIG_ITANIUM */
24593 +#      undef pRecurse
24594 +#      undef pReturn
24595 +       ;;
24596 +       alloc r17=ar.pfs,0,0,0,0        // drop current register frame
24597 +       ;;
24598 +       loadrs
24599 +       ;;
24600 +skip_rbs_switch:
24601 +       mov ar.unat=r25         // M2
24602 +(pKStk)        extr.u r22=r22,21,1     // I0 extract current value of psr.pp from r22
24603 +(pLvSys)mov r19=r0             // A  clear r19 for leave_syscall, no-op otherwise
24604 +       ;;
24605 +(pUStk)        mov ar.bspstore=r23     // M2
24606 +(pKStk)        dep r29=r22,r29,21,1    // I0 update ipsr.pp with psr.pp
24607 +(pLvSys)mov r16=r0             // A  clear r16 for leave_syscall, no-op otherwise
24608 +       ;;
24609 +#ifdef CONFIG_XEN
24610 +       movl r25=XSI_IPSR
24611 +       ;;
24612 +       st8[r25]=r29,XSI_IFS-XSI_IPSR
24613 +       ;;
24614 +#else
24615 +       mov cr.ipsr=r29         // M2
24616 +#endif
24617 +       mov ar.pfs=r26          // I0
24618 +(pLvSys)mov r17=r0             // A  clear r17 for leave_syscall, no-op otherwise
24619 +
24620 +#ifdef CONFIG_XEN
24621 +(p9)   st8 [r25]=r30
24622 +       ;;
24623 +       adds r25=XSI_IIP-XSI_IFS,r25
24624 +       ;;
24625 +#else
24626 +(p9)   mov cr.ifs=r30          // M2
24627 +#endif
24628 +       mov b0=r21              // I0
24629 +(pLvSys)mov r18=r0             // A  clear r18 for leave_syscall, no-op otherwise
24630 +
24631 +       mov ar.fpsr=r20         // M2
24632 +#ifdef CONFIG_XEN
24633 +       st8     [r25]=r28
24634 +#else
24635 +       mov cr.iip=r28          // M2
24636 +#endif
24637 +       nop 0
24638 +       ;;
24639 +(pUStk)        mov ar.rnat=r24         // M2 must happen with RSE in lazy mode
24640 +       nop 0
24641 +(pLvSys)mov r2=r0
24642 +
24643 +       mov ar.rsc=r27          // M2
24644 +       mov pr=r31,-1           // I0
24645 +#ifdef CONFIG_XEN
24646 +       ;;
24647 +       XEN_HYPER_RFI;
24648 +#else
24649 +       rfi                     // B
24650 +#endif
24651 +
24652 +       /*
24653 +        * On entry:
24654 +        *      r20 = &current->thread_info->pre_count (if CONFIG_PREEMPT)
24655 +        *      r31 = current->thread_info->flags
24656 +        * On exit:
24657 +        *      p6 = TRUE if work-pending-check needs to be redone
24658 +        */
24659 +.work_pending_syscall:
24660 +       add r2=-8,r2
24661 +       add r3=-8,r3
24662 +       ;;
24663 +       st8 [r2]=r8
24664 +       st8 [r3]=r10
24665 +.work_pending:
24666 +       tbit.nz p6,p0=r31,TIF_SIGDELAYED                // signal delayed from  MCA/INIT/NMI/PMI context?
24667 +(p6)   br.cond.sptk.few .sigdelayed
24668 +       ;;
24669 +       tbit.z p6,p0=r31,TIF_NEED_RESCHED               // current_thread_info()->need_resched==0?
24670 +(p6)   br.cond.sptk.few .notify
24671 +#ifdef CONFIG_PREEMPT
24672 +(pKStk) dep r21=-1,r0,PREEMPT_ACTIVE_BIT,1
24673 +       ;;
24674 +(pKStk) st4 [r20]=r21
24675 +       ssm psr.i               // enable interrupts
24676 +#endif
24677 +       br.call.spnt.many rp=schedule
24678 +.ret9: cmp.eq p6,p0=r0,r0                              // p6 <- 1
24679 +#ifdef CONFIG_XEN
24680 +       movl r2=XSI_PSR_I_ADDR
24681 +       mov r20=1
24682 +       ;;
24683 +       ld8 r2=[r2]
24684 +       ;;
24685 +       st1 [r2]=r20
24686 +#else
24687 +       rsm psr.i               // disable interrupts
24688 +#endif
24689 +       ;;
24690 +#ifdef CONFIG_PREEMPT
24691 +(pKStk)        adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
24692 +       ;;
24693 +(pKStk)        st4 [r20]=r0            // preempt_count() <- 0
24694 +#endif
24695 +(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
24696 +       br.cond.sptk.many .work_processed_kernel        // re-check
24697 +
24698 +.notify:
24699 +(pUStk)        br.call.spnt.many rp=notify_resume_user
24700 +.ret10:        cmp.ne p6,p0=r0,r0                              // p6 <- 0
24701 +(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
24702 +       br.cond.sptk.many .work_processed_kernel        // don't re-check
24703 +
24704 +// There is a delayed signal that was detected in MCA/INIT/NMI/PMI context where
24705 +// it could not be delivered.  Deliver it now.  The signal might be for us and
24706 +// may set TIF_SIGPENDING, so redrive ia64_leave_* after processing the delayed
24707 +// signal.
24708 +
24709 +.sigdelayed:
24710 +       br.call.sptk.many rp=do_sigdelayed
24711 +       cmp.eq p6,p0=r0,r0                              // p6 <- 1, always re-check
24712 +(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
24713 +       br.cond.sptk.many .work_processed_kernel        // re-check
24714 +
24715 +.work_pending_syscall_end:
24716 +       adds r2=PT(R8)+16,r12
24717 +       adds r3=PT(R10)+16,r12
24718 +       ;;
24719 +       ld8 r8=[r2]
24720 +       ld8 r10=[r3]
24721 +       br.cond.sptk.many .work_processed_syscall       // re-check
24722 +
24723 +#ifdef CONFIG_XEN
24724 +END(xen_leave_kernel)
24725 +#else
24726 +END(ia64_leave_kernel)
24727 +#endif
24728 diff -urNp linux-2.6/arch/ia64/xen/xenhpski.c new/arch/ia64/xen/xenhpski.c
24729 --- linux-2.6/arch/ia64/xen/xenhpski.c  1970-01-01 01:00:00.000000000 +0100
24730 +++ new/arch/ia64/xen/xenhpski.c        2006-05-09 12:32:40.000000000 +0200
24731 @@ -0,0 +1,19 @@
24732 +
24733 +extern unsigned long xen_get_cpuid(int);
24734 +
24735 +int
24736 +running_on_sim(void)
24737 +{
24738 +       int i;
24739 +       long cpuid[6];
24740 +
24741 +       for (i = 0; i < 5; ++i)
24742 +               cpuid[i] = xen_get_cpuid(i);
24743 +       if ((cpuid[0] & 0xff) != 'H') return 0;
24744 +       if ((cpuid[3] & 0xff) != 0x4) return 0;
24745 +       if (((cpuid[3] >> 8) & 0xff) != 0x0) return 0;
24746 +       if (((cpuid[3] >> 16) & 0xff) != 0x0) return 0;
24747 +       if (((cpuid[3] >> 24) & 0x7) != 0x7) return 0;
24748 +       return 1;
24749 +}
24750 +
24751 diff -urNp linux-2.6/arch/ia64/xen/xenivt.S new/arch/ia64/xen/xenivt.S
24752 --- linux-2.6/arch/ia64/xen/xenivt.S    1970-01-01 01:00:00.000000000 +0100
24753 +++ new/arch/ia64/xen/xenivt.S  2006-06-07 13:15:16.000000000 +0200
24754 @@ -0,0 +1,2171 @@
24755 +/*
24756 + * arch/ia64/xen/ivt.S
24757 + *
24758 + * Copyright (C) 2005 Hewlett-Packard Co
24759 + *     Dan Magenheimer <dan.magenheimer@hp.com>
24760 + */
24761 +/*
24762 + * This file defines the interruption vector table used by the CPU.
24763 + * It does not include one entry per possible cause of interruption.
24764 + *
24765 + * The first 20 entries of the table contain 64 bundles each while the
24766 + * remaining 48 entries contain only 16 bundles each.
24767 + *
24768 + * The 64 bundles are used to allow inlining the whole handler for critical
24769 + * interruptions like TLB misses.
24770 + *
24771 + *  For each entry, the comment is as follows:
24772 + *
24773 + *             // 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
24774 + *  entry offset ----/     /         /                  /          /
24775 + *  entry number ---------/         /                  /          /
24776 + *  size of the entry -------------/                  /          /
24777 + *  vector name -------------------------------------/          /
24778 + *  interruptions triggering this vector ----------------------/
24779 + *
24780 + * The table is 32KB in size and must be aligned on 32KB boundary.
24781 + * (The CPU ignores the 15 lower bits of the address)
24782 + *
24783 + * Table is based upon EAS2.6 (Oct 1999)
24784 + */
24785 +
24786 +#include <linux/config.h>
24787 +
24788 +#include <asm/asmmacro.h>
24789 +#include <asm/break.h>
24790 +#include <asm/ia32.h>
24791 +#include <asm/kregs.h>
24792 +#include <asm/asm-offsets.h>
24793 +#include <asm/pgtable.h>
24794 +#include <asm/processor.h>
24795 +#include <asm/ptrace.h>
24796 +#include <asm/system.h>
24797 +#include <asm/thread_info.h>
24798 +#include <asm/unistd.h>
24799 +#include <asm/errno.h>
24800 +
24801 +#ifdef CONFIG_XEN
24802 +#define ia64_ivt xen_ivt
24803 +#endif
24804 +
24805 +#if 1
24806 +# define PSR_DEFAULT_BITS      psr.ac
24807 +#else
24808 +# define PSR_DEFAULT_BITS      0
24809 +#endif
24810 +
24811 +#if 0
24812 +  /*
24813 +   * This lets you track the last eight faults that occurred on the CPU.  Make sure ar.k2 isn't
24814 +   * needed for something else before enabling this...
24815 +   */
24816 +# define DBG_FAULT(i)  mov r16=ar.k2;; shl r16=r16,8;; add r16=(i),r16;;mov ar.k2=r16
24817 +#else
24818 +# define DBG_FAULT(i)
24819 +#endif
24820 +
24821 +#define MINSTATE_VIRT  /* needed by minstate.h */
24822 +#include "xenminstate.h"
24823 +
24824 +#define FAULT(n)                                                                       \
24825 +       mov r31=pr;                                                                     \
24826 +       mov r19=n;;                     /* prepare to save predicates */                \
24827 +       br.sptk.many dispatch_to_fault_handler
24828 +
24829 +       .section .text.ivt,"ax"
24830 +
24831 +       .align 32768    // align on 32KB boundary
24832 +       .global ia64_ivt
24833 +ia64_ivt:
24834 +/////////////////////////////////////////////////////////////////////////////////////////
24835 +// 0x0000 Entry 0 (size 64 bundles) VHPT Translation (8,20,47)
24836 +ENTRY(vhpt_miss)
24837 +       DBG_FAULT(0)
24838 +       /*
24839 +        * The VHPT vector is invoked when the TLB entry for the virtual page table
24840 +        * is missing.  This happens only as a result of a previous
24841 +        * (the "original") TLB miss, which may either be caused by an instruction
24842 +        * fetch or a data access (or non-access).
24843 +        *
24844 +        * What we do here is normal TLB miss handing for the _original_ miss,
24845 +        * followed by inserting the TLB entry for the virtual page table page
24846 +        * that the VHPT walker was attempting to access.  The latter gets
24847 +        * inserted as long as page table entry above pte level have valid
24848 +        * mappings for the faulting address.  The TLB entry for the original
24849 +        * miss gets inserted only if the pte entry indicates that the page is
24850 +        * present.
24851 +        *
24852 +        * do_page_fault gets invoked in the following cases:
24853 +        *      - the faulting virtual address uses unimplemented address bits
24854 +        *      - the faulting virtual address has no valid page table mapping
24855 +        */
24856 +#ifdef CONFIG_XEN
24857 +       movl r16=XSI_IFA
24858 +       ;;
24859 +       ld8 r16=[r16]
24860 +#ifdef CONFIG_HUGETLB_PAGE
24861 +       movl r18=PAGE_SHIFT
24862 +       movl r25=XSI_ITIR
24863 +       ;;
24864 +       ld8 r25=[r25]
24865 +#endif
24866 +       ;;
24867 +#else
24868 +       mov r16=cr.ifa                          // get address that caused the TLB miss
24869 +#ifdef CONFIG_HUGETLB_PAGE
24870 +       movl r18=PAGE_SHIFT
24871 +       mov r25=cr.itir
24872 +#endif
24873 +#endif
24874 +       ;;
24875 +#ifdef CONFIG_XEN
24876 +       XEN_HYPER_RSM_PSR_DT;
24877 +#else
24878 +       rsm psr.dt                              // use physical addressing for data
24879 +#endif
24880 +       mov r31=pr                              // save the predicate registers
24881 +       mov r19=IA64_KR(PT_BASE)                // get page table base address
24882 +       shl r21=r16,3                           // shift bit 60 into sign bit
24883 +       shr.u r17=r16,61                        // get the region number into r17
24884 +       ;;
24885 +       shr.u r22=r21,3
24886 +#ifdef CONFIG_HUGETLB_PAGE
24887 +       extr.u r26=r25,2,6
24888 +       ;;
24889 +       cmp.ne p8,p0=r18,r26
24890 +       sub r27=r26,r18
24891 +       ;;
24892 +(p8)   dep r25=r18,r25,2,6
24893 +(p8)   shr r22=r22,r27
24894 +#endif
24895 +       ;;
24896 +       cmp.eq p6,p7=5,r17                      // is IFA pointing into to region 5?
24897 +       shr.u r18=r22,PGDIR_SHIFT               // get bottom portion of pgd index bit
24898 +       ;;
24899 +(p7)   dep r17=r17,r19,(PAGE_SHIFT-3),3        // put region number bits in place
24900 +
24901 +       srlz.d
24902 +       LOAD_PHYSICAL(p6, r19, swapper_pg_dir)  // region 5 is rooted at swapper_pg_dir
24903 +
24904 +       .pred.rel "mutex", p6, p7
24905 +(p6)   shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
24906 +(p7)   shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
24907 +       ;;
24908 +(p6)   dep r17=r18,r19,3,(PAGE_SHIFT-3)        // r17=pgd_offset for region 5
24909 +(p7)   dep r17=r18,r17,3,(PAGE_SHIFT-6)        // r17=pgd_offset for region[0-4]
24910 +       cmp.eq p7,p6=0,r21                      // unused address bits all zeroes?
24911 +#ifdef CONFIG_PGTABLE_4
24912 +       shr.u r28=r22,PUD_SHIFT                 // shift pud index into position
24913 +#else
24914 +       shr.u r18=r22,PMD_SHIFT                 // shift pmd index into position
24915 +#endif
24916 +       ;;
24917 +       ld8 r17=[r17]                           // get *pgd (may be 0)
24918 +       ;;
24919 +(p7)   cmp.eq p6,p7=r17,r0                     // was pgd_present(*pgd) == NULL?
24920 +#ifdef CONFIG_PGTABLE_4
24921 +       dep r28=r28,r17,3,(PAGE_SHIFT-3)        // r28=pud_offset(pgd,addr)
24922 +       ;;
24923 +       shr.u r18=r22,PMD_SHIFT                 // shift pmd index into position
24924 +(p7)   ld8 r29=[r28]                           // get *pud (may be 0)
24925 +       ;;
24926 +(p7)   cmp.eq.or.andcm p6,p7=r29,r0            // was pud_present(*pud) == NULL?
24927 +       dep r17=r18,r29,3,(PAGE_SHIFT-3)        // r17=pmd_offset(pud,addr)
24928 +#else
24929 +       dep r17=r18,r17,3,(PAGE_SHIFT-3)        // r17=pmd_offset(pgd,addr)
24930 +#endif
24931 +       ;;
24932 +(p7)   ld8 r20=[r17]                           // get *pmd (may be 0)
24933 +       shr.u r19=r22,PAGE_SHIFT                // shift pte index into position
24934 +       ;;
24935 +(p7)   cmp.eq.or.andcm p6,p7=r20,r0            // was pmd_present(*pmd) == NULL?
24936 +       dep r21=r19,r20,3,(PAGE_SHIFT-3)        // r21=pte_offset(pmd,addr)
24937 +       ;;
24938 +(p7)   ld8 r18=[r21]                           // read *pte
24939 +#ifdef CONFIG_XEN
24940 +       movl r19=XSI_ISR
24941 +       ;;
24942 +       ld8 r19=[r19]
24943 +#else
24944 +       mov r19=cr.isr                          // cr.isr bit 32 tells us if this is an insn miss
24945 +#endif
24946 +       ;;
24947 +(p7)   tbit.z p6,p7=r18,_PAGE_P_BIT            // page present bit cleared?
24948 +#ifdef CONFIG_XEN
24949 +       movl r22=XSI_IHA
24950 +       ;;
24951 +       ld8 r22=[r22]
24952 +#else
24953 +       mov r22=cr.iha                          // get the VHPT address that caused the TLB miss
24954 +#endif
24955 +       ;;                                      // avoid RAW on p7
24956 +(p7)   tbit.nz.unc p10,p11=r19,32              // is it an instruction TLB miss?
24957 +       dep r23=0,r20,0,PAGE_SHIFT              // clear low bits to get page address
24958 +       ;;
24959 +#ifdef CONFIG_XEN
24960 +       mov r24=r8
24961 +       mov r8=r18
24962 +       ;;
24963 +(p10)  XEN_HYPER_ITC_I
24964 +       ;;
24965 +(p11)  XEN_HYPER_ITC_D
24966 +       ;;
24967 +       mov r8=r24
24968 +       ;;
24969 +#else
24970 +(p10)  itc.i r18                               // insert the instruction TLB entry
24971 +(p11)  itc.d r18                               // insert the data TLB entry
24972 +#endif
24973 +(p6)   br.cond.spnt.many page_fault            // handle bad address/page not present (page fault)
24974 +#ifdef CONFIG_XEN
24975 +       movl r24=XSI_IFA
24976 +       ;;
24977 +       st8 [r24]=r22
24978 +       ;;
24979 +#else
24980 +       mov cr.ifa=r22
24981 +#endif
24982 +
24983 +#ifdef CONFIG_HUGETLB_PAGE
24984 +(p8)   mov cr.itir=r25                         // change to default page-size for VHPT
24985 +#endif
24986 +
24987 +       /*
24988 +        * Now compute and insert the TLB entry for the virtual page table.  We never
24989 +        * execute in a page table page so there is no need to set the exception deferral
24990 +        * bit.
24991 +        */
24992 +       adds r24=__DIRTY_BITS_NO_ED|_PAGE_PL_0|_PAGE_AR_RW,r23
24993 +       ;;
24994 +#ifdef CONFIG_XEN
24995 +(p7)   mov r25=r8
24996 +(p7)   mov r8=r24
24997 +       ;;
24998 +(p7)   XEN_HYPER_ITC_D
24999 +       ;;
25000 +(p7)   mov r8=r25
25001 +       ;;
25002 +#else
25003 +(p7)   itc.d r24
25004 +#endif
25005 +       ;;
25006 +#ifdef CONFIG_SMP
25007 +       /*
25008 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
25009 +        * cannot possibly affect the following loads:
25010 +        */
25011 +       dv_serialize_data
25012 +
25013 +       /*
25014 +        * Re-check pagetable entry.  If they changed, we may have received a ptc.g
25015 +        * between reading the pagetable and the "itc".  If so, flush the entry we
25016 +        * inserted and retry.  At this point, we have:
25017 +        *
25018 +        * r28 = equivalent of pud_offset(pgd, ifa)
25019 +        * r17 = equivalent of pmd_offset(pud, ifa)
25020 +        * r21 = equivalent of pte_offset(pmd, ifa)
25021 +        *
25022 +        * r29 = *pud
25023 +        * r20 = *pmd
25024 +        * r18 = *pte
25025 +        */
25026 +       ld8 r25=[r21]                           // read *pte again
25027 +       ld8 r26=[r17]                           // read *pmd again
25028 +#ifdef CONFIG_PGTABLE_4
25029 +       ld8 r19=[r28]                           // read *pud again
25030 +#endif
25031 +       cmp.ne p6,p7=r0,r0
25032 +       ;;
25033 +       cmp.ne.or.andcm p6,p7=r26,r20           // did *pmd change
25034 +#ifdef CONFIG_PGTABLE_4
25035 +       cmp.ne.or.andcm p6,p7=r19,r29           // did *pud change
25036 +#endif
25037 +       mov r27=PAGE_SHIFT<<2
25038 +       ;;
25039 +(p6)   ptc.l r22,r27                           // purge PTE page translation
25040 +(p7)   cmp.ne.or.andcm p6,p7=r25,r18           // did *pte change
25041 +       ;;
25042 +(p6)   ptc.l r16,r27                           // purge translation
25043 +#endif
25044 +
25045 +       mov pr=r31,-1                           // restore predicate registers
25046 +#ifdef CONFIG_XEN
25047 +       XEN_HYPER_RFI
25048 +       dv_serialize_data
25049 +#else
25050 +       rfi
25051 +#endif
25052 +END(vhpt_miss)
25053 +
25054 +       .org ia64_ivt+0x400
25055 +/////////////////////////////////////////////////////////////////////////////////////////
25056 +// 0x0400 Entry 1 (size 64 bundles) ITLB (21)
25057 +ENTRY(itlb_miss)
25058 +       DBG_FAULT(1)
25059 +       /*
25060 +        * The ITLB handler accesses the PTE via the virtually mapped linear
25061 +        * page table.  If a nested TLB miss occurs, we switch into physical
25062 +        * mode, walk the page table, and then re-execute the PTE read and
25063 +        * go on normally after that.
25064 +        */
25065 +#ifdef CONFIG_XEN
25066 +       movl r16=XSI_IFA
25067 +       ;;
25068 +       ld8 r16=[r16]
25069 +#else
25070 +       mov r16=cr.ifa                          // get virtual address
25071 +#endif
25072 +       mov r29=b0                              // save b0
25073 +       mov r31=pr                              // save predicates
25074 +.itlb_fault:
25075 +#ifdef CONFIG_XEN
25076 +       movl r17=XSI_IHA
25077 +       ;;
25078 +       ld8 r17=[r17]                           // get virtual address of L3 PTE
25079 +#else
25080 +       mov r17=cr.iha                          // get virtual address of PTE
25081 +#endif
25082 +       movl r30=1f                             // load nested fault continuation point
25083 +       ;;
25084 +1:     ld8 r18=[r17]                           // read *pte
25085 +       ;;
25086 +       mov b0=r29
25087 +       tbit.z p6,p0=r18,_PAGE_P_BIT            // page present bit cleared?
25088 +(p6)   br.cond.spnt page_fault
25089 +       ;;
25090 +#ifdef CONFIG_XEN
25091 +       mov r19=r8
25092 +       mov r8=r18
25093 +       ;;
25094 +       XEN_HYPER_ITC_I
25095 +       ;;
25096 +       mov r8=r19
25097 +#else
25098 +       itc.i r18
25099 +#endif
25100 +       ;;
25101 +#ifdef CONFIG_SMP
25102 +       /*
25103 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
25104 +        * cannot possibly affect the following loads:
25105 +        */
25106 +       dv_serialize_data
25107 +
25108 +       ld8 r19=[r17]                           // read *pte again and see if same
25109 +       mov r20=PAGE_SHIFT<<2                   // setup page size for purge
25110 +       ;;
25111 +       cmp.ne p7,p0=r18,r19
25112 +       ;;
25113 +(p7)   ptc.l r16,r20
25114 +#endif
25115 +       mov pr=r31,-1
25116 +#ifdef CONFIG_XEN
25117 +       XEN_HYPER_RFI
25118 +       dv_serialize_data
25119 +#else
25120 +       rfi
25121 +#endif
25122 +END(itlb_miss)
25123 +
25124 +       .org ia64_ivt+0x0800
25125 +/////////////////////////////////////////////////////////////////////////////////////////
25126 +// 0x0800 Entry 2 (size 64 bundles) DTLB (9,48)
25127 +ENTRY(dtlb_miss)
25128 +       DBG_FAULT(2)
25129 +       /*
25130 +        * The DTLB handler accesses the PTE via the virtually mapped linear
25131 +        * page table.  If a nested TLB miss occurs, we switch into physical
25132 +        * mode, walk the page table, and then re-execute the PTE read and
25133 +        * go on normally after that.
25134 +        */
25135 +#ifdef CONFIG_XEN
25136 +       movl r16=XSI_IFA
25137 +       ;;
25138 +       ld8 r16=[r16]
25139 +#else
25140 +       mov r16=cr.ifa                          // get virtual address
25141 +#endif
25142 +       mov r29=b0                              // save b0
25143 +       mov r31=pr                              // save predicates
25144 +dtlb_fault:
25145 +#ifdef CONFIG_XEN
25146 +       movl r17=XSI_IHA
25147 +       ;;
25148 +       ld8 r17=[r17]                           // get virtual address of L3 PTE
25149 +#else
25150 +       mov r17=cr.iha                          // get virtual address of PTE
25151 +#endif
25152 +       movl r30=1f                             // load nested fault continuation point
25153 +       ;;
25154 +1:     ld8 r18=[r17]                           // read *pte
25155 +       ;;
25156 +       mov b0=r29
25157 +       tbit.z p6,p0=r18,_PAGE_P_BIT            // page present bit cleared?
25158 +(p6)   br.cond.spnt page_fault
25159 +       ;;
25160 +#ifdef CONFIG_XEN
25161 +       mov r19=r8
25162 +       mov r8=r18
25163 +       ;;
25164 +       XEN_HYPER_ITC_D
25165 +       ;;
25166 +       mov r8=r19
25167 +       ;;
25168 +#else
25169 +       itc.d r18
25170 +#endif
25171 +       ;;
25172 +#ifdef CONFIG_SMP
25173 +       /*
25174 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
25175 +        * cannot possibly affect the following loads:
25176 +        */
25177 +       dv_serialize_data
25178 +
25179 +       ld8 r19=[r17]                           // read *pte again and see if same
25180 +       mov r20=PAGE_SHIFT<<2                   // setup page size for purge
25181 +       ;;
25182 +       cmp.ne p7,p0=r18,r19
25183 +       ;;
25184 +(p7)   ptc.l r16,r20
25185 +#endif
25186 +       mov pr=r31,-1
25187 +#ifdef CONFIG_XEN
25188 +       XEN_HYPER_RFI
25189 +       dv_serialize_data
25190 +#else
25191 +       rfi
25192 +#endif
25193 +END(dtlb_miss)
25194 +
25195 +       .org ia64_ivt+0x0c00
25196 +/////////////////////////////////////////////////////////////////////////////////////////
25197 +// 0x0c00 Entry 3 (size 64 bundles) Alt ITLB (19)
25198 +ENTRY(alt_itlb_miss)
25199 +       DBG_FAULT(3)
25200 +#ifdef CONFIG_XEN
25201 +       movl r31=XSI_IPSR
25202 +       ;;
25203 +       ld8 r21=[r31],XSI_IFA-XSI_IPSR  // get ipsr, point to ifa
25204 +       movl r17=PAGE_KERNEL
25205 +       ;;
25206 +       ld8 r16=[r31]           // get ifa
25207 +#else
25208 +       mov r16=cr.ifa          // get address that caused the TLB miss
25209 +       movl r17=PAGE_KERNEL
25210 +       mov r21=cr.ipsr
25211 +#endif
25212 +       movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
25213 +       mov r31=pr
25214 +       ;;
25215 +#ifdef CONFIG_DISABLE_VHPT
25216 +       shr.u r22=r16,61                        // get the region number into r21
25217 +       ;;
25218 +       cmp.gt p8,p0=6,r22                      // user mode
25219 +       ;;
25220 +#ifndef CONFIG_XEN
25221 +(p8)   thash r17=r16
25222 +       ;;
25223 +(p8)   mov cr.iha=r17
25224 +#endif
25225 +(p8)   mov r29=b0                              // save b0
25226 +(p8)   br.cond.dptk .itlb_fault
25227 +#endif
25228 +       extr.u r23=r21,IA64_PSR_CPL0_BIT,2      // extract psr.cpl
25229 +       and r19=r19,r16         // clear ed, reserved bits, and PTE control bits
25230 +       shr.u r18=r16,57        // move address bit 61 to bit 4
25231 +       ;;
25232 +       andcm r18=0x10,r18      // bit 4=~address-bit(61)
25233 +       cmp.ne p8,p0=r0,r23     // psr.cpl != 0?
25234 +       or r19=r17,r19          // insert PTE control bits into r19
25235 +       ;;
25236 +       or r19=r19,r18          // set bit 4 (uncached) if the access was to region 6
25237 +(p8)   br.cond.spnt page_fault
25238 +       ;;
25239 +#ifdef CONFIG_XEN
25240 +       mov r18=r8
25241 +       mov r8=r19
25242 +       ;;
25243 +       XEN_HYPER_ITC_I
25244 +       ;;
25245 +       mov r8=r18
25246 +       ;;
25247 +       mov pr=r31,-1
25248 +       ;;
25249 +       XEN_HYPER_RFI;
25250 +#else
25251 +       itc.i r19               // insert the TLB entry
25252 +       mov pr=r31,-1
25253 +       rfi
25254 +#endif
25255 +END(alt_itlb_miss)
25256 +
25257 +       .org ia64_ivt+0x1000
25258 +/////////////////////////////////////////////////////////////////////////////////////////
25259 +// 0x1000 Entry 4 (size 64 bundles) Alt DTLB (7,46)
25260 +ENTRY(alt_dtlb_miss)
25261 +       DBG_FAULT(4)
25262 +#ifdef CONFIG_XEN
25263 +       movl r31=XSI_IPSR
25264 +       ;;
25265 +       ld8 r21=[r31],XSI_ISR-XSI_IPSR  // get ipsr, point to isr
25266 +       movl r17=PAGE_KERNEL
25267 +       ;;
25268 +       ld8 r20=[r31],XSI_IFA-XSI_ISR   // get isr, point to ifa
25269 +       movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
25270 +       ;;
25271 +       ld8 r16=[r31]           // get ifa
25272 +#else
25273 +       mov r16=cr.ifa          // get address that caused the TLB miss
25274 +       movl r17=PAGE_KERNEL
25275 +       mov r20=cr.isr
25276 +       movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
25277 +       mov r21=cr.ipsr
25278 +#endif
25279 +       mov r31=pr
25280 +       ;;
25281 +#ifdef CONFIG_DISABLE_VHPT
25282 +       shr.u r22=r16,61                        // get the region number into r21
25283 +       ;;
25284 +       cmp.gt p8,p0=6,r22                      // access to region 0-5
25285 +       ;;
25286 +#ifndef CONFIG_XEN
25287 +(p8)   thash r17=r16
25288 +       ;;
25289 +(p8)   mov cr.iha=r17
25290 +#endif
25291 +(p8)   mov r29=b0                              // save b0
25292 +(p8)   br.cond.dptk dtlb_fault
25293 +#endif
25294 +       extr.u r23=r21,IA64_PSR_CPL0_BIT,2      // extract psr.cpl
25295 +       and r22=IA64_ISR_CODE_MASK,r20          // get the isr.code field
25296 +       tbit.nz p6,p7=r20,IA64_ISR_SP_BIT       // is speculation bit on?
25297 +       shr.u r18=r16,57                        // move address bit 61 to bit 4
25298 +       and r19=r19,r16                         // clear ed, reserved bits, and PTE control bits
25299 +       tbit.nz p9,p0=r20,IA64_ISR_NA_BIT       // is non-access bit on?
25300 +       ;;
25301 +       andcm r18=0x10,r18      // bit 4=~address-bit(61)
25302 +       cmp.ne p8,p0=r0,r23
25303 +(p9)   cmp.eq.or.andcm p6,p7=IA64_ISR_CODE_LFETCH,r22  // check isr.code field
25304 +(p8)   br.cond.spnt page_fault
25305 +
25306 +       dep r21=-1,r21,IA64_PSR_ED_BIT,1
25307 +       or r19=r19,r17          // insert PTE control bits into r19
25308 +       ;;
25309 +       or r19=r19,r18          // set bit 4 (uncached) if the access was to region 6
25310 +(p6)   mov cr.ipsr=r21
25311 +       ;;
25312 +#ifdef CONFIG_XEN
25313 +(p7)   mov r18=r8
25314 +(p7)   mov r8=r19
25315 +       ;;
25316 +(p7)   XEN_HYPER_ITC_D
25317 +       ;;
25318 +(p7)   mov r8=r18
25319 +       ;;
25320 +       mov pr=r31,-1
25321 +       ;;
25322 +       XEN_HYPER_RFI;
25323 +#else
25324 +(p7)   itc.d r19               // insert the TLB entry
25325 +       mov pr=r31,-1
25326 +       rfi
25327 +#endif
25328 +END(alt_dtlb_miss)
25329 +
25330 +       .org ia64_ivt+0x1400
25331 +/////////////////////////////////////////////////////////////////////////////////////////
25332 +// 0x1400 Entry 5 (size 64 bundles) Data nested TLB (6,45)
25333 +ENTRY(nested_dtlb_miss)
25334 +       /*
25335 +        * In the absence of kernel bugs, we get here when the virtually mapped linear
25336 +        * page table is accessed non-speculatively (e.g., in the Dirty-bit, Instruction
25337 +        * Access-bit, or Data Access-bit faults).  If the DTLB entry for the virtual page
25338 +        * table is missing, a nested TLB miss fault is triggered and control is
25339 +        * transferred to this point.  When this happens, we lookup the pte for the
25340 +        * faulting address by walking the page table in physical mode and return to the
25341 +        * continuation point passed in register r30 (or call page_fault if the address is
25342 +        * not mapped).
25343 +        *
25344 +        * Input:       r16:    faulting address
25345 +        *              r29:    saved b0
25346 +        *              r30:    continuation address
25347 +        *              r31:    saved pr
25348 +        *
25349 +        * Output:      r17:    physical address of PTE of faulting address
25350 +        *              r29:    saved b0
25351 +        *              r30:    continuation address
25352 +        *              r31:    saved pr
25353 +        *
25354 +        * Clobbered:   b0, r18, r19, r21, r22, psr.dt (cleared)
25355 +        */
25356 +#ifdef CONFIG_XEN
25357 +       XEN_HYPER_RSM_PSR_DT;
25358 +#else
25359 +       rsm psr.dt                              // switch to using physical data addressing
25360 +#endif
25361 +       mov r19=IA64_KR(PT_BASE)                // get the page table base address
25362 +       shl r21=r16,3                           // shift bit 60 into sign bit
25363 +#ifdef CONFIG_XEN
25364 +       movl r18=XSI_ITIR
25365 +       ;;
25366 +       ld8 r18=[r18]
25367 +#else
25368 +       mov r18=cr.itir
25369 +#endif
25370 +       ;;
25371 +       shr.u r17=r16,61                        // get the region number into r17
25372 +       extr.u r18=r18,2,6                      // get the faulting page size
25373 +       ;;
25374 +       cmp.eq p6,p7=5,r17                      // is faulting address in region 5?
25375 +       add r22=-PAGE_SHIFT,r18                 // adjustment for hugetlb address
25376 +       add r18=PGDIR_SHIFT-PAGE_SHIFT,r18
25377 +       ;;
25378 +       shr.u r22=r16,r22
25379 +       shr.u r18=r16,r18
25380 +(p7)   dep r17=r17,r19,(PAGE_SHIFT-3),3        // put region number bits in place
25381 +
25382 +       srlz.d
25383 +       LOAD_PHYSICAL(p6, r19, swapper_pg_dir)  // region 5 is rooted at swapper_pg_dir
25384 +
25385 +       .pred.rel "mutex", p6, p7
25386 +(p6)   shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
25387 +(p7)   shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
25388 +       ;;
25389 +(p6)   dep r17=r18,r19,3,(PAGE_SHIFT-3)        // r17=pgd_offset for region 5
25390 +(p7)   dep r17=r18,r17,3,(PAGE_SHIFT-6)        // r17=pgd_offset for region[0-4]
25391 +       cmp.eq p7,p6=0,r21                      // unused address bits all zeroes?
25392 +#ifdef CONFIG_PGTABLE_4
25393 +       shr.u r18=r22,PUD_SHIFT                 // shift pud index into position
25394 +#else
25395 +       shr.u r18=r22,PMD_SHIFT                 // shift pmd index into position
25396 +#endif
25397 +       ;;
25398 +       ld8 r17=[r17]                           // get *pgd (may be 0)
25399 +       ;;
25400 +(p7)   cmp.eq p6,p7=r17,r0                     // was pgd_present(*pgd) == NULL?
25401 +       dep r17=r18,r17,3,(PAGE_SHIFT-3)        // r17=p[u|m]d_offset(pgd,addr)
25402 +       ;;
25403 +#ifdef CONFIG_PGTABLE_4
25404 +(p7)   ld8 r17=[r17]                           // get *pud (may be 0)
25405 +       shr.u r18=r22,PMD_SHIFT                 // shift pmd index into position
25406 +       ;;
25407 +(p7)   cmp.eq.or.andcm p6,p7=r17,r0            // was pud_present(*pud) == NULL?
25408 +       dep r17=r18,r17,3,(PAGE_SHIFT-3)        // r17=pmd_offset(pud,addr)
25409 +       ;;
25410 +#endif
25411 +(p7)   ld8 r17=[r17]                           // get *pmd (may be 0)
25412 +       shr.u r19=r22,PAGE_SHIFT                // shift pte index into position
25413 +       ;;
25414 +(p7)   cmp.eq.or.andcm p6,p7=r17,r0            // was pmd_present(*pmd) == NULL?
25415 +       dep r17=r19,r17,3,(PAGE_SHIFT-3)        // r17=pte_offset(pmd,addr);
25416 +(p6)   br.cond.spnt page_fault
25417 +       mov b0=r30
25418 +       br.sptk.many b0                         // return to continuation point
25419 +END(nested_dtlb_miss)
25420 +
25421 +       .org ia64_ivt+0x1800
25422 +/////////////////////////////////////////////////////////////////////////////////////////
25423 +// 0x1800 Entry 6 (size 64 bundles) Instruction Key Miss (24)
25424 +ENTRY(ikey_miss)
25425 +       DBG_FAULT(6)
25426 +       FAULT(6)
25427 +END(ikey_miss)
25428 +
25429 +       //-----------------------------------------------------------------------------------
25430 +       // call do_page_fault (predicates are in r31, psr.dt may be off, r16 is faulting address)
25431 +ENTRY(page_fault)
25432 +#ifdef CONFIG_XEN
25433 +       XEN_HYPER_SSM_PSR_DT
25434 +#else
25435 +       ssm psr.dt
25436 +       ;;
25437 +       srlz.i
25438 +#endif
25439 +       ;;
25440 +       SAVE_MIN_WITH_COVER
25441 +       alloc r15=ar.pfs,0,0,3,0
25442 +#ifdef CONFIG_XEN
25443 +       movl r3=XSI_ISR
25444 +       ;;
25445 +       ld8 out1=[r3],XSI_IFA-XSI_ISR           // get vcr.isr, point to ifa
25446 +       ;;
25447 +       ld8 out0=[r3]                           // get vcr.ifa
25448 +       mov r14=1
25449 +       ;;
25450 +       add r3=XSI_PSR_IC-XSI_IFA, r3           // point to vpsr.ic
25451 +       ;;
25452 +       st4 [r3]=r14                            // vpsr.ic = 1
25453 +       adds r3=8,r2                            // set up second base pointer
25454 +       ;;
25455 +#else
25456 +       mov out0=cr.ifa
25457 +       mov out1=cr.isr
25458 +       adds r3=8,r2                            // set up second base pointer
25459 +       ;;
25460 +       ssm psr.ic | PSR_DEFAULT_BITS
25461 +       ;;
25462 +       srlz.i                                  // guarantee that interruption collectin is on
25463 +       ;;
25464 +#endif
25465 +#ifdef CONFIG_XEN
25466 +       br.cond.sptk.many       xen_page_fault
25467 +       ;;
25468 +done_xen_page_fault:
25469 +#endif
25470 +(p15)  ssm psr.i                               // restore psr.i
25471 +       movl r14=ia64_leave_kernel
25472 +       ;;
25473 +       SAVE_REST
25474 +       mov rp=r14
25475 +       ;;
25476 +       adds out2=16,r12                        // out2 = pointer to pt_regs
25477 +       br.call.sptk.many b6=ia64_do_page_fault // ignore return address
25478 +END(page_fault)
25479 +
25480 +       .org ia64_ivt+0x1c00
25481 +/////////////////////////////////////////////////////////////////////////////////////////
25482 +// 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
25483 +ENTRY(dkey_miss)
25484 +       DBG_FAULT(7)
25485 +       FAULT(7)
25486 +#ifdef CONFIG_XEN
25487 +       // Leaving this code inline above results in an IVT section overflow
25488 +       // There is no particular reason for this code to be here...
25489 +xen_page_fault:
25490 +(p15)  movl r3=XSI_PSR_I_ADDR
25491 +       ;;
25492 +(p15)  ld8 r3=[r3]
25493 +       ;;
25494 +(p15)  st1 [r3]=r0,XSI_PEND-XSI_PSR_I_ADDR     // if (p15) vpsr.i = 1
25495 +       mov r14=r0
25496 +       ;;
25497 +(p15)  ld4 r14=[r3]                            // if (pending_interrupts)
25498 +       adds r3=8,r2                            // re-set up second base pointer
25499 +       ;;
25500 +(p15)  cmp.ne  p15,p0=r14,r0
25501 +       ;;
25502 +       br.cond.sptk.many done_xen_page_fault
25503 +       ;;
25504 +#endif
25505 +END(dkey_miss)
25506 +
25507 +       .org ia64_ivt+0x2000
25508 +/////////////////////////////////////////////////////////////////////////////////////////
25509 +// 0x2000 Entry 8 (size 64 bundles) Dirty-bit (54)
25510 +ENTRY(dirty_bit)
25511 +       DBG_FAULT(8)
25512 +       /*
25513 +        * What we do here is to simply turn on the dirty bit in the PTE.  We need to
25514 +        * update both the page-table and the TLB entry.  To efficiently access the PTE,
25515 +        * we address it through the virtual page table.  Most likely, the TLB entry for
25516 +        * the relevant virtual page table page is still present in the TLB so we can
25517 +        * normally do this without additional TLB misses.  In case the necessary virtual
25518 +        * page table TLB entry isn't present, we take a nested TLB miss hit where we look
25519 +        * up the physical address of the L3 PTE and then continue at label 1 below.
25520 +        */
25521 +#ifdef CONFIG_XEN
25522 +       movl r16=XSI_IFA
25523 +       ;;
25524 +       ld8 r16=[r16]
25525 +       ;;
25526 +#else
25527 +       mov r16=cr.ifa                          // get the address that caused the fault
25528 +#endif
25529 +       movl r30=1f                             // load continuation point in case of nested fault
25530 +       ;;
25531 +#ifdef CONFIG_XEN
25532 +       mov r18=r8;
25533 +       mov r8=r16;
25534 +       XEN_HYPER_THASH;;
25535 +       mov r17=r8;
25536 +       mov r8=r18;;
25537 +#else
25538 +       thash r17=r16                           // compute virtual address of L3 PTE
25539 +#endif
25540 +       mov r29=b0                              // save b0 in case of nested fault
25541 +       mov r31=pr                              // save pr
25542 +#ifdef CONFIG_SMP
25543 +       mov r28=ar.ccv                          // save ar.ccv
25544 +       ;;
25545 +1:     ld8 r18=[r17]
25546 +       ;;                                      // avoid RAW on r18
25547 +       mov ar.ccv=r18                          // set compare value for cmpxchg
25548 +       or r25=_PAGE_D|_PAGE_A,r18              // set the dirty and accessed bits
25549 +       tbit.z p7,p6 = r18,_PAGE_P_BIT          // Check present bit
25550 +       ;;
25551 +(p6)   cmpxchg8.acq r26=[r17],r25,ar.ccv       // Only update if page is present
25552 +       mov r24=PAGE_SHIFT<<2
25553 +       ;;
25554 +(p6)   cmp.eq p6,p7=r26,r18                    // Only compare if page is present
25555 +       ;;
25556 +#ifdef CONFIG_XEN
25557 +(p6)   mov r18=r8
25558 +(p6)   mov r8=r25
25559 +       ;;
25560 +(p6)   XEN_HYPER_ITC_D
25561 +       ;;
25562 +(p6)   mov r8=r18
25563 +#else
25564 +(p6)   itc.d r25                               // install updated PTE
25565 +#endif 
25566 +       ;;
25567 +       /*
25568 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
25569 +        * cannot possibly affect the following loads:
25570 +        */
25571 +       dv_serialize_data
25572 +
25573 +       ld8 r18=[r17]                           // read PTE again
25574 +       ;;
25575 +       cmp.eq p6,p7=r18,r25                    // is it same as the newly installed
25576 +       ;;
25577 +(p7)   ptc.l r16,r24
25578 +       mov b0=r29                              // restore b0
25579 +       mov ar.ccv=r28
25580 +#else
25581 +       ;;
25582 +1:     ld8 r18=[r17]
25583 +       ;;                                      // avoid RAW on r18
25584 +       or r18=_PAGE_D|_PAGE_A,r18              // set the dirty and accessed bits
25585 +       mov b0=r29                              // restore b0
25586 +       ;;
25587 +       st8 [r17]=r18                           // store back updated PTE
25588 +       itc.d r18                               // install updated PTE
25589 +#endif
25590 +       mov pr=r31,-1                           // restore pr
25591 +#ifdef CONFIG_XEN
25592 +       XEN_HYPER_RFI
25593 +       dv_serialize_data
25594 +#else
25595 +       rfi
25596 +#endif
25597 +END(dirty_bit)
25598 +
25599 +       .org ia64_ivt+0x2400
25600 +/////////////////////////////////////////////////////////////////////////////////////////
25601 +// 0x2400 Entry 9 (size 64 bundles) Instruction Access-bit (27)
25602 +ENTRY(iaccess_bit)
25603 +       DBG_FAULT(9)
25604 +       // Like Entry 8, except for instruction access
25605 +#ifdef CONFIG_XEN
25606 +       movl r16=XSI_IFA
25607 +       ;;
25608 +       ld8 r16=[r16]
25609 +       ;;
25610 +#else
25611 +       mov r16=cr.ifa                          // get the address that caused the fault
25612 +#endif
25613 +       movl r30=1f                             // load continuation point in case of nested fault
25614 +       mov r31=pr                              // save predicates
25615 +#ifdef CONFIG_ITANIUM
25616 +       /*
25617 +        * Erratum 10 (IFA may contain incorrect address) has "NoFix" status.
25618 +        */
25619 +       mov r17=cr.ipsr
25620 +       ;;
25621 +       mov r18=cr.iip
25622 +       tbit.z p6,p0=r17,IA64_PSR_IS_BIT        // IA64 instruction set?
25623 +       ;;
25624 +(p6)   mov r16=r18                             // if so, use cr.iip instead of cr.ifa
25625 +#endif /* CONFIG_ITANIUM */
25626 +       ;;
25627 +#ifdef CONFIG_XEN
25628 +       mov r18=r8;
25629 +       mov r8=r16;
25630 +       XEN_HYPER_THASH;;
25631 +       mov r17=r8;
25632 +       mov r8=r18;;
25633 +#else
25634 +       thash r17=r16                           // compute virtual address of L3 PTE
25635 +#endif
25636 +       mov r29=b0                              // save b0 in case of nested fault)
25637 +#ifdef CONFIG_SMP
25638 +       mov r28=ar.ccv                          // save ar.ccv
25639 +       ;;
25640 +1:     ld8 r18=[r17]
25641 +       ;;
25642 +       mov ar.ccv=r18                          // set compare value for cmpxchg
25643 +       or r25=_PAGE_A,r18                      // set the accessed bit
25644 +       tbit.z p7,p6 = r18,_PAGE_P_BIT          // Check present bit
25645 +       ;;
25646 +(p6)   cmpxchg8.acq r26=[r17],r25,ar.ccv       // Only if page present
25647 +       mov r24=PAGE_SHIFT<<2
25648 +       ;;
25649 +(p6)   cmp.eq p6,p7=r26,r18                    // Only if page present
25650 +       ;;
25651 +#ifdef CONFIG_XEN
25652 +       mov r26=r8
25653 +       mov r8=r25
25654 +       ;;
25655 +(p6)   XEN_HYPER_ITC_I
25656 +       ;;
25657 +       mov r8=r26
25658 +       ;;
25659 +#else
25660 +(p6)   itc.i r25                               // install updated PTE
25661 +#endif
25662 +       ;;
25663 +       /*
25664 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
25665 +        * cannot possibly affect the following loads:
25666 +        */
25667 +       dv_serialize_data
25668 +
25669 +       ld8 r18=[r17]                           // read PTE again
25670 +       ;;
25671 +       cmp.eq p6,p7=r18,r25                    // is it same as the newly installed
25672 +       ;;
25673 +(p7)   ptc.l r16,r24
25674 +       mov b0=r29                              // restore b0
25675 +       mov ar.ccv=r28
25676 +#else /* !CONFIG_SMP */
25677 +       ;;
25678 +1:     ld8 r18=[r17]
25679 +       ;;
25680 +       or r18=_PAGE_A,r18                      // set the accessed bit
25681 +       mov b0=r29                              // restore b0
25682 +       ;;
25683 +       st8 [r17]=r18                           // store back updated PTE
25684 +       itc.i r18                               // install updated PTE
25685 +#endif /* !CONFIG_SMP */
25686 +       mov pr=r31,-1
25687 +#ifdef CONFIG_XEN
25688 +       XEN_HYPER_RFI
25689 +       dv_serialize_data
25690 +#else
25691 +       rfi
25692 +#endif
25693 +END(iaccess_bit)
25694 +
25695 +       .org ia64_ivt+0x2800
25696 +/////////////////////////////////////////////////////////////////////////////////////////
25697 +// 0x2800 Entry 10 (size 64 bundles) Data Access-bit (15,55)
25698 +ENTRY(daccess_bit)
25699 +       DBG_FAULT(10)
25700 +       // Like Entry 8, except for data access
25701 +#ifdef CONFIG_XEN
25702 +       movl r16=XSI_IFA
25703 +       ;;
25704 +       ld8 r16=[r16]
25705 +       ;;
25706 +#else
25707 +       mov r16=cr.ifa                          // get the address that caused the fault
25708 +#endif
25709 +       movl r30=1f                             // load continuation point in case of nested fault
25710 +       ;;
25711 +#ifdef CONFIG_XEN
25712 +       mov r18=r8
25713 +       mov r8=r16
25714 +       XEN_HYPER_THASH
25715 +       ;;
25716 +       mov r17=r8
25717 +       mov r8=r18
25718 +       ;;
25719 +#else
25720 +       thash r17=r16                           // compute virtual address of L3 PTE
25721 +#endif
25722 +       mov r31=pr
25723 +       mov r29=b0                              // save b0 in case of nested fault)
25724 +#ifdef CONFIG_SMP
25725 +       mov r28=ar.ccv                          // save ar.ccv
25726 +       ;;
25727 +1:     ld8 r18=[r17]
25728 +       ;;                                      // avoid RAW on r18
25729 +       mov ar.ccv=r18                          // set compare value for cmpxchg
25730 +       or r25=_PAGE_A,r18                      // set the dirty bit
25731 +       tbit.z p7,p6 = r18,_PAGE_P_BIT          // Check present bit
25732 +       ;;
25733 +(p6)   cmpxchg8.acq r26=[r17],r25,ar.ccv       // Only if page is present
25734 +       mov r24=PAGE_SHIFT<<2
25735 +       ;;
25736 +(p6)   cmp.eq p6,p7=r26,r18                    // Only if page is present
25737 +       ;;
25738 +#ifdef CONFIG_XEN
25739 +       mov r26=r8
25740 +       mov r8=r25
25741 +       ;;
25742 +(p6)   XEN_HYPER_ITC_D
25743 +       ;;
25744 +       mov r8=r26
25745 +       ;;
25746 +#else
25747 +(p6)   itc.d r25                               // install updated PTE
25748 +#endif
25749 +       /*
25750 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
25751 +        * cannot possibly affect the following loads:
25752 +        */
25753 +       dv_serialize_data
25754 +       ;;
25755 +       ld8 r18=[r17]                           // read PTE again
25756 +       ;;
25757 +       cmp.eq p6,p7=r18,r25                    // is it same as the newly installed
25758 +       ;;
25759 +(p7)   ptc.l r16,r24
25760 +       mov ar.ccv=r28
25761 +#else
25762 +       ;;
25763 +1:     ld8 r18=[r17]
25764 +       ;;                                      // avoid RAW on r18
25765 +       or r18=_PAGE_A,r18                      // set the accessed bit
25766 +       ;;
25767 +       st8 [r17]=r18                           // store back updated PTE
25768 +       itc.d r18                               // install updated PTE
25769 +#endif
25770 +       mov b0=r29                              // restore b0
25771 +       mov pr=r31,-1
25772 +#ifdef CONFIG_XEN
25773 +       XEN_HYPER_RFI
25774 +       dv_serialize_data
25775 +#else
25776 +       rfi
25777 +#endif
25778 +END(daccess_bit)
25779 +
25780 +       .org ia64_ivt+0x2c00
25781 +/////////////////////////////////////////////////////////////////////////////////////////
25782 +// 0x2c00 Entry 11 (size 64 bundles) Break instruction (33)
25783 +ENTRY(break_fault)
25784 +       /*
25785 +        * The streamlined system call entry/exit paths only save/restore the initial part
25786 +        * of pt_regs.  This implies that the callers of system-calls must adhere to the
25787 +        * normal procedure calling conventions.
25788 +        *
25789 +        *   Registers to be saved & restored:
25790 +        *      CR registers: cr.ipsr, cr.iip, cr.ifs
25791 +        *      AR registers: ar.unat, ar.pfs, ar.rsc, ar.rnat, ar.bspstore, ar.fpsr
25792 +        *      others: pr, b0, b6, loadrs, r1, r11, r12, r13, r15
25793 +        *   Registers to be restored only:
25794 +        *      r8-r11: output value from the system call.
25795 +        *
25796 +        * During system call exit, scratch registers (including r15) are modified/cleared
25797 +        * to prevent leaking bits from kernel to user level.
25798 +        */
25799 +       DBG_FAULT(11)
25800 +       mov.m r16=IA64_KR(CURRENT)              // M2 r16 <- current task (12 cyc)
25801 +#ifdef CONFIG_XEN
25802 +       movl r22=XSI_IPSR
25803 +       ;;
25804 +       ld8 r29=[r22],XSI_IIM-XSI_IPSR          // get ipsr, point to iip
25805 +#else
25806 +       mov r29=cr.ipsr                         // M2 (12 cyc)
25807 +#endif
25808 +       mov r31=pr                              // I0 (2 cyc)
25809 +
25810 +#ifdef CONFIG_XEN
25811 +       ;;
25812 +       ld8 r17=[r22],XSI_IIP-XSI_IIM
25813 +#else
25814 +       mov r17=cr.iim                          // M2 (2 cyc)
25815 +#endif
25816 +       mov.m r27=ar.rsc                        // M2 (12 cyc)
25817 +       mov r18=__IA64_BREAK_SYSCALL            // A
25818 +
25819 +       mov.m ar.rsc=0                          // M2
25820 +       mov.m r21=ar.fpsr                       // M2 (12 cyc)
25821 +       mov r19=b6                              // I0 (2 cyc)
25822 +       ;;
25823 +       mov.m r23=ar.bspstore                   // M2 (12 cyc)
25824 +       mov.m r24=ar.rnat                       // M2 (5 cyc)
25825 +       mov.i r26=ar.pfs                        // I0 (2 cyc)
25826 +
25827 +       invala                                  // M0|1
25828 +       nop.m 0                                 // M
25829 +       mov r20=r1                              // A                    save r1
25830 +
25831 +       nop.m 0
25832 +       movl r30=sys_call_table                 // X
25833 +
25834 +#ifdef CONFIG_XEN
25835 +       ld8 r28=[r22]
25836 +#else
25837 +       mov r28=cr.iip                          // M2 (2 cyc)
25838 +#endif
25839 +       cmp.eq p0,p7=r18,r17                    // I0 is this a system call?
25840 +(p7)   br.cond.spnt non_syscall                // B  no ->
25841 +       //
25842 +       // From this point on, we are definitely on the syscall-path
25843 +       // and we can use (non-banked) scratch registers.
25844 +       //
25845 +///////////////////////////////////////////////////////////////////////
25846 +       mov r1=r16                              // A    move task-pointer to "addl"-addressable reg
25847 +       mov r2=r16                              // A    setup r2 for ia64_syscall_setup
25848 +       add r9=TI_FLAGS+IA64_TASK_SIZE,r16      // A    r9 = &current_thread_info()->flags
25849 +
25850 +       adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16
25851 +       adds r15=-1024,r15                      // A    subtract 1024 from syscall number
25852 +       mov r3=NR_syscalls - 1
25853 +       ;;
25854 +       ld1.bias r17=[r16]                      // M0|1 r17 = current->thread.on_ustack flag
25855 +       ld4 r9=[r9]                             // M0|1 r9 = current_thread_info()->flags
25856 +       extr.u r8=r29,41,2                      // I0   extract ei field from cr.ipsr
25857 +
25858 +       shladd r30=r15,3,r30                    // A    r30 = sys_call_table + 8*(syscall-1024)
25859 +       addl r22=IA64_RBS_OFFSET,r1             // A    compute base of RBS
25860 +       cmp.leu p6,p7=r15,r3                    // A    syscall number in range?
25861 +       ;;
25862 +
25863 +       lfetch.fault.excl.nt1 [r22]             // M0|1 prefetch RBS
25864 +(p6)   ld8 r30=[r30]                           // M0|1 load address of syscall entry point
25865 +       tnat.nz.or p7,p0=r15                    // I0   is syscall nr a NaT?
25866 +
25867 +       mov.m ar.bspstore=r22                   // M2   switch to kernel RBS
25868 +       cmp.eq p8,p9=2,r8                       // A    isr.ei==2?
25869 +       ;;
25870 +
25871 +(p8)   mov r8=0                                // A    clear ei to 0
25872 +(p7)   movl r30=sys_ni_syscall                 // X
25873 +
25874 +(p8)   adds r28=16,r28                         // A    switch cr.iip to next bundle
25875 +(p9)   adds r8=1,r8                            // A    increment ei to next slot
25876 +       nop.i 0
25877 +       ;;
25878 +
25879 +       mov.m r25=ar.unat                       // M2 (5 cyc)
25880 +       dep r29=r8,r29,41,2                     // I0   insert new ei into cr.ipsr
25881 +       adds r15=1024,r15                       // A    restore original syscall number
25882 +       //
25883 +       // If any of the above loads miss in L1D, we'll stall here until
25884 +       // the data arrives.
25885 +       //
25886 +///////////////////////////////////////////////////////////////////////
25887 +       st1 [r16]=r0                            // M2|3 clear current->thread.on_ustack flag
25888 +       mov b6=r30                              // I0   setup syscall handler branch reg early
25889 +       cmp.eq pKStk,pUStk=r0,r17               // A    were we on kernel stacks already?
25890 +
25891 +       and r9=_TIF_SYSCALL_TRACEAUDIT,r9       // A    mask trace or audit
25892 +       mov r18=ar.bsp                          // M2 (12 cyc)
25893 +(pKStk)        br.cond.spnt .break_fixup               // B    we're already in kernel-mode -- fix up RBS
25894 +       ;;
25895 +.back_from_break_fixup:
25896 +(pUStk)        addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1 // A    compute base of memory stack
25897 +       cmp.eq p14,p0=r9,r0                     // A    are syscalls being traced/audited?
25898 +       br.call.sptk.many b7=ia64_syscall_setup // B
25899 +1:
25900 +       mov ar.rsc=0x3                          // M2   set eager mode, pl 0, LE, loadrs=0
25901 +       nop 0
25902 +#ifdef CONFIG_XEN
25903 +       mov r2=b0; br.call.sptk b0=xen_bsw1;; mov b0=r2;;
25904 +#else
25905 +       bsw.1                                   // B (6 cyc) regs are saved, switch to bank 1
25906 +#endif
25907 +       ;;
25908 +
25909 +#ifdef CONFIG_XEN
25910 +       movl r16=XSI_PSR_IC
25911 +       mov r3=1
25912 +       ;;
25913 +       st4 [r16]=r3,XSI_PSR_I_ADDR-XSI_PSR_IC  // vpsr.ic = 1
25914 +#else
25915 +       ssm psr.ic | PSR_DEFAULT_BITS           // M2   now it's safe to re-enable intr.-collection
25916 +#endif
25917 +       movl r3=ia64_ret_from_syscall           // X
25918 +       ;;
25919 +
25920 +       srlz.i                                  // M0   ensure interruption collection is on
25921 +       mov rp=r3                               // I0   set the real return addr
25922 +(p10)  br.cond.spnt.many ia64_ret_from_syscall // B    return if bad call-frame or r15 is a NaT
25923 +
25924 +#ifdef CONFIG_XEN
25925 +(p15)  ld8 r16=[r16]                           // vpsr.i
25926 +       ;;
25927 +(p15)  st1 [r16]=r0,XSI_PEND-XSI_PSR_I_ADDR    // if (p15) vpsr.i = 1
25928 +       mov r2=r0
25929 +       ;;
25930 +(p15)  ld4 r2=[r16]                            // if (pending_interrupts)
25931 +       ;;
25932 +       cmp.ne  p6,p0=r2,r0
25933 +       ;;
25934 +(p6)   ssm     psr.i                           //   do a real ssm psr.i
25935 +#else
25936 +(p15)  ssm psr.i                               // M2   restore psr.i
25937 +#endif
25938 +(p14)  br.call.sptk.many b6=b6                 // B    invoke syscall-handker (ignore return addr)
25939 +       br.cond.spnt.many ia64_trace_syscall    // B    do syscall-tracing thingamagic
25940 +       // NOT REACHED
25941 +///////////////////////////////////////////////////////////////////////
25942 +       // On entry, we optimistically assumed that we're coming from user-space.
25943 +       // For the rare cases where a system-call is done from within the kernel,
25944 +       // we fix things up at this point:
25945 +.break_fixup:
25946 +       add r1=-IA64_PT_REGS_SIZE,sp            // A    allocate space for pt_regs structure
25947 +       mov ar.rnat=r24                         // M2   restore kernel's AR.RNAT
25948 +       ;;
25949 +       mov ar.bspstore=r23                     // M2   restore kernel's AR.BSPSTORE
25950 +       br.cond.sptk .back_from_break_fixup
25951 +END(break_fault)
25952 +
25953 +       .org ia64_ivt+0x3000
25954 +/////////////////////////////////////////////////////////////////////////////////////////
25955 +// 0x3000 Entry 12 (size 64 bundles) External Interrupt (4)
25956 +ENTRY(interrupt)
25957 +       DBG_FAULT(12)
25958 +       mov r31=pr              // prepare to save predicates
25959 +       ;;
25960 +       SAVE_MIN_WITH_COVER     // uses r31; defines r2 and r3
25961 +#ifdef CONFIG_XEN
25962 +       movl r3=XSI_PSR_IC
25963 +       mov r14=1
25964 +       ;;
25965 +       st4 [r3]=r14
25966 +#else
25967 +       ssm psr.ic | PSR_DEFAULT_BITS
25968 +#endif
25969 +       ;;
25970 +       adds r3=8,r2            // set up second base pointer for SAVE_REST
25971 +       srlz.i                  // ensure everybody knows psr.ic is back on
25972 +       ;;
25973 +       SAVE_REST
25974 +       ;;
25975 +       alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group
25976 +#ifdef CONFIG_XEN
25977 +       ;;
25978 +       br.call.sptk.many rp=xen_get_ivr
25979 +       ;;
25980 +       mov out0=r8             // pass cr.ivr as first arg
25981 +#else
25982 +       mov out0=cr.ivr         // pass cr.ivr as first arg
25983 +#endif
25984 +       add out1=16,sp          // pass pointer to pt_regs as second arg
25985 +       ;;
25986 +       srlz.d                  // make sure we see the effect of cr.ivr
25987 +       movl r14=ia64_leave_kernel
25988 +       ;;
25989 +       mov rp=r14
25990 +       br.call.sptk.many b6=ia64_handle_irq
25991 +END(interrupt)
25992 +
25993 +       .org ia64_ivt+0x3400
25994 +/////////////////////////////////////////////////////////////////////////////////////////
25995 +// 0x3400 Entry 13 (size 64 bundles) Reserved
25996 +       DBG_FAULT(13)
25997 +       FAULT(13)
25998 +
25999 +       .org ia64_ivt+0x3800
26000 +/////////////////////////////////////////////////////////////////////////////////////////
26001 +// 0x3800 Entry 14 (size 64 bundles) Reserved
26002 +       DBG_FAULT(14)
26003 +       FAULT(14)
26004 +
26005 +       /*
26006 +        * There is no particular reason for this code to be here, other than that
26007 +        * there happens to be space here that would go unused otherwise.  If this
26008 +        * fault ever gets "unreserved", simply moved the following code to a more
26009 +        * suitable spot...
26010 +        *
26011 +        * ia64_syscall_setup() is a separate subroutine so that it can
26012 +        *      allocate stacked registers so it can safely demine any
26013 +        *      potential NaT values from the input registers.
26014 +        *
26015 +        * On entry:
26016 +        *      - executing on bank 0 or bank 1 register set (doesn't matter)
26017 +        *      -  r1: stack pointer
26018 +        *      -  r2: current task pointer
26019 +        *      -  r3: preserved
26020 +        *      - r11: original contents (saved ar.pfs to be saved)
26021 +        *      - r12: original contents (sp to be saved)
26022 +        *      - r13: original contents (tp to be saved)
26023 +        *      - r15: original contents (syscall # to be saved)
26024 +        *      - r18: saved bsp (after switching to kernel stack)
26025 +        *      - r19: saved b6
26026 +        *      - r20: saved r1 (gp)
26027 +        *      - r21: saved ar.fpsr
26028 +        *      - r22: kernel's register backing store base (krbs_base)
26029 +        *      - r23: saved ar.bspstore
26030 +        *      - r24: saved ar.rnat
26031 +        *      - r25: saved ar.unat
26032 +        *      - r26: saved ar.pfs
26033 +        *      - r27: saved ar.rsc
26034 +        *      - r28: saved cr.iip
26035 +        *      - r29: saved cr.ipsr
26036 +        *      - r31: saved pr
26037 +        *      -  b0: original contents (to be saved)
26038 +        * On exit:
26039 +        *      -  p10: TRUE if syscall is invoked with more than 8 out
26040 +        *              registers or r15's Nat is true
26041 +        *      -  r1: kernel's gp
26042 +        *      -  r3: preserved (same as on entry)
26043 +        *      -  r8: -EINVAL if p10 is true
26044 +        *      - r12: points to kernel stack
26045 +        *      - r13: points to current task
26046 +        *      - r14: preserved (same as on entry)
26047 +        *      - p13: preserved
26048 +        *      - p15: TRUE if interrupts need to be re-enabled
26049 +        *      - ar.fpsr: set to kernel settings
26050 +        *      -  b6: preserved (same as on entry)
26051 +        */
26052 +#ifndef CONFIG_XEN
26053 +GLOBAL_ENTRY(ia64_syscall_setup)
26054 +#if PT(B6) != 0
26055 +# error This code assumes that b6 is the first field in pt_regs.
26056 +#endif
26057 +       st8 [r1]=r19                            // save b6
26058 +       add r16=PT(CR_IPSR),r1                  // initialize first base pointer
26059 +       add r17=PT(R11),r1                      // initialize second base pointer
26060 +       ;;
26061 +       alloc r19=ar.pfs,8,0,0,0                // ensure in0-in7 are writable
26062 +       st8 [r16]=r29,PT(AR_PFS)-PT(CR_IPSR)    // save cr.ipsr
26063 +       tnat.nz p8,p0=in0
26064 +
26065 +       st8.spill [r17]=r11,PT(CR_IIP)-PT(R11)  // save r11
26066 +       tnat.nz p9,p0=in1
26067 +(pKStk)        mov r18=r0                              // make sure r18 isn't NaT
26068 +       ;;
26069 +
26070 +       st8 [r16]=r26,PT(CR_IFS)-PT(AR_PFS)     // save ar.pfs
26071 +       st8 [r17]=r28,PT(AR_UNAT)-PT(CR_IIP)    // save cr.iip
26072 +       mov r28=b0                              // save b0 (2 cyc)
26073 +       ;;
26074 +
26075 +       st8 [r17]=r25,PT(AR_RSC)-PT(AR_UNAT)    // save ar.unat
26076 +       dep r19=0,r19,38,26                     // clear all bits but 0..37 [I0]
26077 +(p8)   mov in0=-1
26078 +       ;;
26079 +
26080 +       st8 [r16]=r19,PT(AR_RNAT)-PT(CR_IFS)    // store ar.pfs.pfm in cr.ifs
26081 +       extr.u r11=r19,7,7      // I0           // get sol of ar.pfs
26082 +       and r8=0x7f,r19         // A            // get sof of ar.pfs
26083 +
26084 +       st8 [r17]=r27,PT(AR_BSPSTORE)-PT(AR_RSC)// save ar.rsc
26085 +       tbit.nz p15,p0=r29,IA64_PSR_I_BIT // I0
26086 +(p9)   mov in1=-1
26087 +       ;;
26088 +
26089 +(pUStk) sub r18=r18,r22                                // r18=RSE.ndirty*8
26090 +       tnat.nz p10,p0=in2
26091 +       add r11=8,r11
26092 +       ;;
26093 +(pKStk) adds r16=PT(PR)-PT(AR_RNAT),r16                // skip over ar_rnat field
26094 +(pKStk) adds r17=PT(B0)-PT(AR_BSPSTORE),r17    // skip over ar_bspstore field
26095 +       tnat.nz p11,p0=in3
26096 +       ;;
26097 +(p10)  mov in2=-1
26098 +       tnat.nz p12,p0=in4                              // [I0]
26099 +(p11)  mov in3=-1
26100 +       ;;
26101 +(pUStk) st8 [r16]=r24,PT(PR)-PT(AR_RNAT)       // save ar.rnat
26102 +(pUStk) st8 [r17]=r23,PT(B0)-PT(AR_BSPSTORE)   // save ar.bspstore
26103 +       shl r18=r18,16                          // compute ar.rsc to be used for "loadrs"
26104 +       ;;
26105 +       st8 [r16]=r31,PT(LOADRS)-PT(PR)         // save predicates
26106 +       st8 [r17]=r28,PT(R1)-PT(B0)             // save b0
26107 +       tnat.nz p13,p0=in5                              // [I0]
26108 +       ;;
26109 +       st8 [r16]=r18,PT(R12)-PT(LOADRS)        // save ar.rsc value for "loadrs"
26110 +       st8.spill [r17]=r20,PT(R13)-PT(R1)      // save original r1
26111 +(p12)  mov in4=-1
26112 +       ;;
26113 +
26114 +.mem.offset 0,0; st8.spill [r16]=r12,PT(AR_FPSR)-PT(R12)       // save r12
26115 +.mem.offset 8,0; st8.spill [r17]=r13,PT(R15)-PT(R13)           // save r13
26116 +(p13)  mov in5=-1
26117 +       ;;
26118 +       st8 [r16]=r21,PT(R8)-PT(AR_FPSR)        // save ar.fpsr
26119 +       tnat.nz p13,p0=in6
26120 +       cmp.lt p10,p9=r11,r8    // frame size can't be more than local+8
26121 +       ;;
26122 +       mov r8=1
26123 +(p9)   tnat.nz p10,p0=r15
26124 +       adds r12=-16,r1         // switch to kernel memory stack (with 16 bytes of scratch)
26125 +
26126 +       st8.spill [r17]=r15                     // save r15
26127 +       tnat.nz p8,p0=in7
26128 +       nop.i 0
26129 +
26130 +       mov r13=r2                              // establish `current'
26131 +       movl r1=__gp                            // establish kernel global pointer
26132 +       ;;
26133 +       st8 [r16]=r8            // ensure pt_regs.r8 != 0 (see handle_syscall_error)
26134 +(p13)  mov in6=-1
26135 +(p8)   mov in7=-1
26136 +
26137 +       cmp.eq pSys,pNonSys=r0,r0               // set pSys=1, pNonSys=0
26138 +       movl r17=FPSR_DEFAULT
26139 +       ;;
26140 +       mov.m ar.fpsr=r17                       // set ar.fpsr to kernel default value
26141 +(p10)  mov r8=-EINVAL
26142 +       br.ret.sptk.many b7
26143 +END(ia64_syscall_setup)
26144 +#endif
26145 +
26146 +       .org ia64_ivt+0x3c00
26147 +/////////////////////////////////////////////////////////////////////////////////////////
26148 +// 0x3c00 Entry 15 (size 64 bundles) Reserved
26149 +       DBG_FAULT(15)
26150 +       FAULT(15)
26151 +
26152 +       /*
26153 +        * Squatting in this space ...
26154 +        *
26155 +        * This special case dispatcher for illegal operation faults allows preserved
26156 +        * registers to be modified through a callback function (asm only) that is handed
26157 +        * back from the fault handler in r8. Up to three arguments can be passed to the
26158 +        * callback function by returning an aggregate with the callback as its first
26159 +        * element, followed by the arguments.
26160 +        */
26161 +ENTRY(dispatch_illegal_op_fault)
26162 +       .prologue
26163 +       .body
26164 +       SAVE_MIN_WITH_COVER
26165 +       ssm psr.ic | PSR_DEFAULT_BITS
26166 +       ;;
26167 +       srlz.i          // guarantee that interruption collection is on
26168 +       ;;
26169 +(p15)  ssm psr.i       // restore psr.i
26170 +       adds r3=8,r2    // set up second base pointer for SAVE_REST
26171 +       ;;
26172 +       alloc r14=ar.pfs,0,0,1,0        // must be first in insn group
26173 +       mov out0=ar.ec
26174 +       ;;
26175 +       SAVE_REST
26176 +       PT_REGS_UNWIND_INFO(0)
26177 +       ;;
26178 +       br.call.sptk.many rp=ia64_illegal_op_fault
26179 +.ret0: ;;
26180 +       alloc r14=ar.pfs,0,0,3,0        // must be first in insn group
26181 +       mov out0=r9
26182 +       mov out1=r10
26183 +       mov out2=r11
26184 +       movl r15=ia64_leave_kernel
26185 +       ;;
26186 +       mov rp=r15
26187 +       mov b6=r8
26188 +       ;;
26189 +       cmp.ne p6,p0=0,r8
26190 +(p6)   br.call.dpnt.many b6=b6         // call returns to ia64_leave_kernel
26191 +       br.sptk.many ia64_leave_kernel
26192 +END(dispatch_illegal_op_fault)
26193 +
26194 +       .org ia64_ivt+0x4000
26195 +/////////////////////////////////////////////////////////////////////////////////////////
26196 +// 0x4000 Entry 16 (size 64 bundles) Reserved
26197 +       DBG_FAULT(16)
26198 +       FAULT(16)
26199 +
26200 +       .org ia64_ivt+0x4400
26201 +/////////////////////////////////////////////////////////////////////////////////////////
26202 +// 0x4400 Entry 17 (size 64 bundles) Reserved
26203 +       DBG_FAULT(17)
26204 +       FAULT(17)
26205 +
26206 +ENTRY(non_syscall)
26207 +       mov ar.rsc=r27                  // restore ar.rsc before SAVE_MIN_WITH_COVER
26208 +       ;;
26209 +       SAVE_MIN_WITH_COVER
26210 +
26211 +       // There is no particular reason for this code to be here, other than that
26212 +       // there happens to be space here that would go unused otherwise.  If this
26213 +       // fault ever gets "unreserved", simply moved the following code to a more
26214 +       // suitable spot...
26215 +
26216 +       alloc r14=ar.pfs,0,0,2,0
26217 +       mov out0=cr.iim
26218 +       add out1=16,sp
26219 +       adds r3=8,r2                    // set up second base pointer for SAVE_REST
26220 +
26221 +       ssm psr.ic | PSR_DEFAULT_BITS
26222 +       ;;
26223 +       srlz.i                          // guarantee that interruption collection is on
26224 +       ;;
26225 +(p15)  ssm psr.i                       // restore psr.i
26226 +       movl r15=ia64_leave_kernel
26227 +       ;;
26228 +       SAVE_REST
26229 +       mov rp=r15
26230 +       ;;
26231 +       br.call.sptk.many b6=ia64_bad_break     // avoid WAW on CFM and ignore return addr
26232 +END(non_syscall)
26233 +
26234 +       .org ia64_ivt+0x4800
26235 +/////////////////////////////////////////////////////////////////////////////////////////
26236 +// 0x4800 Entry 18 (size 64 bundles) Reserved
26237 +       DBG_FAULT(18)
26238 +       FAULT(18)
26239 +
26240 +       /*
26241 +        * There is no particular reason for this code to be here, other than that
26242 +        * there happens to be space here that would go unused otherwise.  If this
26243 +        * fault ever gets "unreserved", simply moved the following code to a more
26244 +        * suitable spot...
26245 +        */
26246 +
26247 +ENTRY(dispatch_unaligned_handler)
26248 +       SAVE_MIN_WITH_COVER
26249 +       ;;
26250 +       alloc r14=ar.pfs,0,0,2,0                // now it's safe (must be first in insn group!)
26251 +       mov out0=cr.ifa
26252 +       adds out1=16,sp
26253 +
26254 +       ssm psr.ic | PSR_DEFAULT_BITS
26255 +       ;;
26256 +       srlz.i                                  // guarantee that interruption collection is on
26257 +       ;;
26258 +(p15)  ssm psr.i                               // restore psr.i
26259 +       adds r3=8,r2                            // set up second base pointer
26260 +       ;;
26261 +       SAVE_REST
26262 +       movl r14=ia64_leave_kernel
26263 +       ;;
26264 +       mov rp=r14
26265 +       br.sptk.many ia64_prepare_handle_unaligned
26266 +END(dispatch_unaligned_handler)
26267 +
26268 +       .org ia64_ivt+0x4c00
26269 +/////////////////////////////////////////////////////////////////////////////////////////
26270 +// 0x4c00 Entry 19 (size 64 bundles) Reserved
26271 +       DBG_FAULT(19)
26272 +       FAULT(19)
26273 +
26274 +       /*
26275 +        * There is no particular reason for this code to be here, other than that
26276 +        * there happens to be space here that would go unused otherwise.  If this
26277 +        * fault ever gets "unreserved", simply moved the following code to a more
26278 +        * suitable spot...
26279 +        */
26280 +
26281 +ENTRY(dispatch_to_fault_handler)
26282 +       /*
26283 +        * Input:
26284 +        *      psr.ic: off
26285 +        *      r19:    fault vector number (e.g., 24 for General Exception)
26286 +        *      r31:    contains saved predicates (pr)
26287 +        */
26288 +       SAVE_MIN_WITH_COVER_R19
26289 +       alloc r14=ar.pfs,0,0,5,0
26290 +       mov out0=r15
26291 +#ifdef CONFIG_XEN
26292 +       movl out1=XSI_ISR
26293 +       ;;
26294 +       adds out2=XSI_IFA-XSI_ISR,out1
26295 +       adds out3=XSI_IIM-XSI_ISR,out1
26296 +       adds out4=XSI_ITIR-XSI_ISR,out1
26297 +       ;;
26298 +       ld8 out1=[out1]
26299 +       ld8 out2=[out2]
26300 +       ld8 out3=[out4]
26301 +       ld8 out4=[out4]
26302 +       ;;
26303 +#else
26304 +       mov out1=cr.isr
26305 +       mov out2=cr.ifa
26306 +       mov out3=cr.iim
26307 +       mov out4=cr.itir
26308 +       ;;
26309 +#endif
26310 +       ssm psr.ic | PSR_DEFAULT_BITS
26311 +       ;;
26312 +       srlz.i                                  // guarantee that interruption collection is on
26313 +       ;;
26314 +(p15)  ssm psr.i                               // restore psr.i
26315 +       adds r3=8,r2                            // set up second base pointer for SAVE_REST
26316 +       ;;
26317 +       SAVE_REST
26318 +       movl r14=ia64_leave_kernel
26319 +       ;;
26320 +       mov rp=r14
26321 +       br.call.sptk.many b6=ia64_fault
26322 +END(dispatch_to_fault_handler)
26323 +
26324 +//
26325 +// --- End of long entries, Beginning of short entries
26326 +//
26327 +
26328 +       .org ia64_ivt+0x5000
26329 +/////////////////////////////////////////////////////////////////////////////////////////
26330 +// 0x5000 Entry 20 (size 16 bundles) Page Not Present (10,22,49)
26331 +ENTRY(page_not_present)
26332 +       DBG_FAULT(20)
26333 +       mov r16=cr.ifa
26334 +       rsm psr.dt
26335 +       /*
26336 +        * The Linux page fault handler doesn't expect non-present pages to be in
26337 +        * the TLB.  Flush the existing entry now, so we meet that expectation.
26338 +        */
26339 +       mov r17=PAGE_SHIFT<<2
26340 +       ;;
26341 +       ptc.l r16,r17
26342 +       ;;
26343 +       mov r31=pr
26344 +       srlz.d
26345 +       br.sptk.many page_fault
26346 +END(page_not_present)
26347 +
26348 +       .org ia64_ivt+0x5100
26349 +/////////////////////////////////////////////////////////////////////////////////////////
26350 +// 0x5100 Entry 21 (size 16 bundles) Key Permission (13,25,52)
26351 +ENTRY(key_permission)
26352 +       DBG_FAULT(21)
26353 +       mov r16=cr.ifa
26354 +       rsm psr.dt
26355 +       mov r31=pr
26356 +       ;;
26357 +       srlz.d
26358 +       br.sptk.many page_fault
26359 +END(key_permission)
26360 +
26361 +       .org ia64_ivt+0x5200
26362 +/////////////////////////////////////////////////////////////////////////////////////////
26363 +// 0x5200 Entry 22 (size 16 bundles) Instruction Access Rights (26)
26364 +ENTRY(iaccess_rights)
26365 +       DBG_FAULT(22)
26366 +       mov r16=cr.ifa
26367 +       rsm psr.dt
26368 +       mov r31=pr
26369 +       ;;
26370 +       srlz.d
26371 +       br.sptk.many page_fault
26372 +END(iaccess_rights)
26373 +
26374 +       .org ia64_ivt+0x5300
26375 +/////////////////////////////////////////////////////////////////////////////////////////
26376 +// 0x5300 Entry 23 (size 16 bundles) Data Access Rights (14,53)
26377 +ENTRY(daccess_rights)
26378 +       DBG_FAULT(23)
26379 +#ifdef CONFIG_XEN
26380 +       movl r16=XSI_IFA
26381 +       ;;
26382 +       ld8 r16=[r16]
26383 +       ;;
26384 +       XEN_HYPER_RSM_PSR_DT
26385 +#else
26386 +       mov r16=cr.ifa
26387 +       rsm psr.dt
26388 +#endif
26389 +       mov r31=pr
26390 +       ;;
26391 +       srlz.d
26392 +       br.sptk.many page_fault
26393 +END(daccess_rights)
26394 +
26395 +       .org ia64_ivt+0x5400
26396 +/////////////////////////////////////////////////////////////////////////////////////////
26397 +// 0x5400 Entry 24 (size 16 bundles) General Exception (5,32,34,36,38,39)
26398 +ENTRY(general_exception)
26399 +       DBG_FAULT(24)
26400 +       mov r16=cr.isr
26401 +       mov r31=pr
26402 +       ;;
26403 +       cmp4.eq p6,p0=0,r16
26404 +(p6)   br.sptk.many dispatch_illegal_op_fault
26405 +       ;;
26406 +       mov r19=24              // fault number
26407 +       br.sptk.many dispatch_to_fault_handler
26408 +END(general_exception)
26409 +
26410 +       .org ia64_ivt+0x5500
26411 +/////////////////////////////////////////////////////////////////////////////////////////
26412 +// 0x5500 Entry 25 (size 16 bundles) Disabled FP-Register (35)
26413 +ENTRY(disabled_fp_reg)
26414 +       DBG_FAULT(25)
26415 +       rsm psr.dfh             // ensure we can access fph
26416 +       ;;
26417 +       srlz.d
26418 +       mov r31=pr
26419 +       mov r19=25
26420 +       br.sptk.many dispatch_to_fault_handler
26421 +END(disabled_fp_reg)
26422 +
26423 +       .org ia64_ivt+0x5600
26424 +/////////////////////////////////////////////////////////////////////////////////////////
26425 +// 0x5600 Entry 26 (size 16 bundles) Nat Consumption (11,23,37,50)
26426 +ENTRY(nat_consumption)
26427 +       DBG_FAULT(26)
26428 +
26429 +       mov r16=cr.ipsr
26430 +       mov r17=cr.isr
26431 +       mov r31=pr                              // save PR
26432 +       ;;
26433 +       and r18=0xf,r17                         // r18 = cr.ipsr.code{3:0}
26434 +       tbit.z p6,p0=r17,IA64_ISR_NA_BIT
26435 +       ;;
26436 +       cmp.ne.or p6,p0=IA64_ISR_CODE_LFETCH,r18
26437 +       dep r16=-1,r16,IA64_PSR_ED_BIT,1
26438 +(p6)   br.cond.spnt 1f         // branch if (cr.ispr.na == 0 || cr.ipsr.code{3:0} != LFETCH)
26439 +       ;;
26440 +       mov cr.ipsr=r16         // set cr.ipsr.na
26441 +       mov pr=r31,-1
26442 +       ;;
26443 +       rfi
26444 +
26445 +1:     mov pr=r31,-1
26446 +       ;;
26447 +       FAULT(26)
26448 +END(nat_consumption)
26449 +
26450 +       .org ia64_ivt+0x5700
26451 +/////////////////////////////////////////////////////////////////////////////////////////
26452 +// 0x5700 Entry 27 (size 16 bundles) Speculation (40)
26453 +ENTRY(speculation_vector)
26454 +       DBG_FAULT(27)
26455 +       /*
26456 +        * A [f]chk.[as] instruction needs to take the branch to the recovery code but
26457 +        * this part of the architecture is not implemented in hardware on some CPUs, such
26458 +        * as Itanium.  Thus, in general we need to emulate the behavior.  IIM contains
26459 +        * the relative target (not yet sign extended).  So after sign extending it we
26460 +        * simply add it to IIP.  We also need to reset the EI field of the IPSR to zero,
26461 +        * i.e., the slot to restart into.
26462 +        *
26463 +        * cr.imm contains zero_ext(imm21)
26464 +        */
26465 +       mov r18=cr.iim
26466 +       ;;
26467 +       mov r17=cr.iip
26468 +       shl r18=r18,43                  // put sign bit in position (43=64-21)
26469 +       ;;
26470 +
26471 +       mov r16=cr.ipsr
26472 +       shr r18=r18,39                  // sign extend (39=43-4)
26473 +       ;;
26474 +
26475 +       add r17=r17,r18                 // now add the offset
26476 +       ;;
26477 +       mov cr.iip=r17
26478 +       dep r16=0,r16,41,2              // clear EI
26479 +       ;;
26480 +
26481 +       mov cr.ipsr=r16
26482 +       ;;
26483 +
26484 +#ifdef CONFIG_XEN
26485 +       XEN_HYPER_RFI;
26486 +#else
26487 +       rfi                             // and go back
26488 +#endif
26489 +END(speculation_vector)
26490 +
26491 +       .org ia64_ivt+0x5800
26492 +/////////////////////////////////////////////////////////////////////////////////////////
26493 +// 0x5800 Entry 28 (size 16 bundles) Reserved
26494 +       DBG_FAULT(28)
26495 +       FAULT(28)
26496 +
26497 +       .org ia64_ivt+0x5900
26498 +/////////////////////////////////////////////////////////////////////////////////////////
26499 +// 0x5900 Entry 29 (size 16 bundles) Debug (16,28,56)
26500 +ENTRY(debug_vector)
26501 +       DBG_FAULT(29)
26502 +       FAULT(29)
26503 +END(debug_vector)
26504 +
26505 +       .org ia64_ivt+0x5a00
26506 +/////////////////////////////////////////////////////////////////////////////////////////
26507 +// 0x5a00 Entry 30 (size 16 bundles) Unaligned Reference (57)
26508 +ENTRY(unaligned_access)
26509 +       DBG_FAULT(30)
26510 +       mov r31=pr              // prepare to save predicates
26511 +       ;;
26512 +       br.sptk.many dispatch_unaligned_handler
26513 +END(unaligned_access)
26514 +
26515 +       .org ia64_ivt+0x5b00
26516 +/////////////////////////////////////////////////////////////////////////////////////////
26517 +// 0x5b00 Entry 31 (size 16 bundles) Unsupported Data Reference (57)
26518 +ENTRY(unsupported_data_reference)
26519 +       DBG_FAULT(31)
26520 +       FAULT(31)
26521 +END(unsupported_data_reference)
26522 +
26523 +       .org ia64_ivt+0x5c00
26524 +/////////////////////////////////////////////////////////////////////////////////////////
26525 +// 0x5c00 Entry 32 (size 16 bundles) Floating-Point Fault (64)
26526 +ENTRY(floating_point_fault)
26527 +       DBG_FAULT(32)
26528 +       FAULT(32)
26529 +END(floating_point_fault)
26530 +
26531 +       .org ia64_ivt+0x5d00
26532 +/////////////////////////////////////////////////////////////////////////////////////////
26533 +// 0x5d00 Entry 33 (size 16 bundles) Floating Point Trap (66)
26534 +ENTRY(floating_point_trap)
26535 +       DBG_FAULT(33)
26536 +       FAULT(33)
26537 +END(floating_point_trap)
26538 +
26539 +       .org ia64_ivt+0x5e00
26540 +/////////////////////////////////////////////////////////////////////////////////////////
26541 +// 0x5e00 Entry 34 (size 16 bundles) Lower Privilege Transfer Trap (66)
26542 +ENTRY(lower_privilege_trap)
26543 +       DBG_FAULT(34)
26544 +       FAULT(34)
26545 +END(lower_privilege_trap)
26546 +
26547 +       .org ia64_ivt+0x5f00
26548 +/////////////////////////////////////////////////////////////////////////////////////////
26549 +// 0x5f00 Entry 35 (size 16 bundles) Taken Branch Trap (68)
26550 +ENTRY(taken_branch_trap)
26551 +       DBG_FAULT(35)
26552 +       FAULT(35)
26553 +END(taken_branch_trap)
26554 +
26555 +       .org ia64_ivt+0x6000
26556 +/////////////////////////////////////////////////////////////////////////////////////////
26557 +// 0x6000 Entry 36 (size 16 bundles) Single Step Trap (69)
26558 +ENTRY(single_step_trap)
26559 +       DBG_FAULT(36)
26560 +       FAULT(36)
26561 +END(single_step_trap)
26562 +
26563 +       .org ia64_ivt+0x6100
26564 +/////////////////////////////////////////////////////////////////////////////////////////
26565 +// 0x6100 Entry 37 (size 16 bundles) Reserved
26566 +       DBG_FAULT(37)
26567 +       FAULT(37)
26568 +
26569 +       .org ia64_ivt+0x6200
26570 +/////////////////////////////////////////////////////////////////////////////////////////
26571 +// 0x6200 Entry 38 (size 16 bundles) Reserved
26572 +       DBG_FAULT(38)
26573 +       FAULT(38)
26574 +
26575 +       .org ia64_ivt+0x6300
26576 +/////////////////////////////////////////////////////////////////////////////////////////
26577 +// 0x6300 Entry 39 (size 16 bundles) Reserved
26578 +       DBG_FAULT(39)
26579 +       FAULT(39)
26580 +
26581 +       .org ia64_ivt+0x6400
26582 +/////////////////////////////////////////////////////////////////////////////////////////
26583 +// 0x6400 Entry 40 (size 16 bundles) Reserved
26584 +       DBG_FAULT(40)
26585 +       FAULT(40)
26586 +
26587 +       .org ia64_ivt+0x6500
26588 +/////////////////////////////////////////////////////////////////////////////////////////
26589 +// 0x6500 Entry 41 (size 16 bundles) Reserved
26590 +       DBG_FAULT(41)
26591 +       FAULT(41)
26592 +
26593 +       .org ia64_ivt+0x6600
26594 +/////////////////////////////////////////////////////////////////////////////////////////
26595 +// 0x6600 Entry 42 (size 16 bundles) Reserved
26596 +       DBG_FAULT(42)
26597 +       FAULT(42)
26598 +
26599 +       .org ia64_ivt+0x6700
26600 +/////////////////////////////////////////////////////////////////////////////////////////
26601 +// 0x6700 Entry 43 (size 16 bundles) Reserved
26602 +       DBG_FAULT(43)
26603 +       FAULT(43)
26604 +
26605 +       .org ia64_ivt+0x6800
26606 +/////////////////////////////////////////////////////////////////////////////////////////
26607 +// 0x6800 Entry 44 (size 16 bundles) Reserved
26608 +       DBG_FAULT(44)
26609 +       FAULT(44)
26610 +
26611 +       .org ia64_ivt+0x6900
26612 +/////////////////////////////////////////////////////////////////////////////////////////
26613 +// 0x6900 Entry 45 (size 16 bundles) IA-32 Exeception (17,18,29,41,42,43,44,58,60,61,62,72,73,75,76,77)
26614 +ENTRY(ia32_exception)
26615 +       DBG_FAULT(45)
26616 +       FAULT(45)
26617 +END(ia32_exception)
26618 +
26619 +       .org ia64_ivt+0x6a00
26620 +/////////////////////////////////////////////////////////////////////////////////////////
26621 +// 0x6a00 Entry 46 (size 16 bundles) IA-32 Intercept  (30,31,59,70,71)
26622 +ENTRY(ia32_intercept)
26623 +       DBG_FAULT(46)
26624 +#ifdef CONFIG_IA32_SUPPORT
26625 +       mov r31=pr
26626 +       mov r16=cr.isr
26627 +       ;;
26628 +       extr.u r17=r16,16,8     // get ISR.code
26629 +       mov r18=ar.eflag
26630 +       mov r19=cr.iim          // old eflag value
26631 +       ;;
26632 +       cmp.ne p6,p0=2,r17
26633 +(p6)   br.cond.spnt 1f         // not a system flag fault
26634 +       xor r16=r18,r19
26635 +       ;;
26636 +       extr.u r17=r16,18,1     // get the eflags.ac bit
26637 +       ;;
26638 +       cmp.eq p6,p0=0,r17
26639 +(p6)   br.cond.spnt 1f         // eflags.ac bit didn't change
26640 +       ;;
26641 +       mov pr=r31,-1           // restore predicate registers
26642 +#ifdef CONFIG_XEN
26643 +       XEN_HYPER_RFI;
26644 +#else
26645 +       rfi
26646 +#endif
26647 +
26648 +1:
26649 +#endif // CONFIG_IA32_SUPPORT
26650 +       FAULT(46)
26651 +END(ia32_intercept)
26652 +
26653 +       .org ia64_ivt+0x6b00
26654 +/////////////////////////////////////////////////////////////////////////////////////////
26655 +// 0x6b00 Entry 47 (size 16 bundles) IA-32 Interrupt  (74)
26656 +ENTRY(ia32_interrupt)
26657 +       DBG_FAULT(47)
26658 +#ifdef CONFIG_IA32_SUPPORT
26659 +       mov r31=pr
26660 +       br.sptk.many dispatch_to_ia32_handler
26661 +#else
26662 +       FAULT(47)
26663 +#endif
26664 +END(ia32_interrupt)
26665 +
26666 +       .org ia64_ivt+0x6c00
26667 +/////////////////////////////////////////////////////////////////////////////////////////
26668 +// 0x6c00 Entry 48 (size 16 bundles) Reserved
26669 +       DBG_FAULT(48)
26670 +       FAULT(48)
26671 +
26672 +       .org ia64_ivt+0x6d00
26673 +/////////////////////////////////////////////////////////////////////////////////////////
26674 +// 0x6d00 Entry 49 (size 16 bundles) Reserved
26675 +       DBG_FAULT(49)
26676 +       FAULT(49)
26677 +
26678 +       .org ia64_ivt+0x6e00
26679 +/////////////////////////////////////////////////////////////////////////////////////////
26680 +// 0x6e00 Entry 50 (size 16 bundles) Reserved
26681 +       DBG_FAULT(50)
26682 +       FAULT(50)
26683 +
26684 +       .org ia64_ivt+0x6f00
26685 +/////////////////////////////////////////////////////////////////////////////////////////
26686 +// 0x6f00 Entry 51 (size 16 bundles) Reserved
26687 +       DBG_FAULT(51)
26688 +       FAULT(51)
26689 +
26690 +       .org ia64_ivt+0x7000
26691 +/////////////////////////////////////////////////////////////////////////////////////////
26692 +// 0x7000 Entry 52 (size 16 bundles) Reserved
26693 +       DBG_FAULT(52)
26694 +       FAULT(52)
26695 +
26696 +       .org ia64_ivt+0x7100
26697 +/////////////////////////////////////////////////////////////////////////////////////////
26698 +// 0x7100 Entry 53 (size 16 bundles) Reserved
26699 +       DBG_FAULT(53)
26700 +       FAULT(53)
26701 +
26702 +       .org ia64_ivt+0x7200
26703 +/////////////////////////////////////////////////////////////////////////////////////////
26704 +// 0x7200 Entry 54 (size 16 bundles) Reserved
26705 +       DBG_FAULT(54)
26706 +       FAULT(54)
26707 +
26708 +       .org ia64_ivt+0x7300
26709 +/////////////////////////////////////////////////////////////////////////////////////////
26710 +// 0x7300 Entry 55 (size 16 bundles) Reserved
26711 +       DBG_FAULT(55)
26712 +       FAULT(55)
26713 +
26714 +       .org ia64_ivt+0x7400
26715 +/////////////////////////////////////////////////////////////////////////////////////////
26716 +// 0x7400 Entry 56 (size 16 bundles) Reserved
26717 +       DBG_FAULT(56)
26718 +       FAULT(56)
26719 +
26720 +       .org ia64_ivt+0x7500
26721 +/////////////////////////////////////////////////////////////////////////////////////////
26722 +// 0x7500 Entry 57 (size 16 bundles) Reserved
26723 +       DBG_FAULT(57)
26724 +       FAULT(57)
26725 +
26726 +       .org ia64_ivt+0x7600
26727 +/////////////////////////////////////////////////////////////////////////////////////////
26728 +// 0x7600 Entry 58 (size 16 bundles) Reserved
26729 +       DBG_FAULT(58)
26730 +       FAULT(58)
26731 +
26732 +       .org ia64_ivt+0x7700
26733 +/////////////////////////////////////////////////////////////////////////////////////////
26734 +// 0x7700 Entry 59 (size 16 bundles) Reserved
26735 +       DBG_FAULT(59)
26736 +       FAULT(59)
26737 +
26738 +       .org ia64_ivt+0x7800
26739 +/////////////////////////////////////////////////////////////////////////////////////////
26740 +// 0x7800 Entry 60 (size 16 bundles) Reserved
26741 +       DBG_FAULT(60)
26742 +       FAULT(60)
26743 +
26744 +       .org ia64_ivt+0x7900
26745 +/////////////////////////////////////////////////////////////////////////////////////////
26746 +// 0x7900 Entry 61 (size 16 bundles) Reserved
26747 +       DBG_FAULT(61)
26748 +       FAULT(61)
26749 +
26750 +       .org ia64_ivt+0x7a00
26751 +/////////////////////////////////////////////////////////////////////////////////////////
26752 +// 0x7a00 Entry 62 (size 16 bundles) Reserved
26753 +       DBG_FAULT(62)
26754 +       FAULT(62)
26755 +
26756 +       .org ia64_ivt+0x7b00
26757 +/////////////////////////////////////////////////////////////////////////////////////////
26758 +// 0x7b00 Entry 63 (size 16 bundles) Reserved
26759 +       DBG_FAULT(63)
26760 +       FAULT(63)
26761 +
26762 +       .org ia64_ivt+0x7c00
26763 +/////////////////////////////////////////////////////////////////////////////////////////
26764 +// 0x7c00 Entry 64 (size 16 bundles) Reserved
26765 +       DBG_FAULT(64)
26766 +       FAULT(64)
26767 +
26768 +       .org ia64_ivt+0x7d00
26769 +/////////////////////////////////////////////////////////////////////////////////////////
26770 +// 0x7d00 Entry 65 (size 16 bundles) Reserved
26771 +       DBG_FAULT(65)
26772 +       FAULT(65)
26773 +
26774 +       .org ia64_ivt+0x7e00
26775 +/////////////////////////////////////////////////////////////////////////////////////////
26776 +// 0x7e00 Entry 66 (size 16 bundles) Reserved
26777 +       DBG_FAULT(66)
26778 +       FAULT(66)
26779 +
26780 +#ifdef CONFIG_XEN
26781 +       /*
26782 +        * There is no particular reason for this code to be here, other than that
26783 +        * there happens to be space here that would go unused otherwise.  If this
26784 +        * fault ever gets "unreserved", simply moved the following code to a more
26785 +        * suitable spot...
26786 +        */
26787 +
26788 +GLOBAL_ENTRY(xen_bsw1)
26789 +       /* FIXME: THIS CODE IS NOT NaT SAFE! */
26790 +       movl r30=XSI_BANKNUM;
26791 +       mov r31=1;;
26792 +       st4 [r30]=r31;
26793 +       movl r30=XSI_BANK1_R16;
26794 +       movl r31=XSI_BANK1_R16+8;;
26795 +       ld8 r16=[r30],16; ld8 r17=[r31],16;;
26796 +       ld8 r18=[r30],16; ld8 r19=[r31],16;;
26797 +       ld8 r20=[r30],16; ld8 r21=[r31],16;;
26798 +       ld8 r22=[r30],16; ld8 r23=[r31],16;;
26799 +       ld8 r24=[r30],16; ld8 r25=[r31],16;;
26800 +       ld8 r26=[r30],16; ld8 r27=[r31],16;;
26801 +       ld8 r28=[r30],16; ld8 r29=[r31],16;;
26802 +       ld8 r30=[r30]; ld8 r31=[r31];;
26803 +       br.ret.sptk.many b0
26804 +END(xen_bsw1)
26805 +#endif
26806 +
26807 +       .org ia64_ivt+0x7f00
26808 +/////////////////////////////////////////////////////////////////////////////////////////
26809 +// 0x7f00 Entry 67 (size 16 bundles) Reserved
26810 +       DBG_FAULT(67)
26811 +       FAULT(67)
26812 +
26813 +#ifdef CONFIG_IA32_SUPPORT
26814 +
26815 +       /*
26816 +        * There is no particular reason for this code to be here, other than that
26817 +        * there happens to be space here that would go unused otherwise.  If this
26818 +        * fault ever gets "unreserved", simply moved the following code to a more
26819 +        * suitable spot...
26820 +        */
26821 +
26822 +       // IA32 interrupt entry point
26823 +
26824 +ENTRY(dispatch_to_ia32_handler)
26825 +       SAVE_MIN
26826 +       ;;
26827 +       mov r14=cr.isr
26828 +       ssm psr.ic | PSR_DEFAULT_BITS
26829 +       ;;
26830 +       srlz.i                                  // guarantee that interruption collection is on
26831 +       ;;
26832 +(p15)  ssm psr.i
26833 +       adds r3=8,r2            // Base pointer for SAVE_REST
26834 +       ;;
26835 +       SAVE_REST
26836 +       ;;
26837 +       mov r15=0x80
26838 +       shr r14=r14,16          // Get interrupt number
26839 +       ;;
26840 +       cmp.ne p6,p0=r14,r15
26841 +(p6)   br.call.dpnt.many b6=non_ia32_syscall
26842 +
26843 +       adds r14=IA64_PT_REGS_R8_OFFSET + 16,sp // 16 byte hole per SW conventions
26844 +       adds r15=IA64_PT_REGS_R1_OFFSET + 16,sp
26845 +       ;;
26846 +       cmp.eq pSys,pNonSys=r0,r0 // set pSys=1, pNonSys=0
26847 +       ld8 r8=[r14]            // get r8
26848 +       ;;
26849 +       st8 [r15]=r8            // save original EAX in r1 (IA32 procs don't use the GP)
26850 +       ;;
26851 +       alloc r15=ar.pfs,0,0,6,0        // must first in an insn group
26852 +       ;;
26853 +       ld4 r8=[r14],8          // r8 == eax (syscall number)
26854 +       mov r15=IA32_NR_syscalls
26855 +       ;;
26856 +       cmp.ltu.unc p6,p7=r8,r15
26857 +       ld4 out1=[r14],8        // r9 == ecx
26858 +       ;;
26859 +       ld4 out2=[r14],8        // r10 == edx
26860 +       ;;
26861 +       ld4 out0=[r14]          // r11 == ebx
26862 +       adds r14=(IA64_PT_REGS_R13_OFFSET) + 16,sp
26863 +       ;;
26864 +       ld4 out5=[r14],PT(R14)-PT(R13)  // r13 == ebp
26865 +       ;;
26866 +       ld4 out3=[r14],PT(R15)-PT(R14)  // r14 == esi
26867 +       adds r2=TI_FLAGS+IA64_TASK_SIZE,r13
26868 +       ;;
26869 +       ld4 out4=[r14]          // r15 == edi
26870 +       movl r16=ia32_syscall_table
26871 +       ;;
26872 +(p6)   shladd r16=r8,3,r16     // force ni_syscall if not valid syscall number
26873 +       ld4 r2=[r2]             // r2 = current_thread_info()->flags
26874 +       ;;
26875 +       ld8 r16=[r16]
26876 +       and r2=_TIF_SYSCALL_TRACEAUDIT,r2       // mask trace or audit
26877 +       ;;
26878 +       mov b6=r16
26879 +       movl r15=ia32_ret_from_syscall
26880 +       cmp.eq p8,p0=r2,r0
26881 +       ;;
26882 +       mov rp=r15
26883 +(p8)   br.call.sptk.many b6=b6
26884 +       br.cond.sptk ia32_trace_syscall
26885 +
26886 +non_ia32_syscall:
26887 +       alloc r15=ar.pfs,0,0,2,0
26888 +       mov out0=r14                            // interrupt #
26889 +       add out1=16,sp                          // pointer to pt_regs
26890 +       ;;                      // avoid WAW on CFM
26891 +       br.call.sptk.many rp=ia32_bad_interrupt
26892 +.ret1: movl r15=ia64_leave_kernel
26893 +       ;;
26894 +       mov rp=r15
26895 +       br.ret.sptk.many rp
26896 +END(dispatch_to_ia32_handler)
26897 +#endif /* CONFIG_IA32_SUPPORT */
26898 +
26899 +#ifdef CONFIG_XEN
26900 +       .section .text,"ax"
26901 +GLOBAL_ENTRY(xen_event_callback)
26902 +       mov r31=pr              // prepare to save predicates
26903 +       ;;
26904 +       SAVE_MIN_WITH_COVER     // uses r31; defines r2 and r3
26905 +       ;;
26906 +       movl r3=XSI_PSR_IC
26907 +       mov r14=1
26908 +       ;;
26909 +       st4 [r3]=r14
26910 +       ;;
26911 +       adds r3=8,r2            // set up second base pointer for SAVE_REST
26912 +       srlz.i                  // ensure everybody knows psr.ic is back on
26913 +       ;;
26914 +       SAVE_REST
26915 +       ;;
26916 +       alloc r14=ar.pfs,0,0,1,0 // must be first in an insn group
26917 +       add out0=16,sp          // pass pointer to pt_regs as first arg
26918 +       ;;
26919 +       srlz.d                  // make sure we see the effect of cr.ivr
26920 +       movl r14=ia64_leave_kernel
26921 +       ;;
26922 +       mov rp=r14
26923 +       br.call.sptk.many b6=evtchn_do_upcall
26924 +END(xen_event_callback)
26925 +#endif
26926 diff -urNp linux-2.6/arch/ia64/xen/xenminstate.h new/arch/ia64/xen/xenminstate.h
26927 --- linux-2.6/arch/ia64/xen/xenminstate.h       1970-01-01 01:00:00.000000000 +0100
26928 +++ new/arch/ia64/xen/xenminstate.h     2006-05-23 18:42:17.000000000 +0200
26929 @@ -0,0 +1,369 @@
26930 +#include <linux/config.h>
26931 +
26932 +#include <asm/cache.h>
26933 +
26934 +#ifdef CONFIG_XEN
26935 +#include "../kernel/entry.h"
26936 +#else
26937 +#include "entry.h"
26938 +#endif
26939 +
26940 +/*
26941 + * For ivt.s we want to access the stack virtually so we don't have to disable translation
26942 + * on interrupts.
26943 + *
26944 + *  On entry:
26945 + *     r1:     pointer to current task (ar.k6)
26946 + */
26947 +#define MINSTATE_START_SAVE_MIN_VIRT                                                           \
26948 +(pUStk)        mov ar.rsc=0;           /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */     \
26949 +       ;;                                                                                      \
26950 +(pUStk)        mov.m r24=ar.rnat;                                                                      \
26951 +(pUStk)        addl r22=IA64_RBS_OFFSET,r1;                    /* compute base of RBS */               \
26952 +(pKStk) mov r1=sp;                                     /* get sp  */                           \
26953 +       ;;                                                                                      \
26954 +(pUStk) lfetch.fault.excl.nt1 [r22];                                                           \
26955 +(pUStk)        addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;   /* compute base of memory stack */      \
26956 +(pUStk)        mov r23=ar.bspstore;                            /* save ar.bspstore */                  \
26957 +       ;;                                                                                      \
26958 +(pUStk)        mov ar.bspstore=r22;                            /* switch to kernel RBS */              \
26959 +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;                 /* if in kernel mode, use sp (r12) */   \
26960 +       ;;                                                                                      \
26961 +(pUStk)        mov r18=ar.bsp;                                                                         \
26962 +(pUStk)        mov ar.rsc=0x3;         /* set eager mode, pl 0, little-endian, loadrs=0 */             \
26963 +
26964 +#define MINSTATE_END_SAVE_MIN_VIRT                                                             \
26965 +       bsw.1;                  /* switch back to bank 1 (must be last in insn group) */        \
26966 +       ;;
26967 +
26968 +/*
26969 + * For mca_asm.S we want to access the stack physically since the state is saved before we
26970 + * go virtual and don't want to destroy the iip or ipsr.
26971 + */
26972 +#define MINSTATE_START_SAVE_MIN_PHYS                                                           \
26973 +(pKStk) mov r3=IA64_KR(PER_CPU_DATA);;                                                         \
26974 +(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;;                                                   \
26975 +(pKStk) ld8 r3 = [r3];;                                                                                \
26976 +(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;;                                            \
26977 +(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3;                                          \
26978 +(pUStk)        mov ar.rsc=0;           /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */     \
26979 +(pUStk)        addl r22=IA64_RBS_OFFSET,r1;            /* compute base of register backing store */    \
26980 +       ;;                                                                                      \
26981 +(pUStk)        mov r24=ar.rnat;                                                                        \
26982 +(pUStk)        addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;   /* compute base of memory stack */      \
26983 +(pUStk)        mov r23=ar.bspstore;                            /* save ar.bspstore */                  \
26984 +(pUStk)        dep r22=-1,r22,61,3;                    /* compute kernel virtual addr of RBS */        \
26985 +       ;;                                                                                      \
26986 +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;         /* if in kernel mode, use sp (r12) */           \
26987 +(pUStk)        mov ar.bspstore=r22;                    /* switch to kernel RBS */                      \
26988 +       ;;                                                                                      \
26989 +(pUStk)        mov r18=ar.bsp;                                                                         \
26990 +(pUStk)        mov ar.rsc=0x3;         /* set eager mode, pl 0, little-endian, loadrs=0 */             \
26991 +
26992 +#define MINSTATE_END_SAVE_MIN_PHYS                                                             \
26993 +       dep r12=-1,r12,61,3;            /* make sp a kernel virtual address */                  \
26994 +       ;;
26995 +
26996 +#ifdef MINSTATE_VIRT
26997 +# define MINSTATE_GET_CURRENT(reg)     mov reg=IA64_KR(CURRENT)
26998 +# define MINSTATE_START_SAVE_MIN       MINSTATE_START_SAVE_MIN_VIRT
26999 +# define MINSTATE_END_SAVE_MIN         MINSTATE_END_SAVE_MIN_VIRT
27000 +#endif
27001 +
27002 +#ifdef MINSTATE_PHYS
27003 +# define MINSTATE_GET_CURRENT(reg)     mov reg=IA64_KR(CURRENT);; tpa reg=reg
27004 +# define MINSTATE_START_SAVE_MIN       MINSTATE_START_SAVE_MIN_PHYS
27005 +# define MINSTATE_END_SAVE_MIN         MINSTATE_END_SAVE_MIN_PHYS
27006 +#endif
27007 +
27008 +/*
27009 + * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
27010 + * the minimum state necessary that allows us to turn psr.ic back
27011 + * on.
27012 + *
27013 + * Assumed state upon entry:
27014 + *     psr.ic: off
27015 + *     r31:    contains saved predicates (pr)
27016 + *
27017 + * Upon exit, the state is as follows:
27018 + *     psr.ic: off
27019 + *      r2 = points to &pt_regs.r16
27020 + *      r8 = contents of ar.ccv
27021 + *      r9 = contents of ar.csd
27022 + *     r10 = contents of ar.ssd
27023 + *     r11 = FPSR_DEFAULT
27024 + *     r12 = kernel sp (kernel virtual address)
27025 + *     r13 = points to current task_struct (kernel virtual address)
27026 + *     p15 = TRUE if psr.i is set in cr.ipsr
27027 + *     predicate registers (other than p2, p3, and p15), b6, r3, r14, r15:
27028 + *             preserved
27029 + * CONFIG_XEN note: p6/p7 are not preserved
27030 + *
27031 + * Note that psr.ic is NOT turned on by this macro.  This is so that
27032 + * we can pass interruption state as arguments to a handler.
27033 + */
27034 +#ifdef CONFIG_XEN
27035 +#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA)                                                      \
27036 +       MINSTATE_GET_CURRENT(r16);      /* M (or M;;I) */                                       \
27037 +       mov r27=ar.rsc;                 /* M */                                                 \
27038 +       mov r20=r1;                     /* A */                                                 \
27039 +       mov r25=ar.unat;                /* M */                                                 \
27040 +       /* mov r29=cr.ipsr;             /* M */                                                 \
27041 +       movl r29=XSI_IPSR;;                                                                     \
27042 +       ld8 r29=[r29];;                                                                         \
27043 +       mov r26=ar.pfs;                 /* I */                                                 \
27044 +       /* mov r28=cr.iip;              /* M */                                                 \
27045 +       movl r28=XSI_IIP;;                                                                      \
27046 +       ld8 r28=[r28];;                                                                         \
27047 +       mov r21=ar.fpsr;                /* M */                                                 \
27048 +       COVER;                  /* B;; (or nothing) */                                  \
27049 +       ;;                                                                                      \
27050 +       adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;                                         \
27051 +       ;;                                                                                      \
27052 +       ld1 r17=[r16];                          /* load current->thread.on_ustack flag */       \
27053 +       st1 [r16]=r0;                           /* clear current->thread.on_ustack flag */      \
27054 +       adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16                                          \
27055 +       /* switch from user to kernel RBS: */                                                   \
27056 +       ;;                                                                                      \
27057 +       invala;                         /* M */                                                 \
27058 +       /* SAVE_IFS; /* see xen special handling below */                                               \
27059 +       cmp.eq pKStk,pUStk=r0,r17;              /* are we in kernel mode already? */            \
27060 +       ;;                                                                                      \
27061 +       MINSTATE_START_SAVE_MIN                                                                 \
27062 +       adds r17=2*L1_CACHE_BYTES,r1;           /* really: biggest cache-line size */           \
27063 +       adds r16=PT(CR_IPSR),r1;                                                                \
27064 +       ;;                                                                                      \
27065 +       lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;                                             \
27066 +       st8 [r16]=r29;          /* save cr.ipsr */                                              \
27067 +       ;;                                                                                      \
27068 +       lfetch.fault.excl.nt1 [r17];                                                            \
27069 +       tbit.nz p15,p0=r29,IA64_PSR_I_BIT;                                                      \
27070 +       mov r29=b0                                                                              \
27071 +       ;;                                                                                      \
27072 +       adds r16=PT(R8),r1;     /* initialize first base pointer */                             \
27073 +       adds r17=PT(R9),r1;     /* initialize second base pointer */                            \
27074 +(pKStk)        mov r18=r0;             /* make sure r18 isn't NaT */                                   \
27075 +       ;;                                                                                      \
27076 +.mem.offset 0,0; st8.spill [r16]=r8,16;                                                                \
27077 +.mem.offset 8,0; st8.spill [r17]=r9,16;                                                                \
27078 +        ;;                                                                                     \
27079 +.mem.offset 0,0; st8.spill [r16]=r10,24;                                                       \
27080 +.mem.offset 8,0; st8.spill [r17]=r11,24;                                                       \
27081 +        ;;                                                                                     \
27082 +       /* xen special handling for possibly lazy cover */                                      \
27083 +       movl r8=XSI_INCOMPL_REGFR;                                                              \
27084 +       ;;                                                                                      \
27085 +       ld4 r30=[r8];                                                                           \
27086 +       ;;                                                                                      \
27087 +       /* set XSI_INCOMPL_REGFR 0 */                                                           \
27088 +       st4 [r8]=r0;                                                                            \
27089 +       cmp.eq  p6,p7=r30,r0;                                                                   \
27090 +       ;; /* not sure if this stop bit is necessary */                                         \
27091 +(p6)   adds r8=XSI_PRECOVER_IFS-XSI_INCOMPL_REGFR,r8;                                          \
27092 +(p7)   adds r8=XSI_IFS-XSI_INCOMPL_REGFR,r8;                                                   \
27093 +       ;;                                                                                      \
27094 +       ld8 r30=[r8];                                                                           \
27095 +       ;;                                                                                      \
27096 +       st8 [r16]=r28,16;       /* save cr.iip */                                               \
27097 +       st8 [r17]=r30,16;       /* save cr.ifs */                                               \
27098 +(pUStk)        sub r18=r18,r22;        /* r18=RSE.ndirty*8 */                                          \
27099 +       mov r8=ar.ccv;                                                                          \
27100 +       mov r9=ar.csd;                                                                          \
27101 +       mov r10=ar.ssd;                                                                         \
27102 +       movl r11=FPSR_DEFAULT;   /* L-unit */                                                   \
27103 +       ;;                                                                                      \
27104 +       st8 [r16]=r25,16;       /* save ar.unat */                                              \
27105 +       st8 [r17]=r26,16;       /* save ar.pfs */                                               \
27106 +       shl r18=r18,16;         /* compute ar.rsc to be used for "loadrs" */                    \
27107 +       ;;                                                                                      \
27108 +       st8 [r16]=r27,16;       /* save ar.rsc */                                               \
27109 +(pUStk)        st8 [r17]=r24,16;       /* save ar.rnat */                                              \
27110 +(pKStk)        adds r17=16,r17;        /* skip over ar_rnat field */                                   \
27111 +       ;;                      /* avoid RAW on r16 & r17 */                                    \
27112 +(pUStk)        st8 [r16]=r23,16;       /* save ar.bspstore */                                          \
27113 +       st8 [r17]=r31,16;       /* save predicates */                                           \
27114 +(pKStk)        adds r16=16,r16;        /* skip over ar_bspstore field */                               \
27115 +       ;;                                                                                      \
27116 +       st8 [r16]=r29,16;       /* save b0 */                                                   \
27117 +       st8 [r17]=r18,16;       /* save ar.rsc value for "loadrs" */                            \
27118 +       cmp.eq pNonSys,pSys=r0,r0       /* initialize pSys=0, pNonSys=1 */                      \
27119 +       ;;                                                                                      \
27120 +.mem.offset 0,0; st8.spill [r16]=r20,16;       /* save original r1 */                          \
27121 +.mem.offset 8,0; st8.spill [r17]=r12,16;                                                       \
27122 +       adds r12=-16,r1;        /* switch to kernel memory stack (with 16 bytes of scratch) */  \
27123 +       ;;                                                                                      \
27124 +.mem.offset 0,0; st8.spill [r16]=r13,16;                                                       \
27125 +.mem.offset 8,0; st8.spill [r17]=r21,16;       /* save ar.fpsr */                              \
27126 +       mov r13=IA64_KR(CURRENT);       /* establish `current' */                               \
27127 +       ;;                                                                                      \
27128 +.mem.offset 0,0; st8.spill [r16]=r15,16;                                                       \
27129 +.mem.offset 8,0; st8.spill [r17]=r14,16;                                                       \
27130 +       ;;                                                                                      \
27131 +.mem.offset 0,0; st8.spill [r16]=r2,16;                                                                \
27132 +.mem.offset 8,0; st8.spill [r17]=r3,16;                                                                \
27133 +       ;;                                                                                      \
27134 +       EXTRA;                                                                                  \
27135 +       mov r2=b0; br.call.sptk b0=xen_bsw1;; mov b0=r2;                                        \
27136 +       adds r2=IA64_PT_REGS_R16_OFFSET,r1;                                                     \
27137 +       ;;                                                                                      \
27138 +       movl r1=__gp;           /* establish kernel global pointer */                           \
27139 +       ;;                                                                                      \
27140 +       /* MINSTATE_END_SAVE_MIN */
27141 +#else
27142 +#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA)                                                      \
27143 +       MINSTATE_GET_CURRENT(r16);      /* M (or M;;I) */                                       \
27144 +       mov r27=ar.rsc;                 /* M */                                                 \
27145 +       mov r20=r1;                     /* A */                                                 \
27146 +       mov r25=ar.unat;                /* M */                                                 \
27147 +       mov r29=cr.ipsr;                /* M */                                                 \
27148 +       mov r26=ar.pfs;                 /* I */                                                 \
27149 +       mov r28=cr.iip;                 /* M */                                                 \
27150 +       mov r21=ar.fpsr;                /* M */                                                 \
27151 +       COVER;                          /* B;; (or nothing) */                                  \
27152 +       ;;                                                                                      \
27153 +       adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;                                         \
27154 +       ;;                                                                                      \
27155 +       ld1 r17=[r16];                          /* load current->thread.on_ustack flag */       \
27156 +       st1 [r16]=r0;                           /* clear current->thread.on_ustack flag */      \
27157 +       adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16                                          \
27158 +       /* switch from user to kernel RBS: */                                                   \
27159 +       ;;                                                                                      \
27160 +       invala;                         /* M */                                                 \
27161 +       SAVE_IFS;                                                                               \
27162 +       cmp.eq pKStk,pUStk=r0,r17;              /* are we in kernel mode already? */            \
27163 +       ;;                                                                                      \
27164 +       MINSTATE_START_SAVE_MIN                                                                 \
27165 +       adds r17=2*L1_CACHE_BYTES,r1;           /* really: biggest cache-line size */           \
27166 +       adds r16=PT(CR_IPSR),r1;                                                                \
27167 +       ;;                                                                                      \
27168 +       lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;                                             \
27169 +       st8 [r16]=r29;          /* save cr.ipsr */                                              \
27170 +       ;;                                                                                      \
27171 +       lfetch.fault.excl.nt1 [r17];                                                            \
27172 +       tbit.nz p15,p0=r29,IA64_PSR_I_BIT;                                                      \
27173 +       mov r29=b0                                                                              \
27174 +       ;;                                                                                      \
27175 +       adds r16=PT(R8),r1;     /* initialize first base pointer */                             \
27176 +       adds r17=PT(R9),r1;     /* initialize second base pointer */                            \
27177 +(pKStk)        mov r18=r0;             /* make sure r18 isn't NaT */                                   \
27178 +       ;;                                                                                      \
27179 +.mem.offset 0,0; st8.spill [r16]=r8,16;                                                                \
27180 +.mem.offset 8,0; st8.spill [r17]=r9,16;                                                                \
27181 +        ;;                                                                                     \
27182 +.mem.offset 0,0; st8.spill [r16]=r10,24;                                                       \
27183 +.mem.offset 8,0; st8.spill [r17]=r11,24;                                                       \
27184 +        ;;                                                                                     \
27185 +       st8 [r16]=r28,16;       /* save cr.iip */                                               \
27186 +       st8 [r17]=r30,16;       /* save cr.ifs */                                               \
27187 +(pUStk)        sub r18=r18,r22;        /* r18=RSE.ndirty*8 */                                          \
27188 +       mov r8=ar.ccv;                                                                          \
27189 +       mov r9=ar.csd;                                                                          \
27190 +       mov r10=ar.ssd;                                                                         \
27191 +       movl r11=FPSR_DEFAULT;   /* L-unit */                                                   \
27192 +       ;;                                                                                      \
27193 +       st8 [r16]=r25,16;       /* save ar.unat */                                              \
27194 +       st8 [r17]=r26,16;       /* save ar.pfs */                                               \
27195 +       shl r18=r18,16;         /* compute ar.rsc to be used for "loadrs" */                    \
27196 +       ;;                                                                                      \
27197 +       st8 [r16]=r27,16;       /* save ar.rsc */                                               \
27198 +(pUStk)        st8 [r17]=r24,16;       /* save ar.rnat */                                              \
27199 +(pKStk)        adds r17=16,r17;        /* skip over ar_rnat field */                                   \
27200 +       ;;                      /* avoid RAW on r16 & r17 */                                    \
27201 +(pUStk)        st8 [r16]=r23,16;       /* save ar.bspstore */                                          \
27202 +       st8 [r17]=r31,16;       /* save predicates */                                           \
27203 +(pKStk)        adds r16=16,r16;        /* skip over ar_bspstore field */                               \
27204 +       ;;                                                                                      \
27205 +       st8 [r16]=r29,16;       /* save b0 */                                                   \
27206 +       st8 [r17]=r18,16;       /* save ar.rsc value for "loadrs" */                            \
27207 +       cmp.eq pNonSys,pSys=r0,r0       /* initialize pSys=0, pNonSys=1 */                      \
27208 +       ;;                                                                                      \
27209 +.mem.offset 0,0; st8.spill [r16]=r20,16;       /* save original r1 */                          \
27210 +.mem.offset 8,0; st8.spill [r17]=r12,16;                                                       \
27211 +       adds r12=-16,r1;        /* switch to kernel memory stack (with 16 bytes of scratch) */  \
27212 +       ;;                                                                                      \
27213 +.mem.offset 0,0; st8.spill [r16]=r13,16;                                                       \
27214 +.mem.offset 8,0; st8.spill [r17]=r21,16;       /* save ar.fpsr */                              \
27215 +       mov r13=IA64_KR(CURRENT);       /* establish `current' */                               \
27216 +       ;;                                                                                      \
27217 +.mem.offset 0,0; st8.spill [r16]=r15,16;                                                       \
27218 +.mem.offset 8,0; st8.spill [r17]=r14,16;                                                       \
27219 +       ;;                                                                                      \
27220 +.mem.offset 0,0; st8.spill [r16]=r2,16;                                                                \
27221 +.mem.offset 8,0; st8.spill [r17]=r3,16;                                                                \
27222 +       adds r2=IA64_PT_REGS_R16_OFFSET,r1;                                                     \
27223 +       ;;                                                                                      \
27224 +       EXTRA;                                                                                  \
27225 +       movl r1=__gp;           /* establish kernel global pointer */                           \
27226 +       ;;                                                                                      \
27227 +       MINSTATE_END_SAVE_MIN
27228 +#endif
27229 +
27230 +/*
27231 + * SAVE_REST saves the remainder of pt_regs (with psr.ic on).
27232 + *
27233 + * Assumed state upon entry:
27234 + *     psr.ic: on
27235 + *     r2:     points to &pt_regs.r16
27236 + *     r3:     points to &pt_regs.r17
27237 + *     r8:     contents of ar.ccv
27238 + *     r9:     contents of ar.csd
27239 + *     r10:    contents of ar.ssd
27240 + *     r11:    FPSR_DEFAULT
27241 + *
27242 + * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST.
27243 + */
27244 +#define SAVE_REST                              \
27245 +.mem.offset 0,0; st8.spill [r2]=r16,16;                \
27246 +.mem.offset 8,0; st8.spill [r3]=r17,16;                \
27247 +       ;;                                      \
27248 +.mem.offset 0,0; st8.spill [r2]=r18,16;                \
27249 +.mem.offset 8,0; st8.spill [r3]=r19,16;                \
27250 +       ;;                                      \
27251 +.mem.offset 0,0; st8.spill [r2]=r20,16;                \
27252 +.mem.offset 8,0; st8.spill [r3]=r21,16;                \
27253 +       mov r18=b6;                             \
27254 +       ;;                                      \
27255 +.mem.offset 0,0; st8.spill [r2]=r22,16;                \
27256 +.mem.offset 8,0; st8.spill [r3]=r23,16;                \
27257 +       mov r19=b7;                             \
27258 +       ;;                                      \
27259 +.mem.offset 0,0; st8.spill [r2]=r24,16;                \
27260 +.mem.offset 8,0; st8.spill [r3]=r25,16;                \
27261 +       ;;                                      \
27262 +.mem.offset 0,0; st8.spill [r2]=r26,16;                \
27263 +.mem.offset 8,0; st8.spill [r3]=r27,16;                \
27264 +       ;;                                      \
27265 +.mem.offset 0,0; st8.spill [r2]=r28,16;                \
27266 +.mem.offset 8,0; st8.spill [r3]=r29,16;                \
27267 +       ;;                                      \
27268 +.mem.offset 0,0; st8.spill [r2]=r30,16;                \
27269 +.mem.offset 8,0; st8.spill [r3]=r31,32;                \
27270 +       ;;                                      \
27271 +       mov ar.fpsr=r11;        /* M-unit */    \
27272 +       st8 [r2]=r8,8;          /* ar.ccv */    \
27273 +       adds r24=PT(B6)-PT(F7),r3;              \
27274 +       ;;                                      \
27275 +       stf.spill [r2]=f6,32;                   \
27276 +       stf.spill [r3]=f7,32;                   \
27277 +       ;;                                      \
27278 +       stf.spill [r2]=f8,32;                   \
27279 +       stf.spill [r3]=f9,32;                   \
27280 +       ;;                                      \
27281 +       stf.spill [r2]=f10;                     \
27282 +       stf.spill [r3]=f11;                     \
27283 +       adds r25=PT(B7)-PT(F11),r3;             \
27284 +       ;;                                      \
27285 +       st8 [r24]=r18,16;       /* b6 */        \
27286 +       st8 [r25]=r19,16;       /* b7 */        \
27287 +       ;;                                      \
27288 +       st8 [r24]=r9;           /* ar.csd */    \
27289 +       st8 [r25]=r10;          /* ar.ssd */    \
27290 +       ;;
27291 +
27292 +#define SAVE_MIN_WITH_COVER    DO_SAVE_MIN(cover, mov r30=cr.ifs,)
27293 +#define SAVE_MIN_WITH_COVER_R19        DO_SAVE_MIN(cover, mov r30=cr.ifs, mov r15=r19)
27294 +#ifdef CONFIG_XEN
27295 +#define SAVE_MIN               break 0;; /* FIXME: non-cover version only for ia32 support? */
27296 +#else
27297 +#define SAVE_MIN               DO_SAVE_MIN(     , mov r30=r0, )
27298 +#endif
27299 diff -urNp linux-2.6/arch/ia64/xen/xenpal.S new/arch/ia64/xen/xenpal.S
27300 --- linux-2.6/arch/ia64/xen/xenpal.S    1970-01-01 01:00:00.000000000 +0100
27301 +++ new/arch/ia64/xen/xenpal.S  2006-05-09 12:32:40.000000000 +0200
27302 @@ -0,0 +1,76 @@
27303 +/*
27304 + * ia64/xen/xenpal.S
27305 + *
27306 + * Alternate PAL  routines for Xen.  Heavily leveraged from
27307 + *   ia64/kernel/pal.S
27308 + *
27309 + * Copyright (C) 2005 Hewlett-Packard Co
27310 + *     Dan Magenheimer <dan.magenheimer@.hp.com>
27311 + */
27312 +
27313 +#include <asm/asmmacro.h>
27314 +#include <asm/processor.h>
27315 +
27316 +GLOBAL_ENTRY(xen_pal_call_static)
27317 +       .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5)
27318 +       alloc loc1 = ar.pfs,5,5,0,0
27319 +#ifdef CONFIG_XEN
27320 +       movl r22=running_on_xen;;
27321 +       ld4 r22=[r22];;
27322 +       cmp.eq p7,p0=r22,r0
27323 +(p7)   br.cond.spnt.many __ia64_pal_call_static;;
27324 +#endif
27325 +       movl loc2 = pal_entry_point
27326 +1:     {
27327 +         mov r28 = in0
27328 +         mov r29 = in1
27329 +         mov r8 = ip
27330 +       }
27331 +       ;;
27332 +       ld8 loc2 = [loc2]               // loc2 <- entry point
27333 +       tbit.nz p6,p7 = in4, 0
27334 +       adds r8 = 1f-1b,r8
27335 +       mov loc4=ar.rsc                 // save RSE configuration
27336 +       ;;
27337 +       mov ar.rsc=0                    // put RSE in enforced lazy, LE mode
27338 +       mov loc3 = psr
27339 +       mov loc0 = rp
27340 +       .body
27341 +       mov r30 = in2
27342 +
27343 +#ifdef CONFIG_XEN
27344 +       // this is low priority for paravirtualization, but is called
27345 +       // from the idle loop so confuses privop counting
27346 +       movl r31=XSI_PSR_IC
27347 +       ;;
27348 +(p6)   st4 [r31]=r0
27349 +       ;;
27350 +(p7)   adds r31=XSI_PSR_I_ADDR-XSI_PSR_IC,r31
27351 +(p7)   mov r22=1
27352 +       ;;
27353 +(p7)   ld8 r31=[r31]
27354 +       ;;
27355 +(p7)   st1 [r31]=r22
27356 +       ;;
27357 +       mov r31 = in3
27358 +       mov b7 = loc2
27359 +       ;;
27360 +#else
27361 +(p6)   rsm psr.i | psr.ic
27362 +       mov r31 = in3
27363 +       mov b7 = loc2
27364 +
27365 +(p7)   rsm psr.i
27366 +       ;;
27367 +(p6)   srlz.i
27368 +#endif
27369 +       mov rp = r8
27370 +       br.cond.sptk.many b7
27371 +1:     mov psr.l = loc3
27372 +       mov ar.rsc = loc4               // restore RSE configuration
27373 +       mov ar.pfs = loc1
27374 +       mov rp = loc0
27375 +       ;;
27376 +       srlz.d                          // seralize restoration of psr.l
27377 +       br.ret.sptk.many b0
27378 +END(xen_pal_call_static)
27379 diff -urNp linux-2.6/arch/ia64/xen/xensetup.S new/arch/ia64/xen/xensetup.S
27380 --- linux-2.6/arch/ia64/xen/xensetup.S  1970-01-01 01:00:00.000000000 +0100
27381 +++ new/arch/ia64/xen/xensetup.S        2006-05-23 18:42:17.000000000 +0200
27382 @@ -0,0 +1,24 @@
27383 +/*
27384 + * Support routines for Xen
27385 + *
27386 + * Copyright (C) 2005 Dan Magenheimer <dan.magenheimer@hp.com>
27387 + */
27388 +
27389 +#include <linux/config.h>
27390 +#include <asm/processor.h>
27391 +#include <asm/asmmacro.h>
27392 +
27393 +#define isBP   p3      // are we the Bootstrap Processor?
27394 +
27395 +       .text
27396 +GLOBAL_ENTRY(early_xen_setup)
27397 +       mov r8=ar.rsc           // Initialized in head.S
27398 +(isBP) movl r9=running_on_xen;;
27399 +       extr.u r8=r8,2,2;;      // Extract pl fields
27400 +       cmp.ne p7,p0=r8,r0;;    // p7: running on xen 
27401 +(p7)   mov r8=1                // booleanize.
27402 +(p7)   movl r10=xen_ivt;;
27403 +(isBP) st4 [r9]=r8
27404 +(p7)   mov cr.iva=r10
27405 +       br.ret.sptk.many rp;;
27406 +END(early_xen_setup)
27407 diff -urNp linux-2.6/arch/um/kernel/physmem.c new/arch/um/kernel/physmem.c
27408 --- linux-2.6/arch/um/kernel/physmem.c  2006-07-03 14:14:29.000000000 +0200
27409 +++ new/arch/um/kernel/physmem.c        2006-05-09 12:33:14.000000000 +0200
27410 @@ -226,7 +226,7 @@ EXPORT_SYMBOL(physmem_forget_descriptor)
27411  EXPORT_SYMBOL(physmem_remove_mapping);
27412  EXPORT_SYMBOL(physmem_subst_mapping);
27413  
27414 -void arch_free_page(struct page *page, int order)
27415 +int arch_free_page(struct page *page, int order)
27416  {
27417         void *virt;
27418         int i;
27419 @@ -235,6 +235,8 @@ void arch_free_page(struct page *page, i
27420                 virt = __va(page_to_phys(page + i));
27421                 physmem_remove_mapping(virt);
27422         }
27423 +
27424 +       return 0;
27425  }
27426  
27427  int is_remapped(void *virt)
27428 diff -urNp linux-2.6/arch/x86_64/ia32/ia32entry-xen.S new/arch/x86_64/ia32/ia32entry-xen.S
27429 --- linux-2.6/arch/x86_64/ia32/ia32entry-xen.S  1970-01-01 01:00:00.000000000 +0100
27430 +++ new/arch/x86_64/ia32/ia32entry-xen.S        2006-05-23 18:37:10.000000000 +0200
27431 @@ -0,0 +1,726 @@
27432 +/*
27433 + * Compatibility mode system call entry point for x86-64. 
27434 + *             
27435 + * Copyright 2000-2002 Andi Kleen, SuSE Labs.
27436 + */             
27437 +
27438 +#include <asm/dwarf2.h>
27439 +#include <asm/calling.h>
27440 +#include <asm/asm-offsets.h>
27441 +#include <asm/current.h>
27442 +#include <asm/errno.h>
27443 +#include <asm/ia32_unistd.h>   
27444 +#include <asm/thread_info.h>   
27445 +#include <asm/segment.h>
27446 +#include <asm/vsyscall32.h>
27447 +#include <linux/linkage.h>
27448 +
27449 +#define __XEN_X86_64 1
27450 +
27451 +#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
27452 +
27453 +       .macro IA32_ARG_FIXUP noebp=0
27454 +       movl    %edi,%r8d
27455 +       .if \noebp
27456 +       .else
27457 +       movl    %ebp,%r9d
27458 +       .endif
27459 +       xchg    %ecx,%esi
27460 +       movl    %ebx,%edi
27461 +       movl    %edx,%edx       /* zero extension */
27462 +       .endm 
27463 +
27464 +       /* clobbers %eax */     
27465 +       .macro  CLEAR_RREGS
27466 +       xorl    %eax,%eax
27467 +       movq    %rax,R11(%rsp)
27468 +       movq    %rax,R10(%rsp)
27469 +       movq    %rax,R9(%rsp)
27470 +       movq    %rax,R8(%rsp)
27471 +       .endm
27472 +
27473 +#if defined (__XEN_X86_64)
27474 +#include "../kernel/xen_entry.S"
27475 +               
27476 +#define        __swapgs
27477 +#define __cli
27478 +#define __sti  
27479 +#else
27480 +/*
27481 + * Use the native instructions
27482 + */    
27483 +#define        __swapgs        swapgs
27484 +#define __cli          cli
27485 +#define __sti          sti     
27486 +#endif                 
27487 +
27488 +       .macro CFI_STARTPROC32 simple
27489 +       CFI_STARTPROC   \simple
27490 +       CFI_UNDEFINED   r8
27491 +       CFI_UNDEFINED   r9
27492 +       CFI_UNDEFINED   r10
27493 +       CFI_UNDEFINED   r11
27494 +       CFI_UNDEFINED   r12
27495 +       CFI_UNDEFINED   r13
27496 +       CFI_UNDEFINED   r14
27497 +       CFI_UNDEFINED   r15
27498 +       .endm
27499 +
27500 +/*
27501 + * 32bit SYSENTER instruction entry.
27502 + *
27503 + * Arguments:
27504 + * %eax        System call number.
27505 + * %ebx Arg1
27506 + * %ecx Arg2
27507 + * %edx Arg3
27508 + * %esi Arg4
27509 + * %edi Arg5
27510 + * %ebp user stack
27511 + * 0(%ebp) Arg6        
27512 + *     
27513 + * Interrupts off.
27514 + *     
27515 + * This is purely a fast path. For anything complicated we use the int 0x80
27516 + * path below. Set up a complete hardware stack frame to share code
27517 + * with the int 0x80 path.
27518 + */    
27519 +ENTRY(ia32_sysenter_target)
27520 +       CFI_STARTPROC32 simple
27521 +       CFI_DEF_CFA     rsp,0
27522 +       CFI_REGISTER    rsp,rbp
27523 +       __swapgs 
27524 +       movq    %gs:pda_kernelstack, %rsp
27525 +       addq    $(PDA_STACKOFFSET),%rsp
27526 +       XEN_UNBLOCK_EVENTS(%r11)        
27527 +       __sti
27528 +       movl    %ebp,%ebp               /* zero extension */
27529 +       pushq   $__USER32_DS
27530 +       CFI_ADJUST_CFA_OFFSET 8
27531 +       /*CFI_REL_OFFSET ss,0*/
27532 +       pushq   %rbp
27533 +       CFI_ADJUST_CFA_OFFSET 8
27534 +       CFI_REL_OFFSET rsp,0
27535 +       pushfq
27536 +       CFI_ADJUST_CFA_OFFSET 8
27537 +       /*CFI_REL_OFFSET rflags,0*/
27538 +       movl    $VSYSCALL32_SYSEXIT, %r10d
27539 +       CFI_REGISTER rip,r10
27540 +       pushq   $__USER32_CS
27541 +       CFI_ADJUST_CFA_OFFSET 8
27542 +       /*CFI_REL_OFFSET cs,0*/
27543 +       movl    %eax, %eax
27544 +       pushq   %r10
27545 +       CFI_ADJUST_CFA_OFFSET 8
27546 +       CFI_REL_OFFSET rip,0
27547 +       pushq   %rax
27548 +       CFI_ADJUST_CFA_OFFSET 8
27549 +       cld
27550 +       SAVE_ARGS 0,0,1
27551 +       /* no need to do an access_ok check here because rbp has been
27552 +          32bit zero extended */ 
27553 +1:     movl    (%rbp),%r9d
27554 +       .section __ex_table,"a"
27555 +       .quad 1b,ia32_badarg
27556 +       .previous       
27557 +       GET_THREAD_INFO(%r10)
27558 +       orl    $TS_COMPAT,threadinfo_status(%r10)
27559 +       testl  $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
27560 +       CFI_REMEMBER_STATE
27561 +       jnz  sysenter_tracesys
27562 +sysenter_do_call:      
27563 +       cmpl    $(IA32_NR_syscalls-1),%eax
27564 +       ja      ia32_badsys
27565 +       IA32_ARG_FIXUP 1
27566 +       call    *ia32_sys_call_table(,%rax,8)
27567 +       movq    %rax,RAX-ARGOFFSET(%rsp)
27568 +       GET_THREAD_INFO(%r10)
27569 +       XEN_BLOCK_EVENTS(%r11)  
27570 +       __cli
27571 +       testl   $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
27572 +       jnz     int_ret_from_sys_call
27573 +       andl    $~TS_COMPAT,threadinfo_status(%r10)
27574 +       /* clear IF, that popfq doesn't enable interrupts early */
27575 +       andl  $~0x200,EFLAGS-R11(%rsp) 
27576 +       RESTORE_ARGS 1,24,1,1,1,1
27577 +       popfq
27578 +       CFI_ADJUST_CFA_OFFSET -8
27579 +       /*CFI_RESTORE rflags*/
27580 +       popq    %rcx                            /* User %esp */
27581 +       CFI_ADJUST_CFA_OFFSET -8
27582 +       CFI_REGISTER rsp,rcx
27583 +       movl    $VSYSCALL32_SYSEXIT,%edx        /* User %eip */
27584 +       CFI_REGISTER rip,rdx
27585 +       __swapgs
27586 +       XEN_UNBLOCK_EVENTS(%r11)                
27587 +       __sti           /* sti only takes effect after the next instruction */
27588 +       /* sysexit */
27589 +       .byte   0xf, 0x35  /* TBD */
27590 +
27591 +sysenter_tracesys:
27592 +       CFI_RESTORE_STATE
27593 +       SAVE_REST
27594 +       CLEAR_RREGS
27595 +       movq    $-ENOSYS,RAX(%rsp)      /* really needed? */
27596 +       movq    %rsp,%rdi        /* &pt_regs -> arg1 */
27597 +       call    syscall_trace_enter
27598 +       LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
27599 +       RESTORE_REST
27600 +       movl    %ebp, %ebp
27601 +       /* no need to do an access_ok check here because rbp has been
27602 +          32bit zero extended */ 
27603 +1:     movl    (%rbp),%r9d
27604 +       .section __ex_table,"a"
27605 +       .quad 1b,ia32_badarg
27606 +       .previous
27607 +       jmp     sysenter_do_call
27608 +       CFI_ENDPROC
27609 +
27610 +/*
27611 + * 32bit SYSCALL instruction entry.
27612 + *
27613 + * Arguments:
27614 + * %eax        System call number.
27615 + * %ebx Arg1
27616 + * %ecx return EIP 
27617 + * %edx Arg3
27618 + * %esi Arg4
27619 + * %edi Arg5
27620 + * %ebp Arg2    [note: not saved in the stack frame, should not be touched]
27621 + * %esp user stack 
27622 + * 0(%esp) Arg6
27623 + *     
27624 + * Interrupts off.
27625 + *     
27626 + * This is purely a fast path. For anything complicated we use the int 0x80
27627 + * path below. Set up a complete hardware stack frame to share code
27628 + * with the int 0x80 path.     
27629 + */    
27630 +ENTRY(ia32_cstar_target)
27631 +       CFI_STARTPROC32 simple
27632 +       CFI_DEF_CFA     rsp,0
27633 +       CFI_REGISTER    rip,rcx
27634 +       /*CFI_REGISTER  rflags,r11*/
27635 +       __swapgs
27636 +       movl    %esp,%r8d
27637 +       CFI_REGISTER    rsp,r8
27638 +       movq    %gs:pda_kernelstack,%rsp
27639 +       XEN_UNBLOCK_EVENTS(%r11)        
27640 +       __sti
27641 +       SAVE_ARGS 8,1,1
27642 +       movl    %eax,%eax       /* zero extension */
27643 +       movq    %rax,ORIG_RAX-ARGOFFSET(%rsp)
27644 +       movq    %rcx,RIP-ARGOFFSET(%rsp)
27645 +       CFI_REL_OFFSET rip,RIP-ARGOFFSET
27646 +       movq    %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
27647 +       movl    %ebp,%ecx
27648 +       movq    $__USER32_CS,CS-ARGOFFSET(%rsp)
27649 +       movq    $__USER32_DS,SS-ARGOFFSET(%rsp)
27650 +       movq    %r11,EFLAGS-ARGOFFSET(%rsp)
27651 +       /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
27652 +       movq    %r8,RSP-ARGOFFSET(%rsp) 
27653 +       CFI_REL_OFFSET rsp,RSP-ARGOFFSET
27654 +       /* no need to do an access_ok check here because r8 has been
27655 +          32bit zero extended */ 
27656 +       /* hardware stack frame is complete now */      
27657 +1:     movl    (%r8),%r9d
27658 +       .section __ex_table,"a"
27659 +       .quad 1b,ia32_badarg
27660 +       .previous       
27661 +       GET_THREAD_INFO(%r10)
27662 +       orl   $TS_COMPAT,threadinfo_status(%r10)
27663 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
27664 +       CFI_REMEMBER_STATE
27665 +       jnz   cstar_tracesys
27666 +cstar_do_call: 
27667 +       cmpl $IA32_NR_syscalls-1,%eax
27668 +       ja  ia32_badsys
27669 +       IA32_ARG_FIXUP 1
27670 +       call *ia32_sys_call_table(,%rax,8)
27671 +       movq %rax,RAX-ARGOFFSET(%rsp)
27672 +       GET_THREAD_INFO(%r10)
27673 +       XEN_BLOCK_EVENTS(%r11)          
27674 +       __cli
27675 +       testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
27676 +       jnz  int_ret_from_sys_call
27677 +       andl $~TS_COMPAT,threadinfo_status(%r10)
27678 +       RESTORE_ARGS 1,-ARG_SKIP,1,1,1
27679 +       movl RIP-ARGOFFSET(%rsp),%ecx
27680 +       CFI_REGISTER rip,rcx
27681 +       movl EFLAGS-ARGOFFSET(%rsp),%r11d       
27682 +       /*CFI_REGISTER rflags,r11*/
27683 +       movl RSP-ARGOFFSET(%rsp),%esp
27684 +       CFI_RESTORE rsp
27685 +       __swapgs
27686 +       sysretl  /* TBD */
27687 +       
27688 +cstar_tracesys:        
27689 +       CFI_RESTORE_STATE
27690 +       SAVE_REST
27691 +       CLEAR_RREGS
27692 +       movq $-ENOSYS,RAX(%rsp) /* really needed? */
27693 +       movq %rsp,%rdi        /* &pt_regs -> arg1 */
27694 +       call syscall_trace_enter
27695 +       LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
27696 +       RESTORE_REST
27697 +       movl RSP-ARGOFFSET(%rsp), %r8d
27698 +       /* no need to do an access_ok check here because r8 has been
27699 +          32bit zero extended */ 
27700 +1:     movl    (%r8),%r9d
27701 +       .section __ex_table,"a"
27702 +       .quad 1b,ia32_badarg
27703 +       .previous
27704 +       jmp cstar_do_call
27705 +                               
27706 +ia32_badarg:
27707 +       movq $-EFAULT,%rax
27708 +       jmp ia32_sysret
27709 +       CFI_ENDPROC
27710 +
27711 +/* 
27712 + * Emulated IA32 system calls via int 0x80. 
27713 + *
27714 + * Arguments:   
27715 + * %eax        System call number.
27716 + * %ebx Arg1
27717 + * %ecx Arg2
27718 + * %edx Arg3
27719 + * %esi Arg4
27720 + * %edi Arg5
27721 + * %ebp Arg6    [note: not saved in the stack frame, should not be touched]
27722 + *
27723 + * Notes:
27724 + * Uses the same stack frame as the x86-64 version.    
27725 + * All registers except %eax must be saved (but ptrace may violate that)
27726 + * Arguments are zero extended. For system calls that want sign extension and
27727 + * take long arguments a wrapper is needed. Most calls can just be called
27728 + * directly.
27729 + * Assumes it is only called from user space and entered with interrupts off.  
27730 + */                            
27731 +
27732 +ENTRY(ia32_syscall)
27733 +       CFI_STARTPROC   simple
27734 +       CFI_DEF_CFA     rsp,SS+8-RIP
27735 +       /*CFI_REL_OFFSET        ss,SS-RIP*/
27736 +       CFI_REL_OFFSET  rsp,RSP-RIP
27737 +       /*CFI_REL_OFFSET        rflags,EFLAGS-RIP*/
27738 +       /*CFI_REL_OFFSET        cs,CS-RIP*/
27739 +       CFI_REL_OFFSET  rip,RIP-RIP
27740 +       __swapgs
27741 +       XEN_UNBLOCK_EVENTS(%r11)
27742 +       __sti
27743 +       movq (%rsp),%rcx
27744 +       movq 8(%rsp),%r11
27745 +        addq $0x10,%rsp /* skip rcx and r11 */
27746 +       movl %eax,%eax
27747 +       pushq %rax
27748 +       CFI_ADJUST_CFA_OFFSET 8
27749 +       cld
27750 +/* 1:  jmp 1b   */
27751 +       /* note the registers are not zero extended to the sf.
27752 +          this could be a problem. */
27753 +       SAVE_ARGS 0,0,1
27754 +       GET_THREAD_INFO(%r10)
27755 +       orl   $TS_COMPAT,threadinfo_status(%r10)
27756 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
27757 +       jnz ia32_tracesys
27758 +ia32_do_syscall:       
27759 +       cmpl $(IA32_NR_syscalls-1),%eax
27760 +       ja  ia32_badsys
27761 +       IA32_ARG_FIXUP
27762 +       call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
27763 +ia32_sysret:
27764 +       movq %rax,RAX-ARGOFFSET(%rsp)
27765 +       jmp int_ret_from_sys_call 
27766 +
27767 +ia32_tracesys:                  
27768 +       SAVE_REST
27769 +       movq $-ENOSYS,RAX(%rsp) /* really needed? */
27770 +       movq %rsp,%rdi        /* &pt_regs -> arg1 */
27771 +       call syscall_trace_enter
27772 +       LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
27773 +       RESTORE_REST
27774 +       jmp ia32_do_syscall
27775 +
27776 +ia32_badsys:
27777 +       movq $0,ORIG_RAX-ARGOFFSET(%rsp)
27778 +       movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
27779 +       jmp int_ret_from_sys_call
27780 +
27781 +ni_syscall:
27782 +       movq %rax,%rdi
27783 +       jmp  sys32_ni_syscall                   
27784 +
27785 +quiet_ni_syscall:
27786 +       movq $-ENOSYS,%rax
27787 +       ret
27788 +       CFI_ENDPROC
27789 +       
27790 +       .macro PTREGSCALL label, func, arg
27791 +       .globl \label
27792 +\label:
27793 +       leaq \func(%rip),%rax
27794 +       leaq -ARGOFFSET+8(%rsp),\arg    /* 8 for return address */
27795 +       jmp  ia32_ptregs_common 
27796 +       .endm
27797 +
27798 +       CFI_STARTPROC32
27799 +
27800 +       PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
27801 +       PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
27802 +       PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
27803 +       PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
27804 +       PTREGSCALL stub32_execve, sys32_execve, %rcx
27805 +       PTREGSCALL stub32_fork, sys_fork, %rdi
27806 +       PTREGSCALL stub32_clone, sys32_clone, %rdx
27807 +       PTREGSCALL stub32_vfork, sys_vfork, %rdi
27808 +       PTREGSCALL stub32_iopl, sys_iopl, %rsi
27809 +       PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
27810 +
27811 +ENTRY(ia32_ptregs_common)
27812 +       popq %r11
27813 +       CFI_ENDPROC
27814 +       CFI_STARTPROC32 simple
27815 +       CFI_DEF_CFA     rsp,SS+8-ARGOFFSET
27816 +       CFI_REL_OFFSET  rax,RAX-ARGOFFSET
27817 +       CFI_REL_OFFSET  rcx,RCX-ARGOFFSET
27818 +       CFI_REL_OFFSET  rdx,RDX-ARGOFFSET
27819 +       CFI_REL_OFFSET  rsi,RSI-ARGOFFSET
27820 +       CFI_REL_OFFSET  rdi,RDI-ARGOFFSET
27821 +       CFI_REL_OFFSET  rip,RIP-ARGOFFSET
27822 +/*     CFI_REL_OFFSET  cs,CS-ARGOFFSET*/
27823 +/*     CFI_REL_OFFSET  rflags,EFLAGS-ARGOFFSET*/
27824 +       CFI_REL_OFFSET  rsp,RSP-ARGOFFSET
27825 +/*     CFI_REL_OFFSET  ss,SS-ARGOFFSET*/
27826 +       SAVE_REST
27827 +       call *%rax
27828 +       RESTORE_REST
27829 +       jmp  ia32_sysret        /* misbalances the return cache */
27830 +       CFI_ENDPROC
27831 +
27832 +       .section .rodata,"a"
27833 +       .align 8
27834 +       .globl ia32_sys_call_table
27835 +ia32_sys_call_table:
27836 +       .quad sys_restart_syscall
27837 +       .quad sys_exit
27838 +       .quad stub32_fork
27839 +       .quad sys_read
27840 +       .quad sys_write
27841 +       .quad compat_sys_open           /* 5 */
27842 +       .quad sys_close
27843 +       .quad sys32_waitpid
27844 +       .quad sys_creat
27845 +       .quad sys_link
27846 +       .quad sys_unlink                /* 10 */
27847 +       .quad stub32_execve
27848 +       .quad sys_chdir
27849 +       .quad compat_sys_time
27850 +       .quad sys_mknod
27851 +       .quad sys_chmod         /* 15 */
27852 +       .quad sys_lchown16
27853 +       .quad quiet_ni_syscall                  /* old break syscall holder */
27854 +       .quad sys_stat
27855 +       .quad sys32_lseek
27856 +       .quad sys_getpid                /* 20 */
27857 +       .quad compat_sys_mount  /* mount  */
27858 +       .quad sys_oldumount     /* old_umount  */
27859 +       .quad sys_setuid16
27860 +       .quad sys_getuid16
27861 +       .quad compat_sys_stime  /* stime */             /* 25 */
27862 +       .quad sys32_ptrace      /* ptrace */
27863 +       .quad sys_alarm
27864 +       .quad sys_fstat /* (old)fstat */
27865 +       .quad sys_pause
27866 +       .quad compat_sys_utime  /* 30 */
27867 +       .quad quiet_ni_syscall  /* old stty syscall holder */
27868 +       .quad quiet_ni_syscall  /* old gtty syscall holder */
27869 +       .quad sys_access
27870 +       .quad sys_nice  
27871 +       .quad quiet_ni_syscall  /* 35 */        /* old ftime syscall holder */
27872 +       .quad sys_sync
27873 +       .quad sys32_kill
27874 +       .quad sys_rename
27875 +       .quad sys_mkdir
27876 +       .quad sys_rmdir         /* 40 */
27877 +       .quad sys_dup
27878 +       .quad sys32_pipe
27879 +       .quad compat_sys_times
27880 +       .quad quiet_ni_syscall                  /* old prof syscall holder */
27881 +       .quad sys_brk           /* 45 */
27882 +       .quad sys_setgid16
27883 +       .quad sys_getgid16
27884 +       .quad sys_signal
27885 +       .quad sys_geteuid16
27886 +       .quad sys_getegid16     /* 50 */
27887 +       .quad sys_acct
27888 +       .quad sys_umount                        /* new_umount */
27889 +       .quad quiet_ni_syscall                  /* old lock syscall holder */
27890 +       .quad compat_sys_ioctl
27891 +       .quad compat_sys_fcntl64                /* 55 */
27892 +       .quad quiet_ni_syscall                  /* old mpx syscall holder */
27893 +       .quad sys_setpgid
27894 +       .quad quiet_ni_syscall                  /* old ulimit syscall holder */
27895 +       .quad sys32_olduname
27896 +       .quad sys_umask         /* 60 */
27897 +       .quad sys_chroot
27898 +       .quad sys32_ustat
27899 +       .quad sys_dup2
27900 +       .quad sys_getppid
27901 +       .quad sys_getpgrp               /* 65 */
27902 +       .quad sys_setsid
27903 +       .quad sys32_sigaction
27904 +       .quad sys_sgetmask
27905 +       .quad sys_ssetmask
27906 +       .quad sys_setreuid16    /* 70 */
27907 +       .quad sys_setregid16
27908 +       .quad stub32_sigsuspend
27909 +       .quad compat_sys_sigpending
27910 +       .quad sys_sethostname
27911 +       .quad compat_sys_setrlimit      /* 75 */
27912 +       .quad compat_sys_old_getrlimit  /* old_getrlimit */
27913 +       .quad compat_sys_getrusage
27914 +       .quad sys32_gettimeofday
27915 +       .quad sys32_settimeofday
27916 +       .quad sys_getgroups16   /* 80 */
27917 +       .quad sys_setgroups16
27918 +       .quad sys32_old_select
27919 +       .quad sys_symlink
27920 +       .quad sys_lstat
27921 +       .quad sys_readlink              /* 85 */
27922 +#ifdef CONFIG_IA32_AOUT
27923 +       .quad sys_uselib
27924 +#else
27925 +       .quad quiet_ni_syscall
27926 +#endif
27927 +       .quad sys_swapon
27928 +       .quad sys_reboot
27929 +       .quad compat_sys_old_readdir
27930 +       .quad sys32_mmap                /* 90 */
27931 +       .quad sys_munmap
27932 +       .quad sys_truncate
27933 +       .quad sys_ftruncate
27934 +       .quad sys_fchmod
27935 +       .quad sys_fchown16              /* 95 */
27936 +       .quad sys_getpriority
27937 +       .quad sys_setpriority
27938 +       .quad quiet_ni_syscall                  /* old profil syscall holder */
27939 +       .quad compat_sys_statfs
27940 +       .quad compat_sys_fstatfs                /* 100 */
27941 +       .quad sys_ioperm
27942 +       .quad compat_sys_socketcall
27943 +       .quad sys_syslog
27944 +       .quad compat_sys_setitimer
27945 +       .quad compat_sys_getitimer      /* 105 */
27946 +       .quad compat_sys_newstat
27947 +       .quad compat_sys_newlstat
27948 +       .quad compat_sys_newfstat
27949 +       .quad sys32_uname
27950 +       .quad stub32_iopl               /* 110 */
27951 +       .quad sys_vhangup
27952 +       .quad quiet_ni_syscall  /* old "idle" system call */
27953 +       .quad sys32_vm86_warning        /* vm86old */ 
27954 +       .quad compat_sys_wait4
27955 +       .quad sys_swapoff               /* 115 */
27956 +       .quad sys32_sysinfo
27957 +       .quad sys32_ipc
27958 +       .quad sys_fsync
27959 +       .quad stub32_sigreturn
27960 +       .quad stub32_clone              /* 120 */
27961 +       .quad sys_setdomainname
27962 +       .quad sys_uname
27963 +       .quad sys_modify_ldt
27964 +       .quad compat_sys_adjtimex
27965 +       .quad sys32_mprotect            /* 125 */
27966 +       .quad compat_sys_sigprocmask
27967 +       .quad quiet_ni_syscall          /* create_module */
27968 +       .quad sys_init_module
27969 +       .quad sys_delete_module
27970 +       .quad quiet_ni_syscall          /* 130  get_kernel_syms */
27971 +       .quad sys_quotactl
27972 +       .quad sys_getpgid
27973 +       .quad sys_fchdir
27974 +       .quad quiet_ni_syscall  /* bdflush */
27975 +       .quad sys_sysfs         /* 135 */
27976 +       .quad sys_personality
27977 +       .quad quiet_ni_syscall  /* for afs_syscall */
27978 +       .quad sys_setfsuid16
27979 +       .quad sys_setfsgid16
27980 +       .quad sys_llseek                /* 140 */
27981 +       .quad compat_sys_getdents
27982 +       .quad compat_sys_select
27983 +       .quad sys_flock
27984 +       .quad sys_msync
27985 +       .quad compat_sys_readv          /* 145 */
27986 +       .quad compat_sys_writev
27987 +       .quad sys_getsid
27988 +       .quad sys_fdatasync
27989 +       .quad sys32_sysctl      /* sysctl */
27990 +       .quad sys_mlock         /* 150 */
27991 +       .quad sys_munlock
27992 +       .quad sys_mlockall
27993 +       .quad sys_munlockall
27994 +       .quad sys_sched_setparam
27995 +       .quad sys_sched_getparam   /* 155 */
27996 +       .quad sys_sched_setscheduler
27997 +       .quad sys_sched_getscheduler
27998 +       .quad sys_sched_yield
27999 +       .quad sys_sched_get_priority_max
28000 +       .quad sys_sched_get_priority_min  /* 160 */
28001 +       .quad sys_sched_rr_get_interval
28002 +       .quad compat_sys_nanosleep
28003 +       .quad sys_mremap
28004 +       .quad sys_setresuid16
28005 +       .quad sys_getresuid16   /* 165 */
28006 +       .quad sys32_vm86_warning        /* vm86 */ 
28007 +       .quad quiet_ni_syscall  /* query_module */
28008 +       .quad sys_poll
28009 +       .quad compat_sys_nfsservctl
28010 +       .quad sys_setresgid16   /* 170 */
28011 +       .quad sys_getresgid16
28012 +       .quad sys_prctl
28013 +       .quad stub32_rt_sigreturn
28014 +       .quad sys32_rt_sigaction
28015 +       .quad sys32_rt_sigprocmask      /* 175 */
28016 +       .quad sys32_rt_sigpending
28017 +       .quad compat_sys_rt_sigtimedwait
28018 +       .quad sys32_rt_sigqueueinfo
28019 +       .quad stub32_rt_sigsuspend
28020 +       .quad sys32_pread               /* 180 */
28021 +       .quad sys32_pwrite
28022 +       .quad sys_chown16
28023 +       .quad sys_getcwd
28024 +       .quad sys_capget
28025 +       .quad sys_capset
28026 +       .quad stub32_sigaltstack
28027 +       .quad sys32_sendfile
28028 +       .quad quiet_ni_syscall          /* streams1 */
28029 +       .quad quiet_ni_syscall          /* streams2 */
28030 +       .quad stub32_vfork            /* 190 */
28031 +       .quad compat_sys_getrlimit
28032 +       .quad sys32_mmap2
28033 +       .quad sys32_truncate64
28034 +       .quad sys32_ftruncate64
28035 +       .quad sys32_stat64              /* 195 */
28036 +       .quad sys32_lstat64
28037 +       .quad sys32_fstat64
28038 +       .quad sys_lchown
28039 +       .quad sys_getuid
28040 +       .quad sys_getgid                /* 200 */
28041 +       .quad sys_geteuid
28042 +       .quad sys_getegid
28043 +       .quad sys_setreuid
28044 +       .quad sys_setregid
28045 +       .quad sys_getgroups     /* 205 */
28046 +       .quad sys_setgroups
28047 +       .quad sys_fchown
28048 +       .quad sys_setresuid
28049 +       .quad sys_getresuid
28050 +       .quad sys_setresgid     /* 210 */
28051 +       .quad sys_getresgid
28052 +       .quad sys_chown
28053 +       .quad sys_setuid
28054 +       .quad sys_setgid
28055 +       .quad sys_setfsuid              /* 215 */
28056 +       .quad sys_setfsgid
28057 +       .quad sys_pivot_root
28058 +       .quad sys_mincore
28059 +       .quad sys_madvise
28060 +       .quad compat_sys_getdents64     /* 220 getdents64 */
28061 +       .quad compat_sys_fcntl64        
28062 +       .quad quiet_ni_syscall          /* tux */
28063 +       .quad quiet_ni_syscall          /* security */
28064 +       .quad sys_gettid        
28065 +       .quad sys_readahead     /* 225 */
28066 +       .quad sys_setxattr
28067 +       .quad sys_lsetxattr
28068 +       .quad sys_fsetxattr
28069 +       .quad sys_getxattr
28070 +       .quad sys_lgetxattr     /* 230 */
28071 +       .quad sys_fgetxattr
28072 +       .quad sys_listxattr
28073 +       .quad sys_llistxattr
28074 +       .quad sys_flistxattr
28075 +       .quad sys_removexattr   /* 235 */
28076 +       .quad sys_lremovexattr
28077 +       .quad sys_fremovexattr
28078 +       .quad sys_tkill
28079 +       .quad sys_sendfile64 
28080 +       .quad compat_sys_futex          /* 240 */
28081 +       .quad compat_sys_sched_setaffinity
28082 +       .quad compat_sys_sched_getaffinity
28083 +       .quad sys32_set_thread_area
28084 +       .quad sys32_get_thread_area
28085 +       .quad compat_sys_io_setup       /* 245 */
28086 +       .quad sys_io_destroy
28087 +       .quad compat_sys_io_getevents
28088 +       .quad compat_sys_io_submit
28089 +       .quad sys_io_cancel
28090 +       .quad sys_fadvise64             /* 250 */
28091 +       .quad quiet_ni_syscall  /* free_huge_pages */
28092 +       .quad sys_exit_group
28093 +       .quad sys32_lookup_dcookie
28094 +       .quad sys_epoll_create
28095 +       .quad sys_epoll_ctl             /* 255 */
28096 +       .quad sys_epoll_wait
28097 +       .quad sys_remap_file_pages
28098 +       .quad sys_set_tid_address
28099 +       .quad compat_sys_timer_create
28100 +       .quad compat_sys_timer_settime  /* 260 */
28101 +       .quad compat_sys_timer_gettime
28102 +       .quad sys_timer_getoverrun
28103 +       .quad sys_timer_delete
28104 +       .quad compat_sys_clock_settime
28105 +       .quad compat_sys_clock_gettime  /* 265 */
28106 +       .quad compat_sys_clock_getres
28107 +       .quad compat_sys_clock_nanosleep
28108 +       .quad compat_sys_statfs64
28109 +       .quad compat_sys_fstatfs64
28110 +       .quad sys_tgkill                /* 270 */
28111 +       .quad compat_sys_utimes
28112 +       .quad sys32_fadvise64_64
28113 +       .quad quiet_ni_syscall  /* sys_vserver */
28114 +       .quad sys_mbind
28115 +       .quad compat_sys_get_mempolicy  /* 275 */
28116 +       .quad sys_set_mempolicy
28117 +       .quad compat_sys_mq_open
28118 +       .quad sys_mq_unlink
28119 +       .quad compat_sys_mq_timedsend
28120 +       .quad compat_sys_mq_timedreceive        /* 280 */
28121 +       .quad compat_sys_mq_notify
28122 +       .quad compat_sys_mq_getsetattr
28123 +       .quad compat_sys_kexec_load     /* reserved for kexec */
28124 +       .quad compat_sys_waitid
28125 +       .quad quiet_ni_syscall          /* 285: sys_altroot */
28126 +       .quad sys_add_key
28127 +       .quad sys_request_key
28128 +       .quad sys_keyctl
28129 +       .quad sys_ioprio_set
28130 +       .quad sys_ioprio_get            /* 290 */
28131 +       .quad sys_inotify_init
28132 +       .quad sys_inotify_add_watch
28133 +       .quad sys_inotify_rm_watch
28134 +       .quad sys_migrate_pages
28135 +       .quad compat_sys_openat         /* 295 */
28136 +       .quad sys_mkdirat
28137 +       .quad sys_mknodat
28138 +       .quad sys_fchownat
28139 +       .quad compat_sys_futimesat
28140 +       .quad sys32_fstatat             /* 300 */
28141 +       .quad sys_unlinkat
28142 +       .quad sys_renameat
28143 +       .quad sys_linkat
28144 +       .quad sys_symlinkat
28145 +       .quad sys_readlinkat            /* 305 */
28146 +       .quad sys_fchmodat
28147 +       .quad sys_faccessat
28148 +       .quad quiet_ni_syscall          /* pselect6 for now */
28149 +       .quad quiet_ni_syscall          /* ppoll for now */
28150 +       .quad sys_unshare               /* 310 */
28151 +       .quad compat_sys_set_robust_list
28152 +       .quad compat_sys_get_robust_list
28153 +       .quad sys_splice
28154 +       .quad sys_sync_file_range
28155 +       .quad sys_tee
28156 +       .quad compat_sys_vmsplice
28157 +ia32_syscall_end:              
28158 diff -urNp linux-2.6/arch/x86_64/ia32/Makefile new/arch/x86_64/ia32/Makefile
28159 --- linux-2.6/arch/x86_64/ia32/Makefile 2006-07-03 14:14:30.000000000 +0200
28160 +++ new/arch/x86_64/ia32/Makefile       2006-05-23 18:37:10.000000000 +0200
28161 @@ -23,9 +23,25 @@ quiet_cmd_syscall = SYSCALL $@
28162                            -Wl,-soname=linux-gate.so.1 -o $@ \
28163                            -Wl,-T,$(filter-out FORCE,$^)
28164  
28165 +$(obj)/vsyscall-int80.so \
28166  $(obj)/vsyscall-sysenter.so $(obj)/vsyscall-syscall.so: \
28167  $(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
28168         $(call if_changed,syscall)
28169  
28170 -AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32
28171 -AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32
28172 +AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32 -Iarch/i386/kernel
28173 +AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32 -Iarch/i386/kernel
28174 +
28175 +ifdef CONFIG_XEN
28176 +AFLAGS_vsyscall-int80.o = -m32 -Wa,-32 -Iarch/i386/kernel
28177 +CFLAGS_syscall32-xen.o += -DUSE_INT80
28178 +AFLAGS_syscall32_syscall-xen.o += -DUSE_INT80
28179 +
28180 +$(obj)/syscall32_syscall-xen.o: \
28181 +       $(foreach F,int80 sysenter syscall,$(obj)/vsyscall-$F.so)
28182 +
28183 +targets := $(foreach F,int80 sysenter syscall,vsyscall-$F.o vsyscall-$F.so)
28184 +
28185 +include $(srctree)/scripts/Makefile.xen
28186 +
28187 +obj-y := $(call cherrypickxen, $(obj-y))
28188 +endif
28189 diff -urNp linux-2.6/arch/x86_64/ia32/syscall32_syscall-xen.S new/arch/x86_64/ia32/syscall32_syscall-xen.S
28190 --- linux-2.6/arch/x86_64/ia32/syscall32_syscall-xen.S  1970-01-01 01:00:00.000000000 +0100
28191 +++ new/arch/x86_64/ia32/syscall32_syscall-xen.S        2006-05-09 12:33:17.000000000 +0200
28192 @@ -0,0 +1,28 @@
28193 +/* 32bit VDSOs mapped into user space. */
28194 +
28195 +       .section ".init.data","aw"
28196 +
28197 +#ifdef USE_INT80
28198 +
28199 +       .globl syscall32_int80
28200 +       .globl syscall32_int80_end
28201 +
28202 +syscall32_int80:
28203 +       .incbin "arch/x86_64/ia32/vsyscall-int80.so"
28204 +syscall32_int80_end:
28205 +
28206 +#endif
28207 +
28208 +       .globl syscall32_syscall
28209 +       .globl syscall32_syscall_end
28210 +
28211 +syscall32_syscall:
28212 +       .incbin "arch/x86_64/ia32/vsyscall-syscall.so"
28213 +syscall32_syscall_end:
28214 +
28215 +       .globl syscall32_sysenter
28216 +       .globl syscall32_sysenter_end
28217 +
28218 +syscall32_sysenter:
28219 +       .incbin "arch/x86_64/ia32/vsyscall-sysenter.so"
28220 +syscall32_sysenter_end:
28221 diff -urNp linux-2.6/arch/x86_64/ia32/syscall32-xen.c new/arch/x86_64/ia32/syscall32-xen.c
28222 --- linux-2.6/arch/x86_64/ia32/syscall32-xen.c  1970-01-01 01:00:00.000000000 +0100
28223 +++ new/arch/x86_64/ia32/syscall32-xen.c        2006-05-09 12:33:17.000000000 +0200
28224 @@ -0,0 +1,128 @@
28225 +/* Copyright 2002,2003 Andi Kleen, SuSE Labs */
28226 +
28227 +/* vsyscall handling for 32bit processes. Map a stub page into it 
28228 +   on demand because 32bit cannot reach the kernel's fixmaps */
28229 +
28230 +#include <linux/mm.h>
28231 +#include <linux/string.h>
28232 +#include <linux/kernel.h>
28233 +#include <linux/gfp.h>
28234 +#include <linux/init.h>
28235 +#include <linux/stringify.h>
28236 +#include <linux/security.h>
28237 +#include <asm/proto.h>
28238 +#include <asm/tlbflush.h>
28239 +#include <asm/ia32_unistd.h>
28240 +
28241 +#ifdef USE_INT80
28242 +extern unsigned char syscall32_int80[], syscall32_int80_end[];
28243 +#endif
28244 +extern unsigned char syscall32_syscall[], syscall32_syscall_end[];
28245 +extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[];
28246 +extern int sysctl_vsyscall32;
28247 +
28248 +char *syscall32_page; 
28249 +#ifndef USE_INT80
28250 +static int use_sysenter = -1;
28251 +#endif
28252 +
28253 +static struct page *
28254 +syscall32_nopage(struct vm_area_struct *vma, unsigned long adr, int *type)
28255 +{
28256 +       struct page *p = virt_to_page(adr - vma->vm_start + syscall32_page);
28257 +       get_page(p);
28258 +       return p;
28259 +}
28260 +
28261 +/* Prevent VMA merging */
28262 +static void syscall32_vma_close(struct vm_area_struct *vma)
28263 +{
28264 +}
28265 +
28266 +static struct vm_operations_struct syscall32_vm_ops = {
28267 +       .close = syscall32_vma_close,
28268 +       .nopage = syscall32_nopage,
28269 +};
28270 +
28271 +struct linux_binprm;
28272 +
28273 +/* Setup a VMA at program startup for the vsyscall page */
28274 +int syscall32_setup_pages(struct linux_binprm *bprm, int exstack)
28275 +{
28276 +       int npages = (VSYSCALL32_END - VSYSCALL32_BASE) >> PAGE_SHIFT;
28277 +       struct vm_area_struct *vma;
28278 +       struct mm_struct *mm = current->mm;
28279 +       int ret;
28280 +
28281 +       vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
28282 +       if (!vma)
28283 +               return -ENOMEM;
28284 +
28285 +       memset(vma, 0, sizeof(struct vm_area_struct));
28286 +       /* Could randomize here */
28287 +       vma->vm_start = VSYSCALL32_BASE;
28288 +       vma->vm_end = VSYSCALL32_END;
28289 +       /* MAYWRITE to allow gdb to COW and set breakpoints */
28290 +       vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE;
28291 +       vma->vm_flags |= mm->def_flags;
28292 +       vma->vm_page_prot = protection_map[vma->vm_flags & 7];
28293 +       vma->vm_ops = &syscall32_vm_ops;
28294 +       vma->vm_mm = mm;
28295 +
28296 +       down_write(&mm->mmap_sem);
28297 +       if ((ret = insert_vm_struct(mm, vma))) {
28298 +               up_write(&mm->mmap_sem);
28299 +               kmem_cache_free(vm_area_cachep, vma);
28300 +               return ret;
28301 +       }
28302 +       mm->total_vm += npages;
28303 +       up_write(&mm->mmap_sem);
28304 +       return 0;
28305 +}
28306 +
28307 +static int __init init_syscall32(void)
28308 +{ 
28309 +       syscall32_page = (void *)get_zeroed_page(GFP_KERNEL); 
28310 +       if (!syscall32_page) 
28311 +               panic("Cannot allocate syscall32 page"); 
28312 +
28313 +#ifdef USE_INT80
28314 +       /*
28315 +        * At this point we use int 0x80.
28316 +        */
28317 +       memcpy(syscall32_page, syscall32_int80,
28318 +              syscall32_int80_end - syscall32_int80);
28319 +#else
28320 +       if (use_sysenter > 0) {
28321 +               memcpy(syscall32_page, syscall32_sysenter,
28322 +                      syscall32_sysenter_end - syscall32_sysenter);
28323 +       } else {
28324 +               memcpy(syscall32_page, syscall32_syscall,
28325 +                      syscall32_syscall_end - syscall32_syscall);
28326 +       }       
28327 +#endif
28328 +       return 0;
28329 +} 
28330 +
28331 +/*
28332 + * This must be done early in case we have an initrd containing 32-bit
28333 + * binaries (e.g., hotplug). This could be pushed upstream to arch/x86_64.
28334 + */    
28335 +core_initcall(init_syscall32); 
28336 +
28337 +/* May not be __init: called during resume */
28338 +void syscall32_cpu_init(void)
28339 +{
28340 +#ifndef USE_INT80
28341 +       if (use_sysenter < 0)
28342 +               use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
28343 +
28344 +       /* Load these always in case some future AMD CPU supports
28345 +          SYSENTER from compat mode too. */
28346 +       checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
28347 +       checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL);
28348 +       checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
28349 +
28350 +       wrmsrl(MSR_CSTAR, ia32_cstar_target);
28351 +#endif
28352 +}
28353 diff -urNp linux-2.6/arch/x86_64/ia32/vsyscall-int80.S new/arch/x86_64/ia32/vsyscall-int80.S
28354 --- linux-2.6/arch/x86_64/ia32/vsyscall-int80.S 1970-01-01 01:00:00.000000000 +0100
28355 +++ new/arch/x86_64/ia32/vsyscall-int80.S       2006-05-09 12:33:17.000000000 +0200
28356 @@ -0,0 +1,58 @@
28357 +/*
28358 + * Code for the vsyscall page.  This version uses the old int $0x80 method.
28359 + *
28360 + * NOTE:
28361 + * 1) __kernel_vsyscall _must_ be first in this page.
28362 + * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
28363 + *    for details.
28364 + */
28365 +#include <asm/ia32_unistd.h>
28366 +#include <asm/asm-offsets.h>
28367 +
28368 +       .code32
28369 +       .text
28370 +       .section .text.vsyscall,"ax"
28371 +       .globl __kernel_vsyscall
28372 +       .type __kernel_vsyscall,@function
28373 +__kernel_vsyscall:
28374 +.LSTART_vsyscall:
28375 +       int $0x80
28376 +       ret
28377 +.LEND_vsyscall:
28378 +       .size __kernel_vsyscall,.-.LSTART_vsyscall
28379 +       .previous
28380 +
28381 +       .section .eh_frame,"a",@progbits
28382 +.LSTARTFRAME:
28383 +       .long .LENDCIE-.LSTARTCIE
28384 +.LSTARTCIE:
28385 +       .long 0                 /* CIE ID */
28386 +       .byte 1                 /* Version number */
28387 +       .string "zR"            /* NUL-terminated augmentation string */
28388 +       .uleb128 1              /* Code alignment factor */
28389 +       .sleb128 -4             /* Data alignment factor */
28390 +       .byte 8                 /* Return address register column */
28391 +       .uleb128 1              /* Augmentation value length */
28392 +       .byte 0x1b              /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
28393 +       .byte 0x0c              /* DW_CFA_def_cfa */
28394 +       .uleb128 4
28395 +       .uleb128 4
28396 +       .byte 0x88              /* DW_CFA_offset, column 0x8 */
28397 +       .uleb128 1
28398 +       .align 4
28399 +.LENDCIE:
28400 +
28401 +       .long .LENDFDE1-.LSTARTFDE1     /* Length FDE */
28402 +.LSTARTFDE1:
28403 +       .long .LSTARTFDE1-.LSTARTFRAME  /* CIE pointer */
28404 +       .long .LSTART_vsyscall-.        /* PC-relative start address */
28405 +       .long .LEND_vsyscall-.LSTART_vsyscall
28406 +       .uleb128 0                      /* Augmentation length */
28407 +       .align 4
28408 +.LENDFDE1:
28409 +               
28410 +/*
28411 + * Get the common code for the sigreturn entry points.
28412 + */
28413 +#define SYSCALL_ENTER_KERNEL    int $0x80
28414 +#include "vsyscall-sigreturn.S"
28415 diff -urNp linux-2.6/arch/x86_64/ia32/vsyscall-sigreturn.S new/arch/x86_64/ia32/vsyscall-sigreturn.S
28416 --- linux-2.6/arch/x86_64/ia32/vsyscall-sigreturn.S     2006-07-03 14:14:30.000000000 +0200
28417 +++ new/arch/x86_64/ia32/vsyscall-sigreturn.S   2006-05-09 12:33:17.000000000 +0200
28418 @@ -139,5 +139,5 @@ __kernel_rt_sigreturn:
28419         .align 4
28420  .LENDFDE3:
28421  
28422 -#include "../../i386/kernel/vsyscall-note.S"
28423 +#include <vsyscall-note.S>
28424  
28425 diff -urNp linux-2.6/arch/x86_64/Kconfig new/arch/x86_64/Kconfig
28426 --- linux-2.6/arch/x86_64/Kconfig       2006-07-03 14:14:30.000000000 +0200
28427 +++ new/arch/x86_64/Kconfig     2006-05-09 12:33:17.000000000 +0200
28428 @@ -123,6 +123,22 @@ config GENERIC_CPU
28429  
28430  endchoice
28431  
28432 +config X86_64_XEN
28433 +       bool "Enable Xen compatible kernel"
28434 +       select SWIOTLB
28435 +       help
28436 +         This option will compile a kernel compatible with Xen hypervisor
28437 +
28438 +config X86_NO_TSS
28439 +       bool
28440 +       depends on X86_64_XEN
28441 +       default y
28442 +
28443 +config X86_NO_IDT
28444 +       bool
28445 +       depends on X86_64_XEN
28446 +       default y
28447 +
28448  #
28449  # Define implied options from the CPU selection here
28450  #
28451 @@ -143,6 +159,7 @@ config X86_INTERNODE_CACHE_BYTES
28452  
28453  config X86_TSC
28454         bool
28455 +       depends on !X86_64_XEN
28456         default y
28457  
28458  config X86_GOOD_APIC
28459 @@ -185,7 +202,7 @@ config X86_CPUID
28460  
28461  config X86_HT
28462         bool
28463 -       depends on SMP && !MK8
28464 +       depends on SMP && !MK8 && !X86_64_XEN
28465         default y
28466  
28467  config MATH_EMULATION
28468 @@ -199,14 +216,22 @@ config EISA
28469  
28470  config X86_IO_APIC
28471         bool
28472 +       depends !XEN_UNPRIVILEGED_GUEST
28473         default y
28474  
28475 +config X86_XEN_GENAPIC
28476 +       bool
28477 +       depends X86_64_XEN
28478 +       default XEN_PRIVILEGED_GUEST || SMP
28479 +
28480  config X86_LOCAL_APIC
28481         bool
28482 +       depends !XEN_UNPRIVILEGED_GUEST
28483         default y
28484  
28485  config MTRR
28486         bool "MTRR (Memory Type Range Register) support"
28487 +       depends on !XEN_UNPRIVILEGED_GUEST
28488         ---help---
28489           On Intel P6 family processors (Pentium Pro, Pentium II and later)
28490           the Memory Type Range Registers (MTRRs) may be used to control
28491 @@ -247,7 +272,7 @@ config SMP
28492  
28493  config SCHED_SMT
28494         bool "SMT (Hyperthreading) scheduler support"
28495 -       depends on SMP
28496 +       depends on SMP && !X86_64_XEN
28497         default n
28498         help
28499           SMT scheduler support improves the CPU scheduler's decision making
28500 @@ -257,7 +282,7 @@ config SCHED_SMT
28501  
28502  config SCHED_MC
28503         bool "Multi-core scheduler support"
28504 -       depends on SMP
28505 +       depends on SMP && !X86_64_XEN
28506         default y
28507         help
28508           Multi-core scheduler support improves the CPU scheduler's decision
28509 @@ -268,7 +293,7 @@ source "kernel/Kconfig.preempt"
28510  
28511  config NUMA
28512         bool "Non Uniform Memory Access (NUMA) Support"
28513 -       depends on SMP
28514 +       depends on SMP && !X86_64_XEN
28515         help
28516          Enable NUMA (Non Uniform Memory Access) support. The kernel 
28517          will try to allocate memory used by a CPU on the local memory 
28518 @@ -352,6 +377,7 @@ config NR_CPUS
28519         int "Maximum number of CPUs (2-256)"
28520         range 2 255
28521         depends on SMP
28522 +       default "16" if X86_64_XEN
28523         default "8"
28524         help
28525           This allows you to specify the maximum number of CPUs which this
28526 @@ -372,6 +398,7 @@ config HOTPLUG_CPU
28527  
28528  config HPET_TIMER
28529         bool
28530 +       depends on !X86_64_XEN
28531         default y
28532         help
28533           Use the IA-PC HPET (High Precision Event Timer) to manage
28534 @@ -389,7 +416,7 @@ config GART_IOMMU
28535         bool "K8 GART IOMMU support"
28536         default y
28537         select SWIOTLB
28538 -       depends on PCI
28539 +       depends on PCI && !X86_64_XEN
28540         help
28541           Support for hardware IOMMU in AMD's Opteron/Athlon64 Processors
28542           and for the bounce buffering software IOMMU.
28543 @@ -409,6 +436,7 @@ config SWIOTLB
28544  
28545  config X86_MCE
28546         bool "Machine check support" if EMBEDDED
28547 +       depends on !X86_64_XEN
28548         default y
28549         help
28550            Include a machine check error handler to report hardware errors.
28551 @@ -434,7 +462,7 @@ config X86_MCE_AMD
28552  
28553  config KEXEC
28554         bool "kexec system call (EXPERIMENTAL)"
28555 -       depends on EXPERIMENTAL
28556 +       depends on EXPERIMENTAL && !X86_64_XEN
28557         help
28558           kexec is a system call that implements the ability to shutdown your
28559           current kernel, and to start another kernel.  It is like a reboot
28560 @@ -525,8 +553,11 @@ config GENERIC_PENDING_IRQ
28561         default y
28562  
28563  menu "Power management options"
28564 +       depends on !XEN_UNPRIVILEGED_GUEST
28565  
28566 +if !X86_64_XEN
28567  source kernel/power/Kconfig
28568 +endif
28569  
28570  source "drivers/acpi/Kconfig"
28571  
28572 @@ -549,6 +580,21 @@ config PCI_MMCONFIG
28573         bool "Support mmconfig PCI config space access"
28574         depends on PCI && ACPI
28575  
28576 +config XEN_PCIDEV_FRONTEND
28577 +       bool "Xen PCI Frontend"
28578 +       depends on PCI && X86_64_XEN
28579 +       default y
28580 +       help
28581 +         The PCI device frontend driver allows the kernel to import arbitrary
28582 +         PCI devices from a PCI backend to support PCI driver domains.
28583 +
28584 +config XEN_PCIDEV_FE_DEBUG
28585 +       bool "Xen PCI Frontend Debugging"
28586 +       depends on XEN_PCIDEV_FRONTEND
28587 +       default n
28588 +       help
28589 +         Enables some debug statements within the PCI Frontend.
28590 +
28591  source "drivers/pci/pcie/Kconfig"
28592  
28593  source "drivers/pci/Kconfig"
28594 @@ -619,4 +665,6 @@ source "security/Kconfig"
28595  
28596  source "crypto/Kconfig"
28597  
28598 +source "drivers/xen/Kconfig"
28599 +
28600  source "lib/Kconfig"
28601 diff -urNp linux-2.6/arch/x86_64/kernel/acpi/Makefile new/arch/x86_64/kernel/acpi/Makefile
28602 --- linux-2.6/arch/x86_64/kernel/acpi/Makefile  2006-07-03 14:14:30.000000000 +0200
28603 +++ new/arch/x86_64/kernel/acpi/Makefile        2006-05-09 12:33:17.000000000 +0200
28604 @@ -6,3 +6,4 @@ ifneq ($(CONFIG_ACPI_PROCESSOR),)
28605  obj-y                  += processor.o
28606  endif
28607  
28608 +boot-$(CONFIG_XEN)             := ../../../i386/kernel/acpi/boot-xen.o
28609 diff -urNp linux-2.6/arch/x86_64/kernel/apic-xen.c new/arch/x86_64/kernel/apic-xen.c
28610 --- linux-2.6/arch/x86_64/kernel/apic-xen.c     1970-01-01 01:00:00.000000000 +0100
28611 +++ new/arch/x86_64/kernel/apic-xen.c   2006-05-09 12:33:17.000000000 +0200
28612 @@ -0,0 +1,198 @@
28613 +/*
28614 + *     Local APIC handling, local APIC timers
28615 + *
28616 + *     (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
28617 + *
28618 + *     Fixes
28619 + *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
28620 + *                                     thanks to Eric Gilmore
28621 + *                                     and Rolf G. Tews
28622 + *                                     for testing these extensively.
28623 + *     Maciej W. Rozycki       :       Various updates and fixes.
28624 + *     Mikael Pettersson       :       Power Management for UP-APIC.
28625 + *     Pavel Machek and
28626 + *     Mikael Pettersson       :       PM converted to driver model.
28627 + */
28628 +
28629 +#include <linux/config.h>
28630 +#include <linux/init.h>
28631 +
28632 +#include <linux/mm.h>
28633 +#include <linux/delay.h>
28634 +#include <linux/bootmem.h>
28635 +#include <linux/smp_lock.h>
28636 +#include <linux/interrupt.h>
28637 +#include <linux/mc146818rtc.h>
28638 +#include <linux/kernel_stat.h>
28639 +#include <linux/sysdev.h>
28640 +#include <linux/module.h>
28641 +
28642 +#include <asm/atomic.h>
28643 +#include <asm/smp.h>
28644 +#include <asm/mtrr.h>
28645 +#include <asm/mpspec.h>
28646 +#include <asm/desc.h>
28647 +#include <asm/arch_hooks.h>
28648 +#include <asm/hpet.h>
28649 +#include <asm/idle.h>
28650 +
28651 +int apic_verbosity;
28652 +
28653 +/*
28654 + * 'what should we do if we get a hw irq event on an illegal vector'.
28655 + * each architecture has to answer this themselves.
28656 + */
28657 +void ack_bad_irq(unsigned int irq)
28658 +{
28659 +       printk("unexpected IRQ trap at vector %02x\n", irq);
28660 +       /*
28661 +        * Currently unexpected vectors happen only on SMP and APIC.
28662 +        * We _must_ ack these because every local APIC has only N
28663 +        * irq slots per priority level, and a 'hanging, unacked' IRQ
28664 +        * holds up an irq slot - in excessive cases (when multiple
28665 +        * unexpected vectors occur) that might lock up the APIC
28666 +        * completely.
28667 +        * But don't ack when the APIC is disabled. -AK
28668 +        */
28669 +       if (!disable_apic)
28670 +               ack_APIC_irq();
28671 +}
28672 +
28673 +int setup_profiling_timer(unsigned int multiplier)
28674 +{
28675 +       return -EINVAL;
28676 +}
28677 +
28678 +void smp_local_timer_interrupt(struct pt_regs *regs)
28679 +{
28680 +       profile_tick(CPU_PROFILING, regs);
28681 +#ifndef CONFIG_XEN
28682 +#ifdef CONFIG_SMP
28683 +               update_process_times(user_mode(regs));
28684 +#endif
28685 +#endif
28686 +       /*
28687 +        * We take the 'long' return path, and there every subsystem
28688 +        * grabs the appropriate locks (kernel lock/ irq lock).
28689 +        *
28690 +        * we might want to decouple profiling from the 'long path',
28691 +        * and do the profiling totally in assembly.
28692 +        *
28693 +        * Currently this isn't too much of an issue (performance wise),
28694 +        * we can take more than 100K local irqs per second on a 100 MHz P5.
28695 +        */
28696 +}
28697 +
28698 +/*
28699 + * Local APIC timer interrupt. This is the most natural way for doing
28700 + * local interrupts, but local timer interrupts can be emulated by
28701 + * broadcast interrupts too. [in case the hw doesn't support APIC timers]
28702 + *
28703 + * [ if a single-CPU system runs an SMP kernel then we call the local
28704 + *   interrupt as well. Thus we cannot inline the local irq ... ]
28705 + */
28706 +void smp_apic_timer_interrupt(struct pt_regs *regs)
28707 +{
28708 +       /*
28709 +        * the NMI deadlock-detector uses this.
28710 +        */
28711 +       add_pda(apic_timer_irqs, 1);
28712 +
28713 +       /*
28714 +        * NOTE! We'd better ACK the irq immediately,
28715 +        * because timer handling can be slow.
28716 +        */
28717 +       ack_APIC_irq();
28718 +       /*
28719 +        * update_process_times() expects us to have done irq_enter().
28720 +        * Besides, if we don't timer interrupts ignore the global
28721 +        * interrupt lock, which is the WrongThing (tm) to do.
28722 +        */
28723 +       exit_idle();
28724 +       irq_enter();
28725 +       smp_local_timer_interrupt(regs);
28726 +       irq_exit();
28727 +}
28728 +
28729 +/*
28730 + * This interrupt should _never_ happen with our APIC/SMP architecture
28731 + */
28732 +asmlinkage void smp_spurious_interrupt(void)
28733 +{
28734 +       unsigned int v;
28735 +       exit_idle();
28736 +       irq_enter();
28737 +       /*
28738 +        * Check if this really is a spurious interrupt and ACK it
28739 +        * if it is a vectored one.  Just in case...
28740 +        * Spurious interrupts should not be ACKed.
28741 +        */
28742 +       v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
28743 +       if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
28744 +               ack_APIC_irq();
28745 +
28746 +#if 0
28747 +       static unsigned long last_warning; 
28748 +       static unsigned long skipped; 
28749 +
28750 +       /* see sw-dev-man vol 3, chapter 7.4.13.5 */
28751 +       if (time_before(last_warning+30*HZ,jiffies)) { 
28752 +               printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n",
28753 +                      smp_processor_id(), skipped);
28754 +               last_warning = jiffies; 
28755 +               skipped = 0;
28756 +       } else { 
28757 +               skipped++; 
28758 +       } 
28759 +#endif 
28760 +       irq_exit();
28761 +}
28762 +
28763 +/*
28764 + * This interrupt should never happen with our APIC/SMP architecture
28765 + */
28766 +
28767 +asmlinkage void smp_error_interrupt(void)
28768 +{
28769 +       unsigned int v, v1;
28770 +
28771 +       exit_idle();
28772 +       irq_enter();
28773 +       /* First tickle the hardware, only then report what went on. -- REW */
28774 +       v = apic_read(APIC_ESR);
28775 +       apic_write(APIC_ESR, 0);
28776 +       v1 = apic_read(APIC_ESR);
28777 +       ack_APIC_irq();
28778 +       atomic_inc(&irq_err_count);
28779 +
28780 +       /* Here is what the APIC error bits mean:
28781 +          0: Send CS error
28782 +          1: Receive CS error
28783 +          2: Send accept error
28784 +          3: Receive accept error
28785 +          4: Reserved
28786 +          5: Send illegal vector
28787 +          6: Received illegal vector
28788 +          7: Illegal register address
28789 +       */
28790 +       printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
28791 +               smp_processor_id(), v , v1);
28792 +       irq_exit();
28793 +}
28794 +
28795 +int disable_apic;
28796 +
28797 +/*
28798 + * This initializes the IO-APIC and APIC hardware if this is
28799 + * a UP kernel.
28800 + */
28801 +int __init APIC_init_uniprocessor (void)
28802 +{
28803 +#ifdef CONFIG_X86_IO_APIC
28804 +       if (smp_found_config)
28805 +               if (!skip_ioapic_setup && nr_ioapics)
28806 +                       setup_IO_APIC();
28807 +#endif
28808 +
28809 +       return 1;
28810 +}
28811 diff -urNp linux-2.6/arch/x86_64/kernel/asm-offsets.c new/arch/x86_64/kernel/asm-offsets.c
28812 --- linux-2.6/arch/x86_64/kernel/asm-offsets.c  2006-07-03 14:14:30.000000000 +0200
28813 +++ new/arch/x86_64/kernel/asm-offsets.c        2006-05-09 12:33:18.000000000 +0200
28814 @@ -66,7 +66,9 @@ int main(void)
28815         DEFINE(pbe_address, offsetof(struct pbe, address));
28816         DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
28817         DEFINE(pbe_next, offsetof(struct pbe, next));
28818 +#ifndef CONFIG_X86_NO_TSS
28819         BLANK();
28820         DEFINE(TSS_ist, offsetof(struct tss_struct, ist));
28821 +#endif
28822         return 0;
28823  }
28824 diff -urNp linux-2.6/arch/x86_64/kernel/e820-xen.c new/arch/x86_64/kernel/e820-xen.c
28825 --- linux-2.6/arch/x86_64/kernel/e820-xen.c     1970-01-01 01:00:00.000000000 +0100
28826 +++ new/arch/x86_64/kernel/e820-xen.c   2006-06-28 14:32:14.000000000 +0200
28827 @@ -0,0 +1,753 @@
28828 +/* 
28829 + * Handle the memory map.
28830 + * The functions here do the job until bootmem takes over.
28831 + * $Id$
28832 + *
28833 + *  Getting sanitize_e820_map() in sync with i386 version by applying change:
28834 + *  -  Provisions for empty E820 memory regions (reported by certain BIOSes).
28835 + *     Alex Achenbach <xela@slit.de>, December 2002.
28836 + *  Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
28837 + *
28838 + */
28839 +#include <linux/config.h>
28840 +#include <linux/kernel.h>
28841 +#include <linux/types.h>
28842 +#include <linux/init.h>
28843 +#include <linux/bootmem.h>
28844 +#include <linux/ioport.h>
28845 +#include <linux/string.h>
28846 +#include <linux/kexec.h>
28847 +#include <linux/module.h>
28848 +
28849 +#include <asm/page.h>
28850 +#include <asm/e820.h>
28851 +#include <asm/proto.h>
28852 +#include <asm/bootsetup.h>
28853 +#include <asm/sections.h>
28854 +#include <xen/interface/memory.h>
28855 +
28856 +/* 
28857 + * PFN of last memory page.
28858 + */
28859 +unsigned long end_pfn; 
28860 +EXPORT_SYMBOL(end_pfn);
28861 +
28862 +/* 
28863 + * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
28864 + * The direct mapping extends to end_pfn_map, so that we can directly access
28865 + * apertures, ACPI and other tables without having to play with fixmaps.
28866 + */ 
28867 +unsigned long end_pfn_map; 
28868 +
28869 +/* 
28870 + * Last pfn which the user wants to use.
28871 + */
28872 +unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT;  
28873 +
28874 +extern struct resource code_resource, data_resource;
28875 +
28876 +/* Check for some hardcoded bad areas that early boot is not allowed to touch */ 
28877 +static inline int bad_addr(unsigned long *addrp, unsigned long size)
28878 +{ 
28879 +       unsigned long addr = *addrp, last = addr + size; 
28880 +
28881 +#ifndef CONFIG_XEN
28882 +       /* various gunk below that needed for SMP startup */
28883 +       if (addr < 0x8000) { 
28884 +               *addrp = 0x8000;
28885 +               return 1; 
28886 +       }
28887 +
28888 +       /* direct mapping tables of the kernel */
28889 +       if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { 
28890 +               *addrp = table_end << PAGE_SHIFT; 
28891 +               return 1;
28892 +       } 
28893 +
28894 +       /* initrd */ 
28895 +#ifdef CONFIG_BLK_DEV_INITRD
28896 +       if (LOADER_TYPE && INITRD_START && last >= INITRD_START && 
28897 +           addr < INITRD_START+INITRD_SIZE) { 
28898 +               *addrp = INITRD_START + INITRD_SIZE; 
28899 +               return 1;
28900 +       } 
28901 +#endif
28902 +       /* kernel code + 640k memory hole (later should not be needed, but 
28903 +          be paranoid for now) */
28904 +       if (last >= 640*1024 && addr < __pa_symbol(&_end)) { 
28905 +               *addrp = __pa_symbol(&_end);
28906 +               return 1;
28907 +       }
28908 +
28909 +       if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
28910 +               *addrp = ebda_addr + ebda_size;
28911 +               return 1;
28912 +       }
28913 +
28914 +       /* XXX ramdisk image here? */ 
28915 +#else
28916 +       if (last < (table_end<<PAGE_SHIFT)) {
28917 +               *addrp = table_end << PAGE_SHIFT;
28918 +               return 1;
28919 +       }
28920 +#endif
28921 +       return 0;
28922 +} 
28923 +
28924 +#ifndef CONFIG_XEN
28925 +/*
28926 + * This function checks if any part of the range <start,end> is mapped
28927 + * with type.
28928 + */
28929 +int __meminit
28930 +e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
28931 +{ 
28932 +       int i;
28933 +       for (i = 0; i < e820.nr_map; i++) { 
28934 +               struct e820entry *ei = &e820.map[i]; 
28935 +               if (type && ei->type != type) 
28936 +                       continue;
28937 +               if (ei->addr >= end || ei->addr + ei->size <= start)
28938 +                       continue; 
28939 +               return 1; 
28940 +       } 
28941 +       return 0;
28942 +}
28943 +#endif
28944 +
28945 +/*
28946 + * This function checks if the entire range <start,end> is mapped with type.
28947 + *
28948 + * Note: this function only works correct if the e820 table is sorted and
28949 + * not-overlapping, which is the case
28950 + */
28951 +int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
28952 +{
28953 +       int i;
28954 +       for (i = 0; i < e820.nr_map; i++) {
28955 +               struct e820entry *ei = &e820.map[i];
28956 +               if (type && ei->type != type)
28957 +                       continue;
28958 +               /* is the region (part) in overlap with the current region ?*/
28959 +               if (ei->addr >= end || ei->addr + ei->size <= start)
28960 +                       continue;
28961 +
28962 +               /* if the region is at the beginning of <start,end> we move
28963 +                * start to the end of the region since it's ok until there
28964 +                */
28965 +               if (ei->addr <= start)
28966 +                       start = ei->addr + ei->size;
28967 +               /* if start is now at or beyond end, we're done, full coverage */
28968 +               if (start >= end)
28969 +                       return 1; /* we're done */
28970 +       }
28971 +       return 0;
28972 +}
28973 +
28974 +/* 
28975 + * Find a free area in a specific range. 
28976 + */ 
28977 +unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) 
28978 +{ 
28979 +       int i; 
28980 +       for (i = 0; i < e820.nr_map; i++) { 
28981 +               struct e820entry *ei = &e820.map[i]; 
28982 +               unsigned long addr = ei->addr, last; 
28983 +               if (ei->type != E820_RAM) 
28984 +                       continue; 
28985 +               if (addr < start) 
28986 +                       addr = start;
28987 +               if (addr > ei->addr + ei->size) 
28988 +                       continue; 
28989 +               while (bad_addr(&addr, size) && addr+size < ei->addr + ei->size)
28990 +                       ;
28991 +               last = addr + size;
28992 +               if (last > ei->addr + ei->size)
28993 +                       continue;
28994 +               if (last > end) 
28995 +                       continue;
28996 +               return addr; 
28997 +       } 
28998 +       return -1UL;            
28999 +} 
29000 +
29001 +/* 
29002 + * Free bootmem based on the e820 table for a node.
29003 + */
29004 +void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
29005 +{
29006 +       int i;
29007 +       for (i = 0; i < e820.nr_map; i++) {
29008 +               struct e820entry *ei = &e820.map[i]; 
29009 +               unsigned long last, addr;
29010 +
29011 +               if (ei->type != E820_RAM || 
29012 +                   ei->addr+ei->size <= start || 
29013 +                   ei->addr >= end)
29014 +                       continue;
29015 +
29016 +               addr = round_up(ei->addr, PAGE_SIZE);
29017 +               if (addr < start) 
29018 +                       addr = start;
29019 +
29020 +               last = round_down(ei->addr + ei->size, PAGE_SIZE); 
29021 +               if (last >= end)
29022 +                       last = end; 
29023 +
29024 +               if (last > addr && last-addr >= PAGE_SIZE)
29025 +                       free_bootmem_node(pgdat, addr, last-addr);
29026 +       }
29027 +}
29028 +
29029 +/*
29030 + * Find the highest page frame number we have available
29031 + */
29032 +unsigned long __init e820_end_of_ram(void)
29033 +{
29034 +       int i;
29035 +       unsigned long end_pfn = 0;
29036 +       
29037 +       for (i = 0; i < e820.nr_map; i++) {
29038 +               struct e820entry *ei = &e820.map[i]; 
29039 +               unsigned long start, end;
29040 +
29041 +               start = round_up(ei->addr, PAGE_SIZE); 
29042 +               end = round_down(ei->addr + ei->size, PAGE_SIZE); 
29043 +               if (start >= end)
29044 +                       continue;
29045 +               if (ei->type == E820_RAM) { 
29046 +               if (end > end_pfn<<PAGE_SHIFT)
29047 +                       end_pfn = end>>PAGE_SHIFT;
29048 +               } else { 
29049 +                       if (end > end_pfn_map<<PAGE_SHIFT) 
29050 +                               end_pfn_map = end>>PAGE_SHIFT;
29051 +               } 
29052 +       }
29053 +
29054 +       if (end_pfn > end_pfn_map) 
29055 +               end_pfn_map = end_pfn;
29056 +       if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
29057 +               end_pfn_map = MAXMEM>>PAGE_SHIFT;
29058 +       if (end_pfn > end_user_pfn)
29059 +               end_pfn = end_user_pfn;
29060 +       if (end_pfn > end_pfn_map) 
29061 +               end_pfn = end_pfn_map; 
29062 +
29063 +       return end_pfn; 
29064 +}
29065 +
29066 +/* 
29067 + * Compute how much memory is missing in a range.
29068 + * Unlike the other functions in this file the arguments are in page numbers.
29069 + */
29070 +unsigned long __init
29071 +e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
29072 +{
29073 +       unsigned long ram = 0;
29074 +       unsigned long start = start_pfn << PAGE_SHIFT;
29075 +       unsigned long end = end_pfn << PAGE_SHIFT;
29076 +       int i;
29077 +       for (i = 0; i < e820.nr_map; i++) {
29078 +               struct e820entry *ei = &e820.map[i];
29079 +               unsigned long last, addr;
29080 +
29081 +               if (ei->type != E820_RAM ||
29082 +                   ei->addr+ei->size <= start ||
29083 +                   ei->addr >= end)
29084 +                       continue;
29085 +
29086 +               addr = round_up(ei->addr, PAGE_SIZE);
29087 +               if (addr < start)
29088 +                       addr = start;
29089 +
29090 +               last = round_down(ei->addr + ei->size, PAGE_SIZE);
29091 +               if (last >= end)
29092 +                       last = end;
29093 +
29094 +               if (last > addr)
29095 +                       ram += last - addr;
29096 +       }
29097 +       return ((end - start) - ram) >> PAGE_SHIFT;
29098 +}
29099 +
29100 +/*
29101 + * Mark e820 reserved areas as busy for the resource manager.
29102 + */
29103 +void __init e820_reserve_resources(struct e820entry *e820, int nr_map)
29104 +{
29105 +       int i;
29106 +       for (i = 0; i < nr_map; i++) {
29107 +               struct resource *res;
29108 +               res = alloc_bootmem_low(sizeof(struct resource));
29109 +               switch (e820[i].type) {
29110 +               case E820_RAM:  res->name = "System RAM"; break;
29111 +               case E820_ACPI: res->name = "ACPI Tables"; break;
29112 +               case E820_NVS:  res->name = "ACPI Non-volatile Storage"; break;
29113 +               default:        res->name = "reserved";
29114 +               }
29115 +               res->start = e820[i].addr;
29116 +               res->end = res->start + e820[i].size - 1;
29117 +               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
29118 +               request_resource(&iomem_resource, res);
29119 +               if (e820[i].type == E820_RAM) {
29120 +                       /*
29121 +                        *  We don't know which RAM region contains kernel data,
29122 +                        *  so we try it repeatedly and let the resource manager
29123 +                        *  test it.
29124 +                        */
29125 +                       request_resource(res, &code_resource);
29126 +                       request_resource(res, &data_resource);
29127 +#ifdef CONFIG_KEXEC
29128 +                       request_resource(res, &crashk_res);
29129 +#endif
29130 +               }
29131 +       }
29132 +}
29133 +
29134 +/* 
29135 + * Add a memory region to the kernel e820 map.
29136 + */ 
29137 +void __init add_memory_region(unsigned long start, unsigned long size, int type)
29138 +{
29139 +       int x = e820.nr_map;
29140 +
29141 +       if (x == E820MAX) {
29142 +               printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
29143 +               return;
29144 +       }
29145 +
29146 +       e820.map[x].addr = start;
29147 +       e820.map[x].size = size;
29148 +       e820.map[x].type = type;
29149 +       e820.nr_map++;
29150 +}
29151 +
29152 +void __init e820_print_map(char *who)
29153 +{
29154 +       int i;
29155 +
29156 +       for (i = 0; i < e820.nr_map; i++) {
29157 +               printk(" %s: %016Lx - %016Lx ", who,
29158 +                       (unsigned long long) e820.map[i].addr,
29159 +                       (unsigned long long) (e820.map[i].addr + e820.map[i].size));
29160 +               switch (e820.map[i].type) {
29161 +               case E820_RAM:  printk("(usable)\n");
29162 +                               break;
29163 +               case E820_RESERVED:
29164 +                               printk("(reserved)\n");
29165 +                               break;
29166 +               case E820_ACPI:
29167 +                               printk("(ACPI data)\n");
29168 +                               break;
29169 +               case E820_NVS:
29170 +                               printk("(ACPI NVS)\n");
29171 +                               break;
29172 +               default:        printk("type %u\n", e820.map[i].type);
29173 +                               break;
29174 +               }
29175 +       }
29176 +}
29177 +
29178 +/*
29179 + * Sanitize the BIOS e820 map.
29180 + *
29181 + * Some e820 responses include overlapping entries.  The following 
29182 + * replaces the original e820 map with a new one, removing overlaps.
29183 + *
29184 + */
29185 +static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
29186 +{
29187 +       struct change_member {
29188 +               struct e820entry *pbios; /* pointer to original bios entry */
29189 +               unsigned long long addr; /* address for this change point */
29190 +       };
29191 +       static struct change_member change_point_list[2*E820MAX] __initdata;
29192 +       static struct change_member *change_point[2*E820MAX] __initdata;
29193 +       static struct e820entry *overlap_list[E820MAX] __initdata;
29194 +       static struct e820entry new_bios[E820MAX] __initdata;
29195 +       struct change_member *change_tmp;
29196 +       unsigned long current_type, last_type;
29197 +       unsigned long long last_addr;
29198 +       int chgidx, still_changing;
29199 +       int overlap_entries;
29200 +       int new_bios_entry;
29201 +       int old_nr, new_nr, chg_nr;
29202 +       int i;
29203 +
29204 +       /*
29205 +               Visually we're performing the following (1,2,3,4 = memory types)...
29206 +
29207 +               Sample memory map (w/overlaps):
29208 +                  ____22__________________
29209 +                  ______________________4_
29210 +                  ____1111________________
29211 +                  _44_____________________
29212 +                  11111111________________
29213 +                  ____________________33__
29214 +                  ___________44___________
29215 +                  __________33333_________
29216 +                  ______________22________
29217 +                  ___________________2222_
29218 +                  _________111111111______
29219 +                  _____________________11_
29220 +                  _________________4______
29221 +
29222 +               Sanitized equivalent (no overlap):
29223 +                  1_______________________
29224 +                  _44_____________________
29225 +                  ___1____________________
29226 +                  ____22__________________
29227 +                  ______11________________
29228 +                  _________1______________
29229 +                  __________3_____________
29230 +                  ___________44___________
29231 +                  _____________33_________
29232 +                  _______________2________
29233 +                  ________________1_______
29234 +                  _________________4______
29235 +                  ___________________2____
29236 +                  ____________________33__
29237 +                  ______________________4_
29238 +       */
29239 +
29240 +       /* if there's only one memory region, don't bother */
29241 +       if (*pnr_map < 2)
29242 +               return -1;
29243 +
29244 +       old_nr = *pnr_map;
29245 +
29246 +       /* bail out if we find any unreasonable addresses in bios map */
29247 +       for (i=0; i<old_nr; i++)
29248 +               if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
29249 +                       return -1;
29250 +
29251 +       /* create pointers for initial change-point information (for sorting) */
29252 +       for (i=0; i < 2*old_nr; i++)
29253 +               change_point[i] = &change_point_list[i];
29254 +
29255 +       /* record all known change-points (starting and ending addresses),
29256 +          omitting those that are for empty memory regions */
29257 +       chgidx = 0;
29258 +       for (i=0; i < old_nr; i++)      {
29259 +               if (biosmap[i].size != 0) {
29260 +                       change_point[chgidx]->addr = biosmap[i].addr;
29261 +                       change_point[chgidx++]->pbios = &biosmap[i];
29262 +                       change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
29263 +                       change_point[chgidx++]->pbios = &biosmap[i];
29264 +               }
29265 +       }
29266 +       chg_nr = chgidx;
29267 +
29268 +       /* sort change-point list by memory addresses (low -> high) */
29269 +       still_changing = 1;
29270 +       while (still_changing)  {
29271 +               still_changing = 0;
29272 +               for (i=1; i < chg_nr; i++)  {
29273 +                       /* if <current_addr> > <last_addr>, swap */
29274 +                       /* or, if current=<start_addr> & last=<end_addr>, swap */
29275 +                       if ((change_point[i]->addr < change_point[i-1]->addr) ||
29276 +                               ((change_point[i]->addr == change_point[i-1]->addr) &&
29277 +                                (change_point[i]->addr == change_point[i]->pbios->addr) &&
29278 +                                (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
29279 +                          )
29280 +                       {
29281 +                               change_tmp = change_point[i];
29282 +                               change_point[i] = change_point[i-1];
29283 +                               change_point[i-1] = change_tmp;
29284 +                               still_changing=1;
29285 +                       }
29286 +               }
29287 +       }
29288 +
29289 +       /* create a new bios memory map, removing overlaps */
29290 +       overlap_entries=0;       /* number of entries in the overlap table */
29291 +       new_bios_entry=0;        /* index for creating new bios map entries */
29292 +       last_type = 0;           /* start with undefined memory type */
29293 +       last_addr = 0;           /* start with 0 as last starting address */
29294 +       /* loop through change-points, determining affect on the new bios map */
29295 +       for (chgidx=0; chgidx < chg_nr; chgidx++)
29296 +       {
29297 +               /* keep track of all overlapping bios entries */
29298 +               if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
29299 +               {
29300 +                       /* add map entry to overlap list (> 1 entry implies an overlap) */
29301 +                       overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
29302 +               }
29303 +               else
29304 +               {
29305 +                       /* remove entry from list (order independent, so swap with last) */
29306 +                       for (i=0; i<overlap_entries; i++)
29307 +                       {
29308 +                               if (overlap_list[i] == change_point[chgidx]->pbios)
29309 +                                       overlap_list[i] = overlap_list[overlap_entries-1];
29310 +                       }
29311 +                       overlap_entries--;
29312 +               }
29313 +               /* if there are overlapping entries, decide which "type" to use */
29314 +               /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
29315 +               current_type = 0;
29316 +               for (i=0; i<overlap_entries; i++)
29317 +                       if (overlap_list[i]->type > current_type)
29318 +                               current_type = overlap_list[i]->type;
29319 +               /* continue building up new bios map based on this information */
29320 +               if (current_type != last_type)  {
29321 +                       if (last_type != 0)      {
29322 +                               new_bios[new_bios_entry].size =
29323 +                                       change_point[chgidx]->addr - last_addr;
29324 +                               /* move forward only if the new size was non-zero */
29325 +                               if (new_bios[new_bios_entry].size != 0)
29326 +                                       if (++new_bios_entry >= E820MAX)
29327 +                                               break;  /* no more space left for new bios entries */
29328 +                       }
29329 +                       if (current_type != 0)  {
29330 +                               new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
29331 +                               new_bios[new_bios_entry].type = current_type;
29332 +                               last_addr=change_point[chgidx]->addr;
29333 +                       }
29334 +                       last_type = current_type;
29335 +               }
29336 +       }
29337 +       new_nr = new_bios_entry;   /* retain count for new bios entries */
29338 +
29339 +       /* copy new bios mapping into original location */
29340 +       memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
29341 +       *pnr_map = new_nr;
29342 +
29343 +       return 0;
29344 +}
29345 +
29346 +/*
29347 + * Copy the BIOS e820 map into a safe place.
29348 + *
29349 + * Sanity-check it while we're at it..
29350 + *
29351 + * If we're lucky and live on a modern system, the setup code
29352 + * will have given us a memory map that we can use to properly
29353 + * set up memory.  If we aren't, we'll fake a memory map.
29354 + *
29355 + * We check to see that the memory map contains at least 2 elements
29356 + * before we'll use it, because the detection code in setup.S may
29357 + * not be perfect and most every PC known to man has two memory
29358 + * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
29359 + * thinkpad 560x, for example, does not cooperate with the memory
29360 + * detection code.)
29361 + */
29362 +static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
29363 +{
29364 +#ifndef CONFIG_XEN
29365 +       /* Only one memory region (or negative)? Ignore it */
29366 +       if (nr_map < 2)
29367 +               return -1;
29368 +#else
29369 +       BUG_ON(nr_map < 1);
29370 +#endif
29371 +
29372 +       do {
29373 +               unsigned long start = biosmap->addr;
29374 +               unsigned long size = biosmap->size;
29375 +               unsigned long end = start + size;
29376 +               unsigned long type = biosmap->type;
29377 +
29378 +               /* Overflow in 64 bits? Ignore the memory map. */
29379 +               if (start > end)
29380 +                       return -1;
29381 +
29382 +#ifndef CONFIG_XEN
29383 +               /*
29384 +                * Some BIOSes claim RAM in the 640k - 1M region.
29385 +                * Not right. Fix it up.
29386 +                * 
29387 +                * This should be removed on Hammer which is supposed to not
29388 +                * have non e820 covered ISA mappings there, but I had some strange
29389 +                * problems so it stays for now.  -AK
29390 +                */
29391 +               if (type == E820_RAM) {
29392 +                       if (start < 0x100000ULL && end > 0xA0000ULL) {
29393 +                               if (start < 0xA0000ULL)
29394 +                                       add_memory_region(start, 0xA0000ULL-start, type);
29395 +                               if (end <= 0x100000ULL)
29396 +                                       continue;
29397 +                               start = 0x100000ULL;
29398 +                               size = end - start;
29399 +                       }
29400 +               }
29401 +#endif
29402 +
29403 +               add_memory_region(start, size, type);
29404 +       } while (biosmap++,--nr_map);
29405 +       return 0;
29406 +}
29407 +
29408 +#ifndef CONFIG_XEN
29409 +void __init setup_memory_region(void)
29410 +{
29411 +       char *who = "BIOS-e820";
29412 +
29413 +       /*
29414 +        * Try to copy the BIOS-supplied E820-map.
29415 +        *
29416 +        * Otherwise fake a memory map; one section from 0k->640k,
29417 +        * the next section from 1mb->appropriate_mem_k
29418 +        */
29419 +       sanitize_e820_map(E820_MAP, &E820_MAP_NR);
29420 +       if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
29421 +               unsigned long mem_size;
29422 +
29423 +               /* compare results from other methods and take the greater */
29424 +               if (ALT_MEM_K < EXT_MEM_K) {
29425 +                       mem_size = EXT_MEM_K;
29426 +                       who = "BIOS-88";
29427 +               } else {
29428 +                       mem_size = ALT_MEM_K;
29429 +                       who = "BIOS-e801";
29430 +               }
29431 +
29432 +               e820.nr_map = 0;
29433 +               add_memory_region(0, LOWMEMSIZE(), E820_RAM);
29434 +               add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
29435 +       }
29436 +       printk(KERN_INFO "BIOS-provided physical RAM map:\n");
29437 +       e820_print_map(who);
29438 +}
29439 +
29440 +#else  /* CONFIG_XEN */
29441 +
29442 +void __init setup_memory_region(void)
29443 +{
29444 +       int rc;
29445 +       struct xen_memory_map memmap;
29446 +       /*
29447 +        * This is rather large for a stack variable but this early in
29448 +        * the boot process we know we have plenty slack space.
29449 +        */
29450 +       struct e820entry map[E820MAX];
29451 +
29452 +       memmap.nr_entries = E820MAX;
29453 +       set_xen_guest_handle(memmap.buffer, map);
29454 +
29455 +       rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
29456 +       if ( rc == -ENOSYS ) {
29457 +               memmap.nr_entries = 1;
29458 +               map[0].addr = 0ULL;
29459 +               map[0].size = xen_start_info->nr_pages << PAGE_SHIFT;
29460 +               /* 8MB slack (to balance backend allocations). */
29461 +               map[0].size += 8 << 20;
29462 +               map[0].type = E820_RAM;
29463 +               rc = 0;
29464 +       }
29465 +       BUG_ON(rc);
29466 +
29467 +       sanitize_e820_map(map, (char *)&memmap.nr_entries);
29468 +
29469 +       BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
29470 +
29471 +       printk(KERN_INFO "BIOS-provided physical RAM map:\n");
29472 +       e820_print_map("Xen");
29473 +}
29474 +#endif
29475 +
29476 +void __init parse_memopt(char *p, char **from) 
29477 +{ 
29478 +       int i;
29479 +       unsigned long current_end;
29480 +       unsigned long end;
29481 +
29482 +       end_user_pfn = memparse(p, from);
29483 +       end_user_pfn >>= PAGE_SHIFT;    
29484 +
29485 +       end = end_user_pfn<<PAGE_SHIFT;
29486 +       i = e820.nr_map-1;
29487 +       current_end = e820.map[i].addr + e820.map[i].size;
29488 +
29489 +       if (current_end < end) {
29490 +               /*
29491 +                 * The e820 map ends before our requested size so
29492 +                 * extend the final entry to the requested address.
29493 +                 */
29494 +               if (e820.map[i].type == E820_RAM)
29495 +                       e820.map[i].size = end - e820.map[i].addr;
29496 +               else
29497 +                       add_memory_region(current_end, end - current_end, E820_RAM);
29498 +       }
29499 +} 
29500 +
29501 +void __init parse_memmapopt(char *p, char **from)
29502 +{
29503 +       unsigned long long start_at, mem_size;
29504 +
29505 +       mem_size = memparse(p, from);
29506 +       p = *from;
29507 +       if (*p == '@') {
29508 +               start_at = memparse(p+1, from);
29509 +               add_memory_region(start_at, mem_size, E820_RAM);
29510 +       } else if (*p == '#') {
29511 +               start_at = memparse(p+1, from);
29512 +               add_memory_region(start_at, mem_size, E820_ACPI);
29513 +       } else if (*p == '$') {
29514 +               start_at = memparse(p+1, from);
29515 +               add_memory_region(start_at, mem_size, E820_RESERVED);
29516 +       } else {
29517 +               end_user_pfn = (mem_size >> PAGE_SHIFT);
29518 +       }
29519 +       p = *from;
29520 +}
29521 +
29522 +unsigned long pci_mem_start = 0xaeedbabe;
29523 +
29524 +/*
29525 + * Search for the biggest gap in the low 32 bits of the e820
29526 + * memory space.  We pass this space to PCI to assign MMIO resources
29527 + * for hotplug or unconfigured devices in.
29528 + * Hopefully the BIOS let enough space left.
29529 + */
29530 +__init void e820_setup_gap(struct e820entry *e820, int nr_map)
29531 +{
29532 +       unsigned long gapstart, gapsize, round;
29533 +       unsigned long last;
29534 +       int i;
29535 +       int found = 0;
29536 +
29537 +       last = 0x100000000ull;
29538 +       gapstart = 0x10000000;
29539 +       gapsize = 0x400000;
29540 +       i = nr_map;
29541 +       while (--i >= 0) {
29542 +               unsigned long long start = e820[i].addr;
29543 +               unsigned long long end = start + e820[i].size;
29544 +
29545 +               /*
29546 +                * Since "last" is at most 4GB, we know we'll
29547 +                * fit in 32 bits if this condition is true
29548 +                */
29549 +               if (last > end) {
29550 +                       unsigned long gap = last - end;
29551 +
29552 +                       if (gap > gapsize) {
29553 +                               gapsize = gap;
29554 +                               gapstart = end;
29555 +                               found = 1;
29556 +                       }
29557 +               }
29558 +               if (start < last)
29559 +                       last = start;
29560 +       }
29561 +
29562 +       if (!found) {
29563 +               gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
29564 +               printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
29565 +                      KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
29566 +       }
29567 +
29568 +       /*
29569 +        * See how much we want to round up: start off with
29570 +        * rounding to the next 1MB area.
29571 +        */
29572 +       round = 0x100000;
29573 +       while ((gapsize >> 4) > round)
29574 +               round += round;
29575 +       /* Fun with two's complement */
29576 +       pci_mem_start = (gapstart + round) & -round;
29577 +
29578 +       printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
29579 +               pci_mem_start, gapstart, gapsize);
29580 +}
29581 diff -urNp linux-2.6/arch/x86_64/kernel/early_printk-xen.c new/arch/x86_64/kernel/early_printk-xen.c
29582 --- linux-2.6/arch/x86_64/kernel/early_printk-xen.c     1970-01-01 01:00:00.000000000 +0100
29583 +++ new/arch/x86_64/kernel/early_printk-xen.c   2006-05-09 12:33:18.000000000 +0200
29584 @@ -0,0 +1,304 @@
29585 +#include <linux/config.h>
29586 +#include <linux/console.h>
29587 +#include <linux/kernel.h>
29588 +#include <linux/init.h>
29589 +#include <linux/string.h>
29590 +#include <linux/tty.h>
29591 +#include <asm/io.h>
29592 +#include <asm/processor.h>
29593 +#include <asm/fcntl.h>
29594 +
29595 +/* Simple VGA output */
29596 +
29597 +#ifdef __i386__
29598 +#include <asm/setup.h>
29599 +#define VGABASE                (__ISA_IO_base + 0xb8000)
29600 +#else
29601 +#include <asm/bootsetup.h>
29602 +#define VGABASE                ((void __iomem *)0xffffffff800b8000UL)
29603 +#endif
29604 +
29605 +static int max_ypos = 25, max_xpos = 80;
29606 +static int current_ypos = 25, current_xpos = 0;
29607 +
29608 +#ifndef CONFIG_XEN
29609 +static void early_vga_write(struct console *con, const char *str, unsigned n)
29610 +{
29611 +       char c;
29612 +       int  i, k, j;
29613 +
29614 +       while ((c = *str++) != '\0' && n-- > 0) {
29615 +               if (current_ypos >= max_ypos) {
29616 +                       /* scroll 1 line up */
29617 +                       for (k = 1, j = 0; k < max_ypos; k++, j++) {
29618 +                               for (i = 0; i < max_xpos; i++) {
29619 +                                       writew(readw(VGABASE+2*(max_xpos*k+i)),
29620 +                                              VGABASE + 2*(max_xpos*j + i));
29621 +                               }
29622 +                       }
29623 +                       for (i = 0; i < max_xpos; i++)
29624 +                               writew(0x720, VGABASE + 2*(max_xpos*j + i));
29625 +                       current_ypos = max_ypos-1;
29626 +               }
29627 +               if (c == '\n') {
29628 +                       current_xpos = 0;
29629 +                       current_ypos++;
29630 +               } else if (c != '\r')  {
29631 +                       writew(((0x7 << 8) | (unsigned short) c),
29632 +                              VGABASE + 2*(max_xpos*current_ypos +
29633 +                                               current_xpos++));
29634 +                       if (current_xpos >= max_xpos) {
29635 +                               current_xpos = 0;
29636 +                               current_ypos++;
29637 +                       }
29638 +               }
29639 +       }
29640 +}
29641 +
29642 +static struct console early_vga_console = {
29643 +       .name =         "earlyvga",
29644 +       .write =        early_vga_write,
29645 +       .flags =        CON_PRINTBUFFER,
29646 +       .index =        -1,
29647 +};
29648 +
29649 +/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
29650 +
29651 +static int early_serial_base = 0x3f8;  /* ttyS0 */
29652 +
29653 +#define XMTRDY          0x20
29654 +
29655 +#define DLAB           0x80
29656 +
29657 +#define TXR             0       /*  Transmit register (WRITE) */
29658 +#define RXR             0       /*  Receive register  (READ)  */
29659 +#define IER             1       /*  Interrupt Enable          */
29660 +#define IIR             2       /*  Interrupt ID              */
29661 +#define FCR             2       /*  FIFO control              */
29662 +#define LCR             3       /*  Line control              */
29663 +#define MCR             4       /*  Modem control             */
29664 +#define LSR             5       /*  Line Status               */
29665 +#define MSR             6       /*  Modem Status              */
29666 +#define DLL             0       /*  Divisor Latch Low         */
29667 +#define DLH             1       /*  Divisor latch High        */
29668 +
29669 +static int early_serial_putc(unsigned char ch)
29670 +{
29671 +       unsigned timeout = 0xffff;
29672 +       while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
29673 +               cpu_relax();
29674 +       outb(ch, early_serial_base + TXR);
29675 +       return timeout ? 0 : -1;
29676 +}
29677 +
29678 +static void early_serial_write(struct console *con, const char *s, unsigned n)
29679 +{
29680 +       while (*s && n-- > 0) {
29681 +               early_serial_putc(*s);
29682 +               if (*s == '\n')
29683 +                       early_serial_putc('\r');
29684 +               s++;
29685 +       }
29686 +}
29687 +
29688 +#define DEFAULT_BAUD 9600
29689 +
29690 +static __init void early_serial_init(char *s)
29691 +{
29692 +       unsigned char c;
29693 +       unsigned divisor;
29694 +       unsigned baud = DEFAULT_BAUD;
29695 +       char *e;
29696 +
29697 +       if (*s == ',')
29698 +               ++s;
29699 +
29700 +       if (*s) {
29701 +               unsigned port;
29702 +               if (!strncmp(s,"0x",2)) {
29703 +                       early_serial_base = simple_strtoul(s, &e, 16);
29704 +               } else {
29705 +                       static int bases[] = { 0x3f8, 0x2f8 };
29706 +
29707 +                       if (!strncmp(s,"ttyS",4))
29708 +                               s += 4;
29709 +                       port = simple_strtoul(s, &e, 10);
29710 +                       if (port > 1 || s == e)
29711 +                               port = 0;
29712 +                       early_serial_base = bases[port];
29713 +               }
29714 +               s += strcspn(s, ",");
29715 +               if (*s == ',')
29716 +                       s++;
29717 +       }
29718 +
29719 +       outb(0x3, early_serial_base + LCR);     /* 8n1 */
29720 +       outb(0, early_serial_base + IER);       /* no interrupt */
29721 +       outb(0, early_serial_base + FCR);       /* no fifo */
29722 +       outb(0x3, early_serial_base + MCR);     /* DTR + RTS */
29723 +
29724 +       if (*s) {
29725 +               baud = simple_strtoul(s, &e, 0);
29726 +               if (baud == 0 || s == e)
29727 +                       baud = DEFAULT_BAUD;
29728 +       }
29729 +
29730 +       divisor = 115200 / baud;
29731 +       c = inb(early_serial_base + LCR);
29732 +       outb(c | DLAB, early_serial_base + LCR);
29733 +       outb(divisor & 0xff, early_serial_base + DLL);
29734 +       outb((divisor >> 8) & 0xff, early_serial_base + DLH);
29735 +       outb(c & ~DLAB, early_serial_base + LCR);
29736 +}
29737 +
29738 +#else /* CONFIG_XEN */
29739 +
29740 +#undef SCREEN_INFO
29741 +#define SCREEN_INFO screen_info
29742 +extern struct screen_info screen_info;
29743 +
29744 +static void
29745 +early_serial_write(struct console *con, const char *s, unsigned count)
29746 +{
29747 +       int n;
29748 +
29749 +       while (count > 0) {
29750 +               n = HYPERVISOR_console_io(CONSOLEIO_write, count, (char *)s);
29751 +               if (n <= 0)
29752 +                       break;
29753 +               count -= n;
29754 +               s += n;
29755 +       }
29756 +} 
29757 +
29758 +static __init void early_serial_init(char *s)
29759 +{
29760 +       current_xpos = 0;
29761 +}
29762 +
29763 +/*
29764 + * No early VGA console on Xen, as we do not have convenient ISA-space
29765 + * mappings. Someone should fix this for domain 0. For now, use fake serial.
29766 + */
29767 +#define early_vga_console early_serial_console
29768 +
29769 +#endif
29770 +
29771 +static struct console early_serial_console = {
29772 +       .name =         "earlyser",
29773 +       .write =        early_serial_write,
29774 +       .flags =        CON_PRINTBUFFER,
29775 +       .index =        -1,
29776 +};
29777 +
29778 +/* Console interface to a host file on AMD's SimNow! */
29779 +
29780 +static int simnow_fd;
29781 +
29782 +enum {
29783 +       MAGIC1 = 0xBACCD00A,
29784 +       MAGIC2 = 0xCA110000,
29785 +       XOPEN = 5,
29786 +       XWRITE = 4,
29787 +};
29788 +
29789 +static noinline long simnow(long cmd, long a, long b, long c)
29790 +{
29791 +       long ret;
29792 +       asm volatile("cpuid" :
29793 +                    "=a" (ret) :
29794 +                    "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
29795 +       return ret;
29796 +}
29797 +
29798 +void __init simnow_init(char *str)
29799 +{
29800 +       char *fn = "klog";
29801 +       if (*str == '=')
29802 +               fn = ++str;
29803 +       /* error ignored */
29804 +       simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
29805 +}
29806 +
29807 +static void simnow_write(struct console *con, const char *s, unsigned n)
29808 +{
29809 +       simnow(XWRITE, simnow_fd, (unsigned long)s, n);
29810 +}
29811 +
29812 +static struct console simnow_console = {
29813 +       .name =         "simnow",
29814 +       .write =        simnow_write,
29815 +       .flags =        CON_PRINTBUFFER,
29816 +       .index =        -1,
29817 +};
29818 +
29819 +/* Direct interface for emergencies */
29820 +struct console *early_console = &early_vga_console;
29821 +static int early_console_initialized = 0;
29822 +
29823 +void early_printk(const char *fmt, ...)
29824 +{
29825 +       char buf[512];
29826 +       int n;
29827 +       va_list ap;
29828 +
29829 +       va_start(ap,fmt);
29830 +       n = vscnprintf(buf,512,fmt,ap);
29831 +       early_console->write(early_console,buf,n);
29832 +       va_end(ap);
29833 +}
29834 +
29835 +static int __initdata keep_early;
29836 +
29837 +int __init setup_early_printk(char *opt)
29838 +{
29839 +       char *space;
29840 +       char buf[256];
29841 +
29842 +       if (early_console_initialized)
29843 +               return 1;
29844 +
29845 +       strlcpy(buf,opt,sizeof(buf));
29846 +       space = strchr(buf, ' ');
29847 +       if (space)
29848 +               *space = 0;
29849 +
29850 +       if (strstr(buf,"keep"))
29851 +               keep_early = 1;
29852 +
29853 +       if (!strncmp(buf, "serial", 6)) {
29854 +               early_serial_init(buf + 6);
29855 +               early_console = &early_serial_console;
29856 +       } else if (!strncmp(buf, "ttyS", 4)) {
29857 +               early_serial_init(buf);
29858 +               early_console = &early_serial_console;
29859 +       } else if (!strncmp(buf, "vga", 3)
29860 +                  && SCREEN_INFO.orig_video_isVGA == 1) {
29861 +               max_xpos = SCREEN_INFO.orig_video_cols;
29862 +               max_ypos = SCREEN_INFO.orig_video_lines;
29863 +               current_ypos = SCREEN_INFO.orig_y;
29864 +               early_console = &early_vga_console;
29865 +       } else if (!strncmp(buf, "simnow", 6)) {
29866 +               simnow_init(buf + 6);
29867 +               early_console = &simnow_console;
29868 +               keep_early = 1;
29869 +       }
29870 +       early_console_initialized = 1;
29871 +       register_console(early_console);
29872 +       return 0;
29873 +}
29874 +
29875 +void __init disable_early_printk(void)
29876 +{
29877 +       if (!early_console_initialized || !early_console)
29878 +               return;
29879 +       if (!keep_early) {
29880 +               printk("disabling early console\n");
29881 +               unregister_console(early_console);
29882 +               early_console_initialized = 0;
29883 +       } else {
29884 +               printk("keeping early console\n");
29885 +       }
29886 +}
29887 +
29888 +__setup("earlyprintk=", setup_early_printk);
29889 diff -urNp linux-2.6/arch/x86_64/kernel/entry.S new/arch/x86_64/kernel/entry.S
29890 --- linux-2.6/arch/x86_64/kernel/entry.S        2006-07-03 14:14:30.000000000 +0200
29891 +++ new/arch/x86_64/kernel/entry.S      2006-06-05 15:54:32.000000000 +0200
29892 @@ -596,7 +596,7 @@ retint_kernel:      
29893   */            
29894         .macro apicinterrupt num,func
29895         INTR_FRAME
29896 -       pushq $\num-256
29897 +       pushq $~(\num)
29898         CFI_ADJUST_CFA_OFFSET 8
29899         interrupt \func
29900         jmp ret_from_intr
29901 diff -urNp linux-2.6/arch/x86_64/kernel/entry-xen.S new/arch/x86_64/kernel/entry-xen.S
29902 --- linux-2.6/arch/x86_64/kernel/entry-xen.S    1970-01-01 01:00:00.000000000 +0100
29903 +++ new/arch/x86_64/kernel/entry-xen.S  2006-05-09 12:33:18.000000000 +0200
29904 @@ -0,0 +1,1141 @@
29905 +/*
29906 + *  linux/arch/x86_64/entry.S
29907 + *
29908 + *  Copyright (C) 1991, 1992  Linus Torvalds
29909 + *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
29910 + *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
29911 + * 
29912 + *  $Id$
29913 + *
29914 + *  Jun Nakajima <jun.nakajima@intel.com>
29915 + *  Asit Mallick <asit.k.mallick@intel.com>
29916 + *      Modified for Xen
29917 + */
29918 +
29919 +/*
29920 + * entry.S contains the system-call and fault low-level handling routines.
29921 + *
29922 + * NOTE: This code handles signal-recognition, which happens every time
29923 + * after an interrupt and after each system call.
29924 + * 
29925 + * Normal syscalls and interrupts don't save a full stack frame, this is 
29926 + * only done for syscall tracing, signals or fork/exec et.al.
29927 + * 
29928 + * A note on terminology:       
29929 + * - top of stack: Architecture defined interrupt frame from SS to RIP 
29930 + * at the top of the kernel process stack.     
29931 + * - partial stack frame: partially saved registers upto R11.
29932 + * - full stack frame: Like partial stack frame, but all register saved. 
29933 + *     
29934 + * TODO:        
29935 + * - schedule it carefully for the final hardware.
29936 + */
29937 +
29938 +#define ASSEMBLY 1
29939 +#include <linux/config.h>
29940 +#ifdef CONFIG_DEBUG_INFO
29941 +#undef CONFIG_DEBUG_INFO
29942 +#endif
29943 +#include <linux/linkage.h>
29944 +#include <asm/segment.h>
29945 +#include <asm/smp.h>
29946 +#include <asm/cache.h>
29947 +#include <asm/errno.h>
29948 +#include <asm/dwarf2.h>
29949 +#include <asm/calling.h>
29950 +#include <asm/asm-offsets.h>
29951 +#include <asm/msr.h>
29952 +#include <asm/unistd.h>
29953 +#include <asm/thread_info.h>
29954 +#include <asm/hw_irq.h>
29955 +#include <asm/page.h>
29956 +#include <asm/errno.h>
29957 +#include <xen/interface/arch-x86_64.h>
29958 +#include <xen/interface/features.h>
29959 +
29960 +#include "irq_vectors.h"
29961 +
29962 +#include "xen_entry.S"
29963 +       
29964 +       .code64
29965 +
29966 +#ifndef CONFIG_PREEMPT
29967 +#define retint_kernel retint_restore_args
29968 +#endif 
29969 +
29970 +NMI_MASK = 0x80000000
29971 +       
29972 +/*
29973 + * C code is not supposed to know about undefined top of stack. Every time 
29974 + * a C function with an pt_regs argument is called from the SYSCALL based 
29975 + * fast path FIXUP_TOP_OF_STACK is needed.
29976 + * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
29977 + * manipulation.
29978 + */            
29979 +               
29980 +       /* %rsp:at FRAMEEND */ 
29981 +       .macro FIXUP_TOP_OF_STACK tmp
29982 +       movq    $__USER_CS,CS(%rsp)
29983 +       movq    $-1,RCX(%rsp)
29984 +       .endm
29985 +
29986 +       .macro RESTORE_TOP_OF_STACK tmp,offset=0
29987 +       .endm
29988 +
29989 +       .macro FAKE_STACK_FRAME child_rip
29990 +       /* push in order ss, rsp, eflags, cs, rip */
29991 +       xorl %eax, %eax
29992 +       pushq %rax /* ss */
29993 +       CFI_ADJUST_CFA_OFFSET   8
29994 +       /*CFI_REL_OFFSET        ss,0*/
29995 +       pushq %rax /* rsp */
29996 +       CFI_ADJUST_CFA_OFFSET   8
29997 +       CFI_REL_OFFSET  rsp,0
29998 +       pushq $(1<<9) /* eflags - interrupts on */
29999 +       CFI_ADJUST_CFA_OFFSET   8
30000 +       /*CFI_REL_OFFSET        rflags,0*/
30001 +       pushq $__KERNEL_CS /* cs */
30002 +       CFI_ADJUST_CFA_OFFSET   8
30003 +       /*CFI_REL_OFFSET        cs,0*/
30004 +       pushq \child_rip /* rip */
30005 +       CFI_ADJUST_CFA_OFFSET   8
30006 +       CFI_REL_OFFSET  rip,0
30007 +       pushq   %rax /* orig rax */
30008 +       CFI_ADJUST_CFA_OFFSET   8
30009 +       .endm
30010 +
30011 +       .macro UNFAKE_STACK_FRAME
30012 +       addq $8*6, %rsp
30013 +       CFI_ADJUST_CFA_OFFSET   -(6*8)
30014 +       .endm
30015 +
30016 +       .macro  CFI_DEFAULT_STACK start=1
30017 +       .if \start
30018 +       CFI_STARTPROC   simple
30019 +       CFI_DEF_CFA     rsp,SS+8
30020 +       .else
30021 +       CFI_DEF_CFA_OFFSET SS+8
30022 +       .endif
30023 +       CFI_REL_OFFSET  r15,R15
30024 +       CFI_REL_OFFSET  r14,R14
30025 +       CFI_REL_OFFSET  r13,R13
30026 +       CFI_REL_OFFSET  r12,R12
30027 +       CFI_REL_OFFSET  rbp,RBP
30028 +       CFI_REL_OFFSET  rbx,RBX
30029 +       CFI_REL_OFFSET  r11,R11
30030 +       CFI_REL_OFFSET  r10,R10
30031 +       CFI_REL_OFFSET  r9,R9
30032 +       CFI_REL_OFFSET  r8,R8
30033 +       CFI_REL_OFFSET  rax,RAX
30034 +       CFI_REL_OFFSET  rcx,RCX
30035 +       CFI_REL_OFFSET  rdx,RDX
30036 +       CFI_REL_OFFSET  rsi,RSI
30037 +       CFI_REL_OFFSET  rdi,RDI
30038 +       CFI_REL_OFFSET  rip,RIP
30039 +       /*CFI_REL_OFFSET        cs,CS*/
30040 +       /*CFI_REL_OFFSET        rflags,EFLAGS*/
30041 +       CFI_REL_OFFSET  rsp,RSP
30042 +       /*CFI_REL_OFFSET        ss,SS*/
30043 +       .endm
30044 +
30045 +        /*
30046 +         * Must be consistent with the definition in arch-x86_64.h:    
30047 +         *     struct iret_context {
30048 +         *        u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
30049 +         *     };
30050 +         * #define VGCF_IN_SYSCALL (1<<8) 
30051 +         */
30052 +       .macro HYPERVISOR_IRET flag
30053 +       testb $3,1*8(%rsp)
30054 +       jnz   2f
30055 +       testl $NMI_MASK,2*8(%rsp)
30056 +       jnz   2f
30057 +
30058 +       testb $1,(xen_features+XENFEAT_supervisor_mode_kernel)
30059 +       jnz   1f
30060 +
30061 +       /* Direct iret to kernel space. Correct CS and SS. */
30062 +       orb   $3,1*8(%rsp)
30063 +       orb   $3,4*8(%rsp)
30064 +1:     iretq
30065 +
30066 +2:     /* Slow iret via hypervisor. */
30067 +       andl  $~NMI_MASK, 16(%rsp)
30068 +       pushq $\flag
30069 +       jmp  hypercall_page + (__HYPERVISOR_iret * 32)
30070 +       .endm
30071 +
30072 +        .macro SWITCH_TO_KERNEL ssoff,adjust=0
30073 +       jc  1f
30074 +       orb  $1,\ssoff-\adjust+4(%rsp)
30075 +1:
30076 +        .endm
30077 +
30078 +/*
30079 + * A newly forked process directly context switches into this.
30080 + */    
30081 +/* rdi:        prev */ 
30082 +ENTRY(ret_from_fork)
30083 +       CFI_DEFAULT_STACK
30084 +       call schedule_tail
30085 +       GET_THREAD_INFO(%rcx)
30086 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
30087 +       jnz rff_trace
30088 +rff_action:    
30089 +       RESTORE_REST
30090 +       testl $3,CS-ARGOFFSET(%rsp)     # from kernel_thread?
30091 +       je   int_ret_from_sys_call
30092 +       testl $_TIF_IA32,threadinfo_flags(%rcx)
30093 +       jnz  int_ret_from_sys_call
30094 +       RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
30095 +       jmp ret_from_sys_call
30096 +rff_trace:
30097 +       movq %rsp,%rdi
30098 +       call syscall_trace_leave
30099 +       GET_THREAD_INFO(%rcx)   
30100 +       jmp rff_action
30101 +       CFI_ENDPROC
30102 +
30103 +/*
30104 + * System call entry. Upto 6 arguments in registers are supported.
30105 + *
30106 + * SYSCALL does not save anything on the stack and does not change the
30107 + * stack pointer.
30108 + */
30109 +               
30110 +/*
30111 + * Register setup:     
30112 + * rax  system call number
30113 + * rdi  arg0
30114 + * rcx  return address for syscall/sysret, C arg3 
30115 + * rsi  arg1
30116 + * rdx  arg2   
30117 + * r10  arg3   (--> moved to rcx for C)
30118 + * r8   arg4
30119 + * r9   arg5
30120 + * r11  eflags for syscall/sysret, temporary for C
30121 + * r12-r15,rbp,rbx saved by C code, not touched.               
30122 + * 
30123 + * Interrupts are off on entry.
30124 + * Only called from user space.
30125 + *
30126 + * XXX if we had a free scratch register we could save the RSP into the stack frame
30127 + *      and report it properly in ps. Unfortunately we haven't.
30128 + *
30129 + * When user can change the frames always force IRET. That is because
30130 + * it deals with uncanonical addresses better. SYSRET has trouble
30131 + * with them due to bugs in both AMD and Intel CPUs.
30132 + */                                    
30133 +
30134 +ENTRY(system_call)
30135 +       CFI_STARTPROC   simple
30136 +       CFI_DEF_CFA     rsp,0
30137 +       CFI_REGISTER    rip,rcx
30138 +       /*CFI_REGISTER  rflags,r11*/
30139 +       SAVE_ARGS -8,0
30140 +       movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
30141 +        XEN_UNBLOCK_EVENTS(%r11)        
30142 +       GET_THREAD_INFO(%rcx)
30143 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
30144 +       CFI_REMEMBER_STATE
30145 +       jnz tracesys
30146 +       cmpq $__NR_syscall_max,%rax
30147 +       ja badsys
30148 +       movq %r10,%rcx
30149 +       call *sys_call_table(,%rax,8)  # XXX:    rip relative
30150 +       movq %rax,RAX-ARGOFFSET(%rsp)
30151 +/*
30152 + * Syscall return path ending with SYSRET (fast path)
30153 + * Has incomplete stack frame and undefined top of stack. 
30154 + */            
30155 +       .globl ret_from_sys_call
30156 +ret_from_sys_call:
30157 +       movl $_TIF_ALLWORK_MASK,%edi
30158 +       /* edi: flagmask */
30159 +sysret_check:          
30160 +       GET_THREAD_INFO(%rcx)
30161 +        XEN_BLOCK_EVENTS(%rsi)        
30162 +       movl threadinfo_flags(%rcx),%edx
30163 +       andl %edi,%edx
30164 +       CFI_REMEMBER_STATE
30165 +       jnz  sysret_careful 
30166 +        XEN_UNBLOCK_EVENTS(%rsi)                
30167 +       CFI_REGISTER    rip,rcx
30168 +       RESTORE_ARGS 0,8,0
30169 +       /*CFI_REGISTER  rflags,r11*/
30170 +        HYPERVISOR_IRET VGCF_IN_SYSCALL
30171 +
30172 +       /* Handle reschedules */
30173 +       /* edx: work, edi: workmask */  
30174 +sysret_careful:
30175 +       CFI_RESTORE_STATE
30176 +       bt $TIF_NEED_RESCHED,%edx
30177 +       jnc sysret_signal
30178 +        XEN_BLOCK_EVENTS(%rsi)        
30179 +       pushq %rdi
30180 +       CFI_ADJUST_CFA_OFFSET 8
30181 +       call schedule
30182 +       popq  %rdi
30183 +       CFI_ADJUST_CFA_OFFSET -8
30184 +       jmp sysret_check
30185 +
30186 +       /* Handle a signal */ 
30187 +sysret_signal:
30188 +/*     sti */
30189 +        XEN_UNBLOCK_EVENTS(%rsi)        
30190 +       testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
30191 +       jz    1f
30192 +
30193 +       /* Really a signal */
30194 +       /* edx: work flags (arg3) */
30195 +       leaq do_notify_resume(%rip),%rax
30196 +       leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
30197 +       xorl %esi,%esi # oldset -> arg2
30198 +       call ptregscall_common
30199 +1:     movl $_TIF_NEED_RESCHED,%edi
30200 +       /* Use IRET because user could have changed frame. This
30201 +          works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
30202 +       cli
30203 +       jmp int_with_check
30204 +       
30205 +badsys:
30206 +       movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
30207 +       jmp ret_from_sys_call
30208 +
30209 +       /* Do syscall tracing */
30210 +tracesys:                       
30211 +       CFI_RESTORE_STATE
30212 +       SAVE_REST
30213 +       movq $-ENOSYS,RAX(%rsp)
30214 +       FIXUP_TOP_OF_STACK %rdi
30215 +       movq %rsp,%rdi
30216 +       call syscall_trace_enter
30217 +       LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
30218 +       RESTORE_REST
30219 +       cmpq $__NR_syscall_max,%rax
30220 +       ja  1f
30221 +       movq %r10,%rcx  /* fixup for C */
30222 +       call *sys_call_table(,%rax,8)
30223 +       movq %rax,RAX-ARGOFFSET(%rsp)
30224 +1:     SAVE_REST
30225 +       movq %rsp,%rdi
30226 +       call syscall_trace_leave
30227 +       RESTORE_TOP_OF_STACK %rbx
30228 +       RESTORE_REST
30229 +       /* Use IRET because user could have changed frame */
30230 +       jmp int_ret_from_sys_call
30231 +       CFI_ENDPROC
30232 +               
30233 +/* 
30234 + * Syscall return path ending with IRET.
30235 + * Has correct top of stack, but partial stack frame.
30236 + */    
30237 +ENTRY(int_ret_from_sys_call)
30238 +       CFI_STARTPROC   simple
30239 +       CFI_DEF_CFA     rsp,SS+8-ARGOFFSET
30240 +       /*CFI_REL_OFFSET        ss,SS-ARGOFFSET*/
30241 +       CFI_REL_OFFSET  rsp,RSP-ARGOFFSET
30242 +       /*CFI_REL_OFFSET        rflags,EFLAGS-ARGOFFSET*/
30243 +       /*CFI_REL_OFFSET        cs,CS-ARGOFFSET*/
30244 +       CFI_REL_OFFSET  rip,RIP-ARGOFFSET
30245 +       CFI_REL_OFFSET  rdx,RDX-ARGOFFSET
30246 +       CFI_REL_OFFSET  rcx,RCX-ARGOFFSET
30247 +       CFI_REL_OFFSET  rax,RAX-ARGOFFSET
30248 +       CFI_REL_OFFSET  rdi,RDI-ARGOFFSET
30249 +       CFI_REL_OFFSET  rsi,RSI-ARGOFFSET
30250 +       CFI_REL_OFFSET  r8,R8-ARGOFFSET
30251 +       CFI_REL_OFFSET  r9,R9-ARGOFFSET
30252 +       CFI_REL_OFFSET  r10,R10-ARGOFFSET
30253 +       CFI_REL_OFFSET  r11,R11-ARGOFFSET
30254 +        XEN_BLOCK_EVENTS(%rsi)
30255 +       testb $3,CS-ARGOFFSET(%rsp)
30256 +        jnz 1f
30257 +        /* Need to set the proper %ss (not NULL) for ring 3 iretq */
30258 +        movl $__KERNEL_DS,SS-ARGOFFSET(%rsp)
30259 +        jmp retint_restore_args   # retrun from ring3 kernel
30260 +1:              
30261 +       movl $_TIF_ALLWORK_MASK,%edi
30262 +       /* edi: mask to check */
30263 +int_with_check:
30264 +       GET_THREAD_INFO(%rcx)
30265 +       movl threadinfo_flags(%rcx),%edx
30266 +       andl %edi,%edx
30267 +       jnz   int_careful
30268 +       andl    $~TS_COMPAT,threadinfo_status(%rcx)
30269 +       jmp   retint_restore_args
30270 +
30271 +       /* Either reschedule or signal or syscall exit tracking needed. */
30272 +       /* First do a reschedule test. */
30273 +       /* edx: work, edi: workmask */
30274 +int_careful:
30275 +       bt $TIF_NEED_RESCHED,%edx
30276 +       jnc  int_very_careful
30277 +/*     sti */
30278 +        XEN_UNBLOCK_EVENTS(%rsi)
30279 +       pushq %rdi
30280 +       CFI_ADJUST_CFA_OFFSET 8
30281 +       call schedule
30282 +       popq %rdi
30283 +       CFI_ADJUST_CFA_OFFSET -8
30284 +       cli
30285 +       jmp int_with_check
30286 +
30287 +       /* handle signals and tracing -- both require a full stack frame */
30288 +int_very_careful:
30289 +/*     sti */
30290 +        XEN_UNBLOCK_EVENTS(%rsi)
30291 +       SAVE_REST
30292 +       /* Check for syscall exit trace */      
30293 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
30294 +       jz int_signal
30295 +       pushq %rdi
30296 +       CFI_ADJUST_CFA_OFFSET 8
30297 +       leaq 8(%rsp),%rdi       # &ptregs -> arg1       
30298 +       call syscall_trace_leave
30299 +       popq %rdi
30300 +       CFI_ADJUST_CFA_OFFSET -8
30301 +       andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
30302 +       cli
30303 +       jmp int_restore_rest
30304 +       
30305 +int_signal:
30306 +       testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx
30307 +       jz 1f
30308 +       movq %rsp,%rdi          # &ptregs -> arg1
30309 +       xorl %esi,%esi          # oldset -> arg2
30310 +       call do_notify_resume
30311 +1:     movl $_TIF_NEED_RESCHED,%edi    
30312 +int_restore_rest:
30313 +       RESTORE_REST
30314 +       cli
30315 +       jmp int_with_check
30316 +       CFI_ENDPROC
30317 +               
30318 +/* 
30319 + * Certain special system calls that need to save a complete full stack frame.
30320 + */                                                            
30321 +       
30322 +       .macro PTREGSCALL label,func,arg
30323 +       .globl \label
30324 +\label:
30325 +       leaq    \func(%rip),%rax
30326 +       leaq    -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
30327 +       jmp     ptregscall_common
30328 +       .endm
30329 +
30330 +       CFI_STARTPROC
30331 +
30332 +       PTREGSCALL stub_clone, sys_clone, %r8
30333 +       PTREGSCALL stub_fork, sys_fork, %rdi
30334 +       PTREGSCALL stub_vfork, sys_vfork, %rdi
30335 +       PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
30336 +       PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
30337 +       PTREGSCALL stub_iopl, sys_iopl, %rsi
30338 +
30339 +ENTRY(ptregscall_common)
30340 +       popq %r11
30341 +       CFI_ADJUST_CFA_OFFSET -8
30342 +       CFI_REGISTER rip, r11
30343 +       SAVE_REST
30344 +       movq %r11, %r15
30345 +       CFI_REGISTER rip, r15
30346 +       FIXUP_TOP_OF_STACK %r11
30347 +       call *%rax
30348 +       RESTORE_TOP_OF_STACK %r11
30349 +       movq %r15, %r11
30350 +       CFI_REGISTER rip, r11
30351 +       RESTORE_REST
30352 +       pushq %r11
30353 +       CFI_ADJUST_CFA_OFFSET 8
30354 +       CFI_REL_OFFSET rip, 0
30355 +       ret
30356 +       CFI_ENDPROC
30357 +       
30358 +ENTRY(stub_execve)
30359 +       CFI_STARTPROC
30360 +       popq %r11
30361 +       CFI_ADJUST_CFA_OFFSET -8
30362 +       CFI_REGISTER rip, r11
30363 +       SAVE_REST
30364 +       FIXUP_TOP_OF_STACK %r11
30365 +       call sys_execve
30366 +       RESTORE_TOP_OF_STACK %r11
30367 +       movq %rax,RAX(%rsp)
30368 +       RESTORE_REST
30369 +       jmp int_ret_from_sys_call
30370 +       CFI_ENDPROC
30371 +       
30372 +/*
30373 + * sigreturn is special because it needs to restore all registers on return.
30374 + * This cannot be done with SYSRET, so use the IRET return path instead.
30375 + */                
30376 +ENTRY(stub_rt_sigreturn)
30377 +       CFI_STARTPROC
30378 +       addq $8, %rsp
30379 +       CFI_ADJUST_CFA_OFFSET   -8
30380 +       SAVE_REST
30381 +       movq %rsp,%rdi
30382 +       FIXUP_TOP_OF_STACK %r11
30383 +       call sys_rt_sigreturn
30384 +       movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
30385 +       RESTORE_REST
30386 +       jmp int_ret_from_sys_call
30387 +       CFI_ENDPROC
30388 +
30389 +/*
30390 + * initial frame state for interrupts and exceptions
30391 + */
30392 +       .macro _frame ref
30393 +       CFI_STARTPROC simple
30394 +       CFI_DEF_CFA rsp,SS+8-\ref
30395 +       /*CFI_REL_OFFSET ss,SS-\ref*/
30396 +       CFI_REL_OFFSET rsp,RSP-\ref
30397 +       /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
30398 +       /*CFI_REL_OFFSET cs,CS-\ref*/
30399 +       CFI_REL_OFFSET rip,RIP-\ref
30400 +       .endm
30401 +
30402 +/* initial frame state for interrupts (and exceptions without error code) */
30403 +#define INTR_FRAME _frame RIP
30404 +/* initial frame state for exceptions with error code (and interrupts with
30405 +   vector already pushed) */
30406 +#define XCPT_FRAME _frame ORIG_RAX
30407 +
30408 +/* 
30409 + * Interrupt exit.
30410 + *
30411 + */ 
30412 +
30413 +retint_check:
30414 +       movl threadinfo_flags(%rcx),%edx
30415 +       andl %edi,%edx
30416 +       CFI_REMEMBER_STATE
30417 +       jnz  retint_careful
30418 +retint_restore_args:
30419 +       movl EFLAGS-REST_SKIP(%rsp), %eax
30420 +       shr $9, %eax                    # EAX[0] == IRET_EFLAGS.IF
30421 +       XEN_GET_VCPU_INFO(%rsi)
30422 +       andb evtchn_upcall_mask(%rsi),%al
30423 +       andb $1,%al                     # EAX[0] == IRET_EFLAGS.IF & event_mask
30424 +       jnz restore_all_enable_events   #        != 0 => enable event delivery
30425 +       XEN_PUT_VCPU_INFO(%rsi)
30426 +               
30427 +       RESTORE_ARGS 0,8,0
30428 +       HYPERVISOR_IRET 0
30429 +       
30430 +       /* edi: workmask, edx: work */
30431 +retint_careful:
30432 +       CFI_RESTORE_STATE
30433 +       bt    $TIF_NEED_RESCHED,%edx
30434 +       jnc   retint_signal
30435 +       XEN_UNBLOCK_EVENTS(%rsi)
30436 +/*     sti */        
30437 +       pushq %rdi
30438 +       CFI_ADJUST_CFA_OFFSET   8
30439 +       call  schedule
30440 +       popq %rdi               
30441 +       CFI_ADJUST_CFA_OFFSET   -8
30442 +       XEN_BLOCK_EVENTS(%rsi)          
30443 +       GET_THREAD_INFO(%rcx)
30444 +/*     cli */
30445 +       jmp retint_check
30446 +       
30447 +retint_signal:
30448 +       testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
30449 +       jz    retint_restore_args
30450 +        XEN_UNBLOCK_EVENTS(%rsi)
30451 +       SAVE_REST
30452 +       movq $-1,ORIG_RAX(%rsp)                         
30453 +       xorl %esi,%esi          # oldset
30454 +       movq %rsp,%rdi          # &pt_regs
30455 +       call do_notify_resume
30456 +       RESTORE_REST
30457 +        XEN_BLOCK_EVENTS(%rsi)         
30458 +       movl $_TIF_NEED_RESCHED,%edi
30459 +       GET_THREAD_INFO(%rcx)
30460 +       jmp retint_check
30461 +
30462 +#ifdef CONFIG_PREEMPT
30463 +       /* Returning to kernel space. Check if we need preemption */
30464 +       /* rcx:  threadinfo. interrupts off. */
30465 +       .p2align
30466 +retint_kernel: 
30467 +       cmpl $0,threadinfo_preempt_count(%rcx)
30468 +       jnz  retint_restore_args
30469 +       bt  $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
30470 +       jnc  retint_restore_args
30471 +       bt   $9,EFLAGS-ARGOFFSET(%rsp)  /* interrupts off? */
30472 +       jnc  retint_restore_args
30473 +       call preempt_schedule_irq
30474 +       jmp retint_kernel       /* check again */
30475 +#endif 
30476 +       CFI_ENDPROC
30477 +       
30478 +/*
30479 + * APIC interrupts.
30480 + */            
30481 +       .macro apicinterrupt num,func
30482 +       INTR_FRAME
30483 +       pushq $~(\num)
30484 +       CFI_ADJUST_CFA_OFFSET 8
30485 +       interrupt \func
30486 +       jmp error_entry
30487 +       CFI_ENDPROC
30488 +       .endm
30489 +
30490 +#ifndef CONFIG_XEN
30491 +ENTRY(thermal_interrupt)
30492 +       apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
30493 +
30494 +ENTRY(threshold_interrupt)
30495 +       apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
30496 +
30497 +#ifdef CONFIG_SMP      
30498 +ENTRY(reschedule_interrupt)
30499 +       apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
30500 +
30501 +       .macro INVALIDATE_ENTRY num
30502 +ENTRY(invalidate_interrupt\num)
30503 +       apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt 
30504 +       .endm
30505 +
30506 +       INVALIDATE_ENTRY 0
30507 +       INVALIDATE_ENTRY 1
30508 +       INVALIDATE_ENTRY 2
30509 +       INVALIDATE_ENTRY 3
30510 +       INVALIDATE_ENTRY 4
30511 +       INVALIDATE_ENTRY 5
30512 +       INVALIDATE_ENTRY 6
30513 +       INVALIDATE_ENTRY 7
30514 +
30515 +ENTRY(call_function_interrupt)
30516 +       apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
30517 +#endif
30518 +
30519 +#ifdef CONFIG_X86_LOCAL_APIC   
30520 +ENTRY(apic_timer_interrupt)
30521 +       apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
30522 +
30523 +ENTRY(error_interrupt)
30524 +       apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
30525 +
30526 +ENTRY(spurious_interrupt)
30527 +       apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
30528 +#endif
30529 +#endif /* !CONFIG_XEN */
30530 +                               
30531 +/*
30532 + * Exception entry points.
30533 + */            
30534 +       .macro zeroentry sym
30535 +       INTR_FRAME
30536 +        movq (%rsp),%rcx
30537 +        movq 8(%rsp),%r11
30538 +        addq $0x10,%rsp /* skip rcx and r11 */
30539 +       pushq $0        /* push error code/oldrax */ 
30540 +       CFI_ADJUST_CFA_OFFSET 8
30541 +       pushq %rax      /* push real oldrax to the rdi slot */ 
30542 +       CFI_ADJUST_CFA_OFFSET 8
30543 +       leaq  \sym(%rip),%rax
30544 +       jmp error_entry
30545 +       CFI_ENDPROC
30546 +       .endm   
30547 +
30548 +       .macro errorentry sym
30549 +       XCPT_FRAME
30550 +        movq (%rsp),%rcx
30551 +        movq 8(%rsp),%r11
30552 +        addq $0x10,%rsp /* rsp points to the error code */
30553 +       pushq %rax
30554 +       CFI_ADJUST_CFA_OFFSET 8
30555 +       leaq  \sym(%rip),%rax
30556 +       jmp error_entry
30557 +       CFI_ENDPROC
30558 +       .endm
30559 +
30560 +#if 0 /* not XEN */
30561 +       /* error code is on the stack already */
30562 +       /* handle NMI like exceptions that can happen everywhere */
30563 +       .macro paranoidentry sym, ist=0
30564 +        movq (%rsp),%rcx
30565 +        movq 8(%rsp),%r11
30566 +        addq $0x10,%rsp /* skip rcx and r11 */        
30567 +       SAVE_ALL
30568 +       cld
30569 +#if 0 /* not XEN */
30570 +       movl $1,%ebx
30571 +       movl  $MSR_GS_BASE,%ecx
30572 +       rdmsr
30573 +       testl %edx,%edx
30574 +       js    1f
30575 +       swapgs
30576 +       xorl  %ebx,%ebx
30577 +1:
30578 +#endif
30579 +       .if \ist
30580 +       movq    %gs:pda_data_offset, %rbp
30581 +       .endif
30582 +       movq %rsp,%rdi
30583 +       movq ORIG_RAX(%rsp),%rsi
30584 +       movq $-1,ORIG_RAX(%rsp)
30585 +       .if \ist
30586 +       subq    $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
30587 +       .endif
30588 +       call \sym
30589 +       .if \ist
30590 +       addq    $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
30591 +       .endif
30592 +/*     cli */
30593 +       XEN_BLOCK_EVENTS(%rsi)          
30594 +       .endm
30595 +#endif
30596 +       
30597 +/*
30598 + * Exception entry point. This expects an error code/orig_rax on the stack
30599 + * and the exception handler in %rax.  
30600 + */                                            
30601 +ENTRY(error_entry)
30602 +       _frame RDI
30603 +       /* rdi slot contains rax, oldrax contains error code */
30604 +       cld     
30605 +       subq  $14*8,%rsp
30606 +       CFI_ADJUST_CFA_OFFSET   (14*8)
30607 +       movq %rsi,13*8(%rsp)
30608 +       CFI_REL_OFFSET  rsi,RSI
30609 +       movq 14*8(%rsp),%rsi    /* load rax from rdi slot */
30610 +       movq %rdx,12*8(%rsp)
30611 +       CFI_REL_OFFSET  rdx,RDX
30612 +       movq %rcx,11*8(%rsp)
30613 +       CFI_REL_OFFSET  rcx,RCX
30614 +       movq %rsi,10*8(%rsp)    /* store rax */ 
30615 +       CFI_REL_OFFSET  rax,RAX
30616 +       movq %r8, 9*8(%rsp)
30617 +       CFI_REL_OFFSET  r8,R8
30618 +       movq %r9, 8*8(%rsp)
30619 +       CFI_REL_OFFSET  r9,R9
30620 +       movq %r10,7*8(%rsp)
30621 +       CFI_REL_OFFSET  r10,R10
30622 +       movq %r11,6*8(%rsp)
30623 +       CFI_REL_OFFSET  r11,R11
30624 +       movq %rbx,5*8(%rsp) 
30625 +       CFI_REL_OFFSET  rbx,RBX
30626 +       movq %rbp,4*8(%rsp) 
30627 +       CFI_REL_OFFSET  rbp,RBP
30628 +       movq %r12,3*8(%rsp) 
30629 +       CFI_REL_OFFSET  r12,R12
30630 +       movq %r13,2*8(%rsp) 
30631 +       CFI_REL_OFFSET  r13,R13
30632 +       movq %r14,1*8(%rsp) 
30633 +       CFI_REL_OFFSET  r14,R14
30634 +       movq %r15,(%rsp) 
30635 +       CFI_REL_OFFSET  r15,R15
30636 +#if 0        
30637 +       cmpl $__KERNEL_CS,CS(%rsp)
30638 +       je  error_kernelspace
30639 +#endif        
30640 +error_call_handler:
30641 +       movq %rdi, RDI(%rsp)            
30642 +       movq %rsp,%rdi
30643 +       movq ORIG_RAX(%rsp),%rsi        # get error code 
30644 +       movq $-1,ORIG_RAX(%rsp)
30645 +       call *%rax
30646 +error_exit:            
30647 +       RESTORE_REST
30648 +/*     cli */
30649 +       XEN_BLOCK_EVENTS(%rsi)          
30650 +       GET_THREAD_INFO(%rcx)   
30651 +       testb $3,CS-ARGOFFSET(%rsp)
30652 +       jz retint_kernel
30653 +       movl  threadinfo_flags(%rcx),%edx
30654 +       movl  $_TIF_WORK_MASK,%edi      
30655 +       andl  %edi,%edx
30656 +       jnz   retint_careful
30657 +       jmp   retint_restore_args
30658 +
30659 +error_kernelspace:
30660 +         /*
30661 +         * We need to re-write the logic here because we don't do iretq to 
30662 +         * to return to user mode. It's still possible that we get trap/fault
30663 +         * in the kernel (when accessing buffers pointed to by system calls, 
30664 +         * for example).
30665 +         *
30666 +         */           
30667 +#if 0
30668 +       incl %ebx
30669 +       /* There are two places in the kernel that can potentially fault with
30670 +          usergs. Handle them here. The exception handlers after
30671 +          iret run with kernel gs again, so don't set the user space flag.
30672 +          B stepping K8s sometimes report an truncated RIP for IRET 
30673 +          exceptions returning to compat mode. Check for these here too. */
30674 +       leaq iret_label(%rip),%rbp
30675 +       cmpq %rbp,RIP(%rsp) 
30676 +       je   error_swapgs
30677 +       movl %ebp,%ebp  /* zero extend */
30678 +       cmpq %rbp,RIP(%rsp) 
30679 +       je   error_swapgs
30680 +       cmpq $gs_change,RIP(%rsp)
30681 +        je   error_swapgs
30682 +       jmp  error_sti
30683 +#endif        
30684 +       
30685 +ENTRY(hypervisor_callback)
30686 +       zeroentry do_hypervisor_callback
30687 +        
30688 +/*
30689 + * Copied from arch/xen/i386/kernel/entry.S
30690 + */               
30691 +# A note on the "critical region" in our callback handler.
30692 +# We want to avoid stacking callback handlers due to events occurring
30693 +# during handling of the last event. To do this, we keep events disabled
30694 +# until we've done all processing. HOWEVER, we must enable events before
30695 +# popping the stack frame (can't be done atomically) and so it would still
30696 +# be possible to get enough handler activations to overflow the stack.
30697 +# Although unlikely, bugs of that kind are hard to track down, so we'd
30698 +# like to avoid the possibility.
30699 +# So, on entry to the handler we detect whether we interrupted an
30700 +# existing activation in its critical region -- if so, we pop the current
30701 +# activation and restart the handler using the previous one.
30702 +ENTRY(do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
30703 +# Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
30704 +# see the correct pointer to the pt_regs
30705 +       movq %rdi, %rsp            # we don't return, adjust the stack frame
30706 +11:    movq %gs:pda_irqstackptr,%rax
30707 +       incl %gs:pda_irqcount
30708 +       cmovzq %rax,%rsp
30709 +       pushq %rdi
30710 +       call evtchn_do_upcall
30711 +       popq %rsp
30712 +       decl %gs:pda_irqcount
30713 +       jmp  error_exit
30714 +
30715 +#ifdef CONFIG_X86_LOCAL_APIC
30716 +KPROBE_ENTRY(nmi)
30717 +       zeroentry do_nmi_callback
30718 +ENTRY(do_nmi_callback)
30719 +        addq $8, %rsp
30720 +        call do_nmi
30721 +        orl  $NMI_MASK,EFLAGS(%rsp)
30722 +        RESTORE_REST
30723 +        XEN_BLOCK_EVENTS(%rsi)
30724 +        GET_THREAD_INFO(%rcx)
30725 +        jmp  retint_restore_args
30726 +       .previous .text
30727 +#endif
30728 +
30729 +        ALIGN
30730 +restore_all_enable_events:  
30731 +       XEN_UNBLOCK_EVENTS(%rsi)        # %rsi is already set up...
30732 +
30733 +scrit: /**** START OF CRITICAL REGION ****/
30734 +       XEN_TEST_PENDING(%rsi)
30735 +       jnz  14f                        # process more events if necessary...
30736 +       XEN_PUT_VCPU_INFO(%rsi)
30737 +        RESTORE_ARGS 0,8,0
30738 +        HYPERVISOR_IRET 0
30739 +        
30740 +14:    XEN_LOCKED_BLOCK_EVENTS(%rsi)
30741 +       XEN_PUT_VCPU_INFO(%rsi)
30742 +       SAVE_REST
30743 +        movq %rsp,%rdi                  # set the argument again
30744 +       jmp  11b
30745 +ecrit:  /**** END OF CRITICAL REGION ****/
30746 +# At this point, unlike on x86-32, we don't do the fixup to simplify the 
30747 +# code and the stack frame is more complex on x86-64.
30748 +# When the kernel is interrupted in the critical section, the kernel 
30749 +# will do IRET in that case, and everything will be restored at that point, 
30750 +# i.e. it just resumes from the next instruction interrupted with the same context. 
30751 +
30752 +# Hypervisor uses this for application faults while it executes.
30753 +# We get here for two reasons:
30754 +#  1. Fault while reloading DS, ES, FS or GS
30755 +#  2. Fault while executing IRET
30756 +# Category 1 we do not need to fix up as Xen has already reloaded all segment
30757 +# registers that could be reloaded and zeroed the others.
30758 +# Category 2 we fix up by killing the current process. We cannot use the
30759 +# normal Linux return path in this case because if we use the IRET hypercall
30760 +# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
30761 +# We distinguish between categories by comparing each saved segment register
30762 +# with its current contents: any discrepancy means we in category 1.
30763 +ENTRY(failsafe_callback)
30764 +       movw %ds,%cx
30765 +       cmpw %cx,0x10(%rsp)
30766 +       jne 1f
30767 +       movw %es,%cx
30768 +       cmpw %cx,0x18(%rsp)
30769 +       jne 1f
30770 +       movw %fs,%cx
30771 +       cmpw %cx,0x20(%rsp)
30772 +       jne 1f
30773 +       movw %gs,%cx
30774 +       cmpw %cx,0x28(%rsp)
30775 +       jne 1f
30776 +       /* All segments match their saved values => Category 2 (Bad IRET). */
30777 +       movq (%rsp),%rcx
30778 +       movq 8(%rsp),%r11
30779 +       addq $0x30,%rsp
30780 +       movq $-9999,%rdi        /* better code? */
30781 +       jmp do_exit                     
30782 +1:     /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
30783 +       movq (%rsp),%rcx
30784 +       movq 8(%rsp),%r11
30785 +       addq $0x30,%rsp
30786 +       pushq $0
30787 +       SAVE_ALL
30788 +       jmp error_exit
30789 +#if 0        
30790 +        .section __ex_table,"a"
30791 +        .align 8
30792 +        .quad gs_change,bad_gs
30793 +        .previous
30794 +        .section .fixup,"ax"
30795 +       /* running with kernelgs */
30796 +bad_gs: 
30797 +/*     swapgs          */      /* switch back to user gs */
30798 +       xorl %eax,%eax
30799 +        movl %eax,%gs
30800 +        jmp  2b
30801 +        .previous       
30802 +#endif
30803 +       
30804 +/*
30805 + * Create a kernel thread.
30806 + *
30807 + * C extern interface:
30808 + *     extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
30809 + *
30810 + * asm input arguments:
30811 + *     rdi: fn, rsi: arg, rdx: flags
30812 + */
30813 +ENTRY(kernel_thread)
30814 +       CFI_STARTPROC
30815 +       FAKE_STACK_FRAME $child_rip
30816 +       SAVE_ALL
30817 +
30818 +       # rdi: flags, rsi: usp, rdx: will be &pt_regs
30819 +       movq %rdx,%rdi
30820 +       orq  kernel_thread_flags(%rip),%rdi
30821 +       movq $-1, %rsi
30822 +       movq %rsp, %rdx
30823 +
30824 +       xorl %r8d,%r8d
30825 +       xorl %r9d,%r9d
30826 +       
30827 +       # clone now
30828 +       call do_fork
30829 +       movq %rax,RAX(%rsp)
30830 +       xorl %edi,%edi
30831 +
30832 +       /*
30833 +        * It isn't worth to check for reschedule here,
30834 +        * so internally to the x86_64 port you can rely on kernel_thread()
30835 +        * not to reschedule the child before returning, this avoids the need
30836 +        * of hacks for example to fork off the per-CPU idle tasks.
30837 +         * [Hopefully no generic code relies on the reschedule -AK]    
30838 +        */
30839 +       RESTORE_ALL
30840 +       UNFAKE_STACK_FRAME
30841 +       ret
30842 +       CFI_ENDPROC
30843 +
30844 +       
30845 +child_rip:
30846 +       /*
30847 +        * Here we are in the child and the registers are set as they were
30848 +        * at kernel_thread() invocation in the parent.
30849 +        */
30850 +       movq %rdi, %rax
30851 +       movq %rsi, %rdi
30852 +       call *%rax
30853 +       # exit
30854 +       xorl %edi, %edi
30855 +       call do_exit
30856 +
30857 +/*
30858 + * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
30859 + *
30860 + * C extern interface:
30861 + *      extern long execve(char *name, char **argv, char **envp)
30862 + *
30863 + * asm input arguments:
30864 + *     rdi: name, rsi: argv, rdx: envp
30865 + *
30866 + * We want to fallback into:
30867 + *     extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
30868 + *
30869 + * do_sys_execve asm fallback arguments:
30870 + *     rdi: name, rsi: argv, rdx: envp, fake frame on the stack
30871 + */
30872 +ENTRY(execve)
30873 +       CFI_STARTPROC
30874 +       FAKE_STACK_FRAME $0
30875 +       SAVE_ALL        
30876 +       call sys_execve
30877 +       movq %rax, RAX(%rsp)    
30878 +       RESTORE_REST
30879 +       testq %rax,%rax
30880 +       jne 1f
30881 +        jmp int_ret_from_sys_call
30882 +1:      RESTORE_ARGS
30883 +       UNFAKE_STACK_FRAME
30884 +       ret
30885 +       CFI_ENDPROC
30886 +
30887 +KPROBE_ENTRY(page_fault)
30888 +       errorentry do_page_fault
30889 +       .previous .text
30890 +
30891 +ENTRY(coprocessor_error)
30892 +       zeroentry do_coprocessor_error
30893 +
30894 +ENTRY(simd_coprocessor_error)
30895 +       zeroentry do_simd_coprocessor_error     
30896 +
30897 +ENTRY(device_not_available)
30898 +       zeroentry math_state_restore
30899 +
30900 +       /* runs on exception stack */
30901 +KPROBE_ENTRY(debug)
30902 +       INTR_FRAME
30903 +/*     pushq $0
30904 +       CFI_ADJUST_CFA_OFFSET 8 */
30905 +       zeroentry do_debug
30906 +/*     jmp paranoid_exit */
30907 +       CFI_ENDPROC
30908 +       .previous .text
30909 +
30910 +#if 0
30911 +       /* runs on exception stack */   
30912 +KPROBE_ENTRY(nmi)
30913 +       INTR_FRAME
30914 +       pushq $-1
30915 +       CFI_ADJUST_CFA_OFFSET 8
30916 +       paranoidentry do_nmi
30917 +       /*
30918 +        * "Paranoid" exit path from exception stack.
30919 +        * Paranoid because this is used by NMIs and cannot take
30920 +        * any kernel state for granted.
30921 +        * We don't do kernel preemption checks here, because only
30922 +        * NMI should be common and it does not enable IRQs and
30923 +        * cannot get reschedule ticks.
30924 +        */
30925 +       /* ebx: no swapgs flag */
30926 +paranoid_exit:
30927 +       testl %ebx,%ebx                         /* swapgs needed? */
30928 +       jnz paranoid_restore
30929 +       testl $3,CS(%rsp)
30930 +       jnz   paranoid_userspace
30931 +paranoid_swapgs:       
30932 +       swapgs
30933 +paranoid_restore:      
30934 +       RESTORE_ALL 8
30935 +       iretq
30936 +paranoid_userspace:    
30937 +       GET_THREAD_INFO(%rcx)
30938 +       movl threadinfo_flags(%rcx),%ebx
30939 +       andl $_TIF_WORK_MASK,%ebx
30940 +       jz paranoid_swapgs
30941 +       movq %rsp,%rdi                  /* &pt_regs */
30942 +       call sync_regs
30943 +       movq %rax,%rsp                  /* switch stack for scheduling */
30944 +       testl $_TIF_NEED_RESCHED,%ebx
30945 +       jnz paranoid_schedule
30946 +       movl %ebx,%edx                  /* arg3: thread flags */
30947 +       sti
30948 +       xorl %esi,%esi                  /* arg2: oldset */
30949 +       movq %rsp,%rdi                  /* arg1: &pt_regs */
30950 +       call do_notify_resume
30951 +       cli
30952 +       jmp paranoid_userspace
30953 +paranoid_schedule:
30954 +       sti
30955 +       call schedule
30956 +       cli
30957 +       jmp paranoid_userspace
30958 +       CFI_ENDPROC
30959 +       .previous .text
30960 +#endif        
30961 +
30962 +KPROBE_ENTRY(int3)
30963 +       INTR_FRAME
30964 +/*     pushq $0
30965 +       CFI_ADJUST_CFA_OFFSET 8 */
30966 +       zeroentry do_int3
30967 +/*     jmp paranoid_exit */
30968 +       CFI_ENDPROC
30969 +       .previous .text
30970 +
30971 +ENTRY(overflow)
30972 +       zeroentry do_overflow
30973 +
30974 +ENTRY(bounds)
30975 +       zeroentry do_bounds
30976 +
30977 +ENTRY(invalid_op)
30978 +       zeroentry do_invalid_op 
30979 +
30980 +ENTRY(coprocessor_segment_overrun)
30981 +       zeroentry do_coprocessor_segment_overrun
30982 +
30983 +ENTRY(reserved)
30984 +       zeroentry do_reserved
30985 +
30986 +#if 0
30987 +       /* runs on exception stack */
30988 +ENTRY(double_fault)
30989 +       XCPT_FRAME
30990 +       paranoidentry do_double_fault
30991 +       jmp paranoid_exit
30992 +       CFI_ENDPROC
30993 +#endif
30994 +
30995 +ENTRY(invalid_TSS)
30996 +       errorentry do_invalid_TSS
30997 +
30998 +ENTRY(segment_not_present)
30999 +       errorentry do_segment_not_present
31000 +
31001 +       /* runs on exception stack */
31002 +ENTRY(stack_segment)
31003 +       XCPT_FRAME
31004 +       errorentry do_stack_segment
31005 +       CFI_ENDPROC
31006 +
31007 +KPROBE_ENTRY(general_protection)
31008 +       errorentry do_general_protection
31009 +       .previous .text
31010 +
31011 +ENTRY(alignment_check)
31012 +       errorentry do_alignment_check
31013 +
31014 +ENTRY(divide_error)
31015 +       zeroentry do_divide_error
31016 +
31017 +ENTRY(spurious_interrupt_bug)
31018 +       zeroentry do_spurious_interrupt_bug
31019 +
31020 +#ifdef CONFIG_X86_MCE
31021 +       /* runs on exception stack */
31022 +ENTRY(machine_check)
31023 +       INTR_FRAME
31024 +       pushq $0
31025 +       CFI_ADJUST_CFA_OFFSET 8 
31026 +       paranoidentry do_machine_check
31027 +       jmp paranoid_exit
31028 +       CFI_ENDPROC
31029 +#endif
31030 +
31031 +ENTRY(call_softirq)
31032 +       CFI_STARTPROC
31033 +       movq %gs:pda_irqstackptr,%rax
31034 +       movq %rsp,%rdx
31035 +       CFI_DEF_CFA_REGISTER    rdx
31036 +       incl %gs:pda_irqcount
31037 +       cmove %rax,%rsp
31038 +       pushq %rdx
31039 +       /*todo CFI_DEF_CFA_EXPRESSION ...*/
31040 +       call __do_softirq
31041 +       popq %rsp
31042 +       CFI_DEF_CFA_REGISTER    rsp
31043 +       decl %gs:pda_irqcount
31044 +       ret
31045 +       CFI_ENDPROC
31046 diff -urNp linux-2.6/arch/x86_64/kernel/genapic_xen.c new/arch/x86_64/kernel/genapic_xen.c
31047 --- linux-2.6/arch/x86_64/kernel/genapic_xen.c  1970-01-01 01:00:00.000000000 +0100
31048 +++ new/arch/x86_64/kernel/genapic_xen.c        2006-05-09 12:33:18.000000000 +0200
31049 @@ -0,0 +1,162 @@
31050 +/*
31051 + * Copyright 2004 James Cleverdon, IBM.
31052 + * Subject to the GNU Public License, v.2
31053 + *
31054 + * Xen APIC subarch code.  Maximum 8 CPUs, logical delivery.
31055 + *
31056 + * Hacked for x86-64 by James Cleverdon from i386 architecture code by
31057 + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
31058 + * James Cleverdon.
31059 + *
31060 + * Hacked to pieces for Xen by Chris Wright.
31061 + */
31062 +#include <linux/config.h>
31063 +#include <linux/threads.h>
31064 +#include <linux/cpumask.h>
31065 +#include <linux/string.h>
31066 +#include <linux/kernel.h>
31067 +#include <linux/ctype.h>
31068 +#include <linux/init.h>
31069 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
31070 +#include <asm/smp.h>
31071 +#include <asm/ipi.h>
31072 +#else
31073 +#include <asm/apic.h>
31074 +#include <asm/apicdef.h>
31075 +#include <asm/genapic.h>
31076 +#endif
31077 +#include <xen/evtchn.h>
31078 +
31079 +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
31080 +
31081 +static inline void __send_IPI_one(unsigned int cpu, int vector)
31082 +{
31083 +       int irq = per_cpu(ipi_to_irq, cpu)[vector];
31084 +       BUG_ON(irq < 0);
31085 +       notify_remote_via_irq(irq);
31086 +}
31087 +
31088 +void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
31089 +{
31090 +       int cpu;
31091 +
31092 +       switch (shortcut) {
31093 +       case APIC_DEST_SELF:
31094 +               __send_IPI_one(smp_processor_id(), vector);
31095 +               break;
31096 +       case APIC_DEST_ALLBUT:
31097 +               for (cpu = 0; cpu < NR_CPUS; ++cpu) {
31098 +                       if (cpu == smp_processor_id())
31099 +                               continue;
31100 +                       if (cpu_isset(cpu, cpu_online_map)) {
31101 +                               __send_IPI_one(cpu, vector);
31102 +                       }
31103 +               }
31104 +               break;
31105 +       case APIC_DEST_ALLINC:
31106 +               for (cpu = 0; cpu < NR_CPUS; ++cpu) {
31107 +                       if (cpu_isset(cpu, cpu_online_map)) {
31108 +                               __send_IPI_one(cpu, vector);
31109 +                       }
31110 +               }
31111 +               break;
31112 +       default:
31113 +               printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
31114 +                      vector);
31115 +               break;
31116 +       }
31117 +}
31118 +
31119 +static cpumask_t xen_target_cpus(void)
31120 +{
31121 +       return cpu_online_map;
31122 +}
31123 +
31124 +/*
31125 + * Set up the logical destination ID.
31126 + * Do nothing, not called now.
31127 + */
31128 +static void xen_init_apic_ldr(void)
31129 +{
31130 +       Dprintk("%s\n", __FUNCTION__);
31131 +       return;
31132 +}
31133 +
31134 +static void xen_send_IPI_allbutself(int vector)
31135 +{
31136 +       /*
31137 +        * if there are no other CPUs in the system then
31138 +        * we get an APIC send error if we try to broadcast.
31139 +        * thus we have to avoid sending IPIs in this case.
31140 +        */
31141 +       Dprintk("%s\n", __FUNCTION__);
31142 +       if (num_online_cpus() > 1)
31143 +               xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
31144 +}
31145 +
31146 +static void xen_send_IPI_all(int vector)
31147 +{
31148 +       Dprintk("%s\n", __FUNCTION__);
31149 +       xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
31150 +}
31151 +
31152 +static void xen_send_IPI_mask(cpumask_t cpumask, int vector)
31153 +{
31154 +       unsigned long mask = cpus_addr(cpumask)[0];
31155 +       unsigned int cpu;
31156 +       unsigned long flags;
31157 +
31158 +       Dprintk("%s\n", __FUNCTION__);
31159 +       local_irq_save(flags);
31160 +       WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
31161 +
31162 +       for (cpu = 0; cpu < NR_CPUS; ++cpu) {
31163 +               if (cpu_isset(cpu, cpumask)) {
31164 +                       __send_IPI_one(cpu, vector);
31165 +               }
31166 +       }
31167 +       local_irq_restore(flags);
31168 +}
31169 +
31170 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
31171 +static int xen_apic_id_registered(void)
31172 +{
31173 +       /* better be set */
31174 +       Dprintk("%s\n", __FUNCTION__);
31175 +       return physid_isset(smp_processor_id(), phys_cpu_present_map);
31176 +}
31177 +#endif
31178 +
31179 +static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask)
31180 +{
31181 +       Dprintk("%s\n", __FUNCTION__);
31182 +       return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
31183 +}
31184 +
31185 +static unsigned int phys_pkg_id(int index_msb)
31186 +{
31187 +       u32 ebx;
31188 +
31189 +       Dprintk("%s\n", __FUNCTION__);
31190 +       ebx = cpuid_ebx(1);
31191 +       return ((ebx >> 24) & 0xFF) >> index_msb;
31192 +}
31193 +
31194 +struct genapic apic_xen =  {
31195 +       .name = "xen",
31196 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
31197 +       .int_delivery_mode = dest_LowestPrio,
31198 +#endif
31199 +       .int_dest_mode = (APIC_DEST_LOGICAL != 0),
31200 +       .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST,
31201 +       .target_cpus = xen_target_cpus,
31202 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
31203 +       .apic_id_registered = xen_apic_id_registered,
31204 +#endif
31205 +       .init_apic_ldr = xen_init_apic_ldr,
31206 +       .send_IPI_all = xen_send_IPI_all,
31207 +       .send_IPI_allbutself = xen_send_IPI_allbutself,
31208 +       .send_IPI_mask = xen_send_IPI_mask,
31209 +       .cpu_mask_to_apicid = xen_cpu_mask_to_apicid,
31210 +       .phys_pkg_id = phys_pkg_id,
31211 +};
31212 diff -urNp linux-2.6/arch/x86_64/kernel/genapic-xen.c new/arch/x86_64/kernel/genapic-xen.c
31213 --- linux-2.6/arch/x86_64/kernel/genapic-xen.c  1970-01-01 01:00:00.000000000 +0100
31214 +++ new/arch/x86_64/kernel/genapic-xen.c        2006-05-09 12:33:18.000000000 +0200
31215 @@ -0,0 +1,144 @@
31216 +/*
31217 + * Copyright 2004 James Cleverdon, IBM.
31218 + * Subject to the GNU Public License, v.2
31219 + *
31220 + * Generic APIC sub-arch probe layer.
31221 + *
31222 + * Hacked for x86-64 by James Cleverdon from i386 architecture code by
31223 + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
31224 + * James Cleverdon.
31225 + */
31226 +#include <linux/config.h>
31227 +#include <linux/threads.h>
31228 +#include <linux/cpumask.h>
31229 +#include <linux/string.h>
31230 +#include <linux/kernel.h>
31231 +#include <linux/ctype.h>
31232 +#include <linux/init.h>
31233 +#include <linux/module.h>
31234 +
31235 +#include <asm/smp.h>
31236 +#include <asm/ipi.h>
31237 +
31238 +#if defined(CONFIG_ACPI)
31239 +#include <acpi/acpi_bus.h>
31240 +#endif
31241 +
31242 +/* which logical CPU number maps to which CPU (physical APIC ID) */
31243 +u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
31244 +EXPORT_SYMBOL(x86_cpu_to_apicid);
31245 +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
31246 +
31247 +extern struct genapic apic_cluster;
31248 +extern struct genapic apic_flat;
31249 +extern struct genapic apic_physflat;
31250 +
31251 +#ifndef CONFIG_XEN
31252 +struct genapic *genapic = &apic_flat;
31253 +#else
31254 +extern struct genapic apic_xen;
31255 +struct genapic *genapic = &apic_xen;
31256 +#endif
31257 +
31258 +
31259 +/*
31260 + * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
31261 + */
31262 +void __init clustered_apic_check(void)
31263 +{
31264 +#ifndef CONFIG_XEN
31265 +       long i;
31266 +       u8 clusters, max_cluster;
31267 +       u8 id;
31268 +       u8 cluster_cnt[NUM_APIC_CLUSTERS];
31269 +       int max_apic = 0;
31270 +
31271 +#if defined(CONFIG_ACPI)
31272 +       /*
31273 +        * Some x86_64 machines use physical APIC mode regardless of how many
31274 +        * procs/clusters are present (x86_64 ES7000 is an example).
31275 +        */
31276 +       if (acpi_fadt.revision > FADT2_REVISION_ID)
31277 +               if (acpi_fadt.force_apic_physical_destination_mode) {
31278 +                       genapic = &apic_cluster;
31279 +                       goto print;
31280 +               }
31281 +#endif
31282 +
31283 +       memset(cluster_cnt, 0, sizeof(cluster_cnt));
31284 +       for (i = 0; i < NR_CPUS; i++) {
31285 +               id = bios_cpu_apicid[i];
31286 +               if (id == BAD_APICID)
31287 +                       continue;
31288 +               if (id > max_apic)
31289 +                       max_apic = id;
31290 +               cluster_cnt[APIC_CLUSTERID(id)]++;
31291 +       }
31292 +
31293 +       /* Don't use clustered mode on AMD platforms. */
31294 +       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
31295 +               genapic = &apic_physflat;
31296 +#ifndef CONFIG_HOTPLUG_CPU
31297 +               /* In the CPU hotplug case we cannot use broadcast mode
31298 +                  because that opens a race when a CPU is removed.
31299 +                  Stay at physflat mode in this case.
31300 +                  It is bad to do this unconditionally though. Once
31301 +                  we have ACPI platform support for CPU hotplug
31302 +                  we should detect hotplug capablity from ACPI tables and
31303 +                  only do this when really needed. -AK */
31304 +               if (max_apic <= 8)
31305 +                       genapic = &apic_flat;
31306 +#endif
31307 +               goto print;
31308 +       }
31309 +
31310 +       clusters = 0;
31311 +       max_cluster = 0;
31312 +
31313 +       for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
31314 +               if (cluster_cnt[i] > 0) {
31315 +                       ++clusters;
31316 +                       if (cluster_cnt[i] > max_cluster)
31317 +                               max_cluster = cluster_cnt[i];
31318 +               }
31319 +       }
31320 +
31321 +       /*
31322 +        * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
31323 +        * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
31324 +        * else physical mode.
31325 +        * (We don't use lowest priority delivery + HW APIC IRQ steering, so
31326 +        * can ignore the clustered logical case and go straight to physical.)
31327 +        */
31328 +       if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) {
31329 +#ifdef CONFIG_HOTPLUG_CPU
31330 +               /* Don't use APIC shortcuts in CPU hotplug to avoid races */
31331 +               genapic = &apic_physflat;
31332 +#else
31333 +               genapic = &apic_flat;
31334 +#endif
31335 +       } else
31336 +               genapic = &apic_cluster;
31337 +
31338 +print:
31339 +#else
31340 +       /* hardcode to xen apic functions */
31341 +       genapic = &apic_xen;
31342 +#endif
31343 +       printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
31344 +}
31345 +
31346 +/* Same for both flat and clustered. */
31347 +
31348 +#ifdef CONFIG_XEN
31349 +extern void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest);
31350 +#endif
31351 +
31352 +void send_IPI_self(int vector)
31353 +{
31354 +#ifndef CONFIG_XEN
31355 +       __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
31356 +#else
31357 +       xen_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
31358 +#endif
31359 +}
31360 diff -urNp linux-2.6/arch/x86_64/kernel/head64-xen.c new/arch/x86_64/kernel/head64-xen.c
31361 --- linux-2.6/arch/x86_64/kernel/head64-xen.c   1970-01-01 01:00:00.000000000 +0100
31362 +++ new/arch/x86_64/kernel/head64-xen.c 2006-06-28 14:32:14.000000000 +0200
31363 @@ -0,0 +1,159 @@
31364 +/*
31365 + *  linux/arch/x86_64/kernel/head64.c -- prepare to run common code
31366 + *
31367 + *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
31368 + *
31369 + *  $Id$
31370 + *
31371 + *  Jun Nakajima <jun.nakajima@intel.com>
31372 + *     Modified for Xen.
31373 + */
31374 +
31375 +#include <linux/init.h>
31376 +#include <linux/linkage.h>
31377 +#include <linux/types.h>
31378 +#include <linux/kernel.h>
31379 +#include <linux/string.h>
31380 +#include <linux/percpu.h>
31381 +#include <linux/module.h>
31382 +
31383 +#include <asm/processor.h>
31384 +#include <asm/proto.h>
31385 +#include <asm/smp.h>
31386 +#include <asm/bootsetup.h>
31387 +#include <asm/setup.h>
31388 +#include <asm/desc.h>
31389 +#include <asm/pgtable.h>
31390 +#include <asm/sections.h>
31391 +
31392 +unsigned long start_pfn;
31393 +
31394 +/* Don't add a printk in there. printk relies on the PDA which is not initialized 
31395 +   yet. */
31396 +#if 0
31397 +static void __init clear_bss(void)
31398 +{
31399 +       memset(__bss_start, 0,
31400 +              (unsigned long) __bss_stop - (unsigned long) __bss_start);
31401 +}
31402 +#endif
31403 +
31404 +#define NEW_CL_POINTER         0x228   /* Relative to real mode data */
31405 +#define OLD_CL_MAGIC_ADDR      0x90020
31406 +#define OLD_CL_MAGIC            0xA33F
31407 +#define OLD_CL_BASE_ADDR        0x90000
31408 +#define OLD_CL_OFFSET           0x90022
31409 +
31410 +extern char saved_command_line[];
31411 +
31412 +static void __init copy_bootdata(char *real_mode_data)
31413 +{
31414 +#ifndef CONFIG_XEN
31415 +       int new_data;
31416 +       char * command_line;
31417 +
31418 +       memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE);
31419 +       new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
31420 +       if (!new_data) {
31421 +               if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
31422 +                       printk("so old bootloader that it does not support commandline?!\n");
31423 +                       return;
31424 +               }
31425 +               new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
31426 +               printk("old bootloader convention, maybe loadlin?\n");
31427 +       }
31428 +       command_line = (char *) ((u64)(new_data));
31429 +       memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
31430 +#else
31431 +       int max_cmdline;
31432 +       
31433 +       if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
31434 +               max_cmdline = COMMAND_LINE_SIZE;
31435 +       memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
31436 +       saved_command_line[max_cmdline-1] = '\0';
31437 +#endif
31438 +       printk("Bootdata ok (command line is %s)\n", saved_command_line);
31439 +}
31440 +
31441 +static void __init setup_boot_cpu_data(void)
31442 +{
31443 +       unsigned int dummy, eax;
31444 +
31445 +       /* get vendor info */
31446 +       cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level,
31447 +             (unsigned int *)&boot_cpu_data.x86_vendor_id[0],
31448 +             (unsigned int *)&boot_cpu_data.x86_vendor_id[8],
31449 +             (unsigned int *)&boot_cpu_data.x86_vendor_id[4]);
31450 +
31451 +       /* get cpu type */
31452 +       cpuid(1, &eax, &dummy, &dummy,
31453 +               (unsigned int *) &boot_cpu_data.x86_capability);
31454 +       boot_cpu_data.x86 = (eax >> 8) & 0xf;
31455 +       boot_cpu_data.x86_model = (eax >> 4) & 0xf;
31456 +       boot_cpu_data.x86_mask = eax & 0xf;
31457 +}
31458 +
31459 +#include <xen/interface/memory.h>
31460 +unsigned long *machine_to_phys_mapping;
31461 +EXPORT_SYMBOL(machine_to_phys_mapping);
31462 +unsigned int machine_to_phys_order;
31463 +EXPORT_SYMBOL(machine_to_phys_order);
31464 +
31465 +void __init x86_64_start_kernel(char * real_mode_data)
31466 +{
31467 +       struct xen_machphys_mapping mapping;
31468 +       unsigned long machine_to_phys_nr_ents;
31469 +       char *s;
31470 +       int i;
31471 +
31472 +       xen_start_info = (struct start_info *)real_mode_data;
31473 +       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
31474 +               phys_to_machine_mapping =
31475 +                       (unsigned long *)xen_start_info->mfn_list;
31476 +               start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
31477 +                       xen_start_info->nr_pt_frames;
31478 +       }
31479 +
31480 +
31481 +       machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
31482 +       machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
31483 +       if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
31484 +               machine_to_phys_mapping = (unsigned long *)mapping.v_start;
31485 +               machine_to_phys_nr_ents = mapping.max_mfn + 1;
31486 +       }
31487 +       while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents )
31488 +               machine_to_phys_order++;
31489 +
31490 +#if 0
31491 +       for (i = 0; i < 256; i++)
31492 +               set_intr_gate(i, early_idt_handler);
31493 +       asm volatile("lidt %0" :: "m" (idt_descr));
31494 +#endif
31495 +
31496 +       for (i = 0; i < NR_CPUS; i++)
31497 +               cpu_pda(i) = &boot_cpu_pda[i];
31498 +
31499 +       pda_init(0);
31500 +       copy_bootdata(real_mode_data);
31501 +#ifdef CONFIG_SMP
31502 +       cpu_set(0, cpu_online_map);
31503 +#endif
31504 +       s = strstr(saved_command_line, "earlyprintk=");
31505 +       if (s != NULL)
31506 +               setup_early_printk(strchr(s, '=') + 1);
31507 +#ifdef CONFIG_NUMA
31508 +       s = strstr(saved_command_line, "numa=");
31509 +       if (s != NULL)
31510 +               numa_setup(s+5);
31511 +#endif
31512 +#ifdef CONFIG_X86_IO_APIC
31513 +       if (strstr(saved_command_line, "disableapic"))
31514 +               disable_apic = 1;
31515 +#endif
31516 +       /* You need early console to see that */
31517 +       if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE)
31518 +               panic("Kernel too big for kernel mapping\n");
31519 +
31520 +       setup_boot_cpu_data();
31521 +       start_kernel();
31522 +}
31523 diff -urNp linux-2.6/arch/x86_64/kernel/head-xen.S new/arch/x86_64/kernel/head-xen.S
31524 --- linux-2.6/arch/x86_64/kernel/head-xen.S     1970-01-01 01:00:00.000000000 +0100
31525 +++ new/arch/x86_64/kernel/head-xen.S   2006-05-23 18:42:18.000000000 +0200
31526 @@ -0,0 +1,176 @@
31527 +/*
31528 + *  linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
31529 + *
31530 + *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
31531 + *  Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
31532 + *  Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
31533 + *  Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
31534 + *
31535 + *  $Id$
31536 + *
31537 + *  Jun Nakajima <jun.nakajima@intel.com>
31538 + *    Modified for Xen                                
31539 + */
31540 +
31541 +
31542 +#include <linux/linkage.h>
31543 +#include <linux/threads.h>
31544 +#include <linux/init.h>
31545 +#include <asm/desc.h>
31546 +#include <asm/segment.h>
31547 +#include <asm/page.h>
31548 +#include <asm/msr.h>
31549 +#include <asm/cache.h>
31550 +
31551 +       .text
31552 +       .section .bootstrap.text
31553 +       .code64
31554 +#define VIRT_ENTRY_OFFSET 0x0
31555 +.org VIRT_ENTRY_OFFSET
31556 +       .globl startup_64
31557 +startup_64:
31558 +ENTRY(_start)
31559 +       movq $(init_thread_union+THREAD_SIZE-8),%rsp
31560 +       /* zero EFLAGS after setting rsp */
31561 +       pushq $0
31562 +       popfq
31563 +
31564 +       /* rsi is pointer to startup info structure.
31565 +          pass it to C */
31566 +       movq %rsi,%rdi
31567 +       jmp x86_64_start_kernel
31568 +
31569 +ENTRY(stext)
31570 +ENTRY(_stext)
31571 +
31572 +       $page = 0
31573 +#define NEXT_PAGE(name) \
31574 +       $page = $page + 1; \
31575 +       .org $page * 0x1000; \
31576 +       phys_/**/name = $page * 0x1000 + __PHYSICAL_START; \
31577 +ENTRY(name)
31578 +
31579 +NEXT_PAGE(init_level4_pgt)
31580 +       /* This gets initialized in x86_64_start_kernel */
31581 +       .fill   512,8,0
31582 +
31583 +        /*
31584 +         * We update two pgd entries to make kernel and user pgd consistent
31585 +         * at pgd_populate(). It can be used for kernel modules. So we place 
31586 +         * this page here for those cases to avoid memory corruption.
31587 +         * We also use this page to establish the initiali mapping for
31588 +         * vsyscall area.
31589 +         */
31590 +NEXT_PAGE(init_level4_user_pgt)
31591 +       .fill   512,8,0
31592 +
31593 +NEXT_PAGE(level3_kernel_pgt)
31594 +       .fill   512,8,0
31595 +
31596 +        /*
31597 +         * This is used for vsyscall area mapping as we have a different
31598 +         * level4 page table for user.
31599 +         */
31600 +NEXT_PAGE(level3_user_pgt)
31601 +        .fill  512,8,0
31602 +
31603 +NEXT_PAGE(level2_kernel_pgt)
31604 +       .fill   512,8,0
31605 +
31606 +NEXT_PAGE(hypercall_page)
31607 +       .fill   512,8,0
31608 +
31609 +#undef NEXT_PAGE
31610 +
31611 +       .data
31612 +
31613 +       .align 16
31614 +       .globl cpu_gdt_descr
31615 +cpu_gdt_descr:
31616 +       .word   gdt_end-cpu_gdt_table-1
31617 +gdt:
31618 +       .quad   cpu_gdt_table
31619 +#ifdef CONFIG_SMP
31620 +       .rept   NR_CPUS-1
31621 +       .word   0
31622 +       .quad   0
31623 +       .endr
31624 +#endif
31625 +
31626 +/* We need valid kernel segments for data and code in long mode too
31627 + * IRET will check the segment types  kkeil 2000/10/28
31628 + * Also sysret mandates a special GDT layout 
31629 + */
31630 +                               
31631 +       .section .data.page_aligned, "aw"
31632 +       .align PAGE_SIZE
31633 +
31634 +/* The TLS descriptors are currently at a different place compared to i386.
31635 +   Hopefully nobody expects them at a fixed place (Wine?) */
31636 +
31637 +ENTRY(cpu_gdt_table)
31638 +       .quad   0x0000000000000000      /* NULL descriptor */
31639 +       .quad   0x0                     /* unused */
31640 +       .quad   0x00af9a000000ffff      /* __KERNEL_CS */
31641 +       .quad   0x00cf92000000ffff      /* __KERNEL_DS */
31642 +       .quad   0x00cffa000000ffff      /* __USER32_CS */
31643 +       .quad   0x00cff2000000ffff      /* __USER_DS, __USER32_DS  */
31644 +       .quad   0x00affa000000ffff      /* __USER_CS */
31645 +       .quad   0x00cf9a000000ffff      /* __KERNEL32_CS */
31646 +       .quad   0,0                     /* TSS */
31647 +       .quad   0,0                     /* LDT */
31648 +       .quad   0,0,0                   /* three TLS descriptors */
31649 +       .quad   0                       /* unused */
31650 +gdt_end:
31651 +       /* asm/segment.h:GDT_ENTRIES must match this */
31652 +       /* This should be a multiple of the cache line size */
31653 +       /* GDTs of other CPUs are now dynamically allocated */
31654 +
31655 +       /* zero the remaining page */
31656 +       .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
31657 +
31658 +       .section .bss, "aw", @nobits
31659 +       .align L1_CACHE_BYTES
31660 +ENTRY(idt_table)
31661 +       .skip 256 * 16
31662 +
31663 +       .section .bss.page_aligned, "aw", @nobits
31664 +       .align PAGE_SIZE
31665 +ENTRY(empty_zero_page)
31666 +       .skip PAGE_SIZE
31667 +
31668 +/*
31669 + * __xen_guest information
31670 + */
31671 +.macro utoh value
31672 + .if (\value) < 0 || (\value) >= 0x10
31673 +       utoh (((\value)>>4)&0x0fffffffffffffff)
31674 + .endif
31675 + .if ((\value) & 0xf) < 10
31676 +  .byte '0' + ((\value) & 0xf)
31677 + .else
31678 +  .byte 'A' + ((\value) & 0xf) - 10
31679 + .endif
31680 +.endm
31681 +
31682 +.section __xen_guest
31683 +       .ascii  "GUEST_OS=linux,GUEST_VER=2.6"
31684 +       .ascii  ",XEN_VER=xen-3.0"
31685 +       .ascii  ",VIRT_BASE=0x"
31686 +               utoh __START_KERNEL_map
31687 +#ifdef CONFIG_XEN_COMPAT_030002
31688 +       .ascii  ",ELF_PADDR_OFFSET=0x"
31689 +               utoh __START_KERNEL_map
31690 +#else
31691 +       .ascii  ",ELF_PADDR_OFFSET=0x0"
31692 +#endif /* !CONFIG_XEN_COMPAT_030002 */
31693 +       .ascii  ",VIRT_ENTRY=0x"
31694 +               utoh (__START_KERNEL_map + __PHYSICAL_START + VIRT_ENTRY_OFFSET)
31695 +       .ascii  ",HYPERCALL_PAGE=0x"
31696 +               utoh (phys_hypercall_page >> PAGE_SHIFT)
31697 +       .ascii  ",FEATURES=writable_page_tables"
31698 +       .ascii           "|writable_descriptor_tables"
31699 +       .ascii           "|auto_translated_physmap"
31700 +       .ascii           "|supervisor_mode_kernel"
31701 +       .ascii  ",LOADER=generic"
31702 +       .byte   0
31703 diff -urNp linux-2.6/arch/x86_64/kernel/init_task.c new/arch/x86_64/kernel/init_task.c
31704 --- linux-2.6/arch/x86_64/kernel/init_task.c    2006-07-03 14:14:30.000000000 +0200
31705 +++ new/arch/x86_64/kernel/init_task.c  2006-05-09 12:33:19.000000000 +0200
31706 @@ -37,6 +37,8 @@ union thread_union init_thread_union 
31707  struct task_struct init_task = INIT_TASK(init_task);
31708  
31709  EXPORT_SYMBOL(init_task);
31710 +
31711 +#ifndef CONFIG_X86_NO_TSS
31712  /*
31713   * per-CPU TSS segments. Threads are completely 'soft' on Linux,
31714   * no more per-task TSS's. The TSS size is kept cacheline-aligned
31715 @@ -45,5 +47,6 @@ EXPORT_SYMBOL(init_task);
31716   * on exact cacheline boundaries, to eliminate cacheline ping-pong.
31717   */ 
31718  DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
31719 +#endif
31720  
31721  #define ALIGN_TO_4K __attribute__((section(".data.init_task")))
31722 diff -urNp linux-2.6/arch/x86_64/kernel/io_apic-xen.c new/arch/x86_64/kernel/io_apic-xen.c
31723 --- linux-2.6/arch/x86_64/kernel/io_apic-xen.c  1970-01-01 01:00:00.000000000 +0100
31724 +++ new/arch/x86_64/kernel/io_apic-xen.c        2006-05-23 18:37:10.000000000 +0200
31725 @@ -0,0 +1,2232 @@
31726 +/*
31727 + *     Intel IO-APIC support for multi-Pentium hosts.
31728 + *
31729 + *     Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
31730 + *
31731 + *     Many thanks to Stig Venaas for trying out countless experimental
31732 + *     patches and reporting/debugging problems patiently!
31733 + *
31734 + *     (c) 1999, Multiple IO-APIC support, developed by
31735 + *     Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
31736 + *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
31737 + *     further tested and cleaned up by Zach Brown <zab@redhat.com>
31738 + *     and Ingo Molnar <mingo@redhat.com>
31739 + *
31740 + *     Fixes
31741 + *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
31742 + *                                     thanks to Eric Gilmore
31743 + *                                     and Rolf G. Tews
31744 + *                                     for testing these extensively
31745 + *     Paul Diefenbaugh        :       Added full ACPI support
31746 + */
31747 +
31748 +#include <linux/mm.h>
31749 +#include <linux/interrupt.h>
31750 +#include <linux/init.h>
31751 +#include <linux/delay.h>
31752 +#include <linux/sched.h>
31753 +#include <linux/config.h>
31754 +#include <linux/smp_lock.h>
31755 +#include <linux/mc146818rtc.h>
31756 +#include <linux/acpi.h>
31757 +#include <linux/sysdev.h>
31758 +#ifdef CONFIG_ACPI
31759 +#include <acpi/acpi_bus.h>
31760 +#endif
31761 +
31762 +#include <asm/io.h>
31763 +#include <asm/smp.h>
31764 +#include <asm/desc.h>
31765 +#include <asm/proto.h>
31766 +#include <asm/mach_apic.h>
31767 +#include <asm/acpi.h>
31768 +#include <asm/dma.h>
31769 +
31770 +#define __apicdebuginit  __init
31771 +
31772 +int sis_apic_bug; /* not actually supported, dummy for compile */
31773 +
31774 +static int no_timer_check;
31775 +
31776 +int disable_timer_pin_1 __initdata;
31777 +
31778 +#ifndef CONFIG_XEN
31779 +int timer_over_8254 __initdata = 0;
31780 +
31781 +/* Where if anywhere is the i8259 connect in external int mode */
31782 +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
31783 +#endif
31784 +
31785 +static DEFINE_SPINLOCK(ioapic_lock);
31786 +
31787 +/*
31788 + * # of IRQ routing registers
31789 + */
31790 +int nr_ioapic_registers[MAX_IO_APICS];
31791 +
31792 +/*
31793 + * Rough estimation of how many shared IRQs there are, can
31794 + * be changed anytime.
31795 + */
31796 +#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS
31797 +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
31798 +
31799 +/*
31800 + * This is performance-critical, we want to do it O(1)
31801 + *
31802 + * the indexing order of this array favors 1:1 mappings
31803 + * between pins and IRQs.
31804 + */
31805 +
31806 +static struct irq_pin_list {
31807 +       short apic, pin, next;
31808 +} irq_2_pin[PIN_MAP_SIZE];
31809 +
31810 +int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
31811 +#ifdef CONFIG_PCI_MSI
31812 +#define vector_to_irq(vector)  \
31813 +       (platform_legacy_irq(vector) ? vector : vector_irq[vector])
31814 +#else
31815 +#define vector_to_irq(vector)  (vector)
31816 +#endif
31817 +
31818 +#ifdef CONFIG_XEN
31819 +
31820 +#include <xen/interface/xen.h>
31821 +#include <xen/interface/physdev.h>
31822 +
31823 +/* Fake i8259 */
31824 +#define make_8259A_irq(_irq)     (io_apic_irqs &= ~(1UL<<(_irq)))
31825 +#define disable_8259A_irq(_irq)  ((void)0)
31826 +#define i8259A_irq_pending(_irq) (0)
31827 +
31828 +unsigned long io_apic_irqs;
31829 +
31830 +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
31831 +{
31832 +       struct physdev_apic apic_op;
31833 +       int ret;
31834 +
31835 +       apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
31836 +       apic_op.reg = reg;
31837 +       ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
31838 +       if (ret)
31839 +               return ret;
31840 +       return apic_op.value;
31841 +}
31842 +
31843 +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
31844 +{
31845 +       struct physdev_apic apic_op;
31846 +
31847 +       apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
31848 +       apic_op.reg = reg;
31849 +       apic_op.value = value;
31850 +       HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op);
31851 +}
31852 +
31853 +#define io_apic_read(a,r)    xen_io_apic_read(a,r)
31854 +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
31855 +
31856 +#define clear_IO_APIC() ((void)0)
31857 +
31858 +#else
31859 +
31860 +#ifdef CONFIG_SMP
31861 +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
31862 +{
31863 +       unsigned long flags;
31864 +       unsigned int dest;
31865 +       cpumask_t tmp;
31866 +
31867 +       cpus_and(tmp, mask, cpu_online_map);
31868 +       if (cpus_empty(tmp))
31869 +               tmp = TARGET_CPUS;
31870 +
31871 +       cpus_and(mask, tmp, CPU_MASK_ALL);
31872 +
31873 +       dest = cpu_mask_to_apicid(mask);
31874 +
31875 +       /*
31876 +        * Only the high 8 bits are valid.
31877 +        */
31878 +       dest = SET_APIC_LOGICAL_ID(dest);
31879 +
31880 +       spin_lock_irqsave(&ioapic_lock, flags);
31881 +       __DO_ACTION(1, = dest, )
31882 +       set_irq_info(irq, mask);
31883 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31884 +}
31885 +#endif
31886 +
31887 +#endif /* !CONFIG_XEN */
31888 +
31889 +/*
31890 + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
31891 + * shared ISA-space IRQs, so we have to support them. We are super
31892 + * fast in the common case, and fast for shared ISA-space IRQs.
31893 + */
31894 +static void add_pin_to_irq(unsigned int irq, int apic, int pin)
31895 +{
31896 +       static int first_free_entry = NR_IRQS;
31897 +       struct irq_pin_list *entry = irq_2_pin + irq;
31898 +
31899 +       BUG_ON(irq >= NR_IRQS);
31900 +       while (entry->next)
31901 +               entry = irq_2_pin + entry->next;
31902 +
31903 +       if (entry->pin != -1) {
31904 +               entry->next = first_free_entry;
31905 +               entry = irq_2_pin + entry->next;
31906 +               if (++first_free_entry >= PIN_MAP_SIZE)
31907 +                       panic("io_apic.c: ran out of irq_2_pin entries!");
31908 +       }
31909 +       entry->apic = apic;
31910 +       entry->pin = pin;
31911 +}
31912 +
31913 +#ifndef CONFIG_XEN
31914 +#define __DO_ACTION(R, ACTION, FINAL)                                  \
31915 +                                                                       \
31916 +{                                                                      \
31917 +       int pin;                                                        \
31918 +       struct irq_pin_list *entry = irq_2_pin + irq;                   \
31919 +                                                                       \
31920 +       BUG_ON(irq >= NR_IRQS);                                         \
31921 +       for (;;) {                                                      \
31922 +               unsigned int reg;                                       \
31923 +               pin = entry->pin;                                       \
31924 +               if (pin == -1)                                          \
31925 +                       break;                                          \
31926 +               reg = io_apic_read(entry->apic, 0x10 + R + pin*2);      \
31927 +               reg ACTION;                                             \
31928 +               io_apic_modify(entry->apic, reg);                       \
31929 +               if (!entry->next)                                       \
31930 +                       break;                                          \
31931 +               entry = irq_2_pin + entry->next;                        \
31932 +       }                                                               \
31933 +       FINAL;                                                          \
31934 +}
31935 +
31936 +#define DO_ACTION(name,R,ACTION, FINAL)                                        \
31937 +                                                                       \
31938 +       static void name##_IO_APIC_irq (unsigned int irq)               \
31939 +       __DO_ACTION(R, ACTION, FINAL)
31940 +
31941 +DO_ACTION( __mask,             0, |= 0x00010000, io_apic_sync(entry->apic) )
31942 +                                               /* mask = 1 */
31943 +DO_ACTION( __unmask,           0, &= 0xfffeffff, )
31944 +                                               /* mask = 0 */
31945 +
31946 +static void mask_IO_APIC_irq (unsigned int irq)
31947 +{
31948 +       unsigned long flags;
31949 +
31950 +       spin_lock_irqsave(&ioapic_lock, flags);
31951 +       __mask_IO_APIC_irq(irq);
31952 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31953 +}
31954 +
31955 +static void unmask_IO_APIC_irq (unsigned int irq)
31956 +{
31957 +       unsigned long flags;
31958 +
31959 +       spin_lock_irqsave(&ioapic_lock, flags);
31960 +       __unmask_IO_APIC_irq(irq);
31961 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31962 +}
31963 +
31964 +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
31965 +{
31966 +       struct IO_APIC_route_entry entry;
31967 +       unsigned long flags;
31968 +
31969 +       /* Check delivery_mode to be sure we're not clearing an SMI pin */
31970 +       spin_lock_irqsave(&ioapic_lock, flags);
31971 +       *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
31972 +       *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
31973 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31974 +       if (entry.delivery_mode == dest_SMI)
31975 +               return;
31976 +       /*
31977 +        * Disable it in the IO-APIC irq-routing table:
31978 +        */
31979 +       memset(&entry, 0, sizeof(entry));
31980 +       entry.mask = 1;
31981 +       spin_lock_irqsave(&ioapic_lock, flags);
31982 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
31983 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
31984 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31985 +}
31986 +
31987 +static void clear_IO_APIC (void)
31988 +{
31989 +       int apic, pin;
31990 +
31991 +       for (apic = 0; apic < nr_ioapics; apic++)
31992 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
31993 +                       clear_IO_APIC_pin(apic, pin);
31994 +}
31995 +
31996 +#endif /* !CONFIG_XEN */
31997 +
31998 +static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
31999 +
32000 +/*
32001 + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
32002 + * specific CPU-side IRQs.
32003 + */
32004 +
32005 +#define MAX_PIRQS 8
32006 +static int pirq_entries [MAX_PIRQS];
32007 +static int pirqs_enabled;
32008 +int skip_ioapic_setup;
32009 +int ioapic_force;
32010 +
32011 +/* dummy parsing: see setup.c */
32012 +
32013 +static int __init disable_ioapic_setup(char *str)
32014 +{
32015 +       skip_ioapic_setup = 1;
32016 +       return 1;
32017 +}
32018 +
32019 +static int __init enable_ioapic_setup(char *str)
32020 +{
32021 +       ioapic_force = 1;
32022 +       skip_ioapic_setup = 0;
32023 +       return 1;
32024 +}
32025 +
32026 +__setup("noapic", disable_ioapic_setup);
32027 +__setup("apic", enable_ioapic_setup);
32028 +
32029 +#ifndef CONFIG_XEN
32030 +static int __init setup_disable_8254_timer(char *s)
32031 +{
32032 +       timer_over_8254 = -1;
32033 +       return 1;
32034 +}
32035 +static int __init setup_enable_8254_timer(char *s)
32036 +{
32037 +       timer_over_8254 = 2;
32038 +       return 1;
32039 +}
32040 +
32041 +__setup("disable_8254_timer", setup_disable_8254_timer);
32042 +__setup("enable_8254_timer", setup_enable_8254_timer);
32043 +#endif /* !CONFIG_XEN */
32044 +
32045 +#include <asm/pci-direct.h>
32046 +#include <linux/pci_ids.h>
32047 +#include <linux/pci.h>
32048 +
32049 +/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
32050 +   off. Check for an Nvidia or VIA PCI bridge and turn it off.
32051 +   Use pci direct infrastructure because this runs before the PCI subsystem. 
32052 +
32053 +   Can be overwritten with "apic"
32054 +
32055 +   And another hack to disable the IOMMU on VIA chipsets.
32056 +
32057 +   ... and others. Really should move this somewhere else.
32058 +
32059 +   Kludge-O-Rama. */
32060 +void __init check_ioapic(void) 
32061 +{ 
32062 +       int num,slot,func; 
32063 +       /* Poor man's PCI discovery */
32064 +       for (num = 0; num < 32; num++) { 
32065 +               for (slot = 0; slot < 32; slot++) { 
32066 +                       for (func = 0; func < 8; func++) { 
32067 +                               u32 class;
32068 +                               u32 vendor;
32069 +                               u8 type;
32070 +                               class = read_pci_config(num,slot,func,
32071 +                                                       PCI_CLASS_REVISION);
32072 +                               if (class == 0xffffffff)
32073 +                                       break; 
32074 +
32075 +                               if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
32076 +                                       continue; 
32077 +
32078 +                               vendor = read_pci_config(num, slot, func, 
32079 +                                                        PCI_VENDOR_ID);
32080 +                               vendor &= 0xffff;
32081 +                               switch (vendor) { 
32082 +                               case PCI_VENDOR_ID_VIA:
32083 +#ifdef CONFIG_GART_IOMMU
32084 +                                       if ((end_pfn > MAX_DMA32_PFN ||
32085 +                                            force_iommu) &&
32086 +                                           !iommu_aperture_allowed) {
32087 +                                               printk(KERN_INFO
32088 +    "Looks like a VIA chipset. Disabling IOMMU. Override with \"iommu=allowed\"\n");
32089 +                                               iommu_aperture_disabled = 1;
32090 +                                       }
32091 +#endif
32092 +                                       return;
32093 +                               case PCI_VENDOR_ID_NVIDIA:
32094 +#ifdef CONFIG_ACPI
32095 +                                       /* All timer overrides on Nvidia
32096 +                                          seem to be wrong. Skip them. */
32097 +                                       acpi_skip_timer_override = 1;
32098 +                                       printk(KERN_INFO 
32099 +            "Nvidia board detected. Ignoring ACPI timer override.\n");
32100 +#endif
32101 +                                       /* RED-PEN skip them on mptables too? */
32102 +                                       return;
32103 +                               case PCI_VENDOR_ID_ATI:
32104 +
32105 +                               /* This should be actually default, but
32106 +                                  for 2.6.16 let's do it for ATI only where
32107 +                                  it's really needed. */
32108 +#ifndef CONFIG_XEN
32109 +                                       if (timer_over_8254 == 1) {     
32110 +                                               timer_over_8254 = 0;    
32111 +                                       printk(KERN_INFO
32112 +               "ATI board detected. Disabling timer routing over 8254.\n");
32113 +                                       }       
32114 +#endif
32115 +                                       return;
32116 +                               } 
32117 +
32118 +
32119 +                               /* No multi-function device? */
32120 +                               type = read_pci_config_byte(num,slot,func,
32121 +                                                           PCI_HEADER_TYPE);
32122 +                               if (!(type & 0x80))
32123 +                                       break;
32124 +                       } 
32125 +               }
32126 +       }
32127 +} 
32128 +
32129 +static int __init ioapic_pirq_setup(char *str)
32130 +{
32131 +       int i, max;
32132 +       int ints[MAX_PIRQS+1];
32133 +
32134 +       get_options(str, ARRAY_SIZE(ints), ints);
32135 +
32136 +       for (i = 0; i < MAX_PIRQS; i++)
32137 +               pirq_entries[i] = -1;
32138 +
32139 +       pirqs_enabled = 1;
32140 +       apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n");
32141 +       max = MAX_PIRQS;
32142 +       if (ints[0] < MAX_PIRQS)
32143 +               max = ints[0];
32144 +
32145 +       for (i = 0; i < max; i++) {
32146 +               apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
32147 +               /*
32148 +                * PIRQs are mapped upside down, usually.
32149 +                */
32150 +               pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
32151 +       }
32152 +       return 1;
32153 +}
32154 +
32155 +__setup("pirq=", ioapic_pirq_setup);
32156 +
32157 +/*
32158 + * Find the IRQ entry number of a certain pin.
32159 + */
32160 +static int find_irq_entry(int apic, int pin, int type)
32161 +{
32162 +       int i;
32163 +
32164 +       for (i = 0; i < mp_irq_entries; i++)
32165 +               if (mp_irqs[i].mpc_irqtype == type &&
32166 +                   (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
32167 +                    mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
32168 +                   mp_irqs[i].mpc_dstirq == pin)
32169 +                       return i;
32170 +
32171 +       return -1;
32172 +}
32173 +
32174 +#ifndef CONFIG_XEN
32175 +/*
32176 + * Find the pin to which IRQ[irq] (ISA) is connected
32177 + */
32178 +static int __init find_isa_irq_pin(int irq, int type)
32179 +{
32180 +       int i;
32181 +
32182 +       for (i = 0; i < mp_irq_entries; i++) {
32183 +               int lbus = mp_irqs[i].mpc_srcbus;
32184 +
32185 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
32186 +                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
32187 +                    mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
32188 +                   (mp_irqs[i].mpc_irqtype == type) &&
32189 +                   (mp_irqs[i].mpc_srcbusirq == irq))
32190 +
32191 +                       return mp_irqs[i].mpc_dstirq;
32192 +       }
32193 +       return -1;
32194 +}
32195 +
32196 +static int __init find_isa_irq_apic(int irq, int type)
32197 +{
32198 +       int i;
32199 +
32200 +       for (i = 0; i < mp_irq_entries; i++) {
32201 +               int lbus = mp_irqs[i].mpc_srcbus;
32202 +
32203 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
32204 +                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
32205 +                    mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
32206 +                   (mp_irqs[i].mpc_irqtype == type) &&
32207 +                   (mp_irqs[i].mpc_srcbusirq == irq))
32208 +                       break;
32209 +       }
32210 +       if (i < mp_irq_entries) {
32211 +               int apic;
32212 +               for(apic = 0; apic < nr_ioapics; apic++) {
32213 +                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
32214 +                               return apic;
32215 +               }
32216 +       }
32217 +
32218 +       return -1;
32219 +}
32220 +#endif
32221 +
32222 +/*
32223 + * Find a specific PCI IRQ entry.
32224 + * Not an __init, possibly needed by modules
32225 + */
32226 +static int pin_2_irq(int idx, int apic, int pin);
32227 +
32228 +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
32229 +{
32230 +       int apic, i, best_guess = -1;
32231 +
32232 +       apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
32233 +               bus, slot, pin);
32234 +       if (mp_bus_id_to_pci_bus[bus] == -1) {
32235 +               apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
32236 +               return -1;
32237 +       }
32238 +       for (i = 0; i < mp_irq_entries; i++) {
32239 +               int lbus = mp_irqs[i].mpc_srcbus;
32240 +
32241 +               for (apic = 0; apic < nr_ioapics; apic++)
32242 +                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
32243 +                           mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
32244 +                               break;
32245 +
32246 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
32247 +                   !mp_irqs[i].mpc_irqtype &&
32248 +                   (bus == lbus) &&
32249 +                   (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
32250 +                       int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
32251 +
32252 +                       if (!(apic || IO_APIC_IRQ(irq)))
32253 +                               continue;
32254 +
32255 +                       if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
32256 +                               return irq;
32257 +                       /*
32258 +                        * Use the first all-but-pin matching entry as a
32259 +                        * best-guess fuzzy result for broken mptables.
32260 +                        */
32261 +                       if (best_guess < 0)
32262 +                               best_guess = irq;
32263 +               }
32264 +       }
32265 +       BUG_ON(best_guess >= NR_IRQS);
32266 +       return best_guess;
32267 +}
32268 +
32269 +/*
32270 + * EISA Edge/Level control register, ELCR
32271 + */
32272 +static int EISA_ELCR(unsigned int irq)
32273 +{
32274 +       if (irq < 16) {
32275 +               unsigned int port = 0x4d0 + (irq >> 3);
32276 +               return (inb(port) >> (irq & 7)) & 1;
32277 +       }
32278 +       apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq);
32279 +       return 0;
32280 +}
32281 +
32282 +/* EISA interrupts are always polarity zero and can be edge or level
32283 + * trigger depending on the ELCR value.  If an interrupt is listed as
32284 + * EISA conforming in the MP table, that means its trigger type must
32285 + * be read in from the ELCR */
32286 +
32287 +#define default_EISA_trigger(idx)      (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
32288 +#define default_EISA_polarity(idx)     (0)
32289 +
32290 +/* ISA interrupts are always polarity zero edge triggered,
32291 + * when listed as conforming in the MP table. */
32292 +
32293 +#define default_ISA_trigger(idx)       (0)
32294 +#define default_ISA_polarity(idx)      (0)
32295 +
32296 +/* PCI interrupts are always polarity one level triggered,
32297 + * when listed as conforming in the MP table. */
32298 +
32299 +#define default_PCI_trigger(idx)       (1)
32300 +#define default_PCI_polarity(idx)      (1)
32301 +
32302 +/* MCA interrupts are always polarity zero level triggered,
32303 + * when listed as conforming in the MP table. */
32304 +
32305 +#define default_MCA_trigger(idx)       (1)
32306 +#define default_MCA_polarity(idx)      (0)
32307 +
32308 +static int __init MPBIOS_polarity(int idx)
32309 +{
32310 +       int bus = mp_irqs[idx].mpc_srcbus;
32311 +       int polarity;
32312 +
32313 +       /*
32314 +        * Determine IRQ line polarity (high active or low active):
32315 +        */
32316 +       switch (mp_irqs[idx].mpc_irqflag & 3)
32317 +       {
32318 +               case 0: /* conforms, ie. bus-type dependent polarity */
32319 +               {
32320 +                       switch (mp_bus_id_to_type[bus])
32321 +                       {
32322 +                               case MP_BUS_ISA: /* ISA pin */
32323 +                               {
32324 +                                       polarity = default_ISA_polarity(idx);
32325 +                                       break;
32326 +                               }
32327 +                               case MP_BUS_EISA: /* EISA pin */
32328 +                               {
32329 +                                       polarity = default_EISA_polarity(idx);
32330 +                                       break;
32331 +                               }
32332 +                               case MP_BUS_PCI: /* PCI pin */
32333 +                               {
32334 +                                       polarity = default_PCI_polarity(idx);
32335 +                                       break;
32336 +                               }
32337 +                               case MP_BUS_MCA: /* MCA pin */
32338 +                               {
32339 +                                       polarity = default_MCA_polarity(idx);
32340 +                                       break;
32341 +                               }
32342 +                               default:
32343 +                               {
32344 +                                       printk(KERN_WARNING "broken BIOS!!\n");
32345 +                                       polarity = 1;
32346 +                                       break;
32347 +                               }
32348 +                       }
32349 +                       break;
32350 +               }
32351 +               case 1: /* high active */
32352 +               {
32353 +                       polarity = 0;
32354 +                       break;
32355 +               }
32356 +               case 2: /* reserved */
32357 +               {
32358 +                       printk(KERN_WARNING "broken BIOS!!\n");
32359 +                       polarity = 1;
32360 +                       break;
32361 +               }
32362 +               case 3: /* low active */
32363 +               {
32364 +                       polarity = 1;
32365 +                       break;
32366 +               }
32367 +               default: /* invalid */
32368 +               {
32369 +                       printk(KERN_WARNING "broken BIOS!!\n");
32370 +                       polarity = 1;
32371 +                       break;
32372 +               }
32373 +       }
32374 +       return polarity;
32375 +}
32376 +
32377 +static int MPBIOS_trigger(int idx)
32378 +{
32379 +       int bus = mp_irqs[idx].mpc_srcbus;
32380 +       int trigger;
32381 +
32382 +       /*
32383 +        * Determine IRQ trigger mode (edge or level sensitive):
32384 +        */
32385 +       switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
32386 +       {
32387 +               case 0: /* conforms, ie. bus-type dependent */
32388 +               {
32389 +                       switch (mp_bus_id_to_type[bus])
32390 +                       {
32391 +                               case MP_BUS_ISA: /* ISA pin */
32392 +                               {
32393 +                                       trigger = default_ISA_trigger(idx);
32394 +                                       break;
32395 +                               }
32396 +                               case MP_BUS_EISA: /* EISA pin */
32397 +                               {
32398 +                                       trigger = default_EISA_trigger(idx);
32399 +                                       break;
32400 +                               }
32401 +                               case MP_BUS_PCI: /* PCI pin */
32402 +                               {
32403 +                                       trigger = default_PCI_trigger(idx);
32404 +                                       break;
32405 +                               }
32406 +                               case MP_BUS_MCA: /* MCA pin */
32407 +                               {
32408 +                                       trigger = default_MCA_trigger(idx);
32409 +                                       break;
32410 +                               }
32411 +                               default:
32412 +                               {
32413 +                                       printk(KERN_WARNING "broken BIOS!!\n");
32414 +                                       trigger = 1;
32415 +                                       break;
32416 +                               }
32417 +                       }
32418 +                       break;
32419 +               }
32420 +               case 1: /* edge */
32421 +               {
32422 +                       trigger = 0;
32423 +                       break;
32424 +               }
32425 +               case 2: /* reserved */
32426 +               {
32427 +                       printk(KERN_WARNING "broken BIOS!!\n");
32428 +                       trigger = 1;
32429 +                       break;
32430 +               }
32431 +               case 3: /* level */
32432 +               {
32433 +                       trigger = 1;
32434 +                       break;
32435 +               }
32436 +               default: /* invalid */
32437 +               {
32438 +                       printk(KERN_WARNING "broken BIOS!!\n");
32439 +                       trigger = 0;
32440 +                       break;
32441 +               }
32442 +       }
32443 +       return trigger;
32444 +}
32445 +
32446 +static inline int irq_polarity(int idx)
32447 +{
32448 +       return MPBIOS_polarity(idx);
32449 +}
32450 +
32451 +static inline int irq_trigger(int idx)
32452 +{
32453 +       return MPBIOS_trigger(idx);
32454 +}
32455 +
32456 +static int next_irq = 16;
32457 +
32458 +/*
32459 + * gsi_irq_sharing -- Name overload!  "irq" can be either a legacy IRQ
32460 + * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
32461 + * from ACPI, which can reach 800 in large boxen.
32462 + *
32463 + * Compact the sparse GSI space into a sequential IRQ series and reuse
32464 + * vectors if possible.
32465 + */
32466 +int gsi_irq_sharing(int gsi)
32467 +{
32468 +       int i, tries, vector;
32469 +
32470 +       BUG_ON(gsi >= NR_IRQ_VECTORS);
32471 +
32472 +       if (platform_legacy_irq(gsi))
32473 +               return gsi;
32474 +
32475 +       if (gsi_2_irq[gsi] != 0xFF)
32476 +               return (int)gsi_2_irq[gsi];
32477 +
32478 +       tries = NR_IRQS;
32479 +  try_again:
32480 +       vector = assign_irq_vector(gsi);
32481 +
32482 +       /*
32483 +        * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
32484 +        * use of vector and if found, return that IRQ.  However, we never want
32485 +        * to share legacy IRQs, which usually have a different trigger mode
32486 +        * than PCI.
32487 +        */
32488 +       for (i = 0; i < NR_IRQS; i++)
32489 +               if (IO_APIC_VECTOR(i) == vector)
32490 +                       break;
32491 +       if (platform_legacy_irq(i)) {
32492 +               if (--tries >= 0) {
32493 +                       IO_APIC_VECTOR(i) = 0;
32494 +                       goto try_again;
32495 +               }
32496 +               panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
32497 +       }
32498 +       if (i < NR_IRQS) {
32499 +               gsi_2_irq[gsi] = i;
32500 +               printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
32501 +                               gsi, vector, i);
32502 +               return i;
32503 +       }
32504 +
32505 +       i = next_irq++;
32506 +       BUG_ON(i >= NR_IRQS);
32507 +       gsi_2_irq[gsi] = i;
32508 +       IO_APIC_VECTOR(i) = vector;
32509 +       printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
32510 +                       gsi, vector, i);
32511 +       return i;
32512 +}
32513 +
32514 +static int pin_2_irq(int idx, int apic, int pin)
32515 +{
32516 +       int irq, i;
32517 +       int bus = mp_irqs[idx].mpc_srcbus;
32518 +
32519 +       /*
32520 +        * Debugging check, we are in big trouble if this message pops up!
32521 +        */
32522 +       if (mp_irqs[idx].mpc_dstirq != pin)
32523 +               printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
32524 +
32525 +       switch (mp_bus_id_to_type[bus])
32526 +       {
32527 +               case MP_BUS_ISA: /* ISA pin */
32528 +               case MP_BUS_EISA:
32529 +               case MP_BUS_MCA:
32530 +               {
32531 +                       irq = mp_irqs[idx].mpc_srcbusirq;
32532 +                       break;
32533 +               }
32534 +               case MP_BUS_PCI: /* PCI pin */
32535 +               {
32536 +                       /*
32537 +                        * PCI IRQs are mapped in order
32538 +                        */
32539 +                       i = irq = 0;
32540 +                       while (i < apic)
32541 +                               irq += nr_ioapic_registers[i++];
32542 +                       irq += pin;
32543 +                       irq = gsi_irq_sharing(irq);
32544 +                       break;
32545 +               }
32546 +               default:
32547 +               {
32548 +                       printk(KERN_ERR "unknown bus type %d.\n",bus); 
32549 +                       irq = 0;
32550 +                       break;
32551 +               }
32552 +       }
32553 +       BUG_ON(irq >= NR_IRQS);
32554 +
32555 +       /*
32556 +        * PCI IRQ command line redirection. Yes, limits are hardcoded.
32557 +        */
32558 +       if ((pin >= 16) && (pin <= 23)) {
32559 +               if (pirq_entries[pin-16] != -1) {
32560 +                       if (!pirq_entries[pin-16]) {
32561 +                               apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16);
32562 +                       } else {
32563 +                               irq = pirq_entries[pin-16];
32564 +                               apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n",
32565 +                                               pin-16, irq);
32566 +                       }
32567 +               }
32568 +       }
32569 +       BUG_ON(irq >= NR_IRQS);
32570 +       return irq;
32571 +}
32572 +
32573 +static inline int IO_APIC_irq_trigger(int irq)
32574 +{
32575 +       int apic, idx, pin;
32576 +
32577 +       for (apic = 0; apic < nr_ioapics; apic++) {
32578 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
32579 +                       idx = find_irq_entry(apic,pin,mp_INT);
32580 +                       if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
32581 +                               return irq_trigger(idx);
32582 +               }
32583 +       }
32584 +       /*
32585 +        * nonexistent IRQs are edge default
32586 +        */
32587 +       return 0;
32588 +}
32589 +
32590 +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
32591 +u8 irq_vector[NR_IRQ_VECTORS] __read_mostly;
32592 +
32593 +int assign_irq_vector(int irq)
32594 +{
32595 +       struct physdev_irq irq_op;
32596 +  
32597 +       BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
32598 +       if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
32599 +               return IO_APIC_VECTOR(irq);
32600 +
32601 +       irq_op.irq = irq;
32602 +       if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
32603 +               return -ENOSPC;
32604 +
32605 +       vector_irq[irq_op.vector] = irq;
32606 +       if (irq != AUTO_ASSIGN)
32607 +               IO_APIC_VECTOR(irq) = irq_op.vector;
32608 +
32609 +       return irq_op.vector;
32610 +}
32611 +
32612 +extern void (*interrupt[NR_IRQS])(void);
32613 +#ifndef CONFIG_XEN
32614 +static struct hw_interrupt_type ioapic_level_type;
32615 +static struct hw_interrupt_type ioapic_edge_type;
32616 +
32617 +#define IOAPIC_AUTO    -1
32618 +#define IOAPIC_EDGE    0
32619 +#define IOAPIC_LEVEL   1
32620 +
32621 +static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
32622 +{
32623 +       if (use_pci_vector() && !platform_legacy_irq(irq)) {
32624 +               if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
32625 +                               trigger == IOAPIC_LEVEL)
32626 +                       irq_desc[vector].handler = &ioapic_level_type;
32627 +               else
32628 +                       irq_desc[vector].handler = &ioapic_edge_type;
32629 +               set_intr_gate(vector, interrupt[vector]);
32630 +       } else  {
32631 +               if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
32632 +                               trigger == IOAPIC_LEVEL)
32633 +                       irq_desc[irq].handler = &ioapic_level_type;
32634 +               else
32635 +                       irq_desc[irq].handler = &ioapic_edge_type;
32636 +               set_intr_gate(vector, interrupt[irq]);
32637 +       }
32638 +}
32639 +#else
32640 +#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
32641 +#endif /* !CONFIG_XEN */
32642 +
32643 +static void __init setup_IO_APIC_irqs(void)
32644 +{
32645 +       struct IO_APIC_route_entry entry;
32646 +       int apic, pin, idx, irq, first_notcon = 1, vector;
32647 +       unsigned long flags;
32648 +
32649 +       apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
32650 +
32651 +       for (apic = 0; apic < nr_ioapics; apic++) {
32652 +       for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
32653 +
32654 +               /*
32655 +                * add it to the IO-APIC irq-routing table:
32656 +                */
32657 +               memset(&entry,0,sizeof(entry));
32658 +
32659 +               entry.delivery_mode = INT_DELIVERY_MODE;
32660 +               entry.dest_mode = INT_DEST_MODE;
32661 +               entry.mask = 0;                         /* enable IRQ */
32662 +               entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
32663 +
32664 +               idx = find_irq_entry(apic,pin,mp_INT);
32665 +               if (idx == -1) {
32666 +                       if (first_notcon) {
32667 +                               apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
32668 +                               first_notcon = 0;
32669 +                       } else
32670 +                               apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
32671 +                       continue;
32672 +               }
32673 +
32674 +               entry.trigger = irq_trigger(idx);
32675 +               entry.polarity = irq_polarity(idx);
32676 +
32677 +               if (irq_trigger(idx)) {
32678 +                       entry.trigger = 1;
32679 +                       entry.mask = 1;
32680 +                       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
32681 +               }
32682 +
32683 +               irq = pin_2_irq(idx, apic, pin);
32684 +               add_pin_to_irq(irq, apic, pin);
32685 +
32686 +               if (/* !apic && */ !IO_APIC_IRQ(irq))
32687 +                       continue;
32688 +
32689 +               if (IO_APIC_IRQ(irq)) {
32690 +                       vector = assign_irq_vector(irq);
32691 +                       entry.vector = vector;
32692 +
32693 +                       ioapic_register_intr(irq, vector, IOAPIC_AUTO);
32694 +                       if (!apic && (irq < 16))
32695 +                               disable_8259A_irq(irq);
32696 +               }
32697 +               spin_lock_irqsave(&ioapic_lock, flags);
32698 +               io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
32699 +               io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
32700 +               set_native_irq_info(irq, TARGET_CPUS);
32701 +               spin_unlock_irqrestore(&ioapic_lock, flags);
32702 +       }
32703 +       }
32704 +
32705 +       if (!first_notcon)
32706 +               apic_printk(APIC_VERBOSE," not connected.\n");
32707 +}
32708 +
32709 +#ifndef CONFIG_XEN
32710 +/*
32711 + * Set up the 8259A-master output pin as broadcast to all
32712 + * CPUs.
32713 + */
32714 +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
32715 +{
32716 +       struct IO_APIC_route_entry entry;
32717 +       unsigned long flags;
32718 +
32719 +       memset(&entry,0,sizeof(entry));
32720 +
32721 +       disable_8259A_irq(0);
32722 +
32723 +       /* mask LVT0 */
32724 +       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
32725 +
32726 +       /*
32727 +        * We use logical delivery to get the timer IRQ
32728 +        * to the first CPU.
32729 +        */
32730 +       entry.dest_mode = INT_DEST_MODE;
32731 +       entry.mask = 0;                                 /* unmask IRQ now */
32732 +       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
32733 +       entry.delivery_mode = INT_DELIVERY_MODE;
32734 +       entry.polarity = 0;
32735 +       entry.trigger = 0;
32736 +       entry.vector = vector;
32737 +
32738 +       /*
32739 +        * The timer IRQ doesn't have to know that behind the
32740 +        * scene we have a 8259A-master in AEOI mode ...
32741 +        */
32742 +       irq_desc[0].handler = &ioapic_edge_type;
32743 +
32744 +       /*
32745 +        * Add it to the IO-APIC irq-routing table:
32746 +        */
32747 +       spin_lock_irqsave(&ioapic_lock, flags);
32748 +       io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
32749 +       io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
32750 +       spin_unlock_irqrestore(&ioapic_lock, flags);
32751 +
32752 +       enable_8259A_irq(0);
32753 +}
32754 +
32755 +void __init UNEXPECTED_IO_APIC(void)
32756 +{
32757 +}
32758 +
32759 +void __apicdebuginit print_IO_APIC(void)
32760 +{
32761 +       int apic, i;
32762 +       union IO_APIC_reg_00 reg_00;
32763 +       union IO_APIC_reg_01 reg_01;
32764 +       union IO_APIC_reg_02 reg_02;
32765 +       unsigned long flags;
32766 +
32767 +       if (apic_verbosity == APIC_QUIET)
32768 +               return;
32769 +
32770 +       printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
32771 +       for (i = 0; i < nr_ioapics; i++)
32772 +               printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
32773 +                      mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
32774 +
32775 +       /*
32776 +        * We are a bit conservative about what we expect.  We have to
32777 +        * know about every hardware change ASAP.
32778 +        */
32779 +       printk(KERN_INFO "testing the IO APIC.......................\n");
32780 +
32781 +       for (apic = 0; apic < nr_ioapics; apic++) {
32782 +
32783 +       spin_lock_irqsave(&ioapic_lock, flags);
32784 +       reg_00.raw = io_apic_read(apic, 0);
32785 +       reg_01.raw = io_apic_read(apic, 1);
32786 +       if (reg_01.bits.version >= 0x10)
32787 +               reg_02.raw = io_apic_read(apic, 2);
32788 +       spin_unlock_irqrestore(&ioapic_lock, flags);
32789 +
32790 +       printk("\n");
32791 +       printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
32792 +       printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
32793 +       printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
32794 +       if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
32795 +               UNEXPECTED_IO_APIC();
32796 +
32797 +       printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
32798 +       printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
32799 +       if (    (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
32800 +               (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
32801 +               (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
32802 +               (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
32803 +               (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
32804 +               (reg_01.bits.entries != 0x2E) &&
32805 +               (reg_01.bits.entries != 0x3F) &&
32806 +               (reg_01.bits.entries != 0x03) 
32807 +       )
32808 +               UNEXPECTED_IO_APIC();
32809 +
32810 +       printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
32811 +       printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
32812 +       if (    (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
32813 +               (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */
32814 +               (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
32815 +               (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
32816 +               (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
32817 +               (reg_01.bits.version != 0x20)    /* Intel P64H (82806 AA) */
32818 +       )
32819 +               UNEXPECTED_IO_APIC();
32820 +       if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
32821 +               UNEXPECTED_IO_APIC();
32822 +
32823 +       if (reg_01.bits.version >= 0x10) {
32824 +               printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
32825 +               printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
32826 +               if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
32827 +                       UNEXPECTED_IO_APIC();
32828 +       }
32829 +
32830 +       printk(KERN_DEBUG ".... IRQ redirection table:\n");
32831 +
32832 +       printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
32833 +                         " Stat Dest Deli Vect:   \n");
32834 +
32835 +       for (i = 0; i <= reg_01.bits.entries; i++) {
32836 +               struct IO_APIC_route_entry entry;
32837 +
32838 +               spin_lock_irqsave(&ioapic_lock, flags);
32839 +               *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
32840 +               *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
32841 +               spin_unlock_irqrestore(&ioapic_lock, flags);
32842 +
32843 +               printk(KERN_DEBUG " %02x %03X %02X  ",
32844 +                       i,
32845 +                       entry.dest.logical.logical_dest,
32846 +                       entry.dest.physical.physical_dest
32847 +               );
32848 +
32849 +               printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
32850 +                       entry.mask,
32851 +                       entry.trigger,
32852 +                       entry.irr,
32853 +                       entry.polarity,
32854 +                       entry.delivery_status,
32855 +                       entry.dest_mode,
32856 +                       entry.delivery_mode,
32857 +                       entry.vector
32858 +               );
32859 +       }
32860 +       }
32861 +       if (use_pci_vector())
32862 +               printk(KERN_INFO "Using vector-based indexing\n");
32863 +       printk(KERN_DEBUG "IRQ to pin mappings:\n");
32864 +       for (i = 0; i < NR_IRQS; i++) {
32865 +               struct irq_pin_list *entry = irq_2_pin + i;
32866 +               if (entry->pin < 0)
32867 +                       continue;
32868 +               if (use_pci_vector() && !platform_legacy_irq(i))
32869 +                       printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
32870 +               else
32871 +                       printk(KERN_DEBUG "IRQ%d ", i);
32872 +               for (;;) {
32873 +                       printk("-> %d:%d", entry->apic, entry->pin);
32874 +                       if (!entry->next)
32875 +                               break;
32876 +                       entry = irq_2_pin + entry->next;
32877 +               }
32878 +               printk("\n");
32879 +       }
32880 +
32881 +       printk(KERN_INFO ".................................... done.\n");
32882 +
32883 +       return;
32884 +}
32885 +
32886 +#if 0
32887 +
32888 +static __apicdebuginit void print_APIC_bitfield (int base)
32889 +{
32890 +       unsigned int v;
32891 +       int i, j;
32892 +
32893 +       if (apic_verbosity == APIC_QUIET)
32894 +               return;
32895 +
32896 +       printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
32897 +       for (i = 0; i < 8; i++) {
32898 +               v = apic_read(base + i*0x10);
32899 +               for (j = 0; j < 32; j++) {
32900 +                       if (v & (1<<j))
32901 +                               printk("1");
32902 +                       else
32903 +                               printk("0");
32904 +               }
32905 +               printk("\n");
32906 +       }
32907 +}
32908 +
32909 +void __apicdebuginit print_local_APIC(void * dummy)
32910 +{
32911 +       unsigned int v, ver, maxlvt;
32912 +
32913 +       if (apic_verbosity == APIC_QUIET)
32914 +               return;
32915 +
32916 +       printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
32917 +               smp_processor_id(), hard_smp_processor_id());
32918 +       v = apic_read(APIC_ID);
32919 +       printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(v));
32920 +       v = apic_read(APIC_LVR);
32921 +       printk(KERN_INFO "... APIC VERSION: %08x\n", v);
32922 +       ver = GET_APIC_VERSION(v);
32923 +       maxlvt = get_maxlvt();
32924 +
32925 +       v = apic_read(APIC_TASKPRI);
32926 +       printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
32927 +
32928 +       v = apic_read(APIC_ARBPRI);
32929 +       printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
32930 +               v & APIC_ARBPRI_MASK);
32931 +       v = apic_read(APIC_PROCPRI);
32932 +       printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
32933 +
32934 +       v = apic_read(APIC_EOI);
32935 +       printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
32936 +       v = apic_read(APIC_RRR);
32937 +       printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
32938 +       v = apic_read(APIC_LDR);
32939 +       printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
32940 +       v = apic_read(APIC_DFR);
32941 +       printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
32942 +       v = apic_read(APIC_SPIV);
32943 +       printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
32944 +
32945 +       printk(KERN_DEBUG "... APIC ISR field:\n");
32946 +       print_APIC_bitfield(APIC_ISR);
32947 +       printk(KERN_DEBUG "... APIC TMR field:\n");
32948 +       print_APIC_bitfield(APIC_TMR);
32949 +       printk(KERN_DEBUG "... APIC IRR field:\n");
32950 +       print_APIC_bitfield(APIC_IRR);
32951 +
32952 +       v = apic_read(APIC_ESR);
32953 +       printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
32954 +
32955 +       v = apic_read(APIC_ICR);
32956 +       printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
32957 +       v = apic_read(APIC_ICR2);
32958 +       printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
32959 +
32960 +       v = apic_read(APIC_LVTT);
32961 +       printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
32962 +
32963 +       if (maxlvt > 3) {                       /* PC is LVT#4. */
32964 +               v = apic_read(APIC_LVTPC);
32965 +               printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
32966 +       }
32967 +       v = apic_read(APIC_LVT0);
32968 +       printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
32969 +       v = apic_read(APIC_LVT1);
32970 +       printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
32971 +
32972 +       if (maxlvt > 2) {                       /* ERR is LVT#3. */
32973 +               v = apic_read(APIC_LVTERR);
32974 +               printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
32975 +       }
32976 +
32977 +       v = apic_read(APIC_TMICT);
32978 +       printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
32979 +       v = apic_read(APIC_TMCCT);
32980 +       printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
32981 +       v = apic_read(APIC_TDCR);
32982 +       printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
32983 +       printk("\n");
32984 +}
32985 +
32986 +void print_all_local_APICs (void)
32987 +{
32988 +       on_each_cpu(print_local_APIC, NULL, 1, 1);
32989 +}
32990 +
32991 +void __apicdebuginit print_PIC(void)
32992 +{
32993 +       unsigned int v;
32994 +       unsigned long flags;
32995 +
32996 +       if (apic_verbosity == APIC_QUIET)
32997 +               return;
32998 +
32999 +       printk(KERN_DEBUG "\nprinting PIC contents\n");
33000 +
33001 +       spin_lock_irqsave(&i8259A_lock, flags);
33002 +
33003 +       v = inb(0xa1) << 8 | inb(0x21);
33004 +       printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
33005 +
33006 +       v = inb(0xa0) << 8 | inb(0x20);
33007 +       printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
33008 +
33009 +       outb(0x0b,0xa0);
33010 +       outb(0x0b,0x20);
33011 +       v = inb(0xa0) << 8 | inb(0x20);
33012 +       outb(0x0a,0xa0);
33013 +       outb(0x0a,0x20);
33014 +
33015 +       spin_unlock_irqrestore(&i8259A_lock, flags);
33016 +
33017 +       printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
33018 +
33019 +       v = inb(0x4d1) << 8 | inb(0x4d0);
33020 +       printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
33021 +}
33022 +
33023 +#endif  /*  0  */
33024 +
33025 +#else
33026 +void __init print_IO_APIC(void) { }
33027 +#endif /* !CONFIG_XEN */
33028 +
33029 +static void __init enable_IO_APIC(void)
33030 +{
33031 +       union IO_APIC_reg_01 reg_01;
33032 +#ifndef CONFIG_XEN
33033 +       int i8259_apic, i8259_pin;
33034 +#endif
33035 +       int i, apic;
33036 +       unsigned long flags;
33037 +
33038 +       for (i = 0; i < PIN_MAP_SIZE; i++) {
33039 +               irq_2_pin[i].pin = -1;
33040 +               irq_2_pin[i].next = 0;
33041 +       }
33042 +       if (!pirqs_enabled)
33043 +               for (i = 0; i < MAX_PIRQS; i++)
33044 +                       pirq_entries[i] = -1;
33045 +
33046 +       /*
33047 +        * The number of IO-APIC IRQ registers (== #pins):
33048 +        */
33049 +       for (apic = 0; apic < nr_ioapics; apic++) {
33050 +               spin_lock_irqsave(&ioapic_lock, flags);
33051 +               reg_01.raw = io_apic_read(apic, 1);
33052 +               spin_unlock_irqrestore(&ioapic_lock, flags);
33053 +               nr_ioapic_registers[apic] = reg_01.bits.entries+1;
33054 +       }
33055 +#ifndef CONFIG_XEN
33056 +       for(apic = 0; apic < nr_ioapics; apic++) {
33057 +               int pin;
33058 +               /* See if any of the pins is in ExtINT mode */
33059 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
33060 +                       struct IO_APIC_route_entry entry;
33061 +                       spin_lock_irqsave(&ioapic_lock, flags);
33062 +                       *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
33063 +                       *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
33064 +                       spin_unlock_irqrestore(&ioapic_lock, flags);
33065 +
33066 +
33067 +                       /* If the interrupt line is enabled and in ExtInt mode
33068 +                        * I have found the pin where the i8259 is connected.
33069 +                        */
33070 +                       if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
33071 +                               ioapic_i8259.apic = apic;
33072 +                               ioapic_i8259.pin  = pin;
33073 +                               goto found_i8259;
33074 +                       }
33075 +               }
33076 +       }
33077 + found_i8259:
33078 +       /* Look to see what if the MP table has reported the ExtINT */
33079 +       i8259_pin  = find_isa_irq_pin(0, mp_ExtINT);
33080 +       i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
33081 +       /* Trust the MP table if nothing is setup in the hardware */
33082 +       if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
33083 +               printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
33084 +               ioapic_i8259.pin  = i8259_pin;
33085 +               ioapic_i8259.apic = i8259_apic;
33086 +       }
33087 +       /* Complain if the MP table and the hardware disagree */
33088 +       if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
33089 +               (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
33090 +       {
33091 +               printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
33092 +       }
33093 +#endif
33094 +
33095 +       /*
33096 +        * Do not trust the IO-APIC being empty at bootup
33097 +        */
33098 +       clear_IO_APIC();
33099 +}
33100 +
33101 +/*
33102 + * Not an __init, needed by the reboot code
33103 + */
33104 +void disable_IO_APIC(void)
33105 +{
33106 +       /*
33107 +        * Clear the IO-APIC before rebooting:
33108 +        */
33109 +       clear_IO_APIC();
33110 +
33111 +#ifndef CONFIG_XEN
33112 +       /*
33113 +        * If the i8259 is routed through an IOAPIC
33114 +        * Put that IOAPIC in virtual wire mode
33115 +        * so legacy interrupts can be delivered.
33116 +        */
33117 +       if (ioapic_i8259.pin != -1) {
33118 +               struct IO_APIC_route_entry entry;
33119 +               unsigned long flags;
33120 +
33121 +               memset(&entry, 0, sizeof(entry));
33122 +               entry.mask            = 0; /* Enabled */
33123 +               entry.trigger         = 0; /* Edge */
33124 +               entry.irr             = 0;
33125 +               entry.polarity        = 0; /* High */
33126 +               entry.delivery_status = 0;
33127 +               entry.dest_mode       = 0; /* Physical */
33128 +               entry.delivery_mode   = dest_ExtINT; /* ExtInt */
33129 +               entry.vector          = 0;
33130 +               entry.dest.physical.physical_dest =
33131 +                                       GET_APIC_ID(apic_read(APIC_ID));
33132 +
33133 +               /*
33134 +                * Add it to the IO-APIC irq-routing table:
33135 +                */
33136 +               spin_lock_irqsave(&ioapic_lock, flags);
33137 +               io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
33138 +                       *(((int *)&entry)+1));
33139 +               io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
33140 +                       *(((int *)&entry)+0));
33141 +               spin_unlock_irqrestore(&ioapic_lock, flags);
33142 +       }
33143 +
33144 +       disconnect_bsp_APIC(ioapic_i8259.pin != -1);
33145 +#endif
33146 +}
33147 +
33148 +/*
33149 + * function to set the IO-APIC physical IDs based on the
33150 + * values stored in the MPC table.
33151 + *
33152 + * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
33153 + */
33154 +
33155 +#ifndef CONFIG_XEN
33156 +static void __init setup_ioapic_ids_from_mpc (void)
33157 +{
33158 +       union IO_APIC_reg_00 reg_00;
33159 +       int apic;
33160 +       int i;
33161 +       unsigned char old_id;
33162 +       unsigned long flags;
33163 +
33164 +       /*
33165 +        * Set the IOAPIC ID to the value stored in the MPC table.
33166 +        */
33167 +       for (apic = 0; apic < nr_ioapics; apic++) {
33168 +
33169 +               /* Read the register 0 value */
33170 +               spin_lock_irqsave(&ioapic_lock, flags);
33171 +               reg_00.raw = io_apic_read(apic, 0);
33172 +               spin_unlock_irqrestore(&ioapic_lock, flags);
33173 +               
33174 +               old_id = mp_ioapics[apic].mpc_apicid;
33175 +
33176 +
33177 +               printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid);
33178 +
33179 +
33180 +               /*
33181 +                * We need to adjust the IRQ routing table
33182 +                * if the ID changed.
33183 +                */
33184 +               if (old_id != mp_ioapics[apic].mpc_apicid)
33185 +                       for (i = 0; i < mp_irq_entries; i++)
33186 +                               if (mp_irqs[i].mpc_dstapic == old_id)
33187 +                                       mp_irqs[i].mpc_dstapic
33188 +                                               = mp_ioapics[apic].mpc_apicid;
33189 +
33190 +               /*
33191 +                * Read the right value from the MPC table and
33192 +                * write it into the ID register.
33193 +                */
33194 +               apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
33195 +                               mp_ioapics[apic].mpc_apicid);
33196 +
33197 +               reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
33198 +               spin_lock_irqsave(&ioapic_lock, flags);
33199 +               io_apic_write(apic, 0, reg_00.raw);
33200 +               spin_unlock_irqrestore(&ioapic_lock, flags);
33201 +
33202 +               /*
33203 +                * Sanity check
33204 +                */
33205 +               spin_lock_irqsave(&ioapic_lock, flags);
33206 +               reg_00.raw = io_apic_read(apic, 0);
33207 +               spin_unlock_irqrestore(&ioapic_lock, flags);
33208 +               if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
33209 +                       printk("could not set ID!\n");
33210 +               else
33211 +                       apic_printk(APIC_VERBOSE," ok.\n");
33212 +       }
33213 +}
33214 +#else
33215 +static void __init setup_ioapic_ids_from_mpc(void) { }
33216 +#endif
33217 +
33218 +/*
33219 + * There is a nasty bug in some older SMP boards, their mptable lies
33220 + * about the timer IRQ. We do the following to work around the situation:
33221 + *
33222 + *     - timer IRQ defaults to IO-APIC IRQ
33223 + *     - if this function detects that timer IRQs are defunct, then we fall
33224 + *       back to ISA timer IRQs
33225 + */
33226 +#ifndef CONFIG_XEN
33227 +static int __init timer_irq_works(void)
33228 +{
33229 +       unsigned long t1 = jiffies;
33230 +
33231 +       local_irq_enable();
33232 +       /* Let ten ticks pass... */
33233 +       mdelay((10 * 1000) / HZ);
33234 +
33235 +       /*
33236 +        * Expect a few ticks at least, to be sure some possible
33237 +        * glue logic does not lock up after one or two first
33238 +        * ticks in a non-ExtINT mode.  Also the local APIC
33239 +        * might have cached one ExtINT interrupt.  Finally, at
33240 +        * least one tick may be lost due to delays.
33241 +        */
33242 +
33243 +       /* jiffies wrap? */
33244 +       if (jiffies - t1 > 4)
33245 +               return 1;
33246 +       return 0;
33247 +}
33248 +
33249 +/*
33250 + * In the SMP+IOAPIC case it might happen that there are an unspecified
33251 + * number of pending IRQ events unhandled. These cases are very rare,
33252 + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
33253 + * better to do it this way as thus we do not have to be aware of
33254 + * 'pending' interrupts in the IRQ path, except at this point.
33255 + */
33256 +/*
33257 + * Edge triggered needs to resend any interrupt
33258 + * that was delayed but this is now handled in the device
33259 + * independent code.
33260 + */
33261 +
33262 +/*
33263 + * Starting up a edge-triggered IO-APIC interrupt is
33264 + * nasty - we need to make sure that we get the edge.
33265 + * If it is already asserted for some reason, we need
33266 + * return 1 to indicate that is was pending.
33267 + *
33268 + * This is not complete - we should be able to fake
33269 + * an edge even if it isn't on the 8259A...
33270 + */
33271 +
33272 +static unsigned int startup_edge_ioapic_irq(unsigned int irq)
33273 +{
33274 +       int was_pending = 0;
33275 +       unsigned long flags;
33276 +
33277 +       spin_lock_irqsave(&ioapic_lock, flags);
33278 +       if (irq < 16) {
33279 +               disable_8259A_irq(irq);
33280 +               if (i8259A_irq_pending(irq))
33281 +                       was_pending = 1;
33282 +       }
33283 +       __unmask_IO_APIC_irq(irq);
33284 +       spin_unlock_irqrestore(&ioapic_lock, flags);
33285 +
33286 +       return was_pending;
33287 +}
33288 +
33289 +/*
33290 + * Once we have recorded IRQ_PENDING already, we can mask the
33291 + * interrupt for real. This prevents IRQ storms from unhandled
33292 + * devices.
33293 + */
33294 +static void ack_edge_ioapic_irq(unsigned int irq)
33295 +{
33296 +       move_irq(irq);
33297 +       if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
33298 +                                       == (IRQ_PENDING | IRQ_DISABLED))
33299 +               mask_IO_APIC_irq(irq);
33300 +       ack_APIC_irq();
33301 +}
33302 +
33303 +/*
33304 + * Level triggered interrupts can just be masked,
33305 + * and shutting down and starting up the interrupt
33306 + * is the same as enabling and disabling them -- except
33307 + * with a startup need to return a "was pending" value.
33308 + *
33309 + * Level triggered interrupts are special because we
33310 + * do not touch any IO-APIC register while handling
33311 + * them. We ack the APIC in the end-IRQ handler, not
33312 + * in the start-IRQ-handler. Protection against reentrance
33313 + * from the same interrupt is still provided, both by the
33314 + * generic IRQ layer and by the fact that an unacked local
33315 + * APIC does not accept IRQs.
33316 + */
33317 +static unsigned int startup_level_ioapic_irq (unsigned int irq)
33318 +{
33319 +       unmask_IO_APIC_irq(irq);
33320 +
33321 +       return 0; /* don't check for pending */
33322 +}
33323 +
33324 +static void end_level_ioapic_irq (unsigned int irq)
33325 +{
33326 +       move_irq(irq);
33327 +       ack_APIC_irq();
33328 +}
33329 +
33330 +#ifdef CONFIG_PCI_MSI
33331 +static unsigned int startup_edge_ioapic_vector(unsigned int vector)
33332 +{
33333 +       int irq = vector_to_irq(vector);
33334 +
33335 +       return startup_edge_ioapic_irq(irq);
33336 +}
33337 +
33338 +static void ack_edge_ioapic_vector(unsigned int vector)
33339 +{
33340 +       int irq = vector_to_irq(vector);
33341 +
33342 +       move_native_irq(vector);
33343 +       ack_edge_ioapic_irq(irq);
33344 +}
33345 +
33346 +static unsigned int startup_level_ioapic_vector (unsigned int vector)
33347 +{
33348 +       int irq = vector_to_irq(vector);
33349 +
33350 +       return startup_level_ioapic_irq (irq);
33351 +}
33352 +
33353 +static void end_level_ioapic_vector (unsigned int vector)
33354 +{
33355 +       int irq = vector_to_irq(vector);
33356 +
33357 +       move_native_irq(vector);
33358 +       end_level_ioapic_irq(irq);
33359 +}
33360 +
33361 +static void mask_IO_APIC_vector (unsigned int vector)
33362 +{
33363 +       int irq = vector_to_irq(vector);
33364 +
33365 +       mask_IO_APIC_irq(irq);
33366 +}
33367 +
33368 +static void unmask_IO_APIC_vector (unsigned int vector)
33369 +{
33370 +       int irq = vector_to_irq(vector);
33371 +
33372 +       unmask_IO_APIC_irq(irq);
33373 +}
33374 +
33375 +#ifdef CONFIG_SMP
33376 +static void set_ioapic_affinity_vector (unsigned int vector,
33377 +                                       cpumask_t cpu_mask)
33378 +{
33379 +       int irq = vector_to_irq(vector);
33380 +
33381 +       set_native_irq_info(vector, cpu_mask);
33382 +       set_ioapic_affinity_irq(irq, cpu_mask);
33383 +}
33384 +#endif // CONFIG_SMP
33385 +#endif // CONFIG_PCI_MSI
33386 +
33387 +/*
33388 + * Level and edge triggered IO-APIC interrupts need different handling,
33389 + * so we use two separate IRQ descriptors. Edge triggered IRQs can be
33390 + * handled with the level-triggered descriptor, but that one has slightly
33391 + * more overhead. Level-triggered interrupts cannot be handled with the
33392 + * edge-triggered handler, without risking IRQ storms and other ugly
33393 + * races.
33394 + */
33395 +
33396 +static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
33397 +       .typename = "IO-APIC-edge",
33398 +       .startup        = startup_edge_ioapic,
33399 +       .shutdown       = shutdown_edge_ioapic,
33400 +       .enable         = enable_edge_ioapic,
33401 +       .disable        = disable_edge_ioapic,
33402 +       .ack            = ack_edge_ioapic,
33403 +       .end            = end_edge_ioapic,
33404 +#ifdef CONFIG_SMP
33405 +       .set_affinity = set_ioapic_affinity,
33406 +#endif
33407 +};
33408 +
33409 +static struct hw_interrupt_type ioapic_level_type __read_mostly = {
33410 +       .typename = "IO-APIC-level",
33411 +       .startup        = startup_level_ioapic,
33412 +       .shutdown       = shutdown_level_ioapic,
33413 +       .enable         = enable_level_ioapic,
33414 +       .disable        = disable_level_ioapic,
33415 +       .ack            = mask_and_ack_level_ioapic,
33416 +       .end            = end_level_ioapic,
33417 +#ifdef CONFIG_SMP
33418 +       .set_affinity = set_ioapic_affinity,
33419 +#endif
33420 +};
33421 +#endif /* !CONFIG_XEN */
33422 +
33423 +static inline void init_IO_APIC_traps(void)
33424 +{
33425 +       int irq;
33426 +
33427 +       /*
33428 +        * NOTE! The local APIC isn't very good at handling
33429 +        * multiple interrupts at the same interrupt level.
33430 +        * As the interrupt level is determined by taking the
33431 +        * vector number and shifting that right by 4, we
33432 +        * want to spread these out a bit so that they don't
33433 +        * all fall in the same interrupt level.
33434 +        *
33435 +        * Also, we've got to be careful not to trash gate
33436 +        * 0x80, because int 0x80 is hm, kind of importantish. ;)
33437 +        */
33438 +       for (irq = 0; irq < NR_IRQS ; irq++) {
33439 +               int tmp = irq;
33440 +               if (use_pci_vector()) {
33441 +                       if (!platform_legacy_irq(tmp))
33442 +                               if ((tmp = vector_to_irq(tmp)) == -1)
33443 +                                       continue;
33444 +               }
33445 +               if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
33446 +                       /*
33447 +                        * Hmm.. We don't have an entry for this,
33448 +                        * so default to an old-fashioned 8259
33449 +                        * interrupt if we can..
33450 +                        */
33451 +                       if (irq < 16)
33452 +                               make_8259A_irq(irq);
33453 +#ifndef CONFIG_XEN
33454 +                       else
33455 +                               /* Strange. Oh, well.. */
33456 +                               irq_desc[irq].handler = &no_irq_type;
33457 +#endif
33458 +               }
33459 +       }
33460 +}
33461 +
33462 +#ifndef CONFIG_XEN
33463 +static void enable_lapic_irq (unsigned int irq)
33464 +{
33465 +       unsigned long v;
33466 +
33467 +       v = apic_read(APIC_LVT0);
33468 +       apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
33469 +}
33470 +
33471 +static void disable_lapic_irq (unsigned int irq)
33472 +{
33473 +       unsigned long v;
33474 +
33475 +       v = apic_read(APIC_LVT0);
33476 +       apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
33477 +}
33478 +
33479 +static void ack_lapic_irq (unsigned int irq)
33480 +{
33481 +       ack_APIC_irq();
33482 +}
33483 +
33484 +static void end_lapic_irq (unsigned int i) { /* nothing */ }
33485 +
33486 +static struct hw_interrupt_type lapic_irq_type __read_mostly = {
33487 +       .typename = "local-APIC-edge",
33488 +       .startup = NULL, /* startup_irq() not used for IRQ0 */
33489 +       .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
33490 +       .enable = enable_lapic_irq,
33491 +       .disable = disable_lapic_irq,
33492 +       .ack = ack_lapic_irq,
33493 +       .end = end_lapic_irq,
33494 +};
33495 +
33496 +static void setup_nmi (void)
33497 +{
33498 +       /*
33499 +        * Dirty trick to enable the NMI watchdog ...
33500 +        * We put the 8259A master into AEOI mode and
33501 +        * unmask on all local APICs LVT0 as NMI.
33502 +        *
33503 +        * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
33504 +        * is from Maciej W. Rozycki - so we do not have to EOI from
33505 +        * the NMI handler or the timer interrupt.
33506 +        */ 
33507 +       printk(KERN_INFO "activating NMI Watchdog ...");
33508 +
33509 +       enable_NMI_through_LVT0(NULL);
33510 +
33511 +       printk(" done.\n");
33512 +}
33513 +
33514 +/*
33515 + * This looks a bit hackish but it's about the only one way of sending
33516 + * a few INTA cycles to 8259As and any associated glue logic.  ICR does
33517 + * not support the ExtINT mode, unfortunately.  We need to send these
33518 + * cycles as some i82489DX-based boards have glue logic that keeps the
33519 + * 8259A interrupt line asserted until INTA.  --macro
33520 + */
33521 +static inline void unlock_ExtINT_logic(void)
33522 +{
33523 +       int apic, pin, i;
33524 +       struct IO_APIC_route_entry entry0, entry1;
33525 +       unsigned char save_control, save_freq_select;
33526 +       unsigned long flags;
33527 +
33528 +       pin  = find_isa_irq_pin(8, mp_INT);
33529 +       apic = find_isa_irq_apic(8, mp_INT);
33530 +       if (pin == -1)
33531 +               return;
33532 +
33533 +       spin_lock_irqsave(&ioapic_lock, flags);
33534 +       *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
33535 +       *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
33536 +       spin_unlock_irqrestore(&ioapic_lock, flags);
33537 +       clear_IO_APIC_pin(apic, pin);
33538 +
33539 +       memset(&entry1, 0, sizeof(entry1));
33540 +
33541 +       entry1.dest_mode = 0;                   /* physical delivery */
33542 +       entry1.mask = 0;                        /* unmask IRQ now */
33543 +       entry1.dest.physical.physical_dest = hard_smp_processor_id();
33544 +       entry1.delivery_mode = dest_ExtINT;
33545 +       entry1.polarity = entry0.polarity;
33546 +       entry1.trigger = 0;
33547 +       entry1.vector = 0;
33548 +
33549 +       spin_lock_irqsave(&ioapic_lock, flags);
33550 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
33551 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
33552 +       spin_unlock_irqrestore(&ioapic_lock, flags);
33553 +
33554 +       save_control = CMOS_READ(RTC_CONTROL);
33555 +       save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
33556 +       CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
33557 +                  RTC_FREQ_SELECT);
33558 +       CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
33559 +
33560 +       i = 100;
33561 +       while (i-- > 0) {
33562 +               mdelay(10);
33563 +               if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
33564 +                       i -= 10;
33565 +       }
33566 +
33567 +       CMOS_WRITE(save_control, RTC_CONTROL);
33568 +       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
33569 +       clear_IO_APIC_pin(apic, pin);
33570 +
33571 +       spin_lock_irqsave(&ioapic_lock, flags);
33572 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
33573 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
33574 +       spin_unlock_irqrestore(&ioapic_lock, flags);
33575 +}
33576 +
33577 +int timer_uses_ioapic_pin_0;
33578 +
33579 +/*
33580 + * This code may look a bit paranoid, but it's supposed to cooperate with
33581 + * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
33582 + * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
33583 + * fanatically on his truly buggy board.
33584 + *
33585 + * FIXME: really need to revamp this for modern platforms only.
33586 + */
33587 +static inline void check_timer(void)
33588 +{
33589 +       int apic1, pin1, apic2, pin2;
33590 +       int vector;
33591 +
33592 +       /*
33593 +        * get/set the timer IRQ vector:
33594 +        */
33595 +       disable_8259A_irq(0);
33596 +       vector = assign_irq_vector(0);
33597 +       set_intr_gate(vector, interrupt[0]);
33598 +
33599 +       /*
33600 +        * Subtle, code in do_timer_interrupt() expects an AEOI
33601 +        * mode for the 8259A whenever interrupts are routed
33602 +        * through I/O APICs.  Also IRQ0 has to be enabled in
33603 +        * the 8259A which implies the virtual wire has to be
33604 +        * disabled in the local APIC.
33605 +        */
33606 +       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
33607 +       init_8259A(1);
33608 +       if (timer_over_8254 > 0)
33609 +               enable_8259A_irq(0);
33610 +
33611 +       pin1  = find_isa_irq_pin(0, mp_INT);
33612 +       apic1 = find_isa_irq_apic(0, mp_INT);
33613 +       pin2  = ioapic_i8259.pin;
33614 +       apic2 = ioapic_i8259.apic;
33615 +
33616 +       if (pin1 == 0)
33617 +               timer_uses_ioapic_pin_0 = 1;
33618 +
33619 +       apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
33620 +               vector, apic1, pin1, apic2, pin2);
33621 +
33622 +       if (pin1 != -1) {
33623 +               /*
33624 +                * Ok, does IRQ0 through the IOAPIC work?
33625 +                */
33626 +               unmask_IO_APIC_irq(0);
33627 +               if (!no_timer_check && timer_irq_works()) {
33628 +                       nmi_watchdog_default();
33629 +                       if (nmi_watchdog == NMI_IO_APIC) {
33630 +                               disable_8259A_irq(0);
33631 +                               setup_nmi();
33632 +                               enable_8259A_irq(0);
33633 +                       }
33634 +                       if (disable_timer_pin_1 > 0)
33635 +                               clear_IO_APIC_pin(0, pin1);
33636 +                       return;
33637 +               }
33638 +               clear_IO_APIC_pin(apic1, pin1);
33639 +               apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
33640 +                               "connected to IO-APIC\n");
33641 +       }
33642 +
33643 +       apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
33644 +                               "through the 8259A ... ");
33645 +       if (pin2 != -1) {
33646 +               apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
33647 +                       apic2, pin2);
33648 +               /*
33649 +                * legacy devices should be connected to IO APIC #0
33650 +                */
33651 +               setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
33652 +               if (timer_irq_works()) {
33653 +                       apic_printk(APIC_VERBOSE," works.\n");
33654 +                       nmi_watchdog_default();
33655 +                       if (nmi_watchdog == NMI_IO_APIC) {
33656 +                               setup_nmi();
33657 +                       }
33658 +                       return;
33659 +               }
33660 +               /*
33661 +                * Cleanup, just in case ...
33662 +                */
33663 +               clear_IO_APIC_pin(apic2, pin2);
33664 +       }
33665 +       apic_printk(APIC_VERBOSE," failed.\n");
33666 +
33667 +       if (nmi_watchdog == NMI_IO_APIC) {
33668 +               printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
33669 +               nmi_watchdog = 0;
33670 +       }
33671 +
33672 +       apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
33673 +
33674 +       disable_8259A_irq(0);
33675 +       irq_desc[0].handler = &lapic_irq_type;
33676 +       apic_write(APIC_LVT0, APIC_DM_FIXED | vector);  /* Fixed mode */
33677 +       enable_8259A_irq(0);
33678 +
33679 +       if (timer_irq_works()) {
33680 +               apic_printk(APIC_VERBOSE," works.\n");
33681 +               return;
33682 +       }
33683 +       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
33684 +       apic_printk(APIC_VERBOSE," failed.\n");
33685 +
33686 +       apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
33687 +
33688 +       init_8259A(0);
33689 +       make_8259A_irq(0);
33690 +       apic_write(APIC_LVT0, APIC_DM_EXTINT);
33691 +
33692 +       unlock_ExtINT_logic();
33693 +
33694 +       if (timer_irq_works()) {
33695 +               apic_printk(APIC_VERBOSE," works.\n");
33696 +               return;
33697 +       }
33698 +       apic_printk(APIC_VERBOSE," failed :(.\n");
33699 +       panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
33700 +}
33701 +#else
33702 +int timer_uses_ioapic_pin_0;
33703 +#define check_timer() ((void)0)
33704 +#endif /* !CONFIG_XEN */
33705 +
33706 +static int __init notimercheck(char *s)
33707 +{
33708 +       no_timer_check = 1;
33709 +       return 1;
33710 +}
33711 +__setup("no_timer_check", notimercheck);
33712 +
33713 +/*
33714 + *
33715 + * IRQ's that are handled by the PIC in the MPS IOAPIC case.
33716 + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
33717 + *   Linux doesn't really care, as it's not actually used
33718 + *   for any interrupt handling anyway.
33719 + */
33720 +#define PIC_IRQS       (1<<2)
33721 +
33722 +void __init setup_IO_APIC(void)
33723 +{
33724 +       enable_IO_APIC();
33725 +
33726 +       if (acpi_ioapic)
33727 +               io_apic_irqs = ~0;      /* all IRQs go through IOAPIC */
33728 +       else
33729 +               io_apic_irqs = ~PIC_IRQS;
33730 +
33731 +       apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
33732 +
33733 +       /*
33734 +        * Set up the IO-APIC IRQ routing table.
33735 +        */
33736 +       if (!acpi_ioapic)
33737 +               setup_ioapic_ids_from_mpc();
33738 +#ifndef CONFIG_XEN
33739 +       sync_Arb_IDs();
33740 +#endif /* !CONFIG_XEN */
33741 +       setup_IO_APIC_irqs();
33742 +       init_IO_APIC_traps();
33743 +       check_timer();
33744 +       if (!acpi_ioapic)
33745 +               print_IO_APIC();
33746 +}
33747 +
33748 +struct sysfs_ioapic_data {
33749 +       struct sys_device dev;
33750 +       struct IO_APIC_route_entry entry[0];
33751 +};
33752 +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
33753 +
33754 +static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
33755 +{
33756 +       struct IO_APIC_route_entry *entry;
33757 +       struct sysfs_ioapic_data *data;
33758 +       unsigned long flags;
33759 +       int i;
33760 +
33761 +       data = container_of(dev, struct sysfs_ioapic_data, dev);
33762 +       entry = data->entry;
33763 +       spin_lock_irqsave(&ioapic_lock, flags);
33764 +       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
33765 +               *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
33766 +               *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
33767 +       }
33768 +       spin_unlock_irqrestore(&ioapic_lock, flags);
33769 +
33770 +       return 0;
33771 +}
33772 +
33773 +static int ioapic_resume(struct sys_device *dev)
33774 +{
33775 +       struct IO_APIC_route_entry *entry;
33776 +       struct sysfs_ioapic_data *data;
33777 +       unsigned long flags;
33778 +       union IO_APIC_reg_00 reg_00;
33779 +       int i;
33780 +
33781 +       data = container_of(dev, struct sysfs_ioapic_data, dev);
33782 +       entry = data->entry;
33783 +
33784 +       spin_lock_irqsave(&ioapic_lock, flags);
33785 +       reg_00.raw = io_apic_read(dev->id, 0);
33786 +       if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
33787 +               reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
33788 +               io_apic_write(dev->id, 0, reg_00.raw);
33789 +       }
33790 +       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
33791 +               io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
33792 +               io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
33793 +       }
33794 +       spin_unlock_irqrestore(&ioapic_lock, flags);
33795 +
33796 +       return 0;
33797 +}
33798 +
33799 +static struct sysdev_class ioapic_sysdev_class = {
33800 +       set_kset_name("ioapic"),
33801 +       .suspend = ioapic_suspend,
33802 +       .resume = ioapic_resume,
33803 +};
33804 +
33805 +static int __init ioapic_init_sysfs(void)
33806 +{
33807 +       struct sys_device * dev;
33808 +       int i, size, error = 0;
33809 +
33810 +       error = sysdev_class_register(&ioapic_sysdev_class);
33811 +       if (error)
33812 +               return error;
33813 +
33814 +       for (i = 0; i < nr_ioapics; i++ ) {
33815 +               size = sizeof(struct sys_device) + nr_ioapic_registers[i]
33816 +                       * sizeof(struct IO_APIC_route_entry);
33817 +               mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
33818 +               if (!mp_ioapic_data[i]) {
33819 +                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
33820 +                       continue;
33821 +               }
33822 +               memset(mp_ioapic_data[i], 0, size);
33823 +               dev = &mp_ioapic_data[i]->dev;
33824 +               dev->id = i;
33825 +               dev->cls = &ioapic_sysdev_class;
33826 +               error = sysdev_register(dev);
33827 +               if (error) {
33828 +                       kfree(mp_ioapic_data[i]);
33829 +                       mp_ioapic_data[i] = NULL;
33830 +                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
33831 +                       continue;
33832 +               }
33833 +       }
33834 +
33835 +       return 0;
33836 +}
33837 +
33838 +device_initcall(ioapic_init_sysfs);
33839 +
33840 +/* --------------------------------------------------------------------------
33841 +                          ACPI-based IOAPIC Configuration
33842 +   -------------------------------------------------------------------------- */
33843 +
33844 +#ifdef CONFIG_ACPI
33845 +
33846 +#define IO_APIC_MAX_ID         0xFE
33847 +
33848 +int __init io_apic_get_version (int ioapic)
33849 +{
33850 +       union IO_APIC_reg_01    reg_01;
33851 +       unsigned long flags;
33852 +
33853 +       spin_lock_irqsave(&ioapic_lock, flags);
33854 +       reg_01.raw = io_apic_read(ioapic, 1);
33855 +       spin_unlock_irqrestore(&ioapic_lock, flags);
33856 +
33857 +       return reg_01.bits.version;
33858 +}
33859 +
33860 +
33861 +int __init io_apic_get_redir_entries (int ioapic)
33862 +{
33863 +       union IO_APIC_reg_01    reg_01;
33864 +       unsigned long flags;
33865 +
33866 +       spin_lock_irqsave(&ioapic_lock, flags);
33867 +       reg_01.raw = io_apic_read(ioapic, 1);
33868 +       spin_unlock_irqrestore(&ioapic_lock, flags);
33869 +
33870 +       return reg_01.bits.entries;
33871 +}
33872 +
33873 +
33874 +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
33875 +{
33876 +       struct IO_APIC_route_entry entry;
33877 +       unsigned long flags;
33878 +
33879 +       if (!IO_APIC_IRQ(irq)) {
33880 +               apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
33881 +                       ioapic);
33882 +               return -EINVAL;
33883 +       }
33884 +
33885 +       /*
33886 +        * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
33887 +        * Note that we mask (disable) IRQs now -- these get enabled when the
33888 +        * corresponding device driver registers for this IRQ.
33889 +        */
33890 +
33891 +       memset(&entry,0,sizeof(entry));
33892 +
33893 +       entry.delivery_mode = INT_DELIVERY_MODE;
33894 +       entry.dest_mode = INT_DEST_MODE;
33895 +       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
33896 +       entry.trigger = edge_level;
33897 +       entry.polarity = active_high_low;
33898 +       entry.mask = 1;                                  /* Disabled (masked) */
33899 +
33900 +       irq = gsi_irq_sharing(irq);
33901 +       /*
33902 +        * IRQs < 16 are already in the irq_2_pin[] map
33903 +        */
33904 +       if (irq >= 16)
33905 +               add_pin_to_irq(irq, ioapic, pin);
33906 +
33907 +       entry.vector = assign_irq_vector(irq);
33908 +
33909 +       apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> "
33910 +               "IRQ %d Mode:%i Active:%i)\n", ioapic, 
33911 +              mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
33912 +              edge_level, active_high_low);
33913 +
33914 +       ioapic_register_intr(irq, entry.vector, edge_level);
33915 +
33916 +       if (!ioapic && (irq < 16))
33917 +               disable_8259A_irq(irq);
33918 +
33919 +       spin_lock_irqsave(&ioapic_lock, flags);
33920 +       io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
33921 +       io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
33922 +       set_native_irq_info(use_pci_vector() ?  entry.vector : irq, TARGET_CPUS);
33923 +       spin_unlock_irqrestore(&ioapic_lock, flags);
33924 +
33925 +       return 0;
33926 +}
33927 +
33928 +#endif /* CONFIG_ACPI */
33929 +
33930 +
33931 +#ifndef CONFIG_XEN
33932 +/*
33933 + * This function currently is only a helper for the i386 smp boot process where
33934 + * we need to reprogram the ioredtbls to cater for the cpus which have come online
33935 + * so mask in all cases should simply be TARGET_CPUS
33936 + */
33937 +#ifdef CONFIG_SMP
33938 +void __init setup_ioapic_dest(void)
33939 +{
33940 +       int pin, ioapic, irq, irq_entry;
33941 +
33942 +       if (skip_ioapic_setup == 1)
33943 +               return;
33944 +
33945 +       for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
33946 +               for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
33947 +                       irq_entry = find_irq_entry(ioapic, pin, mp_INT);
33948 +                       if (irq_entry == -1)
33949 +                               continue;
33950 +                       irq = pin_2_irq(irq_entry, ioapic, pin);
33951 +                       set_ioapic_affinity_irq(irq, TARGET_CPUS);
33952 +               }
33953 +
33954 +       }
33955 +}
33956 +#endif
33957 +#endif /* !CONFIG_XEN */
33958 diff -urNp linux-2.6/arch/x86_64/kernel/ioport-xen.c new/arch/x86_64/kernel/ioport-xen.c
33959 --- linux-2.6/arch/x86_64/kernel/ioport-xen.c   1970-01-01 01:00:00.000000000 +0100
33960 +++ new/arch/x86_64/kernel/ioport-xen.c 2006-06-28 14:32:14.000000000 +0200
33961 @@ -0,0 +1,99 @@
33962 +/*
33963 + *     linux/arch/x86_64/kernel/ioport.c
33964 + *
33965 + * This contains the io-permission bitmap code - written by obz, with changes
33966 + * by Linus.
33967 + */
33968 +
33969 +#include <linux/sched.h>
33970 +#include <linux/kernel.h>
33971 +#include <linux/capability.h>
33972 +#include <linux/errno.h>
33973 +#include <linux/types.h>
33974 +#include <linux/ioport.h>
33975 +#include <linux/mm.h>
33976 +#include <linux/smp.h>
33977 +#include <linux/smp_lock.h>
33978 +#include <linux/stddef.h>
33979 +#include <linux/slab.h>
33980 +#include <linux/thread_info.h>
33981 +#include <xen/interface/physdev.h>
33982 +
33983 +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
33984 +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
33985 +{
33986 +       int i;
33987 +
33988 +       if (new_value)
33989 +               for (i = base; i < base + extent; i++)
33990 +                       __set_bit(i, bitmap);
33991 +       else
33992 +               for (i = base; i < base + extent; i++)
33993 +                       clear_bit(i, bitmap);
33994 +}
33995 +
33996 +/*
33997 + * this changes the io permissions bitmap in the current task.
33998 + */
33999 +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
34000 +{
34001 +       struct thread_struct * t = &current->thread;
34002 +       unsigned long *bitmap;
34003 +       struct physdev_set_iobitmap set_iobitmap;
34004 +
34005 +       if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
34006 +               return -EINVAL;
34007 +       if (turn_on && !capable(CAP_SYS_RAWIO))
34008 +               return -EPERM;
34009 +
34010 +       /*
34011 +        * If it's the first ioperm() call in this thread's lifetime, set the
34012 +        * IO bitmap up. ioperm() is much less timing critical than clone(),
34013 +        * this is why we delay this operation until now:
34014 +        */
34015 +       if (!t->io_bitmap_ptr) {
34016 +               bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
34017 +               if (!bitmap)
34018 +                       return -ENOMEM;
34019 +
34020 +               memset(bitmap, 0xff, IO_BITMAP_BYTES);
34021 +               t->io_bitmap_ptr = bitmap;
34022 +
34023 +               set_iobitmap.bitmap   = (char *)bitmap;
34024 +               set_iobitmap.nr_ports = IO_BITMAP_BITS;
34025 +               HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &set_iobitmap);
34026 +       }
34027 +
34028 +       set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
34029 +
34030 +       return 0;
34031 +}
34032 +
34033 +/*
34034 + * sys_iopl has to be used when you want to access the IO ports
34035 + * beyond the 0x3ff range: to get the full 65536 ports bitmapped
34036 + * you'd need 8kB of bitmaps/process, which is a bit excessive.
34037 + *
34038 + */
34039 +
34040 +asmlinkage long sys_iopl(unsigned int new_iopl, struct pt_regs *regs)
34041 +{
34042 +       unsigned int old_iopl = current->thread.iopl;
34043 +       struct physdev_set_iopl set_iopl;
34044 +
34045 +       if (new_iopl > 3)
34046 +               return -EINVAL;
34047 +
34048 +       /* Need "raw I/O" privileges for direct port access. */
34049 +       if ((new_iopl > old_iopl) && !capable(CAP_SYS_RAWIO))
34050 +               return -EPERM;
34051 +
34052 +       /* Change our version of the privilege levels. */
34053 +       current->thread.iopl = new_iopl;
34054 +
34055 +       /* Force the change at ring 0. */
34056 +       set_iopl.iopl = (new_iopl == 0) ? 1 : new_iopl;
34057 +       HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
34058 +
34059 +       return 0;
34060 +}
34061 diff -urNp linux-2.6/arch/x86_64/kernel/irq.c new/arch/x86_64/kernel/irq.c
34062 --- linux-2.6/arch/x86_64/kernel/irq.c  2006-07-03 14:14:30.000000000 +0200
34063 +++ new/arch/x86_64/kernel/irq.c        2006-05-09 12:33:19.000000000 +0200
34064 @@ -91,8 +91,8 @@ skip:
34065   */
34066  asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
34067  {      
34068 -       /* high bits used in ret_from_ code  */
34069 -       unsigned irq = regs->orig_rax & 0xff;
34070 +       /* high bit used in ret_from_ code  */
34071 +       unsigned irq = ~regs->orig_rax;
34072  
34073         exit_idle();
34074         irq_enter();
34075 diff -urNp linux-2.6/arch/x86_64/kernel/irq-xen.c new/arch/x86_64/kernel/irq-xen.c
34076 --- linux-2.6/arch/x86_64/kernel/irq-xen.c      1970-01-01 01:00:00.000000000 +0100
34077 +++ new/arch/x86_64/kernel/irq-xen.c    2006-05-09 12:33:19.000000000 +0200
34078 @@ -0,0 +1,163 @@
34079 +/*
34080 + *     linux/arch/x86_64/kernel/irq.c
34081 + *
34082 + *     Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
34083 + *
34084 + * This file contains the lowest level x86_64-specific interrupt
34085 + * entry and irq statistics code. All the remaining irq logic is
34086 + * done by the generic kernel/irq/ code and in the
34087 + * x86_64-specific irq controller code. (e.g. i8259.c and
34088 + * io_apic.c.)
34089 + */
34090 +
34091 +#include <linux/kernel_stat.h>
34092 +#include <linux/interrupt.h>
34093 +#include <linux/seq_file.h>
34094 +#include <linux/module.h>
34095 +#include <linux/delay.h>
34096 +#include <asm/uaccess.h>
34097 +#include <asm/io_apic.h>
34098 +#include <asm/idle.h>
34099 +
34100 +atomic_t irq_err_count;
34101 +#ifdef CONFIG_X86_IO_APIC
34102 +#ifdef APIC_MISMATCH_DEBUG
34103 +atomic_t irq_mis_count;
34104 +#endif
34105 +#endif
34106 +
34107 +/*
34108 + * Generic, controller-independent functions:
34109 + */
34110 +
34111 +int show_interrupts(struct seq_file *p, void *v)
34112 +{
34113 +       int i = *(loff_t *) v, j;
34114 +       struct irqaction * action;
34115 +       unsigned long flags;
34116 +
34117 +       if (i == 0) {
34118 +               seq_printf(p, "           ");
34119 +               for_each_online_cpu(j)
34120 +                       seq_printf(p, "CPU%d       ",j);
34121 +               seq_putc(p, '\n');
34122 +       }
34123 +
34124 +       if (i < NR_IRQS) {
34125 +               spin_lock_irqsave(&irq_desc[i].lock, flags);
34126 +               action = irq_desc[i].action;
34127 +               if (!action) 
34128 +                       goto skip;
34129 +               seq_printf(p, "%3d: ",i);
34130 +#ifndef CONFIG_SMP
34131 +               seq_printf(p, "%10u ", kstat_irqs(i));
34132 +#else
34133 +               for_each_online_cpu(j)
34134 +                       seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
34135 +#endif
34136 +               seq_printf(p, " %14s", irq_desc[i].handler->typename);
34137 +
34138 +               seq_printf(p, "  %s", action->name);
34139 +               for (action=action->next; action; action = action->next)
34140 +                       seq_printf(p, ", %s", action->name);
34141 +               seq_putc(p, '\n');
34142 +skip:
34143 +               spin_unlock_irqrestore(&irq_desc[i].lock, flags);
34144 +       } else if (i == NR_IRQS) {
34145 +               seq_printf(p, "NMI: ");
34146 +               for_each_online_cpu(j)
34147 +                       seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
34148 +               seq_putc(p, '\n');
34149 +#ifdef CONFIG_X86_LOCAL_APIC
34150 +               seq_printf(p, "LOC: ");
34151 +               for_each_online_cpu(j)
34152 +                       seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
34153 +               seq_putc(p, '\n');
34154 +#endif
34155 +               seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
34156 +#ifdef CONFIG_X86_IO_APIC
34157 +#ifdef APIC_MISMATCH_DEBUG
34158 +               seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
34159 +#endif
34160 +#endif
34161 +       }
34162 +       return 0;
34163 +}
34164 +
34165 +/*
34166 + * do_IRQ handles all normal device IRQ's (the special
34167 + * SMP cross-CPU interrupts have their own specific
34168 + * handlers).
34169 + */
34170 +asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
34171 +{      
34172 +       /* high bit used in ret_from_ code  */
34173 +       unsigned irq = ~regs->orig_rax;
34174 +
34175 +       exit_idle();
34176 +       irq_enter();
34177 +
34178 +       __do_IRQ(irq, regs);
34179 +       irq_exit();
34180 +
34181 +       return 1;
34182 +}
34183 +
34184 +#ifdef CONFIG_HOTPLUG_CPU
34185 +void fixup_irqs(cpumask_t map)
34186 +{
34187 +       unsigned int irq;
34188 +       static int warned;
34189 +
34190 +       for (irq = 0; irq < NR_IRQS; irq++) {
34191 +               cpumask_t mask;
34192 +               if (irq == 2)
34193 +                       continue;
34194 +
34195 +               cpus_and(mask, irq_affinity[irq], map);
34196 +               if (any_online_cpu(mask) == NR_CPUS) {
34197 +                       printk("Breaking affinity for irq %i\n", irq);
34198 +                       mask = map;
34199 +               }
34200 +               if (irq_desc[irq].handler->set_affinity)
34201 +                       irq_desc[irq].handler->set_affinity(irq, mask);
34202 +               else if (irq_desc[irq].action && !(warned++))
34203 +                       printk("Cannot set affinity for irq %i\n", irq);
34204 +       }
34205 +
34206 +       /* That doesn't seem sufficient.  Give it 1ms. */
34207 +       local_irq_enable();
34208 +       mdelay(1);
34209 +       local_irq_disable();
34210 +}
34211 +#endif
34212 +
34213 +extern void call_softirq(void);
34214 +
34215 +asmlinkage void do_softirq(void)
34216 +{
34217 +       __u32 pending;
34218 +       unsigned long flags;
34219 +
34220 +       if (in_interrupt())
34221 +               return;
34222 +
34223 +       local_irq_save(flags);
34224 +       pending = local_softirq_pending();
34225 +       /* Switch to interrupt stack */
34226 +       if (pending)
34227 +               call_softirq();
34228 +       local_irq_restore(flags);
34229 +}
34230 +EXPORT_SYMBOL(do_softirq);
34231 +
34232 +#ifndef CONFIG_X86_LOCAL_APIC
34233 +/*
34234 + * 'what should we do if we get a hw irq event on an illegal vector'.
34235 + * each architecture has to answer this themselves.
34236 + */
34237 +void ack_bad_irq(unsigned int irq)
34238 +{
34239 +        printk("unexpected IRQ trap at vector %02x\n", irq);
34240 +}
34241 +#endif
34242 diff -urNp linux-2.6/arch/x86_64/kernel/ldt-xen.c new/arch/x86_64/kernel/ldt-xen.c
34243 --- linux-2.6/arch/x86_64/kernel/ldt-xen.c      1970-01-01 01:00:00.000000000 +0100
34244 +++ new/arch/x86_64/kernel/ldt-xen.c    2006-05-09 12:33:19.000000000 +0200
34245 @@ -0,0 +1,282 @@
34246 +/*
34247 + * linux/arch/x86_64/kernel/ldt.c
34248 + *
34249 + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
34250 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
34251 + * Copyright (C) 2002 Andi Kleen
34252 + * 
34253 + * This handles calls from both 32bit and 64bit mode.
34254 + */
34255 +
34256 +#include <linux/errno.h>
34257 +#include <linux/sched.h>
34258 +#include <linux/string.h>
34259 +#include <linux/mm.h>
34260 +#include <linux/smp.h>
34261 +#include <linux/smp_lock.h>
34262 +#include <linux/vmalloc.h>
34263 +#include <linux/slab.h>
34264 +
34265 +#include <asm/uaccess.h>
34266 +#include <asm/system.h>
34267 +#include <asm/ldt.h>
34268 +#include <asm/desc.h>
34269 +#include <asm/proto.h>
34270 +#include <asm/pgalloc.h>
34271 +
34272 +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
34273 +static void flush_ldt(void *null)
34274 +{
34275 +       if (current->active_mm)
34276 +               load_LDT(&current->active_mm->context);
34277 +}
34278 +#endif
34279 +
34280 +static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
34281 +{
34282 +       void *oldldt;
34283 +       void *newldt;
34284 +       unsigned oldsize;
34285 +
34286 +       if (mincount <= (unsigned)pc->size)
34287 +               return 0;
34288 +       oldsize = pc->size;
34289 +       mincount = (mincount+511)&(~511);
34290 +       if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
34291 +               newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
34292 +       else
34293 +               newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
34294 +
34295 +       if (!newldt)
34296 +               return -ENOMEM;
34297 +
34298 +       if (oldsize)
34299 +               memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
34300 +       oldldt = pc->ldt;
34301 +       memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
34302 +       wmb();
34303 +       pc->ldt = newldt;
34304 +       wmb();
34305 +       pc->size = mincount;
34306 +       wmb();
34307 +       if (reload) {
34308 +#ifdef CONFIG_SMP
34309 +               cpumask_t mask;
34310 +
34311 +               preempt_disable();
34312 +#endif
34313 +               make_pages_readonly(
34314 +                       pc->ldt,
34315 +                       (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
34316 +                       XENFEAT_writable_descriptor_tables);
34317 +               load_LDT(pc);
34318 +#ifdef CONFIG_SMP
34319 +               mask = cpumask_of_cpu(smp_processor_id());
34320 +               if (!cpus_equal(current->mm->cpu_vm_mask, mask))
34321 +                       smp_call_function(flush_ldt, NULL, 1, 1);
34322 +               preempt_enable();
34323 +#endif
34324 +       }
34325 +       if (oldsize) {
34326 +               make_pages_writable(
34327 +                       oldldt,
34328 +                       (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
34329 +                       XENFEAT_writable_descriptor_tables);
34330 +               if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
34331 +                       vfree(oldldt);
34332 +               else
34333 +                       kfree(oldldt);
34334 +       }
34335 +       return 0;
34336 +}
34337 +
34338 +static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
34339 +{
34340 +       int err = alloc_ldt(new, old->size, 0);
34341 +       if (err < 0)
34342 +               return err;
34343 +       memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
34344 +       make_pages_readonly(
34345 +               new->ldt,
34346 +               (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
34347 +               XENFEAT_writable_descriptor_tables);
34348 +       return 0;
34349 +}
34350 +
34351 +/*
34352 + * we do not have to muck with descriptors here, that is
34353 + * done in switch_mm() as needed.
34354 + */
34355 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
34356 +{
34357 +       struct mm_struct * old_mm;
34358 +       int retval = 0;
34359 +
34360 +       memset(&mm->context, 0, sizeof(mm->context));
34361 +       init_MUTEX(&mm->context.sem);
34362 +       old_mm = current->mm;
34363 +       if (old_mm && old_mm->context.size > 0) {
34364 +               down(&old_mm->context.sem);
34365 +               retval = copy_ldt(&mm->context, &old_mm->context);
34366 +               up(&old_mm->context.sem);
34367 +       }
34368 +       if (retval == 0) {
34369 +               spin_lock(&mm_unpinned_lock);
34370 +               list_add(&mm->context.unpinned, &mm_unpinned);
34371 +               spin_unlock(&mm_unpinned_lock);
34372 +       }
34373 +       return retval;
34374 +}
34375 +
34376 +/*
34377 + * 
34378 + * Don't touch the LDT register - we're already in the next thread.
34379 + */
34380 +void destroy_context(struct mm_struct *mm)
34381 +{
34382 +       if (mm->context.size) {
34383 +               if (mm == current->active_mm)
34384 +                       clear_LDT();
34385 +               make_pages_writable(
34386 +                       mm->context.ldt,
34387 +                       (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
34388 +                       XENFEAT_writable_descriptor_tables);
34389 +               if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
34390 +                       vfree(mm->context.ldt);
34391 +               else
34392 +                       kfree(mm->context.ldt);
34393 +               mm->context.size = 0;
34394 +       }
34395 +       if (!mm->context.pinned) {
34396 +               spin_lock(&mm_unpinned_lock);
34397 +               list_del(&mm->context.unpinned);
34398 +               spin_unlock(&mm_unpinned_lock);
34399 +       }
34400 +}
34401 +
34402 +static int read_ldt(void __user * ptr, unsigned long bytecount)
34403 +{
34404 +       int err;
34405 +       unsigned long size;
34406 +       struct mm_struct * mm = current->mm;
34407 +
34408 +       if (!mm->context.size)
34409 +               return 0;
34410 +       if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
34411 +               bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
34412 +
34413 +       down(&mm->context.sem);
34414 +       size = mm->context.size*LDT_ENTRY_SIZE;
34415 +       if (size > bytecount)
34416 +               size = bytecount;
34417 +
34418 +       err = 0;
34419 +       if (copy_to_user(ptr, mm->context.ldt, size))
34420 +               err = -EFAULT;
34421 +       up(&mm->context.sem);
34422 +       if (err < 0)
34423 +               goto error_return;
34424 +       if (size != bytecount) {
34425 +               /* zero-fill the rest */
34426 +               if (clear_user(ptr+size, bytecount-size) != 0) {
34427 +                       err = -EFAULT;
34428 +                       goto error_return;
34429 +               }
34430 +       }
34431 +       return bytecount;
34432 +error_return:
34433 +       return err;
34434 +}
34435 +
34436 +static int read_default_ldt(void __user * ptr, unsigned long bytecount)
34437 +{
34438 +       /* Arbitrary number */ 
34439 +       /* x86-64 default LDT is all zeros */
34440 +       if (bytecount > 128) 
34441 +               bytecount = 128;        
34442 +       if (clear_user(ptr, bytecount))
34443 +               return -EFAULT;
34444 +       return bytecount; 
34445 +}
34446 +
34447 +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
34448 +{
34449 +       struct task_struct *me = current;
34450 +       struct mm_struct * mm = me->mm;
34451 +       __u32 entry_1, entry_2, *lp;
34452 +       unsigned long mach_lp;
34453 +       int error;
34454 +       struct user_desc ldt_info;
34455 +
34456 +       error = -EINVAL;
34457 +
34458 +       if (bytecount != sizeof(ldt_info))
34459 +               goto out;
34460 +       error = -EFAULT;        
34461 +       if (copy_from_user(&ldt_info, ptr, bytecount))
34462 +               goto out;
34463 +
34464 +       error = -EINVAL;
34465 +       if (ldt_info.entry_number >= LDT_ENTRIES)
34466 +               goto out;
34467 +       if (ldt_info.contents == 3) {
34468 +               if (oldmode)
34469 +                       goto out;
34470 +               if (ldt_info.seg_not_present == 0)
34471 +                       goto out;
34472 +       }
34473 +
34474 +       down(&mm->context.sem);
34475 +       if (ldt_info.entry_number >= (unsigned)mm->context.size) {
34476 +               error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
34477 +               if (error < 0)
34478 +                       goto out_unlock;
34479 +       }
34480 +
34481 +       lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
34482 +       mach_lp = arbitrary_virt_to_machine(lp);
34483 +
34484 +       /* Allow LDTs to be cleared by the user. */
34485 +       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
34486 +               if (oldmode || LDT_empty(&ldt_info)) {
34487 +                       entry_1 = 0;
34488 +                       entry_2 = 0;
34489 +                       goto install;
34490 +               }
34491 +       }
34492 +
34493 +       entry_1 = LDT_entry_a(&ldt_info);
34494 +       entry_2 = LDT_entry_b(&ldt_info);
34495 +       if (oldmode)
34496 +               entry_2 &= ~(1 << 20);
34497 +
34498 +       /* Install the new entry ...  */
34499 +install:
34500 +       error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32)));
34501 +
34502 +out_unlock:
34503 +       up(&mm->context.sem);
34504 +out:
34505 +       return error;
34506 +}
34507 +
34508 +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
34509 +{
34510 +       int ret = -ENOSYS;
34511 +
34512 +       switch (func) {
34513 +       case 0:
34514 +               ret = read_ldt(ptr, bytecount);
34515 +               break;
34516 +       case 1:
34517 +               ret = write_ldt(ptr, bytecount, 1);
34518 +               break;
34519 +       case 2:
34520 +               ret = read_default_ldt(ptr, bytecount);
34521 +               break;
34522 +       case 0x11:
34523 +               ret = write_ldt(ptr, bytecount, 0);
34524 +               break;
34525 +       }
34526 +       return ret;
34527 +}
34528 diff -urNp linux-2.6/arch/x86_64/kernel/Makefile new/arch/x86_64/kernel/Makefile
34529 --- linux-2.6/arch/x86_64/kernel/Makefile       2006-07-03 14:14:30.000000000 +0200
34530 +++ new/arch/x86_64/kernel/Makefile     2006-05-09 12:33:17.000000000 +0200
34531 @@ -20,11 +20,13 @@ obj-$(CONFIG_MICROCODE)             += microcode.o
34532  obj-$(CONFIG_X86_CPUID)                += cpuid.o
34533  obj-$(CONFIG_SMP)              += smp.o smpboot.o trampoline.o
34534  obj-$(CONFIG_X86_LOCAL_APIC)   += apic.o  nmi.o
34535 +obj-$(CONFIG_X86_XEN_GENAPIC)  += genapic.o genapic_xen.o
34536  obj-$(CONFIG_X86_IO_APIC)      += io_apic.o mpparse.o \
34537                 genapic.o genapic_cluster.o genapic_flat.o
34538  obj-$(CONFIG_KEXEC)            += machine_kexec.o relocate_kernel.o crash.o
34539  obj-$(CONFIG_CRASH_DUMP)       += crash_dump.o
34540 -obj-$(CONFIG_PM)               += suspend.o
34541 +obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o
34542 +obj-$(CONFIG_ACPI_SLEEP)       += suspend.o
34543  obj-$(CONFIG_SUSPEND_SHARED)   += suspend_asm.o
34544  obj-$(CONFIG_CPU_FREQ)         += cpufreq/
34545  obj-$(CONFIG_EARLY_PRINTK)     += early_printk.o
34546 @@ -49,3 +51,18 @@ intel_cacheinfo-y            += ../../i386/kernel/
34547  quirks-y                       += ../../i386/kernel/quirks.o
34548  i8237-y                                += ../../i386/kernel/i8237.o
34549  msr-$(subst m,y,$(CONFIG_X86_MSR))  += ../../i386/kernel/msr.o
34550 +
34551 +ifdef CONFIG_XEN
34552 +time-y                         += ../../i386/kernel/time-xen.o
34553 +pci-dma-y                      += ../../i386/kernel/pci-dma-xen.o
34554 +microcode-$(subst m,y,$(CONFIG_MICROCODE))  := ../../i386/kernel/microcode-xen.o
34555 +quirks-y                       := ../../i386/kernel/quirks-xen.o
34556 +
34557 +n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o
34558 +
34559 +include $(srctree)/scripts/Makefile.xen
34560 +
34561 +obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
34562 +obj-y := $(call cherrypickxen, $(obj-y))
34563 +extra-y := $(call cherrypickxen, $(extra-y))
34564 +endif
34565 diff -urNp linux-2.6/arch/x86_64/kernel/mpparse-xen.c new/arch/x86_64/kernel/mpparse-xen.c
34566 --- linux-2.6/arch/x86_64/kernel/mpparse-xen.c  1970-01-01 01:00:00.000000000 +0100
34567 +++ new/arch/x86_64/kernel/mpparse-xen.c        2006-05-23 18:37:10.000000000 +0200
34568 @@ -0,0 +1,1012 @@
34569 +/*
34570 + *     Intel Multiprocessor Specification 1.1 and 1.4
34571 + *     compliant MP-table parsing routines.
34572 + *
34573 + *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
34574 + *     (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
34575 + *
34576 + *     Fixes
34577 + *             Erich Boleyn    :       MP v1.4 and additional changes.
34578 + *             Alan Cox        :       Added EBDA scanning
34579 + *             Ingo Molnar     :       various cleanups and rewrites
34580 + *             Maciej W. Rozycki:      Bits for default MP configurations
34581 + *             Paul Diefenbaugh:       Added full ACPI support
34582 + */
34583 +
34584 +#include <linux/mm.h>
34585 +#include <linux/init.h>
34586 +#include <linux/delay.h>
34587 +#include <linux/config.h>
34588 +#include <linux/bootmem.h>
34589 +#include <linux/smp_lock.h>
34590 +#include <linux/kernel_stat.h>
34591 +#include <linux/mc146818rtc.h>
34592 +#include <linux/acpi.h>
34593 +#include <linux/module.h>
34594 +
34595 +#include <asm/smp.h>
34596 +#include <asm/mtrr.h>
34597 +#include <asm/mpspec.h>
34598 +#include <asm/pgalloc.h>
34599 +#include <asm/io_apic.h>
34600 +#include <asm/proto.h>
34601 +#include <asm/acpi.h>
34602 +
34603 +/* Have we found an MP table */
34604 +int smp_found_config;
34605 +unsigned int __initdata maxcpus = NR_CPUS;
34606 +
34607 +int acpi_found_madt;
34608 +
34609 +/*
34610 + * Various Linux-internal data structures created from the
34611 + * MP-table.
34612 + */
34613 +unsigned char apic_version [MAX_APICS];
34614 +unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
34615 +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
34616 +
34617 +static int mp_current_pci_id = 0;
34618 +/* I/O APIC entries */
34619 +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
34620 +
34621 +/* # of MP IRQ source entries */
34622 +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
34623 +
34624 +/* MP IRQ source entries */
34625 +int mp_irq_entries;
34626 +
34627 +int nr_ioapics;
34628 +int pic_mode;
34629 +unsigned long mp_lapic_addr = 0;
34630 +
34631 +
34632 +
34633 +/* Processor that is doing the boot up */
34634 +unsigned int boot_cpu_id = -1U;
34635 +/* Internal processor count */
34636 +unsigned int num_processors __initdata = 0;
34637 +
34638 +unsigned disabled_cpus __initdata;
34639 +
34640 +/* Bitmask of physically existing CPUs */
34641 +physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
34642 +
34643 +/* ACPI MADT entry parsing functions */
34644 +#ifdef CONFIG_ACPI
34645 +extern struct acpi_boot_flags acpi_boot;
34646 +#ifdef CONFIG_X86_LOCAL_APIC
34647 +extern int acpi_parse_lapic (acpi_table_entry_header *header);
34648 +extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header);
34649 +extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header);
34650 +#endif /*CONFIG_X86_LOCAL_APIC*/
34651 +#ifdef CONFIG_X86_IO_APIC
34652 +extern int acpi_parse_ioapic (acpi_table_entry_header *header);
34653 +#endif /*CONFIG_X86_IO_APIC*/
34654 +#endif /*CONFIG_ACPI*/
34655 +
34656 +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
34657 +
34658 +
34659 +/*
34660 + * Intel MP BIOS table parsing routines:
34661 + */
34662 +
34663 +/*
34664 + * Checksum an MP configuration block.
34665 + */
34666 +
34667 +static int __init mpf_checksum(unsigned char *mp, int len)
34668 +{
34669 +       int sum = 0;
34670 +
34671 +       while (len--)
34672 +               sum += *mp++;
34673 +
34674 +       return sum & 0xFF;
34675 +}
34676 +
34677 +#ifndef CONFIG_XEN
34678 +static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
34679 +{
34680 +       int cpu;
34681 +       unsigned char ver;
34682 +       cpumask_t tmp_map;
34683 +
34684 +       if (!(m->mpc_cpuflag & CPU_ENABLED)) {
34685 +               disabled_cpus++;
34686 +               return;
34687 +       }
34688 +
34689 +       printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
34690 +               m->mpc_apicid,
34691 +              (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8,
34692 +              (m->mpc_cpufeature & CPU_MODEL_MASK)>>4,
34693 +               m->mpc_apicver);
34694 +
34695 +       if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
34696 +               Dprintk("    Bootup CPU\n");
34697 +               boot_cpu_id = m->mpc_apicid;
34698 +       }
34699 +       if (num_processors >= NR_CPUS) {
34700 +               printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
34701 +                       " Processor ignored.\n", NR_CPUS);
34702 +               return;
34703 +       }
34704 +
34705 +       num_processors++;
34706 +       cpus_complement(tmp_map, cpu_present_map);
34707 +       cpu = first_cpu(tmp_map);
34708 +
34709 +#if MAX_APICS < 255    
34710 +       if ((int)m->mpc_apicid > MAX_APICS) {
34711 +               printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
34712 +                       m->mpc_apicid, MAX_APICS);
34713 +               return;
34714 +       }
34715 +#endif
34716 +       ver = m->mpc_apicver;
34717 +
34718 +       physid_set(m->mpc_apicid, phys_cpu_present_map);
34719 +       /*
34720 +        * Validate version
34721 +        */
34722 +       if (ver == 0x0) {
34723 +               printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
34724 +               ver = 0x10;
34725 +       }
34726 +       apic_version[m->mpc_apicid] = ver;
34727 +       if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
34728 +               /*
34729 +                * bios_cpu_apicid is required to have processors listed
34730 +                * in same order as logical cpu numbers. Hence the first
34731 +                * entry is BSP, and so on.
34732 +                */
34733 +               cpu = 0;
34734 +       }
34735 +       bios_cpu_apicid[cpu] = m->mpc_apicid;
34736 +       x86_cpu_to_apicid[cpu] = m->mpc_apicid;
34737 +
34738 +       cpu_set(cpu, cpu_possible_map);
34739 +       cpu_set(cpu, cpu_present_map);
34740 +}
34741 +#else
34742 +void __init MP_processor_info (struct mpc_config_processor *m)
34743 +{
34744 +       num_processors++;
34745 +}
34746 +#endif /* CONFIG_XEN */
34747 +
34748 +static void __init MP_bus_info (struct mpc_config_bus *m)
34749 +{
34750 +       char str[7];
34751 +
34752 +       memcpy(str, m->mpc_bustype, 6);
34753 +       str[6] = 0;
34754 +       Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
34755 +
34756 +       if (strncmp(str, "ISA", 3) == 0) {
34757 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
34758 +       } else if (strncmp(str, "EISA", 4) == 0) {
34759 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
34760 +       } else if (strncmp(str, "PCI", 3) == 0) {
34761 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
34762 +               mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
34763 +               mp_current_pci_id++;
34764 +       } else if (strncmp(str, "MCA", 3) == 0) {
34765 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
34766 +       } else {
34767 +               printk(KERN_ERR "Unknown bustype %s\n", str);
34768 +       }
34769 +}
34770 +
34771 +static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
34772 +{
34773 +       if (!(m->mpc_flags & MPC_APIC_USABLE))
34774 +               return;
34775 +
34776 +       printk("I/O APIC #%d Version %d at 0x%X.\n",
34777 +               m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
34778 +       if (nr_ioapics >= MAX_IO_APICS) {
34779 +               printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n",
34780 +                       MAX_IO_APICS, nr_ioapics);
34781 +               panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
34782 +       }
34783 +       if (!m->mpc_apicaddr) {
34784 +               printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
34785 +                       " found in MP table, skipping!\n");
34786 +               return;
34787 +       }
34788 +       mp_ioapics[nr_ioapics] = *m;
34789 +       nr_ioapics++;
34790 +}
34791 +
34792 +static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
34793 +{
34794 +       mp_irqs [mp_irq_entries] = *m;
34795 +       Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
34796 +               " IRQ %02x, APIC ID %x, APIC INT %02x\n",
34797 +                       m->mpc_irqtype, m->mpc_irqflag & 3,
34798 +                       (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
34799 +                       m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
34800 +       if (++mp_irq_entries >= MAX_IRQ_SOURCES)
34801 +               panic("Max # of irq sources exceeded!!\n");
34802 +}
34803 +
34804 +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
34805 +{
34806 +       Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
34807 +               " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
34808 +                       m->mpc_irqtype, m->mpc_irqflag & 3,
34809 +                       (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
34810 +                       m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
34811 +       /*
34812 +        * Well it seems all SMP boards in existence
34813 +        * use ExtINT/LVT1 == LINT0 and
34814 +        * NMI/LVT2 == LINT1 - the following check
34815 +        * will show us if this assumptions is false.
34816 +        * Until then we do not have to add baggage.
34817 +        */
34818 +       if ((m->mpc_irqtype == mp_ExtINT) &&
34819 +               (m->mpc_destapiclint != 0))
34820 +                       BUG();
34821 +       if ((m->mpc_irqtype == mp_NMI) &&
34822 +               (m->mpc_destapiclint != 1))
34823 +                       BUG();
34824 +}
34825 +
34826 +/*
34827 + * Read/parse the MPC
34828 + */
34829 +
34830 +static int __init smp_read_mpc(struct mp_config_table *mpc)
34831 +{
34832 +       char str[16];
34833 +       int count=sizeof(*mpc);
34834 +       unsigned char *mpt=((unsigned char *)mpc)+count;
34835 +
34836 +       if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
34837 +               printk("SMP mptable: bad signature [%c%c%c%c]!\n",
34838 +                       mpc->mpc_signature[0],
34839 +                       mpc->mpc_signature[1],
34840 +                       mpc->mpc_signature[2],
34841 +                       mpc->mpc_signature[3]);
34842 +               return 0;
34843 +       }
34844 +       if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
34845 +               printk("SMP mptable: checksum error!\n");
34846 +               return 0;
34847 +       }
34848 +       if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
34849 +               printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
34850 +                       mpc->mpc_spec);
34851 +               return 0;
34852 +       }
34853 +       if (!mpc->mpc_lapic) {
34854 +               printk(KERN_ERR "SMP mptable: null local APIC address!\n");
34855 +               return 0;
34856 +       }
34857 +       memcpy(str,mpc->mpc_oem,8);
34858 +       str[8]=0;
34859 +       printk(KERN_INFO "OEM ID: %s ",str);
34860 +
34861 +       memcpy(str,mpc->mpc_productid,12);
34862 +       str[12]=0;
34863 +       printk("Product ID: %s ",str);
34864 +
34865 +       printk("APIC at: 0x%X\n",mpc->mpc_lapic);
34866 +
34867 +       /* save the local APIC address, it might be non-default */
34868 +       if (!acpi_lapic)
34869 +       mp_lapic_addr = mpc->mpc_lapic;
34870 +
34871 +       /*
34872 +        *      Now process the configuration blocks.
34873 +        */
34874 +       while (count < mpc->mpc_length) {
34875 +               switch(*mpt) {
34876 +                       case MP_PROCESSOR:
34877 +                       {
34878 +                               struct mpc_config_processor *m=
34879 +                                       (struct mpc_config_processor *)mpt;
34880 +                               if (!acpi_lapic)
34881 +                               MP_processor_info(m);
34882 +                               mpt += sizeof(*m);
34883 +                               count += sizeof(*m);
34884 +                               break;
34885 +                       }
34886 +                       case MP_BUS:
34887 +                       {
34888 +                               struct mpc_config_bus *m=
34889 +                                       (struct mpc_config_bus *)mpt;
34890 +                               MP_bus_info(m);
34891 +                               mpt += sizeof(*m);
34892 +                               count += sizeof(*m);
34893 +                               break;
34894 +                       }
34895 +                       case MP_IOAPIC:
34896 +                       {
34897 +                               struct mpc_config_ioapic *m=
34898 +                                       (struct mpc_config_ioapic *)mpt;
34899 +                               MP_ioapic_info(m);
34900 +                               mpt+=sizeof(*m);
34901 +                               count+=sizeof(*m);
34902 +                               break;
34903 +                       }
34904 +                       case MP_INTSRC:
34905 +                       {
34906 +                               struct mpc_config_intsrc *m=
34907 +                                       (struct mpc_config_intsrc *)mpt;
34908 +
34909 +                               MP_intsrc_info(m);
34910 +                               mpt+=sizeof(*m);
34911 +                               count+=sizeof(*m);
34912 +                               break;
34913 +                       }
34914 +                       case MP_LINTSRC:
34915 +                       {
34916 +                               struct mpc_config_lintsrc *m=
34917 +                                       (struct mpc_config_lintsrc *)mpt;
34918 +                               MP_lintsrc_info(m);
34919 +                               mpt+=sizeof(*m);
34920 +                               count+=sizeof(*m);
34921 +                               break;
34922 +                       }
34923 +               }
34924 +       }
34925 +       clustered_apic_check();
34926 +       if (!num_processors)
34927 +               printk(KERN_ERR "SMP mptable: no processors registered!\n");
34928 +       return num_processors;
34929 +}
34930 +
34931 +static int __init ELCR_trigger(unsigned int irq)
34932 +{
34933 +       unsigned int port;
34934 +
34935 +       port = 0x4d0 + (irq >> 3);
34936 +       return (inb(port) >> (irq & 7)) & 1;
34937 +}
34938 +
34939 +static void __init construct_default_ioirq_mptable(int mpc_default_type)
34940 +{
34941 +       struct mpc_config_intsrc intsrc;
34942 +       int i;
34943 +       int ELCR_fallback = 0;
34944 +
34945 +       intsrc.mpc_type = MP_INTSRC;
34946 +       intsrc.mpc_irqflag = 0;                 /* conforming */
34947 +       intsrc.mpc_srcbus = 0;
34948 +       intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
34949 +
34950 +       intsrc.mpc_irqtype = mp_INT;
34951 +
34952 +       /*
34953 +        *  If true, we have an ISA/PCI system with no IRQ entries
34954 +        *  in the MP table. To prevent the PCI interrupts from being set up
34955 +        *  incorrectly, we try to use the ELCR. The sanity check to see if
34956 +        *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
34957 +        *  never be level sensitive, so we simply see if the ELCR agrees.
34958 +        *  If it does, we assume it's valid.
34959 +        */
34960 +       if (mpc_default_type == 5) {
34961 +               printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
34962 +
34963 +               if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
34964 +                       printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
34965 +               else {
34966 +                       printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
34967 +                       ELCR_fallback = 1;
34968 +               }
34969 +       }
34970 +
34971 +       for (i = 0; i < 16; i++) {
34972 +               switch (mpc_default_type) {
34973 +               case 2:
34974 +                       if (i == 0 || i == 13)
34975 +                               continue;       /* IRQ0 & IRQ13 not connected */
34976 +                       /* fall through */
34977 +               default:
34978 +                       if (i == 2)
34979 +                               continue;       /* IRQ2 is never connected */
34980 +               }
34981 +
34982 +               if (ELCR_fallback) {
34983 +                       /*
34984 +                        *  If the ELCR indicates a level-sensitive interrupt, we
34985 +                        *  copy that information over to the MP table in the
34986 +                        *  irqflag field (level sensitive, active high polarity).
34987 +                        */
34988 +                       if (ELCR_trigger(i))
34989 +                               intsrc.mpc_irqflag = 13;
34990 +                       else
34991 +                               intsrc.mpc_irqflag = 0;
34992 +               }
34993 +
34994 +               intsrc.mpc_srcbusirq = i;
34995 +               intsrc.mpc_dstirq = i ? i : 2;          /* IRQ0 to INTIN2 */
34996 +               MP_intsrc_info(&intsrc);
34997 +       }
34998 +
34999 +       intsrc.mpc_irqtype = mp_ExtINT;
35000 +       intsrc.mpc_srcbusirq = 0;
35001 +       intsrc.mpc_dstirq = 0;                          /* 8259A to INTIN0 */
35002 +       MP_intsrc_info(&intsrc);
35003 +}
35004 +
35005 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
35006 +{
35007 +       struct mpc_config_processor processor;
35008 +       struct mpc_config_bus bus;
35009 +       struct mpc_config_ioapic ioapic;
35010 +       struct mpc_config_lintsrc lintsrc;
35011 +       int linttypes[2] = { mp_ExtINT, mp_NMI };
35012 +       int i;
35013 +
35014 +       /*
35015 +        * local APIC has default address
35016 +        */
35017 +       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
35018 +
35019 +       /*
35020 +        * 2 CPUs, numbered 0 & 1.
35021 +        */
35022 +       processor.mpc_type = MP_PROCESSOR;
35023 +       /* Either an integrated APIC or a discrete 82489DX. */
35024 +       processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
35025 +       processor.mpc_cpuflag = CPU_ENABLED;
35026 +       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
35027 +                                  (boot_cpu_data.x86_model << 4) |
35028 +                                  boot_cpu_data.x86_mask;
35029 +       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
35030 +       processor.mpc_reserved[0] = 0;
35031 +       processor.mpc_reserved[1] = 0;
35032 +       for (i = 0; i < 2; i++) {
35033 +               processor.mpc_apicid = i;
35034 +               MP_processor_info(&processor);
35035 +       }
35036 +
35037 +       bus.mpc_type = MP_BUS;
35038 +       bus.mpc_busid = 0;
35039 +       switch (mpc_default_type) {
35040 +               default:
35041 +                       printk(KERN_ERR "???\nUnknown standard configuration %d\n",
35042 +                               mpc_default_type);
35043 +                       /* fall through */
35044 +               case 1:
35045 +               case 5:
35046 +                       memcpy(bus.mpc_bustype, "ISA   ", 6);
35047 +                       break;
35048 +               case 2:
35049 +               case 6:
35050 +               case 3:
35051 +                       memcpy(bus.mpc_bustype, "EISA  ", 6);
35052 +                       break;
35053 +               case 4:
35054 +               case 7:
35055 +                       memcpy(bus.mpc_bustype, "MCA   ", 6);
35056 +       }
35057 +       MP_bus_info(&bus);
35058 +       if (mpc_default_type > 4) {
35059 +               bus.mpc_busid = 1;
35060 +               memcpy(bus.mpc_bustype, "PCI   ", 6);
35061 +               MP_bus_info(&bus);
35062 +       }
35063 +
35064 +       ioapic.mpc_type = MP_IOAPIC;
35065 +       ioapic.mpc_apicid = 2;
35066 +       ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
35067 +       ioapic.mpc_flags = MPC_APIC_USABLE;
35068 +       ioapic.mpc_apicaddr = 0xFEC00000;
35069 +       MP_ioapic_info(&ioapic);
35070 +
35071 +       /*
35072 +        * We set up most of the low 16 IO-APIC pins according to MPS rules.
35073 +        */
35074 +       construct_default_ioirq_mptable(mpc_default_type);
35075 +
35076 +       lintsrc.mpc_type = MP_LINTSRC;
35077 +       lintsrc.mpc_irqflag = 0;                /* conforming */
35078 +       lintsrc.mpc_srcbusid = 0;
35079 +       lintsrc.mpc_srcbusirq = 0;
35080 +       lintsrc.mpc_destapic = MP_APIC_ALL;
35081 +       for (i = 0; i < 2; i++) {
35082 +               lintsrc.mpc_irqtype = linttypes[i];
35083 +               lintsrc.mpc_destapiclint = i;
35084 +               MP_lintsrc_info(&lintsrc);
35085 +       }
35086 +}
35087 +
35088 +static struct intel_mp_floating *mpf_found;
35089 +
35090 +/*
35091 + * Scan the memory blocks for an SMP configuration block.
35092 + */
35093 +void __init get_smp_config (void)
35094 +{
35095 +       struct intel_mp_floating *mpf = mpf_found;
35096 +
35097 +       /*
35098 +        * ACPI supports both logical (e.g. Hyper-Threading) and physical 
35099 +        * processors, where MPS only supports physical.
35100 +        */
35101 +       if (acpi_lapic && acpi_ioapic) {
35102 +               printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
35103 +               return;
35104 +       }
35105 +       else if (acpi_lapic)
35106 +               printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
35107 +
35108 +       printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
35109 +       if (mpf->mpf_feature2 & (1<<7)) {
35110 +               printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
35111 +               pic_mode = 1;
35112 +       } else {
35113 +               printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
35114 +               pic_mode = 0;
35115 +       }
35116 +
35117 +       /*
35118 +        * Now see if we need to read further.
35119 +        */
35120 +       if (mpf->mpf_feature1 != 0) {
35121 +
35122 +               printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
35123 +               construct_default_ISA_mptable(mpf->mpf_feature1);
35124 +
35125 +       } else if (mpf->mpf_physptr) {
35126 +
35127 +               /*
35128 +                * Read the physical hardware table.  Anything here will
35129 +                * override the defaults.
35130 +                */
35131 +               if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
35132 +                       smp_found_config = 0;
35133 +                       printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
35134 +                       printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
35135 +                       return;
35136 +               }
35137 +               /*
35138 +                * If there are no explicit MP IRQ entries, then we are
35139 +                * broken.  We set up most of the low 16 IO-APIC pins to
35140 +                * ISA defaults and hope it will work.
35141 +                */
35142 +               if (!mp_irq_entries) {
35143 +                       struct mpc_config_bus bus;
35144 +
35145 +                       printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
35146 +
35147 +                       bus.mpc_type = MP_BUS;
35148 +                       bus.mpc_busid = 0;
35149 +                       memcpy(bus.mpc_bustype, "ISA   ", 6);
35150 +                       MP_bus_info(&bus);
35151 +
35152 +                       construct_default_ioirq_mptable(0);
35153 +               }
35154 +
35155 +       } else
35156 +               BUG();
35157 +
35158 +       printk(KERN_INFO "Processors: %d\n", num_processors);
35159 +       /*
35160 +        * Only use the first configuration found.
35161 +        */
35162 +}
35163 +
35164 +static int __init smp_scan_config (unsigned long base, unsigned long length)
35165 +{
35166 +       extern void __bad_mpf_size(void); 
35167 +       unsigned int *bp = isa_bus_to_virt(base);
35168 +       struct intel_mp_floating *mpf;
35169 +
35170 +       Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
35171 +       if (sizeof(*mpf) != 16)
35172 +               __bad_mpf_size();
35173 +
35174 +       while (length > 0) {
35175 +               mpf = (struct intel_mp_floating *)bp;
35176 +               if ((*bp == SMP_MAGIC_IDENT) &&
35177 +                       (mpf->mpf_length == 1) &&
35178 +                       !mpf_checksum((unsigned char *)bp, 16) &&
35179 +                       ((mpf->mpf_specification == 1)
35180 +                               || (mpf->mpf_specification == 4)) ) {
35181 +
35182 +                       smp_found_config = 1;
35183 +                       mpf_found = mpf;
35184 +                       return 1;
35185 +               }
35186 +               bp += 4;
35187 +               length -= 16;
35188 +       }
35189 +       return 0;
35190 +}
35191 +
35192 +void __init find_intel_smp (void)
35193 +{
35194 +       unsigned int address;
35195 +
35196 +       /*
35197 +        * FIXME: Linux assumes you have 640K of base ram..
35198 +        * this continues the error...
35199 +        *
35200 +        * 1) Scan the bottom 1K for a signature
35201 +        * 2) Scan the top 1K of base RAM
35202 +        * 3) Scan the 64K of bios
35203 +        */
35204 +       if (smp_scan_config(0x0,0x400) ||
35205 +               smp_scan_config(639*0x400,0x400) ||
35206 +                       smp_scan_config(0xF0000,0x10000))
35207 +               return;
35208 +       /*
35209 +        * If it is an SMP machine we should know now, unless the
35210 +        * configuration is in an EISA/MCA bus machine with an
35211 +        * extended bios data area.
35212 +        *
35213 +        * there is a real-mode segmented pointer pointing to the
35214 +        * 4K EBDA area at 0x40E, calculate and scan it here.
35215 +        *
35216 +        * NOTE! There are Linux loaders that will corrupt the EBDA
35217 +        * area, and as such this kind of SMP config may be less
35218 +        * trustworthy, simply because the SMP table may have been
35219 +        * stomped on during early boot. These loaders are buggy and
35220 +        * should be fixed.
35221 +        */
35222 +
35223 +       address = *(unsigned short *)phys_to_virt(0x40E);
35224 +       address <<= 4;
35225 +       if (smp_scan_config(address, 0x1000))
35226 +               return;
35227 +
35228 +       /* If we have come this far, we did not find an MP table  */
35229 +        printk(KERN_INFO "No mptable found.\n");
35230 +}
35231 +
35232 +/*
35233 + * - Intel MP Configuration Table
35234 + */
35235 +void __init find_smp_config (void)
35236 +{
35237 +#ifdef CONFIG_X86_LOCAL_APIC
35238 +       find_intel_smp();
35239 +#endif
35240 +}
35241 +
35242 +
35243 +/* --------------------------------------------------------------------------
35244 +                            ACPI-based MP Configuration
35245 +   -------------------------------------------------------------------------- */
35246 +
35247 +#ifdef CONFIG_ACPI
35248 +
35249 +void __init mp_register_lapic_address (
35250 +       u64                     address)
35251 +{
35252 +#ifndef CONFIG_XEN
35253 +       mp_lapic_addr = (unsigned long) address;
35254 +
35255 +       set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
35256 +
35257 +       if (boot_cpu_id == -1U)
35258 +               boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
35259 +
35260 +       Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
35261 +#endif
35262 +}
35263 +
35264 +
35265 +void __cpuinit mp_register_lapic (
35266 +       u8                      id, 
35267 +       u8                      enabled)
35268 +{
35269 +       struct mpc_config_processor processor;
35270 +       int                     boot_cpu = 0;
35271 +       
35272 +       if (id >= MAX_APICS) {
35273 +               printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
35274 +                       id, MAX_APICS);
35275 +               return;
35276 +       }
35277 +
35278 +       if (id == boot_cpu_physical_apicid)
35279 +               boot_cpu = 1;
35280 +
35281 +#ifndef CONFIG_XEN
35282 +       processor.mpc_type = MP_PROCESSOR;
35283 +       processor.mpc_apicid = id;
35284 +       processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
35285 +       processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
35286 +       processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
35287 +       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | 
35288 +               (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
35289 +       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
35290 +       processor.mpc_reserved[0] = 0;
35291 +       processor.mpc_reserved[1] = 0;
35292 +#endif
35293 +
35294 +       MP_processor_info(&processor);
35295 +}
35296 +
35297 +#ifdef CONFIG_X86_IO_APIC
35298 +
35299 +#define MP_ISA_BUS             0
35300 +#define MP_MAX_IOAPIC_PIN      127
35301 +
35302 +static struct mp_ioapic_routing {
35303 +       int                     apic_id;
35304 +       int                     gsi_start;
35305 +       int                     gsi_end;
35306 +       u32                     pin_programmed[4];
35307 +} mp_ioapic_routing[MAX_IO_APICS];
35308 +
35309 +
35310 +static int mp_find_ioapic (
35311 +       int                     gsi)
35312 +{
35313 +       int                     i = 0;
35314 +
35315 +       /* Find the IOAPIC that manages this GSI. */
35316 +       for (i = 0; i < nr_ioapics; i++) {
35317 +               if ((gsi >= mp_ioapic_routing[i].gsi_start)
35318 +                       && (gsi <= mp_ioapic_routing[i].gsi_end))
35319 +                       return i;
35320 +       }
35321 +
35322 +       printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
35323 +
35324 +       return -1;
35325 +}
35326 +       
35327 +
35328 +void __init mp_register_ioapic (
35329 +       u8                      id, 
35330 +       u32                     address,
35331 +       u32                     gsi_base)
35332 +{
35333 +       int                     idx = 0;
35334 +
35335 +       if (nr_ioapics >= MAX_IO_APICS) {
35336 +               printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
35337 +                       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
35338 +               panic("Recompile kernel with bigger MAX_IO_APICS!\n");
35339 +       }
35340 +       if (!address) {
35341 +               printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
35342 +                       " found in MADT table, skipping!\n");
35343 +               return;
35344 +       }
35345 +
35346 +       idx = nr_ioapics++;
35347 +
35348 +       mp_ioapics[idx].mpc_type = MP_IOAPIC;
35349 +       mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
35350 +       mp_ioapics[idx].mpc_apicaddr = address;
35351 +
35352 +#ifndef CONFIG_XEN
35353 +       set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
35354 +#endif
35355 +       mp_ioapics[idx].mpc_apicid = id;
35356 +       mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
35357 +       
35358 +       /* 
35359 +        * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
35360 +        * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
35361 +        */
35362 +       mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
35363 +       mp_ioapic_routing[idx].gsi_start = gsi_base;
35364 +       mp_ioapic_routing[idx].gsi_end = gsi_base + 
35365 +               io_apic_get_redir_entries(idx);
35366 +
35367 +       printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
35368 +               "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 
35369 +               mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
35370 +               mp_ioapic_routing[idx].gsi_start,
35371 +               mp_ioapic_routing[idx].gsi_end);
35372 +
35373 +       return;
35374 +}
35375 +
35376 +
35377 +void __init mp_override_legacy_irq (
35378 +       u8                      bus_irq,
35379 +       u8                      polarity, 
35380 +       u8                      trigger, 
35381 +       u32                     gsi)
35382 +{
35383 +       struct mpc_config_intsrc intsrc;
35384 +       int                     ioapic = -1;
35385 +       int                     pin = -1;
35386 +
35387 +       /* 
35388 +        * Convert 'gsi' to 'ioapic.pin'.
35389 +        */
35390 +       ioapic = mp_find_ioapic(gsi);
35391 +       if (ioapic < 0)
35392 +               return;
35393 +       pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
35394 +
35395 +       /*
35396 +        * TBD: This check is for faulty timer entries, where the override
35397 +        *      erroneously sets the trigger to level, resulting in a HUGE 
35398 +        *      increase of timer interrupts!
35399 +        */
35400 +       if ((bus_irq == 0) && (trigger == 3))
35401 +               trigger = 1;
35402 +
35403 +       intsrc.mpc_type = MP_INTSRC;
35404 +       intsrc.mpc_irqtype = mp_INT;
35405 +       intsrc.mpc_irqflag = (trigger << 2) | polarity;
35406 +       intsrc.mpc_srcbus = MP_ISA_BUS;
35407 +       intsrc.mpc_srcbusirq = bus_irq;                                /* IRQ */
35408 +       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;        /* APIC ID */
35409 +       intsrc.mpc_dstirq = pin;                                    /* INTIN# */
35410 +
35411 +       Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", 
35412 +               intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
35413 +               (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
35414 +               intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
35415 +
35416 +       mp_irqs[mp_irq_entries] = intsrc;
35417 +       if (++mp_irq_entries == MAX_IRQ_SOURCES)
35418 +               panic("Max # of irq sources exceeded!\n");
35419 +
35420 +       return;
35421 +}
35422 +
35423 +
35424 +void __init mp_config_acpi_legacy_irqs (void)
35425 +{
35426 +       struct mpc_config_intsrc intsrc;
35427 +       int                     i = 0;
35428 +       int                     ioapic = -1;
35429 +
35430 +       /* 
35431 +        * Fabricate the legacy ISA bus (bus #31).
35432 +        */
35433 +       mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
35434 +       Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
35435 +
35436 +       /* 
35437 +        * Locate the IOAPIC that manages the ISA IRQs (0-15). 
35438 +        */
35439 +       ioapic = mp_find_ioapic(0);
35440 +       if (ioapic < 0)
35441 +               return;
35442 +
35443 +       intsrc.mpc_type = MP_INTSRC;
35444 +       intsrc.mpc_irqflag = 0;                                 /* Conforming */
35445 +       intsrc.mpc_srcbus = MP_ISA_BUS;
35446 +       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
35447 +
35448 +       /* 
35449 +        * Use the default configuration for the IRQs 0-15.  Unless
35450 +        * overridden by (MADT) interrupt source override entries.
35451 +        */
35452 +       for (i = 0; i < 16; i++) {
35453 +               int idx;
35454 +
35455 +               for (idx = 0; idx < mp_irq_entries; idx++) {
35456 +                       struct mpc_config_intsrc *irq = mp_irqs + idx;
35457 +
35458 +                       /* Do we already have a mapping for this ISA IRQ? */
35459 +                       if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
35460 +                               break;
35461 +
35462 +                       /* Do we already have a mapping for this IOAPIC pin */
35463 +                       if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
35464 +                               (irq->mpc_dstirq == i))
35465 +                               break;
35466 +               }
35467 +
35468 +               if (idx != mp_irq_entries) {
35469 +                       printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
35470 +                       continue;                       /* IRQ already used */
35471 +               }
35472 +
35473 +               intsrc.mpc_irqtype = mp_INT;
35474 +               intsrc.mpc_srcbusirq = i;                  /* Identity mapped */
35475 +               intsrc.mpc_dstirq = i;
35476 +
35477 +               Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
35478 +                       "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
35479 +                       (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
35480 +                       intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, 
35481 +                       intsrc.mpc_dstirq);
35482 +
35483 +               mp_irqs[mp_irq_entries] = intsrc;
35484 +               if (++mp_irq_entries == MAX_IRQ_SOURCES)
35485 +                       panic("Max # of irq sources exceeded!\n");
35486 +       }
35487 +
35488 +       return;
35489 +}
35490 +
35491 +#define MAX_GSI_NUM    4096
35492 +
35493 +int mp_register_gsi(u32 gsi, int triggering, int polarity)
35494 +{
35495 +       int                     ioapic = -1;
35496 +       int                     ioapic_pin = 0;
35497 +       int                     idx, bit = 0;
35498 +       static int              pci_irq = 16;
35499 +       /*
35500 +        * Mapping between Global System Interrupts, which
35501 +        * represent all possible interrupts, to the IRQs
35502 +        * assigned to actual devices.
35503 +        */
35504 +       static int              gsi_to_irq[MAX_GSI_NUM];
35505 +
35506 +       if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
35507 +               return gsi;
35508 +
35509 +       /* Don't set up the ACPI SCI because it's already set up */
35510 +       if (acpi_fadt.sci_int == gsi)
35511 +               return gsi;
35512 +
35513 +       ioapic = mp_find_ioapic(gsi);
35514 +       if (ioapic < 0) {
35515 +               printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
35516 +               return gsi;
35517 +       }
35518 +
35519 +       ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
35520 +
35521 +       /* 
35522 +        * Avoid pin reprogramming.  PRTs typically include entries  
35523 +        * with redundant pin->gsi mappings (but unique PCI devices);
35524 +        * we only program the IOAPIC on the first.
35525 +        */
35526 +       bit = ioapic_pin % 32;
35527 +       idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
35528 +       if (idx > 3) {
35529 +               printk(KERN_ERR "Invalid reference to IOAPIC pin "
35530 +                       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, 
35531 +                       ioapic_pin);
35532 +               return gsi;
35533 +       }
35534 +       if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
35535 +               Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
35536 +                       mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
35537 +               return gsi_to_irq[gsi];
35538 +       }
35539 +
35540 +       mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
35541 +
35542 +       if (triggering == ACPI_LEVEL_SENSITIVE) {
35543 +               /*
35544 +                * For PCI devices assign IRQs in order, avoiding gaps
35545 +                * due to unused I/O APIC pins.
35546 +                */
35547 +               int irq = gsi;
35548 +               if (gsi < MAX_GSI_NUM) {
35549 +                       /*
35550 +                        * Retain the VIA chipset work-around (gsi > 15), but
35551 +                        * avoid a problem where the 8254 timer (IRQ0) is setup
35552 +                        * via an override (so it's not on pin 0 of the ioapic),
35553 +                        * and at the same time, the pin 0 interrupt is a PCI
35554 +                        * type.  The gsi > 15 test could cause these two pins
35555 +                        * to be shared as IRQ0, and they are not shareable.
35556 +                        * So test for this condition, and if necessary, avoid
35557 +                        * the pin collision.
35558 +                        */
35559 +                       if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
35560 +                               gsi = pci_irq++;
35561 +                       /*
35562 +                        * Don't assign IRQ used by ACPI SCI
35563 +                        */
35564 +                       if (gsi == acpi_fadt.sci_int)
35565 +                               gsi = pci_irq++;
35566 +                       gsi_to_irq[irq] = gsi;
35567 +               } else {
35568 +                       printk(KERN_ERR "GSI %u is too high\n", gsi);
35569 +                       return gsi;
35570 +               }
35571 +       }
35572 +
35573 +       io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
35574 +               triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
35575 +               polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
35576 +       return gsi;
35577 +}
35578 +
35579 +#endif /*CONFIG_X86_IO_APIC*/
35580 +#endif /*CONFIG_ACPI*/
35581 diff -urNp linux-2.6/arch/x86_64/kernel/pci-swiotlb-xen.c new/arch/x86_64/kernel/pci-swiotlb-xen.c
35582 --- linux-2.6/arch/x86_64/kernel/pci-swiotlb-xen.c      1970-01-01 01:00:00.000000000 +0100
35583 +++ new/arch/x86_64/kernel/pci-swiotlb-xen.c    2006-05-09 12:33:20.000000000 +0200
35584 @@ -0,0 +1,54 @@
35585 +/* Glue code to lib/swiotlb.c */
35586 +
35587 +#include <linux/pci.h>
35588 +#include <linux/cache.h>
35589 +#include <linux/module.h>
35590 +#include <asm/dma-mapping.h>
35591 +#include <asm/proto.h>
35592 +#include <asm/swiotlb.h>
35593 +#include <asm/dma.h>
35594 +
35595 +#if 0
35596 +int swiotlb __read_mostly;
35597 +EXPORT_SYMBOL(swiotlb);
35598 +#endif
35599 +
35600 +struct dma_mapping_ops swiotlb_dma_ops = {
35601 +#if 0
35602 +       .mapping_error = swiotlb_dma_mapping_error,
35603 +       .alloc_coherent = swiotlb_alloc_coherent,
35604 +       .free_coherent = swiotlb_free_coherent,
35605 +       .map_single = swiotlb_map_single,
35606 +       .unmap_single = swiotlb_unmap_single,
35607 +       .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
35608 +       .sync_single_for_device = swiotlb_sync_single_for_device,
35609 +       .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
35610 +       .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
35611 +       .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
35612 +       .sync_sg_for_device = swiotlb_sync_sg_for_device,
35613 +       .map_sg = swiotlb_map_sg,
35614 +       .unmap_sg = swiotlb_unmap_sg,
35615 +       .dma_supported = NULL,
35616 +#endif
35617 +};
35618 +
35619 +void pci_swiotlb_init(void)
35620 +{
35621 +#if 0
35622 +       /* don't initialize swiotlb if iommu=off (no_iommu=1) */
35623 +       if (!iommu_aperture && !no_iommu &&
35624 +           (end_pfn > MAX_DMA32_PFN || force_iommu))
35625 +              swiotlb = 1;
35626 +       if (swiotlb) {
35627 +               printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
35628 +               swiotlb_init();
35629 +               dma_ops = &swiotlb_dma_ops;
35630 +       }
35631 +#else
35632 +       swiotlb_init();
35633 +       if (swiotlb) {
35634 +               printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
35635 +               dma_ops = &swiotlb_dma_ops;
35636 +       }
35637 +#endif
35638 +}
35639 diff -urNp linux-2.6/arch/x86_64/kernel/process-xen.c new/arch/x86_64/kernel/process-xen.c
35640 --- linux-2.6/arch/x86_64/kernel/process-xen.c  1970-01-01 01:00:00.000000000 +0100
35641 +++ new/arch/x86_64/kernel/process-xen.c        2006-06-07 13:15:16.000000000 +0200
35642 @@ -0,0 +1,791 @@
35643 +/*
35644 + *  linux/arch/x86-64/kernel/process.c
35645 + *
35646 + *  Copyright (C) 1995  Linus Torvalds
35647 + *
35648 + *  Pentium III FXSR, SSE support
35649 + *     Gareth Hughes <gareth@valinux.com>, May 2000
35650 + * 
35651 + *  X86-64 port
35652 + *     Andi Kleen.
35653 + *
35654 + *     CPU hotplug support - ashok.raj@intel.com
35655 + *  $Id$
35656 + * 
35657 + *  Jun Nakajima <jun.nakajima@intel.com> 
35658 + *     Modified for Xen
35659 + */
35660 +
35661 +/*
35662 + * This file handles the architecture-dependent parts of process handling..
35663 + */
35664 +
35665 +#include <stdarg.h>
35666 +
35667 +#include <linux/cpu.h>
35668 +#include <linux/errno.h>
35669 +#include <linux/sched.h>
35670 +#include <linux/kernel.h>
35671 +#include <linux/mm.h>
35672 +#include <linux/elfcore.h>
35673 +#include <linux/smp.h>
35674 +#include <linux/slab.h>
35675 +#include <linux/user.h>
35676 +#include <linux/module.h>
35677 +#include <linux/a.out.h>
35678 +#include <linux/interrupt.h>
35679 +#include <linux/delay.h>
35680 +#include <linux/ptrace.h>
35681 +#include <linux/utsname.h>
35682 +#include <linux/random.h>
35683 +#include <linux/notifier.h>
35684 +#include <linux/kprobes.h>
35685 +
35686 +#include <asm/uaccess.h>
35687 +#include <asm/pgtable.h>
35688 +#include <asm/system.h>
35689 +#include <asm/io.h>
35690 +#include <asm/processor.h>
35691 +#include <asm/i387.h>
35692 +#include <asm/mmu_context.h>
35693 +#include <asm/pda.h>
35694 +#include <asm/prctl.h>
35695 +#include <asm/kdebug.h>
35696 +#include <xen/interface/dom0_ops.h>
35697 +#include <xen/interface/physdev.h>
35698 +#include <xen/interface/vcpu.h>
35699 +#include <asm/desc.h>
35700 +#include <asm/proto.h>
35701 +#include <asm/hardirq.h>
35702 +#include <asm/ia32.h>
35703 +#include <asm/idle.h>
35704 +
35705 +#include <xen/cpu_hotplug.h>
35706 +
35707 +asmlinkage extern void ret_from_fork(void);
35708 +
35709 +unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
35710 +
35711 +unsigned long boot_option_idle_override = 0;
35712 +EXPORT_SYMBOL(boot_option_idle_override);
35713 +
35714 +/*
35715 + * Powermanagement idle function, if any..
35716 + */
35717 +void (*pm_idle)(void);
35718 +static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
35719 +
35720 +static ATOMIC_NOTIFIER_HEAD(idle_notifier);
35721 +
35722 +void idle_notifier_register(struct notifier_block *n)
35723 +{
35724 +       atomic_notifier_chain_register(&idle_notifier, n);
35725 +}
35726 +EXPORT_SYMBOL_GPL(idle_notifier_register);
35727 +
35728 +void idle_notifier_unregister(struct notifier_block *n)
35729 +{
35730 +       atomic_notifier_chain_unregister(&idle_notifier, n);
35731 +}
35732 +EXPORT_SYMBOL(idle_notifier_unregister);
35733 +
35734 +enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
35735 +static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
35736 +
35737 +void enter_idle(void)
35738 +{
35739 +       __get_cpu_var(idle_state) = CPU_IDLE;
35740 +       atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
35741 +}
35742 +
35743 +static void __exit_idle(void)
35744 +{
35745 +       __get_cpu_var(idle_state) = CPU_NOT_IDLE;
35746 +       atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
35747 +}
35748 +
35749 +/* Called from interrupts to signify idle end */
35750 +void exit_idle(void)
35751 +{
35752 +       if (current->pid | read_pda(irqcount))
35753 +               return;
35754 +       __exit_idle();
35755 +}
35756 +
35757 +/* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */
35758 +void xen_idle(void)
35759 +{
35760 +       local_irq_disable();
35761 +
35762 +       if (need_resched())
35763 +               local_irq_enable();
35764 +       else {
35765 +               clear_thread_flag(TIF_POLLING_NRFLAG);
35766 +               smp_mb__after_clear_bit();
35767 +               safe_halt();
35768 +               set_thread_flag(TIF_POLLING_NRFLAG);
35769 +       }
35770 +}
35771 +
35772 +#ifdef CONFIG_HOTPLUG_CPU
35773 +static inline void play_dead(void)
35774 +{
35775 +       idle_task_exit();
35776 +       local_irq_disable();
35777 +       cpu_clear(smp_processor_id(), cpu_initialized);
35778 +       preempt_enable_no_resched();
35779 +       HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
35780 +       cpu_bringup();
35781 +}
35782 +#else
35783 +static inline void play_dead(void)
35784 +{
35785 +       BUG();
35786 +}
35787 +#endif /* CONFIG_HOTPLUG_CPU */
35788 +
35789 +/*
35790 + * The idle thread. There's no useful work to be
35791 + * done, so just try to conserve power and have a
35792 + * low exit latency (ie sit in a loop waiting for
35793 + * somebody to say that they'd like to reschedule)
35794 + */
35795 +void cpu_idle (void)
35796 +{
35797 +       set_thread_flag(TIF_POLLING_NRFLAG);
35798 +
35799 +       /* endless idle loop with no priority at all */
35800 +       while (1) {
35801 +               while (!need_resched()) {
35802 +                       if (__get_cpu_var(cpu_idle_state))
35803 +                               __get_cpu_var(cpu_idle_state) = 0;
35804 +                       rmb();
35805 +                       
35806 +                       if (cpu_is_offline(smp_processor_id()))
35807 +                               play_dead();
35808 +                       enter_idle();
35809 +                       xen_idle();
35810 +                       __exit_idle();
35811 +               }
35812 +
35813 +               preempt_enable_no_resched();
35814 +               schedule();
35815 +               preempt_disable();
35816 +       }
35817 +}
35818 +
35819 +void cpu_idle_wait(void)
35820 +{
35821 +       unsigned int cpu, this_cpu = get_cpu();
35822 +       cpumask_t map;
35823 +
35824 +       set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
35825 +       put_cpu();
35826 +
35827 +       cpus_clear(map);
35828 +       for_each_online_cpu(cpu) {
35829 +               per_cpu(cpu_idle_state, cpu) = 1;
35830 +               cpu_set(cpu, map);
35831 +       }
35832 +
35833 +       __get_cpu_var(cpu_idle_state) = 0;
35834 +
35835 +       wmb();
35836 +       do {
35837 +               ssleep(1);
35838 +               for_each_online_cpu(cpu) {
35839 +                       if (cpu_isset(cpu, map) &&
35840 +                                       !per_cpu(cpu_idle_state, cpu))
35841 +                               cpu_clear(cpu, map);
35842 +               }
35843 +               cpus_and(map, map, cpu_online_map);
35844 +       } while (!cpus_empty(map));
35845 +}
35846 +EXPORT_SYMBOL_GPL(cpu_idle_wait);
35847 +
35848 +/* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */
35849 +/* Always use xen_idle() instead. */
35850 +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) {}
35851 +
35852 +/* Prints also some state that isn't saved in the pt_regs */ 
35853 +void __show_regs(struct pt_regs * regs)
35854 +{
35855 +       unsigned long fs, gs, shadowgs;
35856 +       unsigned int fsindex,gsindex;
35857 +       unsigned int ds,cs,es; 
35858 +
35859 +       printk("\n");
35860 +       print_modules();
35861 +       printk("Pid: %d, comm: %.20s %s %s %.*s\n",
35862 +               current->pid, current->comm, print_tainted(),
35863 +               system_utsname.release,
35864 +               (int)strcspn(system_utsname.version, " "),
35865 +               system_utsname.version);
35866 +       printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
35867 +       printk_address(regs->rip); 
35868 +       printk("\nRSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
35869 +               regs->eflags);
35870 +       printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
35871 +              regs->rax, regs->rbx, regs->rcx);
35872 +       printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
35873 +              regs->rdx, regs->rsi, regs->rdi); 
35874 +       printk("RBP: %016lx R08: %016lx R09: %016lx\n",
35875 +              regs->rbp, regs->r8, regs->r9); 
35876 +       printk("R10: %016lx R11: %016lx R12: %016lx\n",
35877 +              regs->r10, regs->r11, regs->r12); 
35878 +       printk("R13: %016lx R14: %016lx R15: %016lx\n",
35879 +              regs->r13, regs->r14, regs->r15); 
35880 +
35881 +       asm("mov %%ds,%0" : "=r" (ds)); 
35882 +       asm("mov %%cs,%0" : "=r" (cs)); 
35883 +       asm("mov %%es,%0" : "=r" (es)); 
35884 +       asm("mov %%fs,%0" : "=r" (fsindex));
35885 +       asm("mov %%gs,%0" : "=r" (gsindex));
35886 +
35887 +       rdmsrl(MSR_FS_BASE, fs);
35888 +       rdmsrl(MSR_GS_BASE, gs); 
35889 +       rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 
35890 +
35891 +       printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 
35892 +              fs,fsindex,gs,gsindex,shadowgs); 
35893 +       printk("CS:  %04x DS: %04x ES: %04x\n", cs, ds, es); 
35894 +
35895 +}
35896 +
35897 +void show_regs(struct pt_regs *regs)
35898 +{
35899 +       printk("CPU %d:", smp_processor_id());
35900 +       __show_regs(regs);
35901 +       show_trace(&regs->rsp);
35902 +}
35903 +
35904 +/*
35905 + * Free current thread data structures etc..
35906 + */
35907 +void exit_thread(void)
35908 +{
35909 +       struct task_struct *me = current;
35910 +       struct thread_struct *t = &me->thread;
35911 +
35912 +       if (me->thread.io_bitmap_ptr) { 
35913 +#ifndef CONFIG_X86_NO_TSS
35914 +               struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
35915 +#endif
35916 +#ifdef CONFIG_XEN
35917 +               struct physdev_set_iobitmap iobmp_op = { 0 };
35918 +#endif
35919 +
35920 +               kfree(t->io_bitmap_ptr);
35921 +               t->io_bitmap_ptr = NULL;
35922 +               /*
35923 +                * Careful, clear this in the TSS too:
35924 +                */
35925 +#ifndef CONFIG_X86_NO_TSS
35926 +               memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
35927 +               put_cpu();
35928 +#endif
35929 +#ifdef CONFIG_XEN
35930 +               HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &iobmp_op);
35931 +#endif
35932 +               t->io_bitmap_max = 0;
35933 +       }
35934 +}
35935 +
35936 +void load_gs_index(unsigned gs)
35937 +{
35938 +       HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs);
35939 +}
35940 +
35941 +void flush_thread(void)
35942 +{
35943 +       struct task_struct *tsk = current;
35944 +       struct thread_info *t = current_thread_info();
35945 +
35946 +       if (t->flags & _TIF_ABI_PENDING)
35947 +               t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
35948 +
35949 +       tsk->thread.debugreg0 = 0;
35950 +       tsk->thread.debugreg1 = 0;
35951 +       tsk->thread.debugreg2 = 0;
35952 +       tsk->thread.debugreg3 = 0;
35953 +       tsk->thread.debugreg6 = 0;
35954 +       tsk->thread.debugreg7 = 0;
35955 +       memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));        
35956 +       /*
35957 +        * Forget coprocessor state..
35958 +        */
35959 +       clear_fpu(tsk);
35960 +       clear_used_math();
35961 +}
35962 +
35963 +void release_thread(struct task_struct *dead_task)
35964 +{
35965 +       if (dead_task->mm) {
35966 +               if (dead_task->mm->context.size) {
35967 +                       printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
35968 +                                       dead_task->comm,
35969 +                                       dead_task->mm->context.ldt,
35970 +                                       dead_task->mm->context.size);
35971 +                       BUG();
35972 +               }
35973 +       }
35974 +}
35975 +
35976 +static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
35977 +{
35978 +       struct user_desc ud = { 
35979 +               .base_addr = addr,
35980 +               .limit = 0xfffff,
35981 +               .contents = (3 << 3), /* user */
35982 +               .seg_32bit = 1,
35983 +               .limit_in_pages = 1,
35984 +               .useable = 1,
35985 +       };
35986 +       struct n_desc_struct *desc = (void *)t->thread.tls_array;
35987 +       desc += tls;
35988 +       desc->a = LDT_entry_a(&ud); 
35989 +       desc->b = LDT_entry_b(&ud); 
35990 +}
35991 +
35992 +static inline u32 read_32bit_tls(struct task_struct *t, int tls)
35993 +{
35994 +       struct desc_struct *desc = (void *)t->thread.tls_array;
35995 +       desc += tls;
35996 +       return desc->base0 | 
35997 +               (((u32)desc->base1) << 16) | 
35998 +               (((u32)desc->base2) << 24);
35999 +}
36000 +
36001 +/*
36002 + * This gets called before we allocate a new thread and copy
36003 + * the current task into it.
36004 + */
36005 +void prepare_to_copy(struct task_struct *tsk)
36006 +{
36007 +       unlazy_fpu(tsk);
36008 +}
36009 +
36010 +int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, 
36011 +               unsigned long unused,
36012 +       struct task_struct * p, struct pt_regs * regs)
36013 +{
36014 +       int err;
36015 +       struct pt_regs * childregs;
36016 +       struct task_struct *me = current;
36017 +
36018 +       childregs = ((struct pt_regs *)
36019 +                       (THREAD_SIZE + task_stack_page(p))) - 1;
36020 +       *childregs = *regs;
36021 +
36022 +       childregs->rax = 0;
36023 +       childregs->rsp = rsp;
36024 +       if (rsp == ~0UL)
36025 +               childregs->rsp = (unsigned long)childregs;
36026 +
36027 +       p->thread.rsp = (unsigned long) childregs;
36028 +       p->thread.rsp0 = (unsigned long) (childregs+1);
36029 +       p->thread.userrsp = me->thread.userrsp; 
36030 +
36031 +       set_tsk_thread_flag(p, TIF_FORK);
36032 +
36033 +       p->thread.fs = me->thread.fs;
36034 +       p->thread.gs = me->thread.gs;
36035 +
36036 +       asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
36037 +       asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
36038 +       asm("mov %%es,%0" : "=m" (p->thread.es));
36039 +       asm("mov %%ds,%0" : "=m" (p->thread.ds));
36040 +
36041 +       if (unlikely(me->thread.io_bitmap_ptr != NULL)) { 
36042 +               p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
36043 +               if (!p->thread.io_bitmap_ptr) {
36044 +                       p->thread.io_bitmap_max = 0;
36045 +                       return -ENOMEM;
36046 +               }
36047 +               memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
36048 +                               IO_BITMAP_BYTES);
36049 +       } 
36050 +
36051 +       /*
36052 +        * Set a new TLS for the child thread?
36053 +        */
36054 +       if (clone_flags & CLONE_SETTLS) {
36055 +#ifdef CONFIG_IA32_EMULATION
36056 +               if (test_thread_flag(TIF_IA32))
36057 +                       err = ia32_child_tls(p, childregs); 
36058 +               else                    
36059 +#endif  
36060 +                       err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 
36061 +               if (err) 
36062 +                       goto out;
36063 +       }
36064 +        p->thread.iopl = current->thread.iopl;
36065 +
36066 +       err = 0;
36067 +out:
36068 +       if (err && p->thread.io_bitmap_ptr) {
36069 +               kfree(p->thread.io_bitmap_ptr);
36070 +               p->thread.io_bitmap_max = 0;
36071 +       }
36072 +       return err;
36073 +}
36074 +
36075 +static inline void __save_init_fpu( struct task_struct *tsk )
36076 +{
36077 +       asm volatile( "rex64 ; fxsave %0 ; fnclex"
36078 +                     : "=m" (tsk->thread.i387.fxsave));
36079 +       tsk->thread_info->status &= ~TS_USEDFPU;
36080 +}
36081 +
36082 +/*
36083 + *     switch_to(x,y) should switch tasks from x to y.
36084 + *
36085 + * This could still be optimized: 
36086 + * - fold all the options into a flag word and test it with a single test.
36087 + * - could test fs/gs bitsliced
36088 + *
36089 + * Kprobes not supported here. Set the probe on schedule instead.
36090 + */
36091 +__kprobes struct task_struct *
36092 +__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
36093 +{
36094 +       struct thread_struct *prev = &prev_p->thread,
36095 +                                *next = &next_p->thread;
36096 +       int cpu = smp_processor_id();  
36097 +#ifndef CONFIG_X86_NO_TSS
36098 +       struct tss_struct *tss = &per_cpu(init_tss, cpu);
36099 +#endif
36100 +       struct physdev_set_iopl iopl_op;
36101 +       struct physdev_set_iobitmap iobmp_op;
36102 +       multicall_entry_t _mcl[8], *mcl = _mcl;
36103 +
36104 +       /*
36105 +        * This is basically '__unlazy_fpu', except that we queue a
36106 +        * multicall to indicate FPU task switch, rather than
36107 +        * synchronously trapping to Xen.
36108 +        */
36109 +       if (prev_p->thread_info->status & TS_USEDFPU) {
36110 +               __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
36111 +               mcl->op      = __HYPERVISOR_fpu_taskswitch;
36112 +               mcl->args[0] = 1;
36113 +               mcl++;
36114 +       }
36115 +
36116 +       /*
36117 +        * Reload esp0, LDT and the page table pointer:
36118 +        */
36119 +       mcl->op      = __HYPERVISOR_stack_switch;
36120 +       mcl->args[0] = __KERNEL_DS;
36121 +       mcl->args[1] = next->rsp0;
36122 +       mcl++;
36123 +
36124 +       /*
36125 +        * Load the per-thread Thread-Local Storage descriptor.
36126 +        * This is load_TLS(next, cpu) with multicalls.
36127 +        */
36128 +#define C(i) do {                                                      \
36129 +       if (unlikely(next->tls_array[i] != prev->tls_array[i])) {       \
36130 +               mcl->op      = __HYPERVISOR_update_descriptor;          \
36131 +               mcl->args[0] = virt_to_machine(                         \
36132 +                       &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]);          \
36133 +               mcl->args[1] = next->tls_array[i];                      \
36134 +               mcl++;                                                  \
36135 +       }                                                               \
36136 +} while (0)
36137 +       C(0); C(1); C(2);
36138 +#undef C
36139 +
36140 +       if (unlikely(prev->iopl != next->iopl)) {
36141 +               iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl;
36142 +               mcl->op      = __HYPERVISOR_physdev_op;
36143 +               mcl->args[0] = PHYSDEVOP_set_iopl;
36144 +               mcl->args[1] = (unsigned long)&iopl_op;
36145 +               mcl++;
36146 +       }
36147 +
36148 +       if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
36149 +               iobmp_op.bitmap   = (char *)next->io_bitmap_ptr;
36150 +               iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
36151 +               mcl->op      = __HYPERVISOR_physdev_op;
36152 +               mcl->args[0] = PHYSDEVOP_set_iobitmap;
36153 +               mcl->args[1] = (unsigned long)&iobmp_op;
36154 +               mcl++;
36155 +       }
36156 +
36157 +       (void)HYPERVISOR_multicall(_mcl, mcl - _mcl);
36158 +       /* 
36159 +        * Switch DS and ES.
36160 +        * This won't pick up thread selector changes, but I guess that is ok.
36161 +        */
36162 +       if (unlikely(next->es))
36163 +               loadsegment(es, next->es); 
36164 +       
36165 +       if (unlikely(next->ds))
36166 +               loadsegment(ds, next->ds);
36167 +
36168 +       /* 
36169 +        * Switch FS and GS.
36170 +        */
36171 +       if (unlikely(next->fsindex))
36172 +               loadsegment(fs, next->fsindex);
36173 +
36174 +       if (next->fs)
36175 +               HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs); 
36176 +       
36177 +       if (unlikely(next->gsindex))
36178 +               load_gs_index(next->gsindex);
36179 +
36180 +       if (next->gs)
36181 +               HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs); 
36182 +
36183 +       /* 
36184 +        * Switch the PDA.
36185 +        */
36186 +       prev->userrsp = read_pda(oldrsp); 
36187 +       write_pda(oldrsp, next->userrsp); 
36188 +       write_pda(pcurrent, next_p); 
36189 +       write_pda(kernelstack,
36190 +                 task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
36191 +
36192 +       /*
36193 +        * Now maybe reload the debug registers
36194 +        */
36195 +       if (unlikely(next->debugreg7)) {
36196 +               set_debugreg(next->debugreg0, 0);
36197 +               set_debugreg(next->debugreg1, 1);
36198 +               set_debugreg(next->debugreg2, 2);
36199 +               set_debugreg(next->debugreg3, 3);
36200 +               /* no 4 and 5 */
36201 +               set_debugreg(next->debugreg6, 6);
36202 +               set_debugreg(next->debugreg7, 7);
36203 +       }
36204 +
36205 +       return prev_p;
36206 +}
36207 +
36208 +/*
36209 + * sys_execve() executes a new program.
36210 + */
36211 +asmlinkage 
36212 +long sys_execve(char __user *name, char __user * __user *argv,
36213 +               char __user * __user *envp, struct pt_regs regs)
36214 +{
36215 +       long error;
36216 +       char * filename;
36217 +
36218 +       filename = getname(name);
36219 +       error = PTR_ERR(filename);
36220 +       if (IS_ERR(filename)) 
36221 +               return error;
36222 +       error = do_execve(filename, argv, envp, &regs); 
36223 +       if (error == 0) {
36224 +               task_lock(current);
36225 +               current->ptrace &= ~PT_DTRACE;
36226 +               task_unlock(current);
36227 +       }
36228 +       putname(filename);
36229 +       return error;
36230 +}
36231 +
36232 +void set_personality_64bit(void)
36233 +{
36234 +       /* inherit personality from parent */
36235 +
36236 +       /* Make sure to be in 64bit mode */
36237 +       clear_thread_flag(TIF_IA32); 
36238 +
36239 +       /* TBD: overwrites user setup. Should have two bits.
36240 +          But 64bit processes have always behaved this way,
36241 +          so it's not too bad. The main problem is just that
36242 +          32bit childs are affected again. */
36243 +       current->personality &= ~READ_IMPLIES_EXEC;
36244 +}
36245 +
36246 +asmlinkage long sys_fork(struct pt_regs *regs)
36247 +{
36248 +       return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
36249 +}
36250 +
36251 +asmlinkage long
36252 +sys_clone(unsigned long clone_flags, unsigned long newsp,
36253 +         void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
36254 +{
36255 +       if (!newsp)
36256 +               newsp = regs->rsp;
36257 +       return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
36258 +}
36259 +
36260 +/*
36261 + * This is trivial, and on the face of it looks like it
36262 + * could equally well be done in user mode.
36263 + *
36264 + * Not so, for quite unobvious reasons - register pressure.
36265 + * In user mode vfork() cannot have a stack frame, and if
36266 + * done by calling the "clone()" system call directly, you
36267 + * do not have enough call-clobbered registers to hold all
36268 + * the information you need.
36269 + */
36270 +asmlinkage long sys_vfork(struct pt_regs *regs)
36271 +{
36272 +       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
36273 +                   NULL, NULL);
36274 +}
36275 +
36276 +unsigned long get_wchan(struct task_struct *p)
36277 +{
36278 +       unsigned long stack;
36279 +       u64 fp,rip;
36280 +       int count = 0;
36281 +
36282 +       if (!p || p == current || p->state==TASK_RUNNING)
36283 +               return 0; 
36284 +       stack = (unsigned long)task_stack_page(p);
36285 +       if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
36286 +               return 0;
36287 +       fp = *(u64 *)(p->thread.rsp);
36288 +       do { 
36289 +               if (fp < (unsigned long)stack ||
36290 +                   fp > (unsigned long)stack+THREAD_SIZE)
36291 +                       return 0; 
36292 +               rip = *(u64 *)(fp+8); 
36293 +               if (!in_sched_functions(rip))
36294 +                       return rip; 
36295 +               fp = *(u64 *)fp; 
36296 +       } while (count++ < 16); 
36297 +       return 0;
36298 +}
36299 +
36300 +long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
36301 +{ 
36302 +       int ret = 0; 
36303 +       int doit = task == current;
36304 +       int cpu;
36305 +
36306 +       switch (code) { 
36307 +       case ARCH_SET_GS:
36308 +               if (addr >= TASK_SIZE_OF(task))
36309 +                       return -EPERM; 
36310 +               cpu = get_cpu();
36311 +               /* handle small bases via the GDT because that's faster to 
36312 +                  switch. */
36313 +               if (addr <= 0xffffffff) {  
36314 +                       set_32bit_tls(task, GS_TLS, addr); 
36315 +                       if (doit) { 
36316 +                               load_TLS(&task->thread, cpu);
36317 +                               load_gs_index(GS_TLS_SEL); 
36318 +                       }
36319 +                       task->thread.gsindex = GS_TLS_SEL; 
36320 +                       task->thread.gs = 0;
36321 +               } else { 
36322 +                       task->thread.gsindex = 0;
36323 +                       task->thread.gs = addr;
36324 +                       if (doit) {
36325 +                               load_gs_index(0);
36326 +                               ret = HYPERVISOR_set_segment_base(
36327 +                                       SEGBASE_GS_USER, addr);
36328 +                       } 
36329 +               }
36330 +               put_cpu();
36331 +               break;
36332 +       case ARCH_SET_FS:
36333 +               /* Not strictly needed for fs, but do it for symmetry
36334 +                  with gs */
36335 +               if (addr >= TASK_SIZE_OF(task))
36336 +                       return -EPERM; 
36337 +               cpu = get_cpu();
36338 +               /* handle small bases via the GDT because that's faster to 
36339 +                  switch. */
36340 +               if (addr <= 0xffffffff) { 
36341 +                       set_32bit_tls(task, FS_TLS, addr);
36342 +                       if (doit) { 
36343 +                               load_TLS(&task->thread, cpu); 
36344 +                               asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
36345 +                       }
36346 +                       task->thread.fsindex = FS_TLS_SEL;
36347 +                       task->thread.fs = 0;
36348 +               } else { 
36349 +                       task->thread.fsindex = 0;
36350 +                       task->thread.fs = addr;
36351 +                       if (doit) {
36352 +                               /* set the selector to 0 to not confuse
36353 +                                  __switch_to */
36354 +                               asm volatile("movl %0,%%fs" :: "r" (0));
36355 +                                ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
36356 +                                                                 addr);
36357 +                       }
36358 +               }
36359 +               put_cpu();
36360 +               break;
36361 +       case ARCH_GET_FS: { 
36362 +               unsigned long base; 
36363 +               if (task->thread.fsindex == FS_TLS_SEL)
36364 +                       base = read_32bit_tls(task, FS_TLS);
36365 +               else if (doit)
36366 +                       rdmsrl(MSR_FS_BASE, base);
36367 +               else
36368 +                       base = task->thread.fs;
36369 +               ret = put_user(base, (unsigned long __user *)addr); 
36370 +               break; 
36371 +       }
36372 +       case ARCH_GET_GS: { 
36373 +               unsigned long base;
36374 +               unsigned gsindex;
36375 +               if (task->thread.gsindex == GS_TLS_SEL)
36376 +                       base = read_32bit_tls(task, GS_TLS);
36377 +               else if (doit) {
36378 +                       asm("movl %%gs,%0" : "=r" (gsindex));
36379 +                       if (gsindex)
36380 +                               rdmsrl(MSR_KERNEL_GS_BASE, base);
36381 +                       else
36382 +                               base = task->thread.gs;
36383 +               }
36384 +               else
36385 +                       base = task->thread.gs;
36386 +               ret = put_user(base, (unsigned long __user *)addr); 
36387 +               break;
36388 +       }
36389 +
36390 +       default:
36391 +               ret = -EINVAL;
36392 +               break;
36393 +       } 
36394 +
36395 +       return ret;     
36396 +} 
36397 +
36398 +long sys_arch_prctl(int code, unsigned long addr)
36399 +{
36400 +       return do_arch_prctl(current, code, addr);
36401 +} 
36402 +
36403 +/* 
36404 + * Capture the user space registers if the task is not running (in user space)
36405 + */
36406 +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
36407 +{
36408 +       struct pt_regs *pp, ptregs;
36409 +
36410 +       pp = task_pt_regs(tsk);
36411 +
36412 +       ptregs = *pp; 
36413 +       ptregs.cs &= 0xffff;
36414 +       ptregs.ss &= 0xffff;
36415 +
36416 +       elf_core_copy_regs(regs, &ptregs);
36417
36418 +        boot_option_idle_override = 1;
36419 +       return 1;
36420 +}
36421 +
36422 +unsigned long arch_align_stack(unsigned long sp)
36423 +{
36424 +       if (randomize_va_space)
36425 +               sp -= get_random_int() % 8192;
36426 +       return sp & ~0xf;
36427 +}
36428 +
36429 +#ifndef CONFIG_SMP
36430 +void _restore_vcpu(void)
36431 +{
36432 +}
36433 +#endif
36434 diff -urNp linux-2.6/arch/x86_64/kernel/setup64-xen.c new/arch/x86_64/kernel/setup64-xen.c
36435 --- linux-2.6/arch/x86_64/kernel/setup64-xen.c  1970-01-01 01:00:00.000000000 +0100
36436 +++ new/arch/x86_64/kernel/setup64-xen.c        2006-05-09 12:33:21.000000000 +0200
36437 @@ -0,0 +1,360 @@
36438 +/* 
36439 + * X86-64 specific CPU setup.
36440 + * Copyright (C) 1995  Linus Torvalds
36441 + * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
36442 + * See setup.c for older changelog.
36443 + * $Id$
36444 + *
36445 + * Jun Nakajima <jun.nakajima@intel.com> 
36446 + *   Modified for Xen
36447 + *
36448 + */ 
36449 +#include <linux/config.h>
36450 +#include <linux/init.h>
36451 +#include <linux/kernel.h>
36452 +#include <linux/sched.h>
36453 +#include <linux/string.h>
36454 +#include <linux/bootmem.h>
36455 +#include <linux/bitops.h>
36456 +#include <linux/module.h>
36457 +#include <asm/bootsetup.h>
36458 +#include <asm/pda.h>
36459 +#include <asm/pgtable.h>
36460 +#include <asm/processor.h>
36461 +#include <asm/desc.h>
36462 +#include <asm/atomic.h>
36463 +#include <asm/mmu_context.h>
36464 +#include <asm/smp.h>
36465 +#include <asm/i387.h>
36466 +#include <asm/percpu.h>
36467 +#include <asm/proto.h>
36468 +#include <asm/sections.h>
36469 +#ifdef CONFIG_XEN
36470 +#include <asm/hypervisor.h>
36471 +#endif
36472 +
36473 +char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
36474 +
36475 +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
36476 +
36477 +struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
36478 +struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
36479 +
36480 +#ifndef CONFIG_X86_NO_IDT
36481 +struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; 
36482 +#endif
36483 +
36484 +char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
36485 +
36486 +unsigned long __supported_pte_mask __read_mostly = ~0UL;
36487 +static int do_not_nx __cpuinitdata = 0;
36488 +
36489 +/* noexec=on|off
36490 +Control non executable mappings for 64bit processes.
36491 +
36492 +on     Enable(default)
36493 +off    Disable
36494 +*/ 
36495 +int __init nonx_setup(char *str)
36496 +{
36497 +       if (!strncmp(str, "on", 2)) {
36498 +                __supported_pte_mask |= _PAGE_NX; 
36499 +               do_not_nx = 0; 
36500 +       } else if (!strncmp(str, "off", 3)) {
36501 +               do_not_nx = 1;
36502 +               __supported_pte_mask &= ~_PAGE_NX;
36503 +        }
36504 +       return 1;
36505 +} 
36506 +__setup("noexec=", nonx_setup);        /* parsed early actually */
36507 +
36508 +int force_personality32 = 0; 
36509 +
36510 +/* noexec32=on|off
36511 +Control non executable heap for 32bit processes.
36512 +To control the stack too use noexec=off
36513 +
36514 +on     PROT_READ does not imply PROT_EXEC for 32bit processes
36515 +off    PROT_READ implies PROT_EXEC (default)
36516 +*/
36517 +static int __init nonx32_setup(char *str)
36518 +{
36519 +       if (!strcmp(str, "on"))
36520 +               force_personality32 &= ~READ_IMPLIES_EXEC;
36521 +       else if (!strcmp(str, "off"))
36522 +               force_personality32 |= READ_IMPLIES_EXEC;
36523 +       return 1;
36524 +}
36525 +__setup("noexec32=", nonx32_setup);
36526 +
36527 +/*
36528 + * Great future plan:
36529 + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
36530 + * Always point %gs to its beginning
36531 + */
36532 +void __init setup_per_cpu_areas(void)
36533 +{ 
36534 +       int i;
36535 +       unsigned long size;
36536 +
36537 +#ifdef CONFIG_HOTPLUG_CPU
36538 +       prefill_possible_map();
36539 +#endif
36540 +
36541 +       /* Copy section for each CPU (we discard the original) */
36542 +       size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
36543 +#ifdef CONFIG_MODULES
36544 +       if (size < PERCPU_ENOUGH_ROOM)
36545 +               size = PERCPU_ENOUGH_ROOM;
36546 +#endif
36547 +
36548 +       for_each_cpu_mask (i, cpu_possible_map) {
36549 +               char *ptr;
36550 +
36551 +               if (!NODE_DATA(cpu_to_node(i))) {
36552 +                       printk("cpu with no node %d, num_online_nodes %d\n",
36553 +                              i, num_online_nodes());
36554 +                       ptr = alloc_bootmem(size);
36555 +               } else { 
36556 +                       ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
36557 +               }
36558 +               if (!ptr)
36559 +                       panic("Cannot allocate cpu data for CPU %d\n", i);
36560 +               cpu_pda(i)->data_offset = ptr - __per_cpu_start;
36561 +               memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
36562 +       }
36563 +} 
36564 +
36565 +#ifdef CONFIG_XEN
36566 +static void switch_pt(void)
36567 +{
36568 +       xen_pt_switch(__pa(init_level4_pgt));
36569 +        xen_new_user_pt(__pa(init_level4_user_pgt));
36570 +}
36571 +
36572 +void __cpuinit cpu_gdt_init(struct desc_ptr *gdt_descr)
36573 +{
36574 +       unsigned long frames[16];
36575 +       unsigned long va;
36576 +       int f;
36577 +
36578 +       for (va = gdt_descr->address, f = 0;
36579 +            va < gdt_descr->address + gdt_descr->size;
36580 +            va += PAGE_SIZE, f++) {
36581 +               frames[f] = virt_to_mfn(va);
36582 +               make_page_readonly(
36583 +                       (void *)va, XENFEAT_writable_descriptor_tables);
36584 +       }
36585 +       if (HYPERVISOR_set_gdt(frames, gdt_descr->size /
36586 +                               sizeof (struct desc_struct)))
36587 +               BUG();
36588 +}
36589 +#else
36590 +static void switch_pt(void)
36591 +{
36592 +       asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
36593 +}
36594 +
36595 +void __init cpu_gdt_init(struct desc_ptr *gdt_descr)
36596 +{
36597 +       asm volatile("lgdt %0" :: "m" (*gdt_descr));
36598 +       asm volatile("lidt %0" :: "m" (idt_descr));
36599 +}
36600 +#endif
36601 +
36602 +void pda_init(int cpu)
36603 +{ 
36604 +       struct x8664_pda *pda = cpu_pda(cpu);
36605 +
36606 +       /* Setup up data that may be needed in __get_free_pages early */
36607 +       asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 
36608 +#ifndef CONFIG_XEN
36609 +       wrmsrl(MSR_GS_BASE, pda);
36610 +#else
36611 +       HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, (unsigned long)pda);
36612 +#endif
36613 +       pda->cpunumber = cpu; 
36614 +       pda->irqcount = -1;
36615 +       pda->kernelstack = 
36616 +               (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; 
36617 +       pda->active_mm = &init_mm;
36618 +       pda->mmu_state = 0;
36619 +
36620 +       if (cpu == 0) {
36621 +#ifdef CONFIG_XEN
36622 +               xen_init_pt();
36623 +#endif
36624 +               /* others are initialized in smpboot.c */
36625 +               pda->pcurrent = &init_task;
36626 +               pda->irqstackptr = boot_cpu_stack; 
36627 +       } else {
36628 +               pda->irqstackptr = (char *)
36629 +                       __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
36630 +               if (!pda->irqstackptr)
36631 +                       panic("cannot allocate irqstack for cpu %d", cpu); 
36632 +       }
36633 +
36634 +       switch_pt();
36635 +
36636 +       pda->irqstackptr += IRQSTACKSIZE-64;
36637 +} 
36638 +
36639 +#ifndef CONFIG_X86_NO_TSS
36640 +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
36641 +__attribute__((section(".bss.page_aligned")));
36642 +#endif
36643 +
36644 +/* May not be marked __init: used by software suspend */
36645 +void syscall_init(void)
36646 +{
36647 +#ifndef CONFIG_XEN
36648 +       /* 
36649 +        * LSTAR and STAR live in a bit strange symbiosis.
36650 +        * They both write to the same internal register. STAR allows to set CS/DS
36651 +        * but only a 32bit target. LSTAR sets the 64bit rip.    
36652 +        */ 
36653 +       wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32); 
36654 +       wrmsrl(MSR_LSTAR, system_call); 
36655 +
36656 +       /* Flags to clear on syscall */
36657 +       wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); 
36658 +#endif
36659 +#ifdef CONFIG_IA32_EMULATION                   
36660 +       syscall32_cpu_init ();
36661 +#endif
36662 +}
36663 +
36664 +void __cpuinit check_efer(void)
36665 +{
36666 +       unsigned long efer;
36667 +
36668 +       rdmsrl(MSR_EFER, efer); 
36669 +        if (!(efer & EFER_NX) || do_not_nx) { 
36670 +                __supported_pte_mask &= ~_PAGE_NX; 
36671 +        }       
36672 +}
36673 +
36674 +/*
36675 + * cpu_init() initializes state that is per-CPU. Some data is already
36676 + * initialized (naturally) in the bootstrap process, such as the GDT
36677 + * and IDT. We reload them nevertheless, this function acts as a
36678 + * 'CPU state barrier', nothing should get across.
36679 + * A lot of state is already set up in PDA init.
36680 + */
36681 +void __cpuinit cpu_init (void)
36682 +{
36683 +       int cpu = stack_smp_processor_id();
36684 +#ifndef CONFIG_X86_NO_TSS
36685 +       struct tss_struct *t = &per_cpu(init_tss, cpu);
36686 +       unsigned long v; 
36687 +       char *estacks = NULL; 
36688 +       unsigned i;
36689 +#endif
36690 +       struct task_struct *me;
36691 +
36692 +       /* CPU 0 is initialised in head64.c */
36693 +       if (cpu != 0) {
36694 +               pda_init(cpu);
36695 +               zap_low_mappings(cpu);
36696 +       }
36697 +#ifndef CONFIG_X86_NO_TSS
36698 +       else
36699 +               estacks = boot_exception_stacks; 
36700 +#endif
36701 +
36702 +       me = current;
36703 +
36704 +       if (cpu_test_and_set(cpu, cpu_initialized))
36705 +               panic("CPU#%d already initialized!\n", cpu);
36706 +
36707 +       printk("Initializing CPU#%d\n", cpu);
36708 +
36709 +       clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
36710 +
36711 +       /*
36712 +        * Initialize the per-CPU GDT with the boot GDT,
36713 +        * and set up the GDT descriptor:
36714 +        */
36715 +#ifndef CONFIG_XEN 
36716 +       if (cpu)
36717 +               memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
36718 +#endif
36719 +
36720 +       cpu_gdt_descr[cpu].size = GDT_SIZE;
36721 +       cpu_gdt_init(&cpu_gdt_descr[cpu]);
36722 +
36723 +       memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
36724 +       syscall_init();
36725 +
36726 +       wrmsrl(MSR_FS_BASE, 0);
36727 +       wrmsrl(MSR_KERNEL_GS_BASE, 0);
36728 +       barrier(); 
36729 +
36730 +       check_efer();
36731 +
36732 +#ifndef CONFIG_X86_NO_TSS
36733 +       /*
36734 +        * set up and load the per-CPU TSS
36735 +        */
36736 +       for (v = 0; v < N_EXCEPTION_STACKS; v++) {
36737 +               if (cpu) {
36738 +                       static const unsigned int order[N_EXCEPTION_STACKS] = {
36739 +                               [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
36740 +                               [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
36741 +                       };
36742 +
36743 +                       estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
36744 +                       if (!estacks)
36745 +                               panic("Cannot allocate exception stack %ld %d\n",
36746 +                                     v, cpu); 
36747 +               }
36748 +               switch (v + 1) {
36749 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
36750 +               case DEBUG_STACK:
36751 +                       cpu_pda(cpu)->debugstack = (unsigned long)estacks;
36752 +                       estacks += DEBUG_STKSZ;
36753 +                       break;
36754 +#endif
36755 +               default:
36756 +                       estacks += EXCEPTION_STKSZ;
36757 +                       break;
36758 +               }
36759 +               t->ist[v] = (unsigned long)estacks;
36760 +       }
36761 +
36762 +       t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
36763 +       /*
36764 +        * <= is required because the CPU will access up to
36765 +        * 8 bits beyond the end of the IO permission bitmap.
36766 +        */
36767 +       for (i = 0; i <= IO_BITMAP_LONGS; i++)
36768 +               t->io_bitmap[i] = ~0UL;
36769 +#endif
36770 +
36771 +       atomic_inc(&init_mm.mm_count);
36772 +       me->active_mm = &init_mm;
36773 +       if (me->mm)
36774 +               BUG();
36775 +       enter_lazy_tlb(&init_mm, me);
36776 +
36777 +#ifndef CONFIG_X86_NO_TSS
36778 +       set_tss_desc(cpu, t);
36779 +#endif
36780 +#ifndef CONFIG_XEN
36781 +       load_TR_desc();
36782 +#endif
36783 +       load_LDT(&init_mm.context);
36784 +
36785 +       /*
36786 +        * Clear all 6 debug registers:
36787 +        */
36788 +
36789 +       set_debugreg(0UL, 0);
36790 +       set_debugreg(0UL, 1);
36791 +       set_debugreg(0UL, 2);
36792 +       set_debugreg(0UL, 3);
36793 +       set_debugreg(0UL, 6);
36794 +       set_debugreg(0UL, 7);
36795 +
36796 +       fpu_init(); 
36797 +}
36798 diff -urNp linux-2.6/arch/x86_64/kernel/setup-xen.c new/arch/x86_64/kernel/setup-xen.c
36799 --- linux-2.6/arch/x86_64/kernel/setup-xen.c    1970-01-01 01:00:00.000000000 +0100
36800 +++ new/arch/x86_64/kernel/setup-xen.c  2006-06-28 14:32:14.000000000 +0200
36801 @@ -0,0 +1,1710 @@
36802 +/*
36803 + *  linux/arch/x86-64/kernel/setup.c
36804 + *
36805 + *  Copyright (C) 1995  Linus Torvalds
36806 + *
36807 + *  Nov 2001 Dave Jones <davej@suse.de>
36808 + *  Forked from i386 setup code.
36809 + *
36810 + *  $Id$
36811 + */
36812 +
36813 +/*
36814 + * This file handles the architecture-dependent parts of initialization
36815 + */
36816 +
36817 +#include <linux/errno.h>
36818 +#include <linux/sched.h>
36819 +#include <linux/kernel.h>
36820 +#include <linux/mm.h>
36821 +#include <linux/stddef.h>
36822 +#include <linux/unistd.h>
36823 +#include <linux/ptrace.h>
36824 +#include <linux/slab.h>
36825 +#include <linux/user.h>
36826 +#include <linux/a.out.h>
36827 +#include <linux/tty.h>
36828 +#include <linux/ioport.h>
36829 +#include <linux/delay.h>
36830 +#include <linux/config.h>
36831 +#include <linux/init.h>
36832 +#include <linux/initrd.h>
36833 +#include <linux/highmem.h>
36834 +#include <linux/bootmem.h>
36835 +#include <linux/module.h>
36836 +#include <asm/processor.h>
36837 +#include <linux/console.h>
36838 +#include <linux/seq_file.h>
36839 +#include <linux/crash_dump.h>
36840 +#include <linux/root_dev.h>
36841 +#include <linux/pci.h>
36842 +#include <linux/acpi.h>
36843 +#include <linux/kallsyms.h>
36844 +#include <linux/edd.h>
36845 +#include <linux/mmzone.h>
36846 +#include <linux/kexec.h>
36847 +#include <linux/cpufreq.h>
36848 +#include <linux/dmi.h>
36849 +#include <linux/dma-mapping.h>
36850 +#include <linux/ctype.h>
36851 +
36852 +#include <asm/mtrr.h>
36853 +#include <asm/uaccess.h>
36854 +#include <asm/system.h>
36855 +#include <asm/io.h>
36856 +#include <asm/smp.h>
36857 +#include <asm/msr.h>
36858 +#include <asm/desc.h>
36859 +#include <video/edid.h>
36860 +#include <asm/e820.h>
36861 +#include <asm/dma.h>
36862 +#include <asm/mpspec.h>
36863 +#include <asm/mmu_context.h>
36864 +#include <asm/bootsetup.h>
36865 +#include <asm/proto.h>
36866 +#include <asm/setup.h>
36867 +#include <asm/mach_apic.h>
36868 +#include <asm/numa.h>
36869 +#include <asm/swiotlb.h>
36870 +#include <asm/sections.h>
36871 +#include <asm/gart-mapping.h>
36872 +#include <asm/dmi.h>
36873 +#ifdef CONFIG_XEN
36874 +#include <linux/percpu.h>
36875 +#include <xen/interface/physdev.h>
36876 +#include "setup_arch_pre.h"
36877 +#include <asm/hypervisor.h>
36878 +#include <xen/interface/nmi.h>
36879 +#include <xen/features.h>
36880 +#define PFN_UP(x)       (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
36881 +#define PFN_PHYS(x)     ((x) << PAGE_SHIFT)
36882 +#include <asm/mach-xen/setup_arch_post.h>
36883 +#include <xen/interface/memory.h>
36884 +
36885 +extern unsigned long start_pfn;
36886 +extern struct edid_info edid_info;
36887 +
36888 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
36889 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
36890 +
36891 +extern char hypercall_page[PAGE_SIZE];
36892 +EXPORT_SYMBOL(hypercall_page);
36893 +
36894 +/* Allows setting of maximum possible memory size  */
36895 +unsigned long xen_override_max_pfn;
36896 +
36897 +static int xen_panic_event(struct notifier_block *, unsigned long, void *);
36898 +static struct notifier_block xen_panic_block = {
36899 +       xen_panic_event, NULL, 0 /* try to go last */
36900 +};
36901 +
36902 +unsigned long *phys_to_machine_mapping;
36903 +unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512];
36904 +
36905 +EXPORT_SYMBOL(phys_to_machine_mapping);
36906 +
36907 +DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]);
36908 +DEFINE_PER_CPU(int, nr_multicall_ents);
36909 +
36910 +/* Raw start-of-day parameters from the hypervisor. */
36911 +start_info_t *xen_start_info;
36912 +EXPORT_SYMBOL(xen_start_info);
36913 +#endif
36914 +
36915 +/*
36916 + * Machine setup..
36917 + */
36918 +
36919 +struct cpuinfo_x86 boot_cpu_data __read_mostly;
36920 +
36921 +unsigned long mmu_cr4_features;
36922 +
36923 +int acpi_disabled;
36924 +EXPORT_SYMBOL(acpi_disabled);
36925 +#ifdef CONFIG_ACPI
36926 +extern int __initdata acpi_ht;
36927 +extern acpi_interrupt_flags    acpi_sci_flags;
36928 +int __initdata acpi_force = 0;
36929 +#endif
36930 +
36931 +int acpi_numa __initdata;
36932 +
36933 +/* Boot loader ID as an integer, for the benefit of proc_dointvec */
36934 +int bootloader_type;
36935 +
36936 +unsigned long saved_video_mode;
36937 +
36938 +/* 
36939 + * Early DMI memory
36940 + */
36941 +int dmi_alloc_index;
36942 +char dmi_alloc_data[DMI_MAX_DATA];
36943 +
36944 +/*
36945 + * Setup options
36946 + */
36947 +struct screen_info screen_info;
36948 +struct sys_desc_table_struct {
36949 +       unsigned short length;
36950 +       unsigned char table[0];
36951 +};
36952 +
36953 +struct edid_info edid_info;
36954 +struct e820map e820;
36955 +
36956 +extern int root_mountflags;
36957 +
36958 +char command_line[COMMAND_LINE_SIZE];
36959 +
36960 +struct resource standard_io_resources[] = {
36961 +       { .name = "dma1", .start = 0x00, .end = 0x1f,
36962 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
36963 +       { .name = "pic1", .start = 0x20, .end = 0x21,
36964 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
36965 +       { .name = "timer0", .start = 0x40, .end = 0x43,
36966 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
36967 +       { .name = "timer1", .start = 0x50, .end = 0x53,
36968 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
36969 +       { .name = "keyboard", .start = 0x60, .end = 0x6f,
36970 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
36971 +       { .name = "dma page reg", .start = 0x80, .end = 0x8f,
36972 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
36973 +       { .name = "pic2", .start = 0xa0, .end = 0xa1,
36974 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
36975 +       { .name = "dma2", .start = 0xc0, .end = 0xdf,
36976 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
36977 +       { .name = "fpu", .start = 0xf0, .end = 0xff,
36978 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO }
36979 +};
36980 +
36981 +#define STANDARD_IO_RESOURCES \
36982 +       (sizeof standard_io_resources / sizeof standard_io_resources[0])
36983 +
36984 +#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
36985 +
36986 +struct resource data_resource = {
36987 +       .name = "Kernel data",
36988 +       .start = 0,
36989 +       .end = 0,
36990 +       .flags = IORESOURCE_RAM,
36991 +};
36992 +struct resource code_resource = {
36993 +       .name = "Kernel code",
36994 +       .start = 0,
36995 +       .end = 0,
36996 +       .flags = IORESOURCE_RAM,
36997 +};
36998 +
36999 +#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM)
37000 +
37001 +#if defined(CONFIG_XEN_PRIVILEGED_GUEST) || !defined(CONFIG_XEN)
37002 +static struct resource system_rom_resource = {
37003 +       .name = "System ROM",
37004 +       .start = 0xf0000,
37005 +       .end = 0xfffff,
37006 +       .flags = IORESOURCE_ROM,
37007 +};
37008 +
37009 +static struct resource extension_rom_resource = {
37010 +       .name = "Extension ROM",
37011 +       .start = 0xe0000,
37012 +       .end = 0xeffff,
37013 +       .flags = IORESOURCE_ROM,
37014 +};
37015 +
37016 +static struct resource adapter_rom_resources[] = {
37017 +       { .name = "Adapter ROM", .start = 0xc8000, .end = 0,
37018 +               .flags = IORESOURCE_ROM },
37019 +       { .name = "Adapter ROM", .start = 0, .end = 0,
37020 +               .flags = IORESOURCE_ROM },
37021 +       { .name = "Adapter ROM", .start = 0, .end = 0,
37022 +               .flags = IORESOURCE_ROM },
37023 +       { .name = "Adapter ROM", .start = 0, .end = 0,
37024 +               .flags = IORESOURCE_ROM },
37025 +       { .name = "Adapter ROM", .start = 0, .end = 0,
37026 +               .flags = IORESOURCE_ROM },
37027 +       { .name = "Adapter ROM", .start = 0, .end = 0,
37028 +               .flags = IORESOURCE_ROM }
37029 +};
37030 +#endif
37031 +
37032 +#define ADAPTER_ROM_RESOURCES \
37033 +       (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
37034 +
37035 +#if defined(CONFIG_XEN_PRIVILEGED_GUEST) || !defined(CONFIG_XEN)
37036 +static struct resource video_rom_resource = {
37037 +       .name = "Video ROM",
37038 +       .start = 0xc0000,
37039 +       .end = 0xc7fff,
37040 +       .flags = IORESOURCE_ROM,
37041 +};
37042 +#endif
37043 +
37044 +static struct resource video_ram_resource = {
37045 +       .name = "Video RAM area",
37046 +       .start = 0xa0000,
37047 +       .end = 0xbffff,
37048 +       .flags = IORESOURCE_RAM,
37049 +};
37050 +
37051 +#if defined(CONFIG_XEN_PRIVILEGED_GUEST) || !defined(CONFIG_XEN)
37052 +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
37053 +
37054 +static int __init romchecksum(unsigned char *rom, unsigned long length)
37055 +{
37056 +       unsigned char *p, sum = 0;
37057 +
37058 +       for (p = rom; p < rom + length; p++)
37059 +               sum += *p;
37060 +       return sum == 0;
37061 +}
37062 +
37063 +static void __init probe_roms(void)
37064 +{
37065 +       unsigned long start, length, upper;
37066 +       unsigned char *rom;
37067 +       int           i;
37068 +
37069 +       /* video rom */
37070 +       upper = adapter_rom_resources[0].start;
37071 +       for (start = video_rom_resource.start; start < upper; start += 2048) {
37072 +               rom = isa_bus_to_virt(start);
37073 +               if (!romsignature(rom))
37074 +                       continue;
37075 +
37076 +               video_rom_resource.start = start;
37077 +
37078 +               /* 0 < length <= 0x7f * 512, historically */
37079 +               length = rom[2] * 512;
37080 +
37081 +               /* if checksum okay, trust length byte */
37082 +               if (length && romchecksum(rom, length))
37083 +                       video_rom_resource.end = start + length - 1;
37084 +
37085 +               request_resource(&iomem_resource, &video_rom_resource);
37086 +               break;
37087 +                       }
37088 +
37089 +       start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
37090 +       if (start < upper)
37091 +               start = upper;
37092 +
37093 +       /* system rom */
37094 +       request_resource(&iomem_resource, &system_rom_resource);
37095 +       upper = system_rom_resource.start;
37096 +
37097 +       /* check for extension rom (ignore length byte!) */
37098 +       rom = isa_bus_to_virt(extension_rom_resource.start);
37099 +       if (romsignature(rom)) {
37100 +               length = extension_rom_resource.end - extension_rom_resource.start + 1;
37101 +               if (romchecksum(rom, length)) {
37102 +                       request_resource(&iomem_resource, &extension_rom_resource);
37103 +                       upper = extension_rom_resource.start;
37104 +               }
37105 +       }
37106 +
37107 +       /* check for adapter roms on 2k boundaries */
37108 +       for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
37109 +               rom = isa_bus_to_virt(start);
37110 +               if (!romsignature(rom))
37111 +                       continue;
37112 +
37113 +               /* 0 < length <= 0x7f * 512, historically */
37114 +               length = rom[2] * 512;
37115 +
37116 +               /* but accept any length that fits if checksum okay */
37117 +               if (!length || start + length > upper || !romchecksum(rom, length))
37118 +                       continue;
37119 +
37120 +               adapter_rom_resources[i].start = start;
37121 +               adapter_rom_resources[i].end = start + length - 1;
37122 +               request_resource(&iomem_resource, &adapter_rom_resources[i]);
37123 +
37124 +               start = adapter_rom_resources[i++].end & ~2047UL;
37125 +       }
37126 +}
37127 +#endif
37128 +
37129 +/* Check for full argument with no trailing characters */
37130 +static int fullarg(char *p, char *arg)
37131 +{
37132 +       int l = strlen(arg);
37133 +       return !memcmp(p, arg, l) && (p[l] == 0 || isspace(p[l]));
37134 +}
37135 +
37136 +static __init void parse_cmdline_early (char ** cmdline_p)
37137 +{
37138 +       char c = ' ', *to = command_line, *from = COMMAND_LINE;
37139 +       int len = 0;
37140 +       int userdef = 0;
37141 +
37142 +       for (;;) {
37143 +               if (c != ' ') 
37144 +                       goto next_char; 
37145 +
37146 +#ifdef  CONFIG_SMP
37147 +               /*
37148 +                * If the BIOS enumerates physical processors before logical,
37149 +                * maxcpus=N at enumeration-time can be used to disable HT.
37150 +                */
37151 +               else if (!memcmp(from, "maxcpus=", 8)) {
37152 +                       extern unsigned int maxcpus;
37153 +
37154 +                       maxcpus = simple_strtoul(from + 8, NULL, 0);
37155 +               }
37156 +#endif
37157 +#ifdef CONFIG_ACPI
37158 +               /* "acpi=off" disables both ACPI table parsing and interpreter init */
37159 +               if (fullarg(from,"acpi=off"))
37160 +                       disable_acpi();
37161 +
37162 +               if (fullarg(from, "acpi=force")) { 
37163 +                       /* add later when we do DMI horrors: */
37164 +                       acpi_force = 1;
37165 +                       acpi_disabled = 0;
37166 +               }
37167 +
37168 +               /* acpi=ht just means: do ACPI MADT parsing 
37169 +                  at bootup, but don't enable the full ACPI interpreter */
37170 +               if (fullarg(from, "acpi=ht")) { 
37171 +                       if (!acpi_force)
37172 +                               disable_acpi();
37173 +                       acpi_ht = 1; 
37174 +               }
37175 +                else if (fullarg(from, "pci=noacpi")) 
37176 +                       acpi_disable_pci();
37177 +               else if (fullarg(from, "acpi=noirq"))
37178 +                       acpi_noirq_set();
37179 +
37180 +               else if (fullarg(from, "acpi_sci=edge"))
37181 +                       acpi_sci_flags.trigger =  1;
37182 +               else if (fullarg(from, "acpi_sci=level"))
37183 +                       acpi_sci_flags.trigger = 3;
37184 +               else if (fullarg(from, "acpi_sci=high"))
37185 +                       acpi_sci_flags.polarity = 1;
37186 +               else if (fullarg(from, "acpi_sci=low"))
37187 +                       acpi_sci_flags.polarity = 3;
37188 +
37189 +               /* acpi=strict disables out-of-spec workarounds */
37190 +               else if (fullarg(from, "acpi=strict")) {
37191 +                       acpi_strict = 1;
37192 +               }
37193 +#ifdef CONFIG_X86_IO_APIC
37194 +               else if (fullarg(from, "acpi_skip_timer_override"))
37195 +                       acpi_skip_timer_override = 1;
37196 +#endif
37197 +#endif
37198 +
37199 +#ifndef CONFIG_XEN
37200 +               if (fullarg(from, "disable_timer_pin_1"))
37201 +                       disable_timer_pin_1 = 1;
37202 +               if (fullarg(from, "enable_timer_pin_1"))
37203 +                       disable_timer_pin_1 = -1;
37204 +
37205 +               if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) {
37206 +                       clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
37207 +                       disable_apic = 1;
37208 +               }
37209 +
37210 +               if (fullarg(from, "noapic"))
37211 +                       skip_ioapic_setup = 1;
37212 +
37213 +               if (fullarg(from,"apic")) {
37214 +                       skip_ioapic_setup = 0;
37215 +                       ioapic_force = 1;
37216 +               }
37217 +#endif
37218 +                       
37219 +               if (!memcmp(from, "mem=", 4))
37220 +                       parse_memopt(from+4, &from); 
37221 +
37222 +               if (!memcmp(from, "memmap=", 7)) {
37223 +                       /* exactmap option is for used defined memory */
37224 +                       if (!memcmp(from+7, "exactmap", 8)) {
37225 +#ifdef CONFIG_CRASH_DUMP
37226 +                               /* If we are doing a crash dump, we
37227 +                                * still need to know the real mem
37228 +                                * size before original memory map is
37229 +                                * reset.
37230 +                                */
37231 +                               saved_max_pfn = e820_end_of_ram();
37232 +#endif
37233 +                               from += 8+7;
37234 +                               end_pfn_map = 0;
37235 +                               e820.nr_map = 0;
37236 +                               userdef = 1;
37237 +                       }
37238 +                       else {
37239 +                               parse_memmapopt(from+7, &from);
37240 +                               userdef = 1;
37241 +                       }
37242 +               }
37243 +
37244 +#ifdef CONFIG_NUMA
37245 +               if (!memcmp(from, "numa=", 5))
37246 +                       numa_setup(from+5); 
37247 +#endif
37248 +
37249 +               if (!memcmp(from,"iommu=",6)) { 
37250 +                       iommu_setup(from+6); 
37251 +               }
37252 +
37253 +               if (fullarg(from,"oops=panic"))
37254 +                       panic_on_oops = 1;
37255 +
37256 +               if (!memcmp(from, "noexec=", 7))
37257 +                       nonx_setup(from + 7);
37258 +
37259 +#ifdef CONFIG_KEXEC
37260 +               /* crashkernel=size@addr specifies the location to reserve for
37261 +                * a crash kernel.  By reserving this memory we guarantee
37262 +                * that linux never set's it up as a DMA target.
37263 +                * Useful for holding code to do something appropriate
37264 +                * after a kernel panic.
37265 +                */
37266 +               else if (!memcmp(from, "crashkernel=", 12)) {
37267 +                       unsigned long size, base;
37268 +                       size = memparse(from+12, &from);
37269 +                       if (*from == '@') {
37270 +                               base = memparse(from+1, &from);
37271 +                               /* FIXME: Do I want a sanity check
37272 +                                * to validate the memory range?
37273 +                                */
37274 +                               crashk_res.start = base;
37275 +                               crashk_res.end   = base + size - 1;
37276 +                       }
37277 +               }
37278 +#endif
37279 +
37280 +#ifdef CONFIG_PROC_VMCORE
37281 +               /* elfcorehdr= specifies the location of elf core header
37282 +                * stored by the crashed kernel. This option will be passed
37283 +                * by kexec loader to the capture kernel.
37284 +                */
37285 +               else if(!memcmp(from, "elfcorehdr=", 11))
37286 +                       elfcorehdr_addr = memparse(from+11, &from);
37287 +#endif
37288 +
37289 +#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN)
37290 +               else if (!memcmp(from, "additional_cpus=", 16))
37291 +                       setup_additional_cpus(from+16);
37292 +#endif
37293 +
37294 +       next_char:
37295 +               c = *(from++);
37296 +               if (!c)
37297 +                       break;
37298 +               if (COMMAND_LINE_SIZE <= ++len)
37299 +                       break;
37300 +               *(to++) = c;
37301 +       }
37302 +       if (userdef) {
37303 +               printk(KERN_INFO "user-defined physical RAM map:\n");
37304 +               e820_print_map("user");
37305 +       }
37306 +       *to = '\0';
37307 +       *cmdline_p = command_line;
37308 +}
37309 +
37310 +#ifndef CONFIG_NUMA
37311 +static void __init
37312 +contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
37313 +{
37314 +       unsigned long bootmap_size, bootmap;
37315 +
37316 +       bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
37317 +       bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
37318 +       if (bootmap == -1L)
37319 +               panic("Cannot find bootmem map of size %ld\n",bootmap_size);
37320 +       bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
37321 +#ifdef CONFIG_XEN
37322 +       e820_bootmem_free(NODE_DATA(0), 0, xen_start_info->nr_pages<<PAGE_SHIFT);
37323 +#else
37324 +       e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
37325 +#endif
37326 +       reserve_bootmem(bootmap, bootmap_size);
37327 +} 
37328 +#endif
37329 +
37330 +/* Use inline assembly to define this because the nops are defined 
37331 +   as inline assembly strings in the include files and we cannot 
37332 +   get them easily into strings. */
37333 +asm("\t.data\nk8nops: " 
37334 +    K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
37335 +    K8_NOP7 K8_NOP8); 
37336 +    
37337 +extern unsigned char k8nops[];
37338 +static unsigned char *k8_nops[ASM_NOP_MAX+1] = { 
37339 +     NULL,
37340 +     k8nops,
37341 +     k8nops + 1,
37342 +     k8nops + 1 + 2,
37343 +     k8nops + 1 + 2 + 3,
37344 +     k8nops + 1 + 2 + 3 + 4,
37345 +     k8nops + 1 + 2 + 3 + 4 + 5,
37346 +     k8nops + 1 + 2 + 3 + 4 + 5 + 6,
37347 +     k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
37348 +}; 
37349 +
37350 +extern char __vsyscall_0;
37351 +
37352 +/* Replace instructions with better alternatives for this CPU type.
37353 +
37354 +   This runs before SMP is initialized to avoid SMP problems with
37355 +   self modifying code. This implies that assymetric systems where
37356 +   APs have less capabilities than the boot processor are not handled. 
37357 +   In this case boot with "noreplacement". */ 
37358 +void apply_alternatives(void *start, void *end) 
37359 +{ 
37360 +       struct alt_instr *a; 
37361 +       int diff, i, k;
37362 +       for (a = start; (void *)a < end; a++) { 
37363 +               u8 *instr;
37364 +
37365 +               if (!boot_cpu_has(a->cpuid))
37366 +                       continue;
37367 +
37368 +               BUG_ON(a->replacementlen > a->instrlen); 
37369 +               instr = a->instr;
37370 +               /* vsyscall code is not mapped yet. resolve it manually. */
37371 +               if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END)
37372 +                       instr -= VSYSCALL_START - (unsigned long)&__vsyscall_0;
37373 +               __inline_memcpy(instr, a->replacement, a->replacementlen);
37374 +               diff = a->instrlen - a->replacementlen; 
37375 +
37376 +               /* Pad the rest with nops */
37377 +               for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
37378 +                       k = diff;
37379 +                       if (k > ASM_NOP_MAX)
37380 +                               k = ASM_NOP_MAX;
37381 +                       __inline_memcpy(instr + i, k8_nops[k], k);
37382 +               } 
37383 +       }
37384 +} 
37385 +
37386 +static int no_replacement __initdata = 0; 
37387
37388 +void __init alternative_instructions(void)
37389 +{
37390 +       extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
37391 +       if (no_replacement) 
37392 +               return;
37393 +       apply_alternatives(__alt_instructions, __alt_instructions_end);
37394 +}
37395 +
37396 +static int __init noreplacement_setup(char *s)
37397 +{ 
37398 +     no_replacement = 1; 
37399 +     return 1;
37400 +} 
37401 +
37402 +__setup("noreplacement", noreplacement_setup); 
37403 +
37404 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
37405 +struct edd edd;
37406 +#ifdef CONFIG_EDD_MODULE
37407 +EXPORT_SYMBOL(edd);
37408 +#endif
37409 +/**
37410 + * copy_edd() - Copy the BIOS EDD information
37411 + *              from boot_params into a safe place.
37412 + *
37413 + */
37414 +static inline void copy_edd(void)
37415 +{
37416 +     memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
37417 +     memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
37418 +     edd.mbr_signature_nr = EDD_MBR_SIG_NR;
37419 +     edd.edd_info_nr = EDD_NR;
37420 +}
37421 +#else
37422 +static inline void copy_edd(void)
37423 +{
37424 +}
37425 +#endif
37426 +
37427 +#ifndef CONFIG_XEN
37428 +#define EBDA_ADDR_POINTER 0x40E
37429 +
37430 +unsigned __initdata ebda_addr;
37431 +unsigned __initdata ebda_size;
37432 +
37433 +static void discover_ebda(void)
37434 +{
37435 +       /*
37436 +        * there is a real-mode segmented pointer pointing to the 
37437 +        * 4K EBDA area at 0x40E
37438 +        */
37439 +       ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER;
37440 +       ebda_addr <<= 4;
37441 +
37442 +       ebda_size = *(unsigned short *)(unsigned long)ebda_addr;
37443 +
37444 +       /* Round EBDA up to pages */
37445 +       if (ebda_size == 0)
37446 +               ebda_size = 1;
37447 +       ebda_size <<= 10;
37448 +       ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
37449 +       if (ebda_size > 64*1024)
37450 +               ebda_size = 64*1024;
37451 +}
37452 +#endif
37453 +
37454 +void __init setup_arch(char **cmdline_p)
37455 +{
37456 +       unsigned long kernel_end;
37457 +
37458 +#if defined(CONFIG_XEN_PRIVILEGED_GUEST)
37459 +       struct e820entry *machine_e820;
37460 +       struct xen_memory_map memmap;
37461 +#endif
37462 +
37463 +#ifdef CONFIG_XEN
37464 +       /* Register a call for panic conditions. */
37465 +       atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
37466 +
37467 +       ROOT_DEV = MKDEV(RAMDISK_MAJOR,0); 
37468 +       kernel_end = 0;         /* dummy */
37469 +       screen_info = SCREEN_INFO;
37470 +
37471 +       if (xen_start_info->flags & SIF_INITDOMAIN) {
37472 +               /* This is drawn from a dump from vgacon:startup in
37473 +                * standard Linux. */
37474 +               screen_info.orig_video_mode = 3;
37475 +               screen_info.orig_video_isVGA = 1;
37476 +               screen_info.orig_video_lines = 25;
37477 +               screen_info.orig_video_cols = 80;
37478 +               screen_info.orig_video_ega_bx = 3;
37479 +               screen_info.orig_video_points = 16;
37480 +       } else
37481 +               screen_info.orig_video_isVGA = 0;
37482 +
37483 +       edid_info = EDID_INFO;
37484 +       saved_video_mode = SAVED_VIDEO_MODE;
37485 +       bootloader_type = LOADER_TYPE;
37486 +
37487 +#ifdef CONFIG_BLK_DEV_RAM
37488 +       rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
37489 +       rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
37490 +       rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
37491 +
37492 +
37493 +#endif
37494 +
37495 +       setup_xen_features();
37496 +
37497 +       HYPERVISOR_vm_assist(VMASST_CMD_enable,
37498 +                            VMASST_TYPE_writable_pagetables);
37499 +
37500 +       ARCH_SETUP
37501 +#else
37502 +       ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
37503 +       screen_info = SCREEN_INFO;
37504 +       edid_info = EDID_INFO;
37505 +       saved_video_mode = SAVED_VIDEO_MODE;
37506 +       bootloader_type = LOADER_TYPE;
37507 +
37508 +#ifdef CONFIG_BLK_DEV_RAM
37509 +       rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
37510 +       rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
37511 +       rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
37512 +#endif
37513 +#endif /* !CONFIG_XEN */
37514 +       setup_memory_region();
37515 +       copy_edd();
37516 +
37517 +       if (!MOUNT_ROOT_RDONLY)
37518 +               root_mountflags &= ~MS_RDONLY;
37519 +       init_mm.start_code = (unsigned long) &_text;
37520 +       init_mm.end_code = (unsigned long) &_etext;
37521 +       init_mm.end_data = (unsigned long) &_edata;
37522 +       init_mm.brk = (unsigned long) &_end;
37523 +
37524 +       code_resource.start = virt_to_phys(&_text);
37525 +       code_resource.end = virt_to_phys(&_etext)-1;
37526 +       data_resource.start = virt_to_phys(&_etext);
37527 +       data_resource.end = virt_to_phys(&_edata)-1;
37528 +
37529 +       parse_cmdline_early(cmdline_p);
37530 +
37531 +       early_identify_cpu(&boot_cpu_data);
37532 +
37533 +       /*
37534 +        * partially used pages are not usable - thus
37535 +        * we are rounding upwards:
37536 +        */
37537 +       end_pfn = e820_end_of_ram();
37538 +       num_physpages = end_pfn;                /* for pfn_valid */
37539 +
37540 +       check_efer();
37541 +
37542 +#ifndef CONFIG_XEN
37543 +       discover_ebda();
37544 +#endif
37545 +
37546 +       init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
37547 +
37548 +#ifdef CONFIG_ACPI_NUMA
37549 +       /*
37550 +        * Parse SRAT to discover nodes.
37551 +        */
37552 +       acpi_numa_init();
37553 +#endif
37554 +
37555 +#ifdef CONFIG_NUMA
37556 +       numa_initmem_init(0, end_pfn); 
37557 +#else
37558 +       contig_initmem_init(0, end_pfn);
37559 +#endif
37560 +
37561 +       /* Reserve direct mapping */
37562 +       reserve_bootmem_generic(table_start << PAGE_SHIFT, 
37563 +                               (table_end - table_start) << PAGE_SHIFT);
37564 +
37565 +       /* reserve kernel */
37566 +       kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE);
37567 +       reserve_bootmem_generic(HIGH_MEMORY, kernel_end - HIGH_MEMORY);
37568 +
37569 +#ifdef CONFIG_XEN
37570 +       /* reserve physmap, start info and initial page tables */
37571 +       reserve_bootmem(kernel_end, (table_start<<PAGE_SHIFT)-kernel_end);
37572 +#else
37573 +       /*
37574 +        * reserve physical page 0 - it's a special BIOS page on many boxes,
37575 +        * enabling clean reboots, SMP operation, laptop functions.
37576 +        */
37577 +       reserve_bootmem_generic(0, PAGE_SIZE);
37578 +
37579 +       /* reserve ebda region */
37580 +       if (ebda_addr)
37581 +               reserve_bootmem_generic(ebda_addr, ebda_size);
37582 +#endif
37583 +
37584 +#ifdef CONFIG_SMP
37585 +       /*
37586 +        * But first pinch a few for the stack/trampoline stuff
37587 +        * FIXME: Don't need the extra page at 4K, but need to fix
37588 +        * trampoline before removing it. (see the GDT stuff)
37589 +        */
37590 +       reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE);
37591 +
37592 +       /* Reserve SMP trampoline */
37593 +       reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE);
37594 +#endif
37595 +
37596 +#ifdef CONFIG_ACPI_SLEEP
37597 +       /*
37598 +        * Reserve low memory region for sleep support.
37599 +        */
37600 +       acpi_reserve_bootmem();
37601 +#endif
37602 +#ifdef CONFIG_XEN
37603 +#ifdef CONFIG_BLK_DEV_INITRD
37604 +       if (xen_start_info->mod_start) {
37605 +               if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
37606 +                       /*reserve_bootmem_generic(INITRD_START, INITRD_SIZE);*/
37607 +                       initrd_start = INITRD_START + PAGE_OFFSET;
37608 +                       initrd_end = initrd_start+INITRD_SIZE;
37609 +                       initrd_below_start_ok = 1;
37610 +               } else {
37611 +                       printk(KERN_ERR "initrd extends beyond end of memory "
37612 +                               "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
37613 +                               (unsigned long)(INITRD_START + INITRD_SIZE),
37614 +                               (unsigned long)(end_pfn << PAGE_SHIFT));
37615 +                       initrd_start = 0;
37616 +               }
37617 +       }
37618 +#endif
37619 +#else  /* CONFIG_XEN */
37620 +#ifdef CONFIG_BLK_DEV_INITRD
37621 +       if (LOADER_TYPE && INITRD_START) {
37622 +               if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
37623 +                       reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
37624 +                       initrd_start =
37625 +                               INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
37626 +                       initrd_end = initrd_start+INITRD_SIZE;
37627 +               }
37628 +               else {
37629 +                       printk(KERN_ERR "initrd extends beyond end of memory "
37630 +                           "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
37631 +                           (unsigned long)(INITRD_START + INITRD_SIZE),
37632 +                           (unsigned long)(end_pfn << PAGE_SHIFT));
37633 +                       initrd_start = 0;
37634 +               }
37635 +       }
37636 +#endif
37637 +#endif /* !CONFIG_XEN */
37638 +#ifdef CONFIG_KEXEC
37639 +       if (crashk_res.start != crashk_res.end) {
37640 +               reserve_bootmem(crashk_res.start,
37641 +                       crashk_res.end - crashk_res.start + 1);
37642 +       }
37643 +#endif
37644 +
37645 +       paging_init();
37646 +#ifdef CONFIG_X86_LOCAL_APIC
37647 +       /*
37648 +        * Find and reserve possible boot-time SMP configuration:
37649 +        */
37650 +       find_smp_config();
37651 +#endif
37652 +#ifdef CONFIG_XEN
37653 +       {
37654 +               int i, j, k, fpp;
37655 +
37656 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
37657 +                       /* Make sure we have a large enough P->M table. */
37658 +                       phys_to_machine_mapping = alloc_bootmem(
37659 +                               end_pfn * sizeof(unsigned long));
37660 +                       memset(phys_to_machine_mapping, ~0,
37661 +                              end_pfn * sizeof(unsigned long));
37662 +                       memcpy(phys_to_machine_mapping,
37663 +                              (unsigned long *)xen_start_info->mfn_list,
37664 +                              xen_start_info->nr_pages * sizeof(unsigned long));
37665 +                       free_bootmem(
37666 +                               __pa(xen_start_info->mfn_list),
37667 +                               PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
37668 +                                               sizeof(unsigned long))));
37669 +
37670 +                       /*
37671 +                        * Initialise the list of the frames that specify the
37672 +                        * list of frames that make up the p2m table. Used by
37673 +                         * save/restore.
37674 +                        */
37675 +                       pfn_to_mfn_frame_list_list = alloc_bootmem(PAGE_SIZE);
37676 +                       HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
37677 +                               virt_to_mfn(pfn_to_mfn_frame_list_list);
37678 +
37679 +                       fpp = PAGE_SIZE/sizeof(unsigned long);
37680 +                       for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) {
37681 +                               if ((j % fpp) == 0) {
37682 +                                       k++;
37683 +                                       BUG_ON(k>=fpp);
37684 +                                       pfn_to_mfn_frame_list[k] =
37685 +                                               alloc_bootmem(PAGE_SIZE);
37686 +                                       pfn_to_mfn_frame_list_list[k] =
37687 +                                               virt_to_mfn(pfn_to_mfn_frame_list[k]);
37688 +                                       j=0;
37689 +                               }
37690 +                               pfn_to_mfn_frame_list[k][j] =
37691 +                                       virt_to_mfn(&phys_to_machine_mapping[i]);
37692 +                       }
37693 +                       HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
37694 +               }
37695 +
37696 +       }
37697 +
37698 +       if (xen_start_info->flags & SIF_INITDOMAIN)
37699 +               dmi_scan_machine();
37700 +
37701 +       if ( ! (xen_start_info->flags & SIF_INITDOMAIN))
37702 +       {
37703 +               acpi_disabled = 1;
37704 +#ifdef  CONFIG_ACPI
37705 +               acpi_ht = 0;
37706 +#endif
37707 +       }
37708 +#endif
37709 +
37710 +#ifndef CONFIG_XEN
37711 +       check_ioapic();
37712 +#endif
37713 +
37714 +       zap_low_mappings(0);
37715 +
37716 +       /*
37717 +        * set this early, so we dont allocate cpu0
37718 +        * if MADT list doesnt list BSP first
37719 +        * mpparse.c/MP_processor_info() allocates logical cpu numbers.
37720 +        */
37721 +       cpu_set(0, cpu_present_map);
37722 +#ifdef CONFIG_ACPI
37723 +       /*
37724 +        * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
37725 +        * Call this early for SRAT node setup.
37726 +        */
37727 +       acpi_boot_table_init();
37728 +
37729 +       /*
37730 +        * Read APIC and some other early information from ACPI tables.
37731 +        */
37732 +       acpi_boot_init();
37733 +#endif
37734 +
37735 +       init_cpu_to_node();
37736 +
37737 +#ifdef CONFIG_X86_LOCAL_APIC
37738 +       /*
37739 +        * get boot-time SMP configuration:
37740 +        */
37741 +       if (smp_found_config)
37742 +               get_smp_config();
37743 +#ifndef CONFIG_XEN
37744 +       init_apic_mappings();
37745 +#endif
37746 +#endif
37747 +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
37748 +       prefill_possible_map();
37749 +#endif
37750 +
37751 +       /*
37752 +        * Request address space for all standard RAM and ROM resources
37753 +        * and also for regions reported as reserved by the e820.
37754 +        */
37755 +#if defined(CONFIG_XEN_PRIVILEGED_GUEST)
37756 +       probe_roms();
37757 +       if (xen_start_info->flags & SIF_INITDOMAIN) {
37758 +               machine_e820 = alloc_bootmem_low_pages(PAGE_SIZE);
37759 +
37760 +               memmap.nr_entries = E820MAX;
37761 +               set_xen_guest_handle(memmap.buffer, machine_e820);
37762 +
37763 +               BUG_ON(HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap));
37764 +
37765 +               e820_reserve_resources(machine_e820, memmap.nr_entries);
37766 +       } else if (!(xen_start_info->flags & SIF_INITDOMAIN))
37767 +               e820_reserve_resources(e820.map, e820.nr_map);
37768 +#elif defined(CONFIG_XEN)
37769 +       e820_reserve_resources(e820.map, e820.nr_map);
37770 +#else
37771 +       probe_roms();
37772 +       e820_reserve_resources(e820.map, e820.nr_map);
37773 +#endif
37774 +
37775 +       request_resource(&iomem_resource, &video_ram_resource);
37776 +
37777 +       {
37778 +       unsigned i;
37779 +       /* request I/O space for devices used on all i[345]86 PCs */
37780 +       for (i = 0; i < STANDARD_IO_RESOURCES; i++)
37781 +               request_resource(&ioport_resource, &standard_io_resources[i]);
37782 +       }
37783 +
37784 +#if defined(CONFIG_XEN_PRIVILEGED_GUEST)
37785 +       if (xen_start_info->flags & SIF_INITDOMAIN) {
37786 +               e820_setup_gap(machine_e820, memmap.nr_entries);
37787 +               free_bootmem(__pa(machine_e820), PAGE_SIZE);
37788 +       }
37789 +#elif !defined(CONFIG_XEN)
37790 +       e820_setup_gap(e820.map, e820.nr_map);
37791 +#endif
37792 +
37793 +#ifdef CONFIG_GART_IOMMU
37794 +       iommu_hole_init();
37795 +#endif
37796 +
37797 +#ifdef CONFIG_XEN
37798 +       {
37799 +               struct physdev_set_iopl set_iopl;
37800 +
37801 +               set_iopl.iopl = 1;
37802 +               HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
37803 +
37804 +               if (xen_start_info->flags & SIF_INITDOMAIN) {
37805 +                       if (!(xen_start_info->flags & SIF_PRIVILEGED))
37806 +                               panic("Xen granted us console access "
37807 +                                     "but not privileged status");
37808 +                      
37809 +#ifdef CONFIG_VT
37810 +#if defined(CONFIG_VGA_CONSOLE)
37811 +                       conswitchp = &vga_con;
37812 +#elif defined(CONFIG_DUMMY_CONSOLE)
37813 +                       conswitchp = &dummy_con;
37814 +#endif
37815 +#endif
37816 +               } else {
37817 +                       extern int console_use_vt;
37818 +                       console_use_vt = 0;
37819 +               }
37820 +       }
37821 +#else  /* CONFIG_XEN */
37822 +
37823 +#ifdef CONFIG_VT
37824 +#if defined(CONFIG_VGA_CONSOLE)
37825 +       conswitchp = &vga_con;
37826 +#elif defined(CONFIG_DUMMY_CONSOLE)
37827 +       conswitchp = &dummy_con;
37828 +#endif
37829 +#endif
37830 +
37831 +#endif /* !CONFIG_XEN */
37832 +}
37833 +
37834 +#ifdef CONFIG_XEN
37835 +static int
37836 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
37837 +{
37838 +       HYPERVISOR_shutdown(SHUTDOWN_crash);
37839 +       /* we're never actually going to get here... */
37840 +       return NOTIFY_DONE;
37841 +}
37842 +#endif /* !CONFIG_XEN */
37843 +
37844 +
37845 +static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
37846 +{
37847 +       unsigned int *v;
37848 +
37849 +       if (c->extended_cpuid_level < 0x80000004)
37850 +               return 0;
37851 +
37852 +       v = (unsigned int *) c->x86_model_id;
37853 +       cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
37854 +       cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
37855 +       cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
37856 +       c->x86_model_id[48] = 0;
37857 +       return 1;
37858 +}
37859 +
37860 +
37861 +static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
37862 +{
37863 +       unsigned int n, dummy, eax, ebx, ecx, edx;
37864 +
37865 +       n = c->extended_cpuid_level;
37866 +
37867 +       if (n >= 0x80000005) {
37868 +               cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
37869 +               printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
37870 +                       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
37871 +               c->x86_cache_size=(ecx>>24)+(edx>>24);
37872 +               /* On K8 L1 TLB is inclusive, so don't count it */
37873 +               c->x86_tlbsize = 0;
37874 +       }
37875 +
37876 +       if (n >= 0x80000006) {
37877 +               cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
37878 +               ecx = cpuid_ecx(0x80000006);
37879 +               c->x86_cache_size = ecx >> 16;
37880 +               c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
37881 +
37882 +               printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
37883 +               c->x86_cache_size, ecx & 0xFF);
37884 +       }
37885 +
37886 +       if (n >= 0x80000007)
37887 +               cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); 
37888 +       if (n >= 0x80000008) {
37889 +               cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); 
37890 +               c->x86_virt_bits = (eax >> 8) & 0xff;
37891 +               c->x86_phys_bits = eax & 0xff;
37892 +       }
37893 +}
37894 +
37895 +#ifdef CONFIG_NUMA
37896 +static int nearby_node(int apicid)
37897 +{
37898 +       int i;
37899 +       for (i = apicid - 1; i >= 0; i--) {
37900 +               int node = apicid_to_node[i];
37901 +               if (node != NUMA_NO_NODE && node_online(node))
37902 +                       return node;
37903 +       }
37904 +       for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
37905 +               int node = apicid_to_node[i];
37906 +               if (node != NUMA_NO_NODE && node_online(node))
37907 +                       return node;
37908 +       }
37909 +       return first_node(node_online_map); /* Shouldn't happen */
37910 +}
37911 +#endif
37912 +
37913 +/*
37914 + * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
37915 + * Assumes number of cores is a power of two.
37916 + */
37917 +static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
37918 +{
37919 +#ifdef CONFIG_SMP
37920 +       int cpu = smp_processor_id();
37921 +       unsigned bits;
37922 +#ifdef CONFIG_NUMA
37923 +       int node = 0;
37924 +       unsigned apicid = hard_smp_processor_id();
37925 +#endif
37926 +
37927 +       bits = 0;
37928 +       while ((1 << bits) < c->x86_max_cores)
37929 +               bits++;
37930 +
37931 +       /* Low order bits define the core id (index of core in socket) */
37932 +       cpu_core_id[cpu] = phys_proc_id[cpu] & ((1 << bits)-1);
37933 +       /* Convert the APIC ID into the socket ID */
37934 +       phys_proc_id[cpu] = phys_pkg_id(bits);
37935 +
37936 +#ifdef CONFIG_NUMA
37937 +       node = phys_proc_id[cpu];
37938 +       if (apicid_to_node[apicid] != NUMA_NO_NODE)
37939 +               node = apicid_to_node[apicid];
37940 +       if (!node_online(node)) {
37941 +               /* Two possibilities here:
37942 +                  - The CPU is missing memory and no node was created.
37943 +                  In that case try picking one from a nearby CPU
37944 +                  - The APIC IDs differ from the HyperTransport node IDs
37945 +                  which the K8 northbridge parsing fills in.
37946 +                  Assume they are all increased by a constant offset,
37947 +                  but in the same order as the HT nodeids.
37948 +                  If that doesn't result in a usable node fall back to the
37949 +                  path for the previous case.  */
37950 +               int ht_nodeid = apicid - (phys_proc_id[0] << bits);
37951 +               if (ht_nodeid >= 0 &&
37952 +                   apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
37953 +                       node = apicid_to_node[ht_nodeid];
37954 +               /* Pick a nearby node */
37955 +               if (!node_online(node))
37956 +                       node = nearby_node(apicid);
37957 +       }
37958 +       numa_set_node(cpu, node);
37959 +
37960 +       printk(KERN_INFO "CPU %d/%x(%d) -> Node %d -> Core %d\n",
37961 +                       cpu, apicid, c->x86_max_cores, node, cpu_core_id[cpu]);
37962 +#endif
37963 +#endif
37964 +}
37965 +
37966 +static int __init init_amd(struct cpuinfo_x86 *c)
37967 +{
37968 +       int r;
37969 +       unsigned level;
37970 +
37971 +#ifdef CONFIG_SMP
37972 +       unsigned long value;
37973 +
37974 +       /*
37975 +        * Disable TLB flush filter by setting HWCR.FFDIS on K8
37976 +        * bit 6 of msr C001_0015
37977 +        *
37978 +        * Errata 63 for SH-B3 steppings
37979 +        * Errata 122 for all steppings (F+ have it disabled by default)
37980 +        */
37981 +       if (c->x86 == 15) {
37982 +               rdmsrl(MSR_K8_HWCR, value);
37983 +               value |= 1 << 6;
37984 +               wrmsrl(MSR_K8_HWCR, value);
37985 +       }
37986 +#endif
37987 +
37988 +       /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
37989 +          3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
37990 +       clear_bit(0*32+31, &c->x86_capability);
37991 +       
37992 +       /* On C+ stepping K8 rep microcode works well for copy/memset */
37993 +       level = cpuid_eax(1);
37994 +       if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
37995 +               set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
37996 +
37997 +       /* Enable workaround for FXSAVE leak */
37998 +       if (c->x86 >= 6)
37999 +               set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
38000 +
38001 +       r = get_model_name(c);
38002 +       if (!r) { 
38003 +               switch (c->x86) { 
38004 +               case 15:
38005 +                       /* Should distinguish Models here, but this is only
38006 +                          a fallback anyways. */
38007 +                       strcpy(c->x86_model_id, "Hammer");
38008 +                       break; 
38009 +               } 
38010 +       } 
38011 +       display_cacheinfo(c);
38012 +
38013 +       /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
38014 +       if (c->x86_power & (1<<8))
38015 +               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
38016 +
38017 +       if (c->extended_cpuid_level >= 0x80000008) {
38018 +               c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
38019 +
38020 +               amd_detect_cmp(c);
38021 +       }
38022 +
38023 +       return r;
38024 +}
38025 +
38026 +static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
38027 +{
38028 +#ifdef CONFIG_SMP
38029 +       u32     eax, ebx, ecx, edx;
38030 +       int     index_msb, core_bits;
38031 +       int     cpu = smp_processor_id();
38032 +
38033 +       cpuid(1, &eax, &ebx, &ecx, &edx);
38034 +
38035 +
38036 +       if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
38037 +               return;
38038 +
38039 +       smp_num_siblings = (ebx & 0xff0000) >> 16;
38040 +
38041 +       if (smp_num_siblings == 1) {
38042 +               printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
38043 +       } else if (smp_num_siblings > 1 ) {
38044 +
38045 +               if (smp_num_siblings > NR_CPUS) {
38046 +                       printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
38047 +                       smp_num_siblings = 1;
38048 +                       return;
38049 +               }
38050 +
38051 +               index_msb = get_count_order(smp_num_siblings);
38052 +               phys_proc_id[cpu] = phys_pkg_id(index_msb);
38053 +
38054 +               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
38055 +                      phys_proc_id[cpu]);
38056 +
38057 +               smp_num_siblings = smp_num_siblings / c->x86_max_cores;
38058 +
38059 +               index_msb = get_count_order(smp_num_siblings) ;
38060 +
38061 +               core_bits = get_count_order(c->x86_max_cores);
38062 +
38063 +               cpu_core_id[cpu] = phys_pkg_id(index_msb) &
38064 +                                              ((1 << core_bits) - 1);
38065 +
38066 +               if (c->x86_max_cores > 1)
38067 +                       printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
38068 +                              cpu_core_id[cpu]);
38069 +       }
38070 +#endif
38071 +}
38072 +
38073 +/*
38074 + * find out the number of processor cores on the die
38075 + */
38076 +static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
38077 +{
38078 +       unsigned int eax;
38079 +
38080 +       if (c->cpuid_level < 4)
38081 +               return 1;
38082 +
38083 +       __asm__("cpuid"
38084 +               : "=a" (eax)
38085 +               : "0" (4), "c" (0)
38086 +               : "bx", "dx");
38087 +
38088 +       if (eax & 0x1f)
38089 +               return ((eax >> 26) + 1);
38090 +       else
38091 +               return 1;
38092 +}
38093 +
38094 +static void srat_detect_node(void)
38095 +{
38096 +#ifdef CONFIG_NUMA
38097 +       unsigned node;
38098 +       int cpu = smp_processor_id();
38099 +
38100 +       /* Don't do the funky fallback heuristics the AMD version employs
38101 +          for now. */
38102 +       node = apicid_to_node[hard_smp_processor_id()];
38103 +       if (node == NUMA_NO_NODE)
38104 +               node = 0;
38105 +       numa_set_node(cpu, node);
38106 +
38107 +       if (acpi_numa > 0)
38108 +               printk(KERN_INFO "CPU %d -> Node %d\n", cpu, node);
38109 +#endif
38110 +}
38111 +
38112 +static void __cpuinit init_intel(struct cpuinfo_x86 *c)
38113 +{
38114 +       /* Cache sizes */
38115 +       unsigned n;
38116 +
38117 +       init_intel_cacheinfo(c);
38118 +       n = c->extended_cpuid_level;
38119 +       if (n >= 0x80000008) {
38120 +               unsigned eax = cpuid_eax(0x80000008);
38121 +               c->x86_virt_bits = (eax >> 8) & 0xff;
38122 +               c->x86_phys_bits = eax & 0xff;
38123 +               /* CPUID workaround for Intel 0F34 CPU */
38124 +               if (c->x86_vendor == X86_VENDOR_INTEL &&
38125 +                   c->x86 == 0xF && c->x86_model == 0x3 &&
38126 +                   c->x86_mask == 0x4)
38127 +                       c->x86_phys_bits = 36;
38128 +       }
38129 +
38130 +       if (c->x86 == 15)
38131 +               c->x86_cache_alignment = c->x86_clflush_size * 2;
38132 +       if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
38133 +           (c->x86 == 0x6 && c->x86_model >= 0x0e))
38134 +               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
38135 +       set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
38136 +       c->x86_max_cores = intel_num_cpu_cores(c);
38137 +
38138 +       srat_detect_node();
38139 +}
38140 +
38141 +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
38142 +{
38143 +       char *v = c->x86_vendor_id;
38144 +
38145 +       if (!strcmp(v, "AuthenticAMD"))
38146 +               c->x86_vendor = X86_VENDOR_AMD;
38147 +       else if (!strcmp(v, "GenuineIntel"))
38148 +               c->x86_vendor = X86_VENDOR_INTEL;
38149 +       else
38150 +               c->x86_vendor = X86_VENDOR_UNKNOWN;
38151 +}
38152 +
38153 +struct cpu_model_info {
38154 +       int vendor;
38155 +       int family;
38156 +       char *model_names[16];
38157 +};
38158 +
38159 +/* Do some early cpuid on the boot CPU to get some parameter that are
38160 +   needed before check_bugs. Everything advanced is in identify_cpu
38161 +   below. */
38162 +void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
38163 +{
38164 +       u32 tfms;
38165 +
38166 +       c->loops_per_jiffy = loops_per_jiffy;
38167 +       c->x86_cache_size = -1;
38168 +       c->x86_vendor = X86_VENDOR_UNKNOWN;
38169 +       c->x86_model = c->x86_mask = 0; /* So far unknown... */
38170 +       c->x86_vendor_id[0] = '\0'; /* Unset */
38171 +       c->x86_model_id[0] = '\0';  /* Unset */
38172 +       c->x86_clflush_size = 64;
38173 +       c->x86_cache_alignment = c->x86_clflush_size;
38174 +       c->x86_max_cores = 1;
38175 +       c->extended_cpuid_level = 0;
38176 +       memset(&c->x86_capability, 0, sizeof c->x86_capability);
38177 +
38178 +       /* Get vendor name */
38179 +       cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
38180 +             (unsigned int *)&c->x86_vendor_id[0],
38181 +             (unsigned int *)&c->x86_vendor_id[8],
38182 +             (unsigned int *)&c->x86_vendor_id[4]);
38183 +               
38184 +       get_cpu_vendor(c);
38185 +
38186 +       /* Initialize the standard set of capabilities */
38187 +       /* Note that the vendor-specific code below might override */
38188 +
38189 +       /* Intel-defined flags: level 0x00000001 */
38190 +       if (c->cpuid_level >= 0x00000001) {
38191 +               __u32 misc;
38192 +               cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
38193 +                     &c->x86_capability[0]);
38194 +               c->x86 = (tfms >> 8) & 0xf;
38195 +               c->x86_model = (tfms >> 4) & 0xf;
38196 +               c->x86_mask = tfms & 0xf;
38197 +               if (c->x86 == 0xf)
38198 +                       c->x86 += (tfms >> 20) & 0xff;
38199 +               if (c->x86 >= 0x6)
38200 +                       c->x86_model += ((tfms >> 16) & 0xF) << 4;
38201 +               if (c->x86_capability[0] & (1<<19)) 
38202 +                       c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
38203 +       } else {
38204 +               /* Have CPUID level 0 only - unheard of */
38205 +               c->x86 = 4;
38206 +       }
38207 +
38208 +#ifdef CONFIG_SMP
38209 +       phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
38210 +#endif
38211 +}
38212 +
38213 +/*
38214 + * This does the hard work of actually picking apart the CPU stuff...
38215 + */
38216 +void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
38217 +{
38218 +       int i;
38219 +       u32 xlvl;
38220 +
38221 +       early_identify_cpu(c);
38222 +
38223 +       /* AMD-defined flags: level 0x80000001 */
38224 +       xlvl = cpuid_eax(0x80000000);
38225 +       c->extended_cpuid_level = xlvl;
38226 +       if ((xlvl & 0xffff0000) == 0x80000000) {
38227 +               if (xlvl >= 0x80000001) {
38228 +                       c->x86_capability[1] = cpuid_edx(0x80000001);
38229 +                       c->x86_capability[6] = cpuid_ecx(0x80000001);
38230 +               }
38231 +               if (xlvl >= 0x80000004)
38232 +                       get_model_name(c); /* Default name */
38233 +       }
38234 +
38235 +       /* Transmeta-defined flags: level 0x80860001 */
38236 +       xlvl = cpuid_eax(0x80860000);
38237 +       if ((xlvl & 0xffff0000) == 0x80860000) {
38238 +               /* Don't set x86_cpuid_level here for now to not confuse. */
38239 +               if (xlvl >= 0x80860001)
38240 +                       c->x86_capability[2] = cpuid_edx(0x80860001);
38241 +       }
38242 +
38243 +       c->apicid = phys_pkg_id(0);
38244 +
38245 +       /*
38246 +        * Vendor-specific initialization.  In this section we
38247 +        * canonicalize the feature flags, meaning if there are
38248 +        * features a certain CPU supports which CPUID doesn't
38249 +        * tell us, CPUID claiming incorrect flags, or other bugs,
38250 +        * we handle them here.
38251 +        *
38252 +        * At the end of this section, c->x86_capability better
38253 +        * indicate the features this CPU genuinely supports!
38254 +        */
38255 +       switch (c->x86_vendor) {
38256 +       case X86_VENDOR_AMD:
38257 +               init_amd(c);
38258 +               break;
38259 +
38260 +       case X86_VENDOR_INTEL:
38261 +               init_intel(c);
38262 +               break;
38263 +
38264 +       case X86_VENDOR_UNKNOWN:
38265 +       default:
38266 +               display_cacheinfo(c);
38267 +               break;
38268 +       }
38269 +
38270 +       select_idle_routine(c);
38271 +       detect_ht(c); 
38272 +
38273 +       /*
38274 +        * On SMP, boot_cpu_data holds the common feature set between
38275 +        * all CPUs; so make sure that we indicate which features are
38276 +        * common between the CPUs.  The first time this routine gets
38277 +        * executed, c == &boot_cpu_data.
38278 +        */
38279 +       if (c != &boot_cpu_data) {
38280 +               /* AND the already accumulated flags with these */
38281 +               for (i = 0 ; i < NCAPINTS ; i++)
38282 +                       boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
38283 +       }
38284 +
38285 +#ifdef CONFIG_X86_MCE
38286 +       mcheck_init(c);
38287 +#endif
38288 +       if (c == &boot_cpu_data)
38289 +               mtrr_bp_init();
38290 +       else
38291 +               mtrr_ap_init();
38292 +#ifdef CONFIG_NUMA
38293 +       numa_add_cpu(smp_processor_id());
38294 +#endif
38295 +}
38296
38297 +
38298 +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
38299 +{
38300 +       if (c->x86_model_id[0])
38301 +               printk("%s", c->x86_model_id);
38302 +
38303 +       if (c->x86_mask || c->cpuid_level >= 0) 
38304 +               printk(" stepping %02x\n", c->x86_mask);
38305 +       else
38306 +               printk("\n");
38307 +}
38308 +
38309 +/*
38310 + *     Get CPU information for use by the procfs.
38311 + */
38312 +
38313 +static int show_cpuinfo(struct seq_file *m, void *v)
38314 +{
38315 +       struct cpuinfo_x86 *c = v;
38316 +
38317 +       /* 
38318 +        * These flag bits must match the definitions in <asm/cpufeature.h>.
38319 +        * NULL means this bit is undefined or reserved; either way it doesn't
38320 +        * have meaning as far as Linux is concerned.  Note that it's important
38321 +        * to realize there is a difference between this table and CPUID -- if
38322 +        * applications want to get the raw CPUID data, they should access
38323 +        * /dev/cpu/<cpu_nr>/cpuid instead.
38324 +        */
38325 +       static char *x86_cap_flags[] = {
38326 +               /* Intel-defined */
38327 +               "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
38328 +               "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
38329 +               "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
38330 +               "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL,
38331 +
38332 +               /* AMD-defined */
38333 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
38334 +               NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
38335 +               NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
38336 +               NULL, "fxsr_opt", "rdtscp", NULL, NULL, "lm", "3dnowext", "3dnow",
38337 +
38338 +               /* Transmeta-defined */
38339 +               "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
38340 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
38341 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
38342 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
38343 +
38344 +               /* Other (Linux-defined) */
38345 +               "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL,
38346 +               "constant_tsc", NULL, NULL,
38347 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
38348 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
38349 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
38350 +
38351 +               /* Intel-defined (#2) */
38352 +               "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
38353 +               "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
38354 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
38355 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
38356 +
38357 +               /* VIA/Cyrix/Centaur-defined */
38358 +               NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
38359 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
38360 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
38361 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
38362 +
38363 +               /* AMD-defined (#2) */
38364 +               "lahf_lm", "cmp_legacy", "svm", NULL, "cr8_legacy", NULL, NULL, NULL,
38365 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
38366 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
38367 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
38368 +       };
38369 +       static char *x86_power_flags[] = { 
38370 +               "ts",   /* temperature sensor */
38371 +               "fid",  /* frequency id control */
38372 +               "vid",  /* voltage id control */
38373 +               "ttp",  /* thermal trip */
38374 +               "tm",
38375 +               "stc",
38376 +               NULL,
38377 +               /* nothing */   /* constant_tsc - moved to flags */
38378 +       };
38379 +
38380 +
38381 +#ifdef CONFIG_SMP
38382 +       if (!cpu_online(c-cpu_data))
38383 +               return 0;
38384 +#endif
38385 +
38386 +       seq_printf(m,"processor\t: %u\n"
38387 +                    "vendor_id\t: %s\n"
38388 +                    "cpu family\t: %d\n"
38389 +                    "model\t\t: %d\n"
38390 +                    "model name\t: %s\n",
38391 +                    (unsigned)(c-cpu_data),
38392 +                    c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
38393 +                    c->x86,
38394 +                    (int)c->x86_model,
38395 +                    c->x86_model_id[0] ? c->x86_model_id : "unknown");
38396 +       
38397 +       if (c->x86_mask || c->cpuid_level >= 0)
38398 +               seq_printf(m, "stepping\t: %d\n", c->x86_mask);
38399 +       else
38400 +               seq_printf(m, "stepping\t: unknown\n");
38401 +       
38402 +       if (cpu_has(c,X86_FEATURE_TSC)) {
38403 +               unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data));
38404 +               if (!freq)
38405 +                       freq = cpu_khz;
38406 +               seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
38407 +                            freq / 1000, (freq % 1000));
38408 +       }
38409 +
38410 +       /* Cache size */
38411 +       if (c->x86_cache_size >= 0) 
38412 +               seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
38413 +       
38414 +#ifdef CONFIG_SMP
38415 +       if (smp_num_siblings * c->x86_max_cores > 1) {
38416 +               int cpu = c - cpu_data;
38417 +               seq_printf(m, "physical id\t: %d\n", phys_proc_id[cpu]);
38418 +               seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu]));
38419 +               seq_printf(m, "core id\t\t: %d\n", cpu_core_id[cpu]);
38420 +               seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
38421 +       }
38422 +#endif 
38423 +
38424 +       seq_printf(m,
38425 +               "fpu\t\t: yes\n"
38426 +               "fpu_exception\t: yes\n"
38427 +               "cpuid level\t: %d\n"
38428 +               "wp\t\t: yes\n"
38429 +               "flags\t\t:",
38430 +                  c->cpuid_level);
38431 +
38432 +       { 
38433 +               int i; 
38434 +               for ( i = 0 ; i < 32*NCAPINTS ; i++ )
38435 +                       if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
38436 +                               seq_printf(m, " %s", x86_cap_flags[i]);
38437 +       }
38438 +               
38439 +       seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
38440 +                  c->loops_per_jiffy/(500000/HZ),
38441 +                  (c->loops_per_jiffy/(5000/HZ)) % 100);
38442 +
38443 +       if (c->x86_tlbsize > 0) 
38444 +               seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
38445 +       seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
38446 +       seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
38447 +
38448 +       seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", 
38449 +                  c->x86_phys_bits, c->x86_virt_bits);
38450 +
38451 +       seq_printf(m, "power management:");
38452 +       {
38453 +               unsigned i;
38454 +               for (i = 0; i < 32; i++) 
38455 +                       if (c->x86_power & (1 << i)) {
38456 +                               if (i < ARRAY_SIZE(x86_power_flags) &&
38457 +                                       x86_power_flags[i])
38458 +                                       seq_printf(m, "%s%s",
38459 +                                               x86_power_flags[i][0]?" ":"",
38460 +                                               x86_power_flags[i]);
38461 +                               else
38462 +                                       seq_printf(m, " [%d]", i);
38463 +                       }
38464 +       }
38465 +
38466 +       seq_printf(m, "\n\n");
38467 +
38468 +       return 0;
38469 +}
38470 +
38471 +static void *c_start(struct seq_file *m, loff_t *pos)
38472 +{
38473 +       return *pos < NR_CPUS ? cpu_data + *pos : NULL;
38474 +}
38475 +
38476 +static void *c_next(struct seq_file *m, void *v, loff_t *pos)
38477 +{
38478 +       ++*pos;
38479 +       return c_start(m, pos);
38480 +}
38481 +
38482 +static void c_stop(struct seq_file *m, void *v)
38483 +{
38484 +}
38485 +
38486 +struct seq_operations cpuinfo_op = {
38487 +       .start =c_start,
38488 +       .next = c_next,
38489 +       .stop = c_stop,
38490 +       .show = show_cpuinfo,
38491 +};
38492 +
38493 +#ifdef CONFIG_INPUT_PCSPKR
38494 +#include <linux/platform_device.h>
38495 +static __init int add_pcspkr(void)
38496 +{
38497 +       struct platform_device *pd;
38498 +       int ret;
38499 +
38500 +       pd = platform_device_alloc("pcspkr", -1);
38501 +       if (!pd)
38502 +               return -ENOMEM;
38503 +
38504 +       ret = platform_device_add(pd);
38505 +       if (ret)
38506 +               platform_device_put(pd);
38507 +
38508 +       return ret;
38509 +}
38510 +device_initcall(add_pcspkr);
38511 +#endif
38512 diff -urNp linux-2.6/arch/x86_64/kernel/smp.c new/arch/x86_64/kernel/smp.c
38513 --- linux-2.6/arch/x86_64/kernel/smp.c  2006-07-03 14:14:30.000000000 +0200
38514 +++ new/arch/x86_64/kernel/smp.c        2006-05-09 12:33:21.000000000 +0200
38515 @@ -135,10 +135,10 @@ asmlinkage void smp_invalidate_interrupt
38516  
38517         cpu = smp_processor_id();
38518         /*
38519 -        * orig_rax contains the interrupt vector - 256.
38520 +        * orig_rax contains the negated interrupt vector.
38521          * Use that to determine where the sender put the data.
38522          */
38523 -       sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START;
38524 +       sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
38525         f = &per_cpu(flush_state, sender);
38526  
38527         if (!cpu_isset(cpu, f->flush_cpumask))
38528 diff -urNp linux-2.6/arch/x86_64/kernel/smp-xen.c new/arch/x86_64/kernel/smp-xen.c
38529 --- linux-2.6/arch/x86_64/kernel/smp-xen.c      1970-01-01 01:00:00.000000000 +0100
38530 +++ new/arch/x86_64/kernel/smp-xen.c    2006-06-07 13:29:36.000000000 +0200
38531 @@ -0,0 +1,596 @@
38532 +/*
38533 + *     Intel SMP support routines.
38534 + *
38535 + *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
38536 + *     (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
38537 + *      (c) 2002,2003 Andi Kleen, SuSE Labs.
38538 + *
38539 + *     This code is released under the GNU General Public License version 2 or
38540 + *     later.
38541 + */
38542 +
38543 +#include <linux/init.h>
38544 +
38545 +#include <linux/mm.h>
38546 +#include <linux/delay.h>
38547 +#include <linux/spinlock.h>
38548 +#include <linux/smp_lock.h>
38549 +#include <linux/smp.h>
38550 +#include <linux/kernel_stat.h>
38551 +#include <linux/mc146818rtc.h>
38552 +#include <linux/interrupt.h>
38553 +
38554 +#include <asm/mtrr.h>
38555 +#include <asm/pgalloc.h>
38556 +#include <asm/tlbflush.h>
38557 +#include <asm/mach_apic.h>
38558 +#include <asm/mmu_context.h>
38559 +#include <asm/proto.h>
38560 +#include <asm/apicdef.h>
38561 +#include <asm/idle.h>
38562 +#ifdef CONFIG_XEN
38563 +#include <xen/evtchn.h>
38564 +#endif
38565 +
38566 +#ifndef CONFIG_XEN
38567 +/*
38568 + *     Smarter SMP flushing macros. 
38569 + *             c/o Linus Torvalds.
38570 + *
38571 + *     These mean you can really definitely utterly forget about
38572 + *     writing to user space from interrupts. (Its not allowed anyway).
38573 + *
38574 + *     Optimizations Manfred Spraul <manfred@colorfullife.com>
38575 + *
38576 + *     More scalable flush, from Andi Kleen
38577 + *
38578 + *     To avoid global state use 8 different call vectors.
38579 + *     Each CPU uses a specific vector to trigger flushes on other
38580 + *     CPUs. Depending on the received vector the target CPUs look into
38581 + *     the right per cpu variable for the flush data.
38582 + *
38583 + *     With more than 8 CPUs they are hashed to the 8 available
38584 + *     vectors. The limited global vector space forces us to this right now.
38585 + *     In future when interrupts are split into per CPU domains this could be
38586 + *     fixed, at the cost of triggering multiple IPIs in some cases.
38587 + */
38588 +
38589 +union smp_flush_state {
38590 +       struct {
38591 +               cpumask_t flush_cpumask;
38592 +               struct mm_struct *flush_mm;
38593 +               unsigned long flush_va;
38594 +#define FLUSH_ALL      -1ULL
38595 +               spinlock_t tlbstate_lock;
38596 +       };
38597 +       char pad[SMP_CACHE_BYTES];
38598 +} ____cacheline_aligned;
38599 +
38600 +/* State is put into the per CPU data section, but padded
38601 +   to a full cache line because other CPUs can access it and we don't
38602 +   want false sharing in the per cpu data segment. */
38603 +static DEFINE_PER_CPU(union smp_flush_state, flush_state);
38604 +#endif
38605 +
38606 +/*
38607 + * We cannot call mmdrop() because we are in interrupt context, 
38608 + * instead update mm->cpu_vm_mask.
38609 + */
38610 +static inline void leave_mm(unsigned long cpu)
38611 +{
38612 +       if (read_pda(mmu_state) == TLBSTATE_OK)
38613 +               BUG();
38614 +       cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
38615 +       load_cr3(swapper_pg_dir);
38616 +}
38617 +
38618 +#ifndef CONFIG_XEN
38619 +/*
38620 + *
38621 + * The flush IPI assumes that a thread switch happens in this order:
38622 + * [cpu0: the cpu that switches]
38623 + * 1) switch_mm() either 1a) or 1b)
38624 + * 1a) thread switch to a different mm
38625 + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
38626 + *     Stop ipi delivery for the old mm. This is not synchronized with
38627 + *     the other cpus, but smp_invalidate_interrupt ignore flush ipis
38628 + *     for the wrong mm, and in the worst case we perform a superfluous
38629 + *     tlb flush.
38630 + * 1a2) set cpu mmu_state to TLBSTATE_OK
38631 + *     Now the smp_invalidate_interrupt won't call leave_mm if cpu0
38632 + *     was in lazy tlb mode.
38633 + * 1a3) update cpu active_mm
38634 + *     Now cpu0 accepts tlb flushes for the new mm.
38635 + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
38636 + *     Now the other cpus will send tlb flush ipis.
38637 + * 1a4) change cr3.
38638 + * 1b) thread switch without mm change
38639 + *     cpu active_mm is correct, cpu0 already handles
38640 + *     flush ipis.
38641 + * 1b1) set cpu mmu_state to TLBSTATE_OK
38642 + * 1b2) test_and_set the cpu bit in cpu_vm_mask.
38643 + *     Atomically set the bit [other cpus will start sending flush ipis],
38644 + *     and test the bit.
38645 + * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
38646 + * 2) switch %%esp, ie current
38647 + *
38648 + * The interrupt must handle 2 special cases:
38649 + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
38650 + * - the cpu performs speculative tlb reads, i.e. even if the cpu only
38651 + *   runs in kernel space, the cpu could load tlb entries for user space
38652 + *   pages.
38653 + *
38654 + * The good news is that cpu mmu_state is local to each cpu, no
38655 + * write/read ordering problems.
38656 + */
38657 +
38658 +/*
38659 + * TLB flush IPI:
38660 + *
38661 + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
38662 + * 2) Leave the mm if we are in the lazy tlb mode.
38663 + *
38664 + * Interrupts are disabled.
38665 + */
38666 +
38667 +asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
38668 +{
38669 +       int cpu;
38670 +       int sender;
38671 +       union smp_flush_state *f;
38672 +
38673 +       cpu = smp_processor_id();
38674 +       /*
38675 +        * orig_rax contains the interrupt vector - 256.
38676 +        * Use that to determine where the sender put the data.
38677 +        */
38678 +       sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START;
38679 +       f = &per_cpu(flush_state, sender);
38680 +
38681 +       if (!cpu_isset(cpu, f->flush_cpumask))
38682 +               goto out;
38683 +               /* 
38684 +                * This was a BUG() but until someone can quote me the
38685 +                * line from the intel manual that guarantees an IPI to
38686 +                * multiple CPUs is retried _only_ on the erroring CPUs
38687 +                * its staying as a return
38688 +                *
38689 +                * BUG();
38690 +                */
38691 +                
38692 +       if (f->flush_mm == read_pda(active_mm)) {
38693 +               if (read_pda(mmu_state) == TLBSTATE_OK) {
38694 +                       if (f->flush_va == FLUSH_ALL)
38695 +                               local_flush_tlb();
38696 +                       else
38697 +                               __flush_tlb_one(f->flush_va);
38698 +               } else
38699 +                       leave_mm(cpu);
38700 +       }
38701 +out:
38702 +       ack_APIC_irq();
38703 +       cpu_clear(cpu, f->flush_cpumask);
38704 +}
38705 +
38706 +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
38707 +                                               unsigned long va)
38708 +{
38709 +       int sender;
38710 +       union smp_flush_state *f;
38711 +
38712 +       /* Caller has disabled preemption */
38713 +       sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
38714 +       f = &per_cpu(flush_state, sender);
38715 +
38716 +       /* Could avoid this lock when
38717 +          num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
38718 +          probably not worth checking this for a cache-hot lock. */
38719 +       spin_lock(&f->tlbstate_lock);
38720 +
38721 +       f->flush_mm = mm;
38722 +       f->flush_va = va;
38723 +       cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
38724 +
38725 +       /*
38726 +        * We have to send the IPI only to
38727 +        * CPUs affected.
38728 +        */
38729 +       send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
38730 +
38731 +       while (!cpus_empty(f->flush_cpumask))
38732 +               cpu_relax();
38733 +
38734 +       f->flush_mm = NULL;
38735 +       f->flush_va = 0;
38736 +       spin_unlock(&f->tlbstate_lock);
38737 +}
38738 +
38739 +int __cpuinit init_smp_flush(void)
38740 +{
38741 +       int i;
38742 +       for_each_cpu_mask(i, cpu_possible_map) {
38743 +               spin_lock_init(&per_cpu(flush_state.tlbstate_lock, i));
38744 +       }
38745 +       return 0;
38746 +}
38747 +
38748 +core_initcall(init_smp_flush);
38749 +       
38750 +void flush_tlb_current_task(void)
38751 +{
38752 +       struct mm_struct *mm = current->mm;
38753 +       cpumask_t cpu_mask;
38754 +
38755 +       preempt_disable();
38756 +       cpu_mask = mm->cpu_vm_mask;
38757 +       cpu_clear(smp_processor_id(), cpu_mask);
38758 +
38759 +       local_flush_tlb();
38760 +       if (!cpus_empty(cpu_mask))
38761 +               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
38762 +       preempt_enable();
38763 +}
38764 +
38765 +void flush_tlb_mm (struct mm_struct * mm)
38766 +{
38767 +       cpumask_t cpu_mask;
38768 +
38769 +       preempt_disable();
38770 +       cpu_mask = mm->cpu_vm_mask;
38771 +       cpu_clear(smp_processor_id(), cpu_mask);
38772 +
38773 +       if (current->active_mm == mm) {
38774 +               if (current->mm)
38775 +                       local_flush_tlb();
38776 +               else
38777 +                       leave_mm(smp_processor_id());
38778 +       }
38779 +       if (!cpus_empty(cpu_mask))
38780 +               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
38781 +
38782 +       preempt_enable();
38783 +}
38784 +
38785 +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
38786 +{
38787 +       struct mm_struct *mm = vma->vm_mm;
38788 +       cpumask_t cpu_mask;
38789 +
38790 +       preempt_disable();
38791 +       cpu_mask = mm->cpu_vm_mask;
38792 +       cpu_clear(smp_processor_id(), cpu_mask);
38793 +
38794 +       if (current->active_mm == mm) {
38795 +               if(current->mm)
38796 +                       __flush_tlb_one(va);
38797 +                else
38798 +                       leave_mm(smp_processor_id());
38799 +       }
38800 +
38801 +       if (!cpus_empty(cpu_mask))
38802 +               flush_tlb_others(cpu_mask, mm, va);
38803 +
38804 +       preempt_enable();
38805 +}
38806 +
38807 +static void do_flush_tlb_all(void* info)
38808 +{
38809 +       unsigned long cpu = smp_processor_id();
38810 +
38811 +       __flush_tlb_all();
38812 +       if (read_pda(mmu_state) == TLBSTATE_LAZY)
38813 +               leave_mm(cpu);
38814 +}
38815 +
38816 +void flush_tlb_all(void)
38817 +{
38818 +       on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
38819 +}
38820 +#else
38821 +asmlinkage void smp_invalidate_interrupt (void)
38822 +{ return; }
38823 +void flush_tlb_current_task(void)
38824 +{ xen_tlb_flush_mask(&current->mm->cpu_vm_mask); }
38825 +void flush_tlb_mm (struct mm_struct * mm)
38826 +{ xen_tlb_flush_mask(&mm->cpu_vm_mask); }
38827 +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
38828 +{ xen_invlpg_mask(&vma->vm_mm->cpu_vm_mask, va); }
38829 +void flush_tlb_all(void)
38830 +{ xen_tlb_flush_all(); }
38831 +#endif /* Xen */
38832 +
38833 +/*
38834 + * this function sends a 'reschedule' IPI to another CPU.
38835 + * it goes straight through and wastes no time serializing
38836 + * anything. Worst case is that we lose a reschedule ...
38837 + */
38838 +
38839 +void smp_send_reschedule(int cpu)
38840 +{
38841 +       send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
38842 +}
38843 +
38844 +/*
38845 + * Structure and data for smp_call_function(). This is designed to minimise
38846 + * static memory requirements. It also looks cleaner.
38847 + */
38848 +static DEFINE_SPINLOCK(call_lock);
38849 +
38850 +struct call_data_struct {
38851 +       void (*func) (void *info);
38852 +       void *info;
38853 +       atomic_t started;
38854 +       atomic_t finished;
38855 +       int wait;
38856 +};
38857 +
38858 +static struct call_data_struct * call_data;
38859 +
38860 +void lock_ipi_call_lock(void)
38861 +{
38862 +       spin_lock_irq(&call_lock);
38863 +}
38864 +
38865 +void unlock_ipi_call_lock(void)
38866 +{
38867 +       spin_unlock_irq(&call_lock);
38868 +}
38869 +
38870 +/*
38871 + * this function sends a 'generic call function' IPI to one other CPU
38872 + * in the system.
38873 + *
38874 + * cpu is a standard Linux logical CPU number.
38875 + */
38876 +static void
38877 +__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
38878 +                               int nonatomic, int wait)
38879 +{
38880 +       struct call_data_struct data;
38881 +       int cpus = 1;
38882 +
38883 +       data.func = func;
38884 +       data.info = info;
38885 +       atomic_set(&data.started, 0);
38886 +       data.wait = wait;
38887 +       if (wait)
38888 +               atomic_set(&data.finished, 0);
38889 +
38890 +       call_data = &data;
38891 +       wmb();
38892 +       /* Send a message to all other CPUs and wait for them to respond */
38893 +       send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
38894 +
38895 +       /* Wait for response */
38896 +       while (atomic_read(&data.started) != cpus)
38897 +               cpu_relax();
38898 +
38899 +       if (!wait)
38900 +               return;
38901 +
38902 +       while (atomic_read(&data.finished) != cpus)
38903 +               cpu_relax();
38904 +}
38905 +
38906 +/*
38907 + * smp_call_function_single - Run a function on another CPU
38908 + * @func: The function to run. This must be fast and non-blocking.
38909 + * @info: An arbitrary pointer to pass to the function.
38910 + * @nonatomic: Currently unused.
38911 + * @wait: If true, wait until function has completed on other CPUs.
38912 + *
38913 + * Retrurns 0 on success, else a negative status code.
38914 + *
38915 + * Does not return until the remote CPU is nearly ready to execute <func>
38916 + * or is or has executed.
38917 + */
38918 +
38919 +int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
38920 +       int nonatomic, int wait)
38921 +{
38922 +       /* prevent preemption and reschedule on another processor */
38923 +       int me = get_cpu();
38924 +       if (cpu == me) {
38925 +               WARN_ON(1);
38926 +               put_cpu();
38927 +               return -EBUSY;
38928 +       }
38929 +       spin_lock_bh(&call_lock);
38930 +       __smp_call_function_single(cpu, func, info, nonatomic, wait);
38931 +       spin_unlock_bh(&call_lock);
38932 +       put_cpu();
38933 +       return 0;
38934 +}
38935 +
38936 +/*
38937 + * this function sends a 'generic call function' IPI to all other CPUs
38938 + * in the system.
38939 + */
38940 +static void __smp_call_function (void (*func) (void *info), void *info,
38941 +                               int nonatomic, int wait)
38942 +{
38943 +       struct call_data_struct data;
38944 +       int cpus = num_online_cpus()-1;
38945 +
38946 +       if (!cpus)
38947 +               return;
38948 +
38949 +       data.func = func;
38950 +       data.info = info;
38951 +       atomic_set(&data.started, 0);
38952 +       data.wait = wait;
38953 +       if (wait)
38954 +               atomic_set(&data.finished, 0);
38955 +
38956 +       call_data = &data;
38957 +       wmb();
38958 +       /* Send a message to all other CPUs and wait for them to respond */
38959 +       send_IPI_allbutself(CALL_FUNCTION_VECTOR);
38960 +
38961 +       /* Wait for response */
38962 +       while (atomic_read(&data.started) != cpus)
38963 +#ifndef CONFIG_XEN
38964 +               cpu_relax();
38965 +#else
38966 +               barrier();
38967 +#endif
38968 +
38969 +       if (!wait)
38970 +               return;
38971 +
38972 +       while (atomic_read(&data.finished) != cpus)
38973 +#ifndef CONFIG_XEN
38974 +               cpu_relax();
38975 +#else
38976 +               barrier();
38977 +#endif
38978 +}
38979 +
38980 +/*
38981 + * smp_call_function - run a function on all other CPUs.
38982 + * @func: The function to run. This must be fast and non-blocking.
38983 + * @info: An arbitrary pointer to pass to the function.
38984 + * @nonatomic: currently unused.
38985 + * @wait: If true, wait (atomically) until function has completed on other
38986 + *        CPUs.
38987 + *
38988 + * Returns 0 on success, else a negative status code. Does not return until
38989 + * remote CPUs are nearly ready to execute func or are or have executed.
38990 + *
38991 + * You must not call this function with disabled interrupts or from a
38992 + * hardware interrupt handler or from a bottom half handler.
38993 + * Actually there are a few legal cases, like panic.
38994 + */
38995 +int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
38996 +                       int wait)
38997 +{
38998 +       spin_lock(&call_lock);
38999 +       __smp_call_function(func,info,nonatomic,wait);
39000 +       spin_unlock(&call_lock);
39001 +       return 0;
39002 +}
39003 +
39004 +void smp_stop_cpu(void)
39005 +{
39006 +       unsigned long flags;
39007 +       /*
39008 +        * Remove this CPU:
39009 +        */
39010 +       cpu_clear(smp_processor_id(), cpu_online_map);
39011 +       local_irq_save(flags);
39012 +#ifndef CONFIG_XEN
39013 +       disable_local_APIC();
39014 +#endif
39015 +       local_irq_restore(flags); 
39016 +}
39017 +
39018 +static void smp_really_stop_cpu(void *dummy)
39019 +{
39020 +       smp_stop_cpu(); 
39021 +       for (;;) 
39022 +               halt();
39023 +} 
39024 +
39025 +void smp_send_stop(void)
39026 +{
39027 +       int nolock = 0;
39028 +#ifndef CONFIG_XEN
39029 +       if (reboot_force)
39030 +               return;
39031 +#endif
39032 +       /* Don't deadlock on the call lock in panic */
39033 +       if (!spin_trylock(&call_lock)) {
39034 +               /* ignore locking because we have paniced anyways */
39035 +               nolock = 1;
39036 +       }
39037 +       __smp_call_function(smp_really_stop_cpu, NULL, 0, 0);
39038 +       if (!nolock)
39039 +               spin_unlock(&call_lock);
39040 +
39041 +       local_irq_disable();
39042 +#ifndef CONFIG_XEN
39043 +       disable_local_APIC();
39044 +#endif
39045 +       local_irq_enable();
39046 +}
39047 +
39048 +/*
39049 + * Reschedule call back. Nothing to do,
39050 + * all the work is done automatically when
39051 + * we return from the interrupt.
39052 + */
39053 +#ifndef CONFIG_XEN
39054 +asmlinkage void smp_reschedule_interrupt(void)
39055 +#else
39056 +asmlinkage irqreturn_t smp_reschedule_interrupt(void)
39057 +#endif
39058 +{
39059 +#ifndef CONFIG_XEN
39060 +       ack_APIC_irq();
39061 +#else
39062 +       return IRQ_HANDLED;
39063 +#endif
39064 +}
39065 +
39066 +#ifndef CONFIG_XEN
39067 +asmlinkage void smp_call_function_interrupt(void)
39068 +#else
39069 +asmlinkage irqreturn_t smp_call_function_interrupt(void)
39070 +#endif
39071 +{
39072 +       void (*func) (void *info) = call_data->func;
39073 +       void *info = call_data->info;
39074 +       int wait = call_data->wait;
39075 +
39076 +#ifndef CONFIG_XEN
39077 +       ack_APIC_irq();
39078 +#endif
39079 +       /*
39080 +        * Notify initiating CPU that I've grabbed the data and am
39081 +        * about to execute the function
39082 +        */
39083 +       mb();
39084 +       atomic_inc(&call_data->started);
39085 +       /*
39086 +        * At this point the info structure may be out of scope unless wait==1
39087 +        */
39088 +       exit_idle();
39089 +       irq_enter();
39090 +       (*func)(info);
39091 +       irq_exit();
39092 +       if (wait) {
39093 +               mb();
39094 +               atomic_inc(&call_data->finished);
39095 +       }
39096 +#ifdef CONFIG_XEN
39097 +       return IRQ_HANDLED;
39098 +#endif
39099 +}
39100 +
39101 +int safe_smp_processor_id(void)
39102 +{
39103 +#ifdef CONFIG_XEN
39104 +       return smp_processor_id();
39105 +#else
39106 +       int apicid, i;
39107 +
39108 +       if (disable_apic)
39109 +               return 0;
39110 +
39111 +       apicid = hard_smp_processor_id();
39112 +       if (x86_cpu_to_apicid[apicid] == apicid)
39113 +               return apicid;
39114 +
39115 +       for (i = 0; i < NR_CPUS; ++i) {
39116 +               if (x86_cpu_to_apicid[i] == apicid)
39117 +                       return i;
39118 +       }
39119 +
39120 +       /* No entries in x86_cpu_to_apicid?  Either no MPS|ACPI,
39121 +        * or called too early.  Either way, we must be CPU 0. */
39122 +       if (x86_cpu_to_apicid[0] == BAD_APICID)
39123 +               return 0;
39124 +
39125 +       return 0; /* Should not happen */
39126 +#endif
39127 +}
39128 diff -urNp linux-2.6/arch/x86_64/kernel/traps-xen.c new/arch/x86_64/kernel/traps-xen.c
39129 --- linux-2.6/arch/x86_64/kernel/traps-xen.c    1970-01-01 01:00:00.000000000 +0100
39130 +++ new/arch/x86_64/kernel/traps-xen.c  2006-05-23 18:37:10.000000000 +0200
39131 @@ -0,0 +1,1035 @@
39132 +/*
39133 + *  linux/arch/x86-64/traps.c
39134 + *
39135 + *  Copyright (C) 1991, 1992  Linus Torvalds
39136 + *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
39137 + *
39138 + *  Pentium III FXSR, SSE support
39139 + *     Gareth Hughes <gareth@valinux.com>, May 2000
39140 + *
39141 + *  $Id$
39142 + */
39143 +
39144 +/*
39145 + * 'Traps.c' handles hardware traps and faults after we have saved some
39146 + * state in 'entry.S'.
39147 + */
39148 +#include <linux/config.h>
39149 +#include <linux/sched.h>
39150 +#include <linux/kernel.h>
39151 +#include <linux/string.h>
39152 +#include <linux/errno.h>
39153 +#include <linux/ptrace.h>
39154 +#include <linux/timer.h>
39155 +#include <linux/mm.h>
39156 +#include <linux/init.h>
39157 +#include <linux/delay.h>
39158 +#include <linux/spinlock.h>
39159 +#include <linux/interrupt.h>
39160 +#include <linux/module.h>
39161 +#include <linux/moduleparam.h>
39162 +#include <linux/nmi.h>
39163 +#include <linux/kprobes.h>
39164 +#include <linux/kexec.h>
39165 +
39166 +#include <asm/system.h>
39167 +#include <asm/uaccess.h>
39168 +#include <asm/io.h>
39169 +#include <asm/atomic.h>
39170 +#include <asm/debugreg.h>
39171 +#include <asm/desc.h>
39172 +#include <asm/i387.h>
39173 +#include <asm/kdebug.h>
39174 +#include <asm/processor.h>
39175 +
39176 +#include <asm/smp.h>
39177 +#include <asm/pgalloc.h>
39178 +#include <asm/pda.h>
39179 +#include <asm/proto.h>
39180 +#include <asm/nmi.h>
39181 +
39182 +asmlinkage void divide_error(void);
39183 +asmlinkage void debug(void);
39184 +asmlinkage void nmi(void);
39185 +asmlinkage void int3(void);
39186 +asmlinkage void overflow(void);
39187 +asmlinkage void bounds(void);
39188 +asmlinkage void invalid_op(void);
39189 +asmlinkage void device_not_available(void);
39190 +asmlinkage void double_fault(void);
39191 +asmlinkage void coprocessor_segment_overrun(void);
39192 +asmlinkage void invalid_TSS(void);
39193 +asmlinkage void segment_not_present(void);
39194 +asmlinkage void stack_segment(void);
39195 +asmlinkage void general_protection(void);
39196 +asmlinkage void page_fault(void);
39197 +asmlinkage void coprocessor_error(void);
39198 +asmlinkage void simd_coprocessor_error(void);
39199 +asmlinkage void reserved(void);
39200 +asmlinkage void alignment_check(void);
39201 +asmlinkage void machine_check(void);
39202 +asmlinkage void spurious_interrupt_bug(void);
39203 +
39204 +ATOMIC_NOTIFIER_HEAD(die_chain);
39205 +
39206 +int register_die_notifier(struct notifier_block *nb)
39207 +{
39208 +       vmalloc_sync_all();
39209 +       return atomic_notifier_chain_register(&die_chain, nb);
39210 +}
39211 +EXPORT_SYMBOL(register_die_notifier);
39212 +
39213 +int unregister_die_notifier(struct notifier_block *nb)
39214 +{
39215 +       return atomic_notifier_chain_unregister(&die_chain, nb);
39216 +}
39217 +EXPORT_SYMBOL(unregister_die_notifier);
39218 +
39219 +static inline void conditional_sti(struct pt_regs *regs)
39220 +{
39221 +       if (regs->eflags & X86_EFLAGS_IF)
39222 +               local_irq_enable();
39223 +}
39224 +
39225 +static inline void preempt_conditional_sti(struct pt_regs *regs)
39226 +{
39227 +       preempt_disable();
39228 +       if (regs->eflags & X86_EFLAGS_IF)
39229 +               local_irq_enable();
39230 +}
39231 +
39232 +static inline void preempt_conditional_cli(struct pt_regs *regs)
39233 +{
39234 +       if (regs->eflags & X86_EFLAGS_IF)
39235 +               local_irq_disable();
39236 +       preempt_enable_no_resched();
39237 +}
39238 +
39239 +static int kstack_depth_to_print = 10;
39240 +
39241 +#ifdef CONFIG_KALLSYMS
39242 +#include <linux/kallsyms.h> 
39243 +int printk_address(unsigned long address)
39244 +{ 
39245 +       unsigned long offset = 0, symsize;
39246 +       const char *symname;
39247 +       char *modname;
39248 +       char *delim = ":"; 
39249 +       char namebuf[128];
39250 +
39251 +       symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); 
39252 +       if (!symname) 
39253 +               return printk("[<%016lx>]", address);
39254 +       if (!modname) 
39255 +               modname = delim = "";           
39256 +        return printk("<%016lx>{%s%s%s%s%+ld}",
39257 +                     address, delim, modname, delim, symname, offset); 
39258 +} 
39259 +#else
39260 +int printk_address(unsigned long address)
39261 +{ 
39262 +       return printk("[<%016lx>]", address);
39263 +} 
39264 +#endif
39265 +
39266 +static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
39267 +                                       unsigned *usedp, const char **idp)
39268 +{
39269 +#ifndef CONFIG_X86_NO_TSS
39270 +       static char ids[][8] = {
39271 +               [DEBUG_STACK - 1] = "#DB",
39272 +               [NMI_STACK - 1] = "NMI",
39273 +               [DOUBLEFAULT_STACK - 1] = "#DF",
39274 +               [STACKFAULT_STACK - 1] = "#SS",
39275 +               [MCE_STACK - 1] = "#MC",
39276 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
39277 +               [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
39278 +#endif
39279 +       };
39280 +       unsigned k;
39281 +
39282 +       for (k = 0; k < N_EXCEPTION_STACKS; k++) {
39283 +               unsigned long end;
39284 +
39285 +               switch (k + 1) {
39286 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
39287 +               case DEBUG_STACK:
39288 +                       end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ;
39289 +                       break;
39290 +#endif
39291 +               default:
39292 +                       end = per_cpu(init_tss, cpu).ist[k];
39293 +                       break;
39294 +               }
39295 +               if (stack >= end)
39296 +                       continue;
39297 +               if (stack >= end - EXCEPTION_STKSZ) {
39298 +                       if (*usedp & (1U << k))
39299 +                               break;
39300 +                       *usedp |= 1U << k;
39301 +                       *idp = ids[k];
39302 +                       return (unsigned long *)end;
39303 +               }
39304 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
39305 +               if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
39306 +                       unsigned j = N_EXCEPTION_STACKS - 1;
39307 +
39308 +                       do {
39309 +                               ++j;
39310 +                               end -= EXCEPTION_STKSZ;
39311 +                               ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
39312 +                       } while (stack < end - EXCEPTION_STKSZ);
39313 +                       if (*usedp & (1U << j))
39314 +                               break;
39315 +                       *usedp |= 1U << j;
39316 +                       *idp = ids[j];
39317 +                       return (unsigned long *)end;
39318 +               }
39319 +#endif
39320 +       }
39321 +#endif
39322 +       return NULL;
39323 +}
39324 +
39325 +/*
39326 + * x86-64 can have upto three kernel stacks: 
39327 + * process stack
39328 + * interrupt stack
39329 + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
39330 + */
39331 +
39332 +void show_trace(unsigned long *stack)
39333 +{
39334 +       const unsigned cpu = safe_smp_processor_id();
39335 +       unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
39336 +       int i;
39337 +       unsigned used = 0;
39338 +
39339 +       printk("\nCall Trace:");
39340 +
39341 +#define HANDLE_STACK(cond) \
39342 +       do while (cond) { \
39343 +               unsigned long addr = *stack++; \
39344 +               if (kernel_text_address(addr)) { \
39345 +                       if (i > 50) { \
39346 +                               printk("\n       "); \
39347 +                               i = 0; \
39348 +                       } \
39349 +                       else \
39350 +                               i += printk(" "); \
39351 +                       /* \
39352 +                        * If the address is either in the text segment of the \
39353 +                        * kernel, or in the region which contains vmalloc'ed \
39354 +                        * memory, it *may* be the address of a calling \
39355 +                        * routine; if so, print it so that someone tracing \
39356 +                        * down the cause of the crash will be able to figure \
39357 +                        * out the call path that was taken. \
39358 +                        */ \
39359 +                       i += printk_address(addr); \
39360 +               } \
39361 +       } while (0)
39362 +
39363 +       for(i = 11; ; ) {
39364 +               const char *id;
39365 +               unsigned long *estack_end;
39366 +               estack_end = in_exception_stack(cpu, (unsigned long)stack,
39367 +                                               &used, &id);
39368 +
39369 +               if (estack_end) {
39370 +                       i += printk(" <%s>", id);
39371 +                       HANDLE_STACK (stack < estack_end);
39372 +                       i += printk(" <EOE>");
39373 +                       stack = (unsigned long *) estack_end[-2];
39374 +                       continue;
39375 +               }
39376 +               if (irqstack_end) {
39377 +                       unsigned long *irqstack;
39378 +                       irqstack = irqstack_end -
39379 +                               (IRQSTACKSIZE - 64) / sizeof(*irqstack);
39380 +
39381 +                       if (stack >= irqstack && stack < irqstack_end) {
39382 +                               i += printk(" <IRQ>");
39383 +                               HANDLE_STACK (stack < irqstack_end);
39384 +                               stack = (unsigned long *) (irqstack_end[-1]);
39385 +                               irqstack_end = NULL;
39386 +                               i += printk(" <EOI>");
39387 +                               continue;
39388 +                       }
39389 +               }
39390 +               break;
39391 +       }
39392 +
39393 +       HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
39394 +#undef HANDLE_STACK
39395 +       printk("\n");
39396 +}
39397 +
39398 +void show_stack(struct task_struct *tsk, unsigned long * rsp)
39399 +{
39400 +       unsigned long *stack;
39401 +       int i;
39402 +       const int cpu = safe_smp_processor_id();
39403 +       unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
39404 +       unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
39405 +
39406 +       // debugging aid: "show_stack(NULL, NULL);" prints the
39407 +       // back trace for this cpu.
39408 +
39409 +       if (rsp == NULL) {
39410 +               if (tsk)
39411 +                       rsp = (unsigned long *)tsk->thread.rsp;
39412 +               else
39413 +                       rsp = (unsigned long *)&rsp;
39414 +       }
39415 +
39416 +       stack = rsp;
39417 +       for(i=0; i < kstack_depth_to_print; i++) {
39418 +               if (stack >= irqstack && stack <= irqstack_end) {
39419 +                       if (stack == irqstack_end) {
39420 +                               stack = (unsigned long *) (irqstack_end[-1]);
39421 +                               printk(" <EOI> ");
39422 +                       }
39423 +               } else {
39424 +               if (((long) stack & (THREAD_SIZE-1)) == 0)
39425 +                       break;
39426 +               }
39427 +               if (i && ((i % 4) == 0))
39428 +                       printk("\n       ");
39429 +               printk("%016lx ", *stack++);
39430 +               touch_nmi_watchdog();
39431 +       }
39432 +       show_trace((unsigned long *)rsp);
39433 +}
39434 +
39435 +/*
39436 + * The architecture-independent dump_stack generator
39437 + */
39438 +void dump_stack(void)
39439 +{
39440 +       unsigned long dummy;
39441 +       show_trace(&dummy);
39442 +}
39443 +
39444 +EXPORT_SYMBOL(dump_stack);
39445 +
39446 +void show_registers(struct pt_regs *regs)
39447 +{
39448 +       int i;
39449 +       int in_kernel = !user_mode(regs);
39450 +       unsigned long rsp;
39451 +       const int cpu = safe_smp_processor_id(); 
39452 +       struct task_struct *cur = cpu_pda(cpu)->pcurrent;
39453 +
39454 +               rsp = regs->rsp;
39455 +
39456 +       printk("CPU %d ", cpu);
39457 +       __show_regs(regs);
39458 +       printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
39459 +               cur->comm, cur->pid, task_thread_info(cur), cur);
39460 +
39461 +       /*
39462 +        * When in-kernel, we also print out the stack and code at the
39463 +        * time of the fault..
39464 +        */
39465 +       if (in_kernel) {
39466 +
39467 +               printk("Stack: ");
39468 +               show_stack(NULL, (unsigned long*)rsp);
39469 +
39470 +               printk("\nCode: ");
39471 +               if (regs->rip < PAGE_OFFSET)
39472 +                       goto bad;
39473 +
39474 +               for (i=0; i<20; i++) {
39475 +                       unsigned char c;
39476 +                       if (__get_user(c, &((unsigned char*)regs->rip)[i])) {
39477 +bad:
39478 +                               printk(" Bad RIP value.");
39479 +                               break;
39480 +                       }
39481 +                       printk("%02x ", c);
39482 +               }
39483 +       }
39484 +       printk("\n");
39485 +}      
39486 +
39487 +void handle_BUG(struct pt_regs *regs)
39488 +{ 
39489 +       struct bug_frame f;
39490 +       long len;
39491 +       const char *prefix = "";
39492 +
39493 +       if (user_mode(regs))
39494 +               return; 
39495 +       if (__copy_from_user(&f, (const void __user *) regs->rip,
39496 +                            sizeof(struct bug_frame)))
39497 +               return; 
39498 +       if (f.filename >= 0 ||
39499 +           f.ud2[0] != 0x0f || f.ud2[1] != 0x0b) 
39500 +               return;
39501 +       len = __strnlen_user((char *)(long)f.filename, PATH_MAX) - 1;
39502 +       if (len < 0 || len >= PATH_MAX)
39503 +               f.filename = (int)(long)"unmapped filename";
39504 +       else if (len > 50) {
39505 +               f.filename += len - 50;
39506 +               prefix = "...";
39507 +       }
39508 +       printk("----------- [cut here ] --------- [please bite here ] ---------\n");
39509 +       printk(KERN_ALERT "Kernel BUG at %s%.50s:%d\n", prefix, (char *)(long)f.filename, f.line);
39510 +} 
39511 +
39512 +#ifdef CONFIG_BUG
39513 +void out_of_line_bug(void)
39514 +{ 
39515 +       BUG(); 
39516 +} 
39517 +#endif
39518 +
39519 +static DEFINE_SPINLOCK(die_lock);
39520 +static int die_owner = -1;
39521 +static unsigned int die_nest_count;
39522 +
39523 +unsigned __kprobes long oops_begin(void)
39524 +{
39525 +       int cpu = safe_smp_processor_id();
39526 +       unsigned long flags;
39527 +
39528 +       /* racy, but better than risking deadlock. */
39529 +       local_irq_save(flags);
39530 +       if (!spin_trylock(&die_lock)) { 
39531 +               if (cpu == die_owner) 
39532 +                       /* nested oops. should stop eventually */;
39533 +               else
39534 +                       spin_lock(&die_lock);
39535 +       }
39536 +       die_nest_count++;
39537 +       die_owner = cpu;
39538 +       console_verbose();
39539 +       bust_spinlocks(1);
39540 +       return flags;
39541 +}
39542 +
39543 +void __kprobes oops_end(unsigned long flags)
39544 +{ 
39545 +       die_owner = -1;
39546 +       bust_spinlocks(0);
39547 +       die_nest_count--;
39548 +       if (die_nest_count)
39549 +               /* We still own the lock */
39550 +               local_irq_restore(flags);
39551 +       else
39552 +               /* Nest count reaches zero, release the lock. */
39553 +               spin_unlock_irqrestore(&die_lock, flags);
39554 +       if (panic_on_oops)
39555 +               panic("Oops");
39556 +}
39557 +
39558 +void __kprobes __die(const char * str, struct pt_regs * regs, long err)
39559 +{
39560 +       static int die_counter;
39561 +       printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
39562 +#ifdef CONFIG_PREEMPT
39563 +       printk("PREEMPT ");
39564 +#endif
39565 +#ifdef CONFIG_SMP
39566 +       printk("SMP ");
39567 +#endif
39568 +#ifdef CONFIG_DEBUG_PAGEALLOC
39569 +       printk("DEBUG_PAGEALLOC");
39570 +#endif
39571 +       printk("\n");
39572 +       notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
39573 +       show_registers(regs);
39574 +       /* Executive summary in case the oops scrolled away */
39575 +       printk(KERN_ALERT "RIP ");
39576 +       printk_address(regs->rip); 
39577 +       printk(" RSP <%016lx>\n", regs->rsp); 
39578 +       if (kexec_should_crash(current))
39579 +               crash_kexec(regs);
39580 +}
39581 +
39582 +void die(const char * str, struct pt_regs * regs, long err)
39583 +{
39584 +       unsigned long flags = oops_begin();
39585 +
39586 +       handle_BUG(regs);
39587 +       __die(str, regs, err);
39588 +       oops_end(flags);
39589 +       do_exit(SIGSEGV); 
39590 +}
39591 +
39592 +#ifdef CONFIG_X86_LOCAL_APIC
39593 +void __kprobes die_nmi(char *str, struct pt_regs *regs)
39594 +{
39595 +       unsigned long flags = oops_begin();
39596 +
39597 +       /*
39598 +        * We are in trouble anyway, lets at least try
39599 +        * to get a message out.
39600 +        */
39601 +       printk(str, safe_smp_processor_id());
39602 +       show_registers(regs);
39603 +       if (kexec_should_crash(current))
39604 +               crash_kexec(regs);
39605 +       if (panic_on_timeout || panic_on_oops)
39606 +               panic("nmi watchdog");
39607 +       printk("console shuts up ...\n");
39608 +       oops_end(flags);
39609 +       nmi_exit();
39610 +       local_irq_enable();
39611 +       do_exit(SIGSEGV);
39612 +}
39613 +#endif
39614 +
39615 +static void __kprobes do_trap(int trapnr, int signr, char *str,
39616 +                             struct pt_regs * regs, long error_code,
39617 +                             siginfo_t *info)
39618 +{
39619 +       struct task_struct *tsk = current;
39620 +
39621 +       conditional_sti(regs);
39622 +
39623 +       tsk->thread.error_code = error_code;
39624 +       tsk->thread.trap_no = trapnr;
39625 +
39626 +       if (user_mode(regs)) {
39627 +               if (exception_trace && unhandled_signal(tsk, signr))
39628 +                       printk(KERN_INFO
39629 +                              "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
39630 +                              tsk->comm, tsk->pid, str,
39631 +                              regs->rip, regs->rsp, error_code); 
39632 +
39633 +               if (info)
39634 +                       force_sig_info(signr, info, tsk);
39635 +               else
39636 +                       force_sig(signr, tsk);
39637 +               return;
39638 +       }
39639 +
39640 +
39641 +       /* kernel trap */ 
39642 +       {            
39643 +               const struct exception_table_entry *fixup;
39644 +               fixup = search_exception_tables(regs->rip);
39645 +               if (fixup)
39646 +                       regs->rip = fixup->fixup;
39647 +               else    
39648 +                       die(str, regs, error_code);
39649 +               return;
39650 +       }
39651 +}
39652 +
39653 +#define DO_ERROR(trapnr, signr, str, name) \
39654 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
39655 +{ \
39656 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
39657 +                                                       == NOTIFY_STOP) \
39658 +               return; \
39659 +       do_trap(trapnr, signr, str, regs, error_code, NULL); \
39660 +}
39661 +
39662 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
39663 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
39664 +{ \
39665 +       siginfo_t info; \
39666 +       info.si_signo = signr; \
39667 +       info.si_errno = 0; \
39668 +       info.si_code = sicode; \
39669 +       info.si_addr = (void __user *)siaddr; \
39670 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
39671 +                                                       == NOTIFY_STOP) \
39672 +               return; \
39673 +       do_trap(trapnr, signr, str, regs, error_code, &info); \
39674 +}
39675 +
39676 +DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->rip)
39677 +DO_ERROR( 4, SIGSEGV, "overflow", overflow)
39678 +DO_ERROR( 5, SIGSEGV, "bounds", bounds)
39679 +DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
39680 +DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
39681 +DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
39682 +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
39683 +DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
39684 +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
39685 +DO_ERROR(18, SIGSEGV, "reserved", reserved)
39686 +DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
39687 +
39688 +asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
39689 +{
39690 +       static const char str[] = "double fault";
39691 +       struct task_struct *tsk = current;
39692 +
39693 +       /* Return not checked because double check cannot be ignored */
39694 +       notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
39695 +
39696 +       tsk->thread.error_code = error_code;
39697 +       tsk->thread.trap_no = 8;
39698 +
39699 +       /* This is always a kernel trap and never fixable (and thus must
39700 +          never return). */
39701 +       for (;;)
39702 +               die(str, regs, error_code);
39703 +}
39704 +
39705 +asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
39706 +                                               long error_code)
39707 +{
39708 +       struct task_struct *tsk = current;
39709 +
39710 +       conditional_sti(regs);
39711 +
39712 +       tsk->thread.error_code = error_code;
39713 +       tsk->thread.trap_no = 13;
39714 +
39715 +       if (user_mode(regs)) {
39716 +               if (exception_trace && unhandled_signal(tsk, SIGSEGV))
39717 +                       printk(KERN_INFO
39718 +                      "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
39719 +                              tsk->comm, tsk->pid,
39720 +                              regs->rip, regs->rsp, error_code); 
39721 +
39722 +               force_sig(SIGSEGV, tsk);
39723 +               return;
39724 +       } 
39725 +
39726 +       /* kernel gp */
39727 +       {
39728 +               const struct exception_table_entry *fixup;
39729 +               fixup = search_exception_tables(regs->rip);
39730 +               if (fixup) {
39731 +                       regs->rip = fixup->fixup;
39732 +                       return;
39733 +               }
39734 +               if (notify_die(DIE_GPF, "general protection fault", regs,
39735 +                                       error_code, 13, SIGSEGV) == NOTIFY_STOP)
39736 +                       return;
39737 +               die("general protection fault", regs, error_code);
39738 +       }
39739 +}
39740 +
39741 +static __kprobes void
39742 +mem_parity_error(unsigned char reason, struct pt_regs * regs)
39743 +{
39744 +       printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
39745 +       printk("You probably have a hardware problem with your RAM chips\n");
39746 +
39747 +#if 0 /* XEN */
39748 +       /* Clear and disable the memory parity error line. */
39749 +       reason = (reason & 0xf) | 4;
39750 +       outb(reason, 0x61);
39751 +#endif /* XEN */
39752 +}
39753 +
39754 +static __kprobes void
39755 +io_check_error(unsigned char reason, struct pt_regs * regs)
39756 +{
39757 +       printk("NMI: IOCK error (debug interrupt?)\n");
39758 +       show_registers(regs);
39759 +
39760 +#if 0 /* XEN */
39761 +       /* Re-enable the IOCK line, wait for a few seconds */
39762 +       reason = (reason & 0xf) | 8;
39763 +       outb(reason, 0x61);
39764 +       mdelay(2000);
39765 +       reason &= ~8;
39766 +       outb(reason, 0x61);
39767 +#endif /* XEN */
39768 +}
39769 +
39770 +static __kprobes void
39771 +unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
39772 +{      printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
39773 +       printk("Dazed and confused, but trying to continue\n");
39774 +       printk("Do you have a strange power saving mode enabled?\n");
39775 +}
39776 +
39777 +/* Runs on IST stack. This code must keep interrupts off all the time.
39778 +   Nested NMIs are prevented by the CPU. */
39779 +asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
39780 +{
39781 +       unsigned char reason = 0;
39782 +       int cpu;
39783 +
39784 +       cpu = smp_processor_id();
39785 +
39786 +       /* Only the BSP gets external NMIs from the system.  */
39787 +       if (!cpu)
39788 +               reason = get_nmi_reason();
39789 +
39790 +       if (!(reason & 0xc0)) {
39791 +               if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
39792 +                                                               == NOTIFY_STOP)
39793 +                       return;
39794 +#ifdef CONFIG_X86_LOCAL_APIC
39795 +               /*
39796 +                * Ok, so this is none of the documented NMI sources,
39797 +                * so it must be the NMI watchdog.
39798 +                */
39799 +               if (nmi_watchdog > 0) {
39800 +                       nmi_watchdog_tick(regs,reason);
39801 +                       return;
39802 +               }
39803 +#endif
39804 +               unknown_nmi_error(reason, regs);
39805 +               return;
39806 +       }
39807 +       if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
39808 +               return; 
39809 +
39810 +       /* AK: following checks seem to be broken on modern chipsets. FIXME */
39811 +
39812 +       if (reason & 0x80)
39813 +               mem_parity_error(reason, regs);
39814 +       if (reason & 0x40)
39815 +               io_check_error(reason, regs);
39816 +}
39817 +
39818 +/* runs on IST stack. */
39819 +asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
39820 +{
39821 +       if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
39822 +               return;
39823 +       }
39824 +       do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
39825 +       return;
39826 +}
39827 +
39828 +/* Help handler running on IST stack to switch back to user stack
39829 +   for scheduling or signal handling. The actual stack switch is done in
39830 +   entry.S */
39831 +asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
39832 +{
39833 +       struct pt_regs *regs = eregs;
39834 +       /* Did already sync */
39835 +       if (eregs == (struct pt_regs *)eregs->rsp)
39836 +               ;
39837 +       /* Exception from user space */
39838 +       else if (user_mode(eregs))
39839 +               regs = task_pt_regs(current);
39840 +       /* Exception from kernel and interrupts are enabled. Move to
39841 +          kernel process stack. */
39842 +       else if (eregs->eflags & X86_EFLAGS_IF)
39843 +               regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
39844 +       if (eregs != regs)
39845 +               *regs = *eregs;
39846 +       return regs;
39847 +}
39848 +
39849 +/* runs on IST stack. */
39850 +asmlinkage void __kprobes do_debug(struct pt_regs * regs,
39851 +                                  unsigned long error_code)
39852 +{
39853 +       unsigned long condition;
39854 +       struct task_struct *tsk = current;
39855 +       siginfo_t info;
39856 +
39857 +       get_debugreg(condition, 6);
39858 +
39859 +       if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
39860 +                                               SIGTRAP) == NOTIFY_STOP)
39861 +               return;
39862 +
39863 +       preempt_conditional_sti(regs);
39864 +
39865 +       /* Mask out spurious debug traps due to lazy DR7 setting */
39866 +       if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
39867 +               if (!tsk->thread.debugreg7) { 
39868 +                       goto clear_dr7;
39869 +               }
39870 +       }
39871 +
39872 +       tsk->thread.debugreg6 = condition;
39873 +
39874 +       /* Mask out spurious TF errors due to lazy TF clearing */
39875 +       if (condition & DR_STEP) {
39876 +               /*
39877 +                * The TF error should be masked out only if the current
39878 +                * process is not traced and if the TRAP flag has been set
39879 +                * previously by a tracing process (condition detected by
39880 +                * the PT_DTRACE flag); remember that the i386 TRAP flag
39881 +                * can be modified by the process itself in user mode,
39882 +                * allowing programs to debug themselves without the ptrace()
39883 +                * interface.
39884 +                */
39885 +                if (!user_mode(regs))
39886 +                       goto clear_TF_reenable;
39887 +               /*
39888 +                * Was the TF flag set by a debugger? If so, clear it now,
39889 +                * so that register information is correct.
39890 +                */
39891 +               if (tsk->ptrace & PT_DTRACE) {
39892 +                       regs->eflags &= ~TF_MASK;
39893 +                       tsk->ptrace &= ~PT_DTRACE;
39894 +               }
39895 +       }
39896 +
39897 +       /* Ok, finally something we can handle */
39898 +       tsk->thread.trap_no = 1;
39899 +       tsk->thread.error_code = error_code;
39900 +       info.si_signo = SIGTRAP;
39901 +       info.si_errno = 0;
39902 +       info.si_code = TRAP_BRKPT;
39903 +       info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
39904 +       force_sig_info(SIGTRAP, &info, tsk);
39905 +
39906 +clear_dr7:
39907 +       set_debugreg(0UL, 7);
39908 +       preempt_conditional_cli(regs);
39909 +       return;
39910 +
39911 +clear_TF_reenable:
39912 +       set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
39913 +       regs->eflags &= ~TF_MASK;
39914 +       preempt_conditional_cli(regs);
39915 +}
39916 +
39917 +static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
39918 +{
39919 +       const struct exception_table_entry *fixup;
39920 +       fixup = search_exception_tables(regs->rip);
39921 +       if (fixup) {
39922 +               regs->rip = fixup->fixup;
39923 +               return 1;
39924 +       }
39925 +       notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
39926 +       /* Illegal floating point operation in the kernel */
39927 +       current->thread.trap_no = trapnr;
39928 +       die(str, regs, 0);
39929 +       return 0;
39930 +}
39931 +
39932 +/*
39933 + * Note that we play around with the 'TS' bit in an attempt to get
39934 + * the correct behaviour even in the presence of the asynchronous
39935 + * IRQ13 behaviour
39936 + */
39937 +asmlinkage void do_coprocessor_error(struct pt_regs *regs)
39938 +{
39939 +       void __user *rip = (void __user *)(regs->rip);
39940 +       struct task_struct * task;
39941 +       siginfo_t info;
39942 +       unsigned short cwd, swd;
39943 +
39944 +       conditional_sti(regs);
39945 +       if (!user_mode(regs) &&
39946 +           kernel_math_error(regs, "kernel x87 math error", 16))
39947 +               return;
39948 +
39949 +       /*
39950 +        * Save the info for the exception handler and clear the error.
39951 +        */
39952 +       task = current;
39953 +       save_init_fpu(task);
39954 +       task->thread.trap_no = 16;
39955 +       task->thread.error_code = 0;
39956 +       info.si_signo = SIGFPE;
39957 +       info.si_errno = 0;
39958 +       info.si_code = __SI_FAULT;
39959 +       info.si_addr = rip;
39960 +       /*
39961 +        * (~cwd & swd) will mask out exceptions that are not set to unmasked
39962 +        * status.  0x3f is the exception bits in these regs, 0x200 is the
39963 +        * C1 reg you need in case of a stack fault, 0x040 is the stack
39964 +        * fault bit.  We should only be taking one exception at a time,
39965 +        * so if this combination doesn't produce any single exception,
39966 +        * then we have a bad program that isn't synchronizing its FPU usage
39967 +        * and it will suffer the consequences since we won't be able to
39968 +        * fully reproduce the context of the exception
39969 +        */
39970 +       cwd = get_fpu_cwd(task);
39971 +       swd = get_fpu_swd(task);
39972 +       switch (swd & ~cwd & 0x3f) {
39973 +               case 0x000:
39974 +               default:
39975 +                       break;
39976 +               case 0x001: /* Invalid Op */
39977 +                       /*
39978 +                        * swd & 0x240 == 0x040: Stack Underflow
39979 +                        * swd & 0x240 == 0x240: Stack Overflow
39980 +                        * User must clear the SF bit (0x40) if set
39981 +                        */
39982 +                       info.si_code = FPE_FLTINV;
39983 +                       break;
39984 +               case 0x002: /* Denormalize */
39985 +               case 0x010: /* Underflow */
39986 +                       info.si_code = FPE_FLTUND;
39987 +                       break;
39988 +               case 0x004: /* Zero Divide */
39989 +                       info.si_code = FPE_FLTDIV;
39990 +                       break;
39991 +               case 0x008: /* Overflow */
39992 +                       info.si_code = FPE_FLTOVF;
39993 +                       break;
39994 +               case 0x020: /* Precision */
39995 +                       info.si_code = FPE_FLTRES;
39996 +                       break;
39997 +       }
39998 +       force_sig_info(SIGFPE, &info, task);
39999 +}
40000 +
40001 +asmlinkage void bad_intr(void)
40002 +{
40003 +       printk("bad interrupt"); 
40004 +}
40005 +
40006 +asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
40007 +{
40008 +       void __user *rip = (void __user *)(regs->rip);
40009 +       struct task_struct * task;
40010 +       siginfo_t info;
40011 +       unsigned short mxcsr;
40012 +
40013 +       conditional_sti(regs);
40014 +       if (!user_mode(regs) &&
40015 +               kernel_math_error(regs, "kernel simd math error", 19))
40016 +               return;
40017 +
40018 +       /*
40019 +        * Save the info for the exception handler and clear the error.
40020 +        */
40021 +       task = current;
40022 +       save_init_fpu(task);
40023 +       task->thread.trap_no = 19;
40024 +       task->thread.error_code = 0;
40025 +       info.si_signo = SIGFPE;
40026 +       info.si_errno = 0;
40027 +       info.si_code = __SI_FAULT;
40028 +       info.si_addr = rip;
40029 +       /*
40030 +        * The SIMD FPU exceptions are handled a little differently, as there
40031 +        * is only a single status/control register.  Thus, to determine which
40032 +        * unmasked exception was caught we must mask the exception mask bits
40033 +        * at 0x1f80, and then use these to mask the exception bits at 0x3f.
40034 +        */
40035 +       mxcsr = get_fpu_mxcsr(task);
40036 +       switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
40037 +               case 0x000:
40038 +               default:
40039 +                       break;
40040 +               case 0x001: /* Invalid Op */
40041 +                       info.si_code = FPE_FLTINV;
40042 +                       break;
40043 +               case 0x002: /* Denormalize */
40044 +               case 0x010: /* Underflow */
40045 +                       info.si_code = FPE_FLTUND;
40046 +                       break;
40047 +               case 0x004: /* Zero Divide */
40048 +                       info.si_code = FPE_FLTDIV;
40049 +                       break;
40050 +               case 0x008: /* Overflow */
40051 +                       info.si_code = FPE_FLTOVF;
40052 +                       break;
40053 +               case 0x020: /* Precision */
40054 +                       info.si_code = FPE_FLTRES;
40055 +                       break;
40056 +       }
40057 +       force_sig_info(SIGFPE, &info, task);
40058 +}
40059 +
40060 +asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
40061 +{
40062 +}
40063 +
40064 +#if 0
40065 +asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
40066 +{
40067 +}
40068 +#endif
40069 +
40070 +asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
40071 +{
40072 +}
40073 +
40074 +/*
40075 + *  'math_state_restore()' saves the current math information in the
40076 + * old math state array, and gets the new ones from the current task
40077 + *
40078 + * Careful.. There are problems with IBM-designed IRQ13 behaviour.
40079 + * Don't touch unless you *really* know how it works.
40080 + */
40081 +asmlinkage void math_state_restore(void)
40082 +{
40083 +       struct task_struct *me = current;
40084 +        /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
40085 +
40086 +       if (!used_math())
40087 +               init_fpu(me);
40088 +       restore_fpu_checking(&me->thread.i387.fxsave);
40089 +       task_thread_info(me)->status |= TS_USEDFPU;
40090 +}
40091 +
40092 +
40093 +/*
40094 + * NB. All these are "interrupt gates" (i.e. events_mask is set) because we
40095 + * specify <dpl>|4 in the second field.
40096 + */
40097 +static trap_info_t trap_table[] = {
40098 +        {  0, 0|4, __KERNEL_CS, (unsigned long)divide_error               },
40099 +        {  1, 0|4, __KERNEL_CS, (unsigned long)debug                      },
40100 +        {  3, 3|4, __KERNEL_CS, (unsigned long)int3                       },
40101 +        {  4, 3|4, __KERNEL_CS, (unsigned long)overflow                   },
40102 +        {  5, 0|4, __KERNEL_CS, (unsigned long)bounds                     },
40103 +        {  6, 0|4, __KERNEL_CS, (unsigned long)invalid_op                 },
40104 +        {  7, 0|4, __KERNEL_CS, (unsigned long)device_not_available       },
40105 +        {  9, 0|4, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun},
40106 +        { 10, 0|4, __KERNEL_CS, (unsigned long)invalid_TSS                },
40107 +        { 11, 0|4, __KERNEL_CS, (unsigned long)segment_not_present        },
40108 +        { 12, 0|4, __KERNEL_CS, (unsigned long)stack_segment              },
40109 +        { 13, 0|4, __KERNEL_CS, (unsigned long)general_protection         },
40110 +        { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault                 },
40111 +        { 15, 0|4, __KERNEL_CS, (unsigned long)spurious_interrupt_bug     },
40112 +        { 16, 0|4, __KERNEL_CS, (unsigned long)coprocessor_error          },
40113 +        { 17, 0|4, __KERNEL_CS, (unsigned long)alignment_check            },
40114 +#ifdef CONFIG_X86_MCE
40115 +        { 18, 0|4, __KERNEL_CS, (unsigned long)machine_check              },
40116 +#endif
40117 +        { 19, 0|4, __KERNEL_CS, (unsigned long)simd_coprocessor_error     },
40118 +#ifdef CONFIG_IA32_EMULATION
40119 +       { IA32_SYSCALL_VECTOR, 3|4, __KERNEL_CS, (unsigned long)ia32_syscall},
40120 +#endif
40121 +        {  0, 0,           0, 0                                              }
40122 +};
40123 +
40124 +void __init trap_init(void)
40125 +{
40126 +        int ret;
40127 +
40128 +        ret = HYPERVISOR_set_trap_table(trap_table);
40129 +        
40130 +        if (ret) 
40131 +                printk("HYPERVISOR_set_trap_table faild: error %d\n",
40132 +                       ret);
40133 +
40134 +       /*
40135 +        * Should be a barrier for any external CPU state.
40136 +        */
40137 +       cpu_init();
40138 +}
40139 +
40140 +void smp_trap_init(trap_info_t *trap_ctxt)
40141 +{
40142 +       trap_info_t *t = trap_table;
40143 +
40144 +       for (t = trap_table; t->address; t++) {
40145 +               trap_ctxt[t->vector].flags = t->flags;
40146 +               trap_ctxt[t->vector].cs = t->cs;
40147 +               trap_ctxt[t->vector].address = t->address;
40148 +       }
40149 +}
40150 +
40151 +
40152 +/* Actual parsing is done early in setup.c. */
40153 +static int __init oops_dummy(char *s)
40154 +{ 
40155 +       panic_on_oops = 1;
40156 +       return 1;
40157 +} 
40158 +__setup("oops=", oops_dummy); 
40159 +
40160 +static int __init kstack_setup(char *s)
40161 +{
40162 +       kstack_depth_to_print = simple_strtoul(s,NULL,0);
40163 +       return 1;
40164 +}
40165 +__setup("kstack=", kstack_setup);
40166 +
40167 diff -urNp linux-2.6/arch/x86_64/kernel/vsyscall-xen.c new/arch/x86_64/kernel/vsyscall-xen.c
40168 --- linux-2.6/arch/x86_64/kernel/vsyscall-xen.c 1970-01-01 01:00:00.000000000 +0100
40169 +++ new/arch/x86_64/kernel/vsyscall-xen.c       2006-05-09 12:33:21.000000000 +0200
40170 @@ -0,0 +1,239 @@
40171 +/*
40172 + *  linux/arch/x86_64/kernel/vsyscall.c
40173 + *
40174 + *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
40175 + *  Copyright 2003 Andi Kleen, SuSE Labs.
40176 + *
40177 + *  Thanks to hpa@transmeta.com for some useful hint.
40178 + *  Special thanks to Ingo Molnar for his early experience with
40179 + *  a different vsyscall implementation for Linux/IA32 and for the name.
40180 + *
40181 + *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
40182 + *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
40183 + *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
40184 + *  jumping out of line if necessary. We cannot add more with this
40185 + *  mechanism because older kernels won't return -ENOSYS.
40186 + *  If we want more than four we need a vDSO.
40187 + *
40188 + *  Note: the concept clashes with user mode linux. If you use UML and
40189 + *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
40190 + */
40191 +
40192 +#include <linux/time.h>
40193 +#include <linux/init.h>
40194 +#include <linux/kernel.h>
40195 +#include <linux/timer.h>
40196 +#include <linux/seqlock.h>
40197 +#include <linux/jiffies.h>
40198 +#include <linux/sysctl.h>
40199 +
40200 +#include <asm/vsyscall.h>
40201 +#include <asm/pgtable.h>
40202 +#include <asm/page.h>
40203 +#include <asm/fixmap.h>
40204 +#include <asm/errno.h>
40205 +#include <asm/io.h>
40206 +
40207 +#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
40208 +
40209 +int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
40210 +seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
40211 +
40212 +#include <asm/unistd.h>
40213 +
40214 +static __always_inline void timeval_normalize(struct timeval * tv)
40215 +{
40216 +       time_t __sec;
40217 +
40218 +       __sec = tv->tv_usec / 1000000;
40219 +       if (__sec) {
40220 +               tv->tv_usec %= 1000000;
40221 +               tv->tv_sec += __sec;
40222 +       }
40223 +}
40224 +
40225 +static __always_inline void do_vgettimeofday(struct timeval * tv)
40226 +{
40227 +       long sequence, t;
40228 +       unsigned long sec, usec;
40229 +
40230 +       do {
40231 +               sequence = read_seqbegin(&__xtime_lock);
40232 +               
40233 +               sec = __xtime.tv_sec;
40234 +               usec = (__xtime.tv_nsec / 1000) +
40235 +                       (__jiffies - __wall_jiffies) * (1000000 / HZ);
40236 +
40237 +               if (__vxtime.mode != VXTIME_HPET) {
40238 +                       t = get_cycles_sync();
40239 +                       if (t < __vxtime.last_tsc)
40240 +                               t = __vxtime.last_tsc;
40241 +                       usec += ((t - __vxtime.last_tsc) *
40242 +                                __vxtime.tsc_quot) >> 32;
40243 +                       /* See comment in x86_64 do_gettimeofday. */
40244 +               } else {
40245 +                       usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
40246 +                                 __vxtime.last) * __vxtime.quot) >> 32;
40247 +               }
40248 +       } while (read_seqretry(&__xtime_lock, sequence));
40249 +
40250 +       tv->tv_sec = sec + usec / 1000000;
40251 +       tv->tv_usec = usec % 1000000;
40252 +}
40253 +
40254 +/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
40255 +static __always_inline void do_get_tz(struct timezone * tz)
40256 +{
40257 +       *tz = __sys_tz;
40258 +}
40259 +
40260 +static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
40261 +{
40262 +       int ret;
40263 +       asm volatile("vsysc2: syscall"
40264 +               : "=a" (ret)
40265 +               : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber );
40266 +       return ret;
40267 +}
40268 +
40269 +static __always_inline long time_syscall(long *t)
40270 +{
40271 +       long secs;
40272 +       asm volatile("vsysc1: syscall"
40273 +               : "=a" (secs)
40274 +               : "0" (__NR_time),"D" (t) : __syscall_clobber);
40275 +       return secs;
40276 +}
40277 +
40278 +int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
40279 +{
40280 +       if (unlikely(!__sysctl_vsyscall))
40281 +               return gettimeofday(tv,tz);
40282 +       if (tv)
40283 +               do_vgettimeofday(tv);
40284 +       if (tz)
40285 +               do_get_tz(tz);
40286 +       return 0;
40287 +}
40288 +
40289 +/* This will break when the xtime seconds get inaccurate, but that is
40290 + * unlikely */
40291 +time_t __vsyscall(1) vtime(time_t *t)
40292 +{
40293 +       if (unlikely(!__sysctl_vsyscall))
40294 +               return time_syscall(t);
40295 +       else if (t)
40296 +               *t = __xtime.tv_sec;            
40297 +       return __xtime.tv_sec;
40298 +}
40299 +
40300 +long __vsyscall(2) venosys_0(void)
40301 +{
40302 +       return -ENOSYS;
40303 +}
40304 +
40305 +long __vsyscall(3) venosys_1(void)
40306 +{
40307 +       return -ENOSYS;
40308 +}
40309 +
40310 +#ifdef CONFIG_SYSCTL
40311 +
40312 +#define SYSCALL 0x050f
40313 +#define NOP2    0x9090
40314 +
40315 +/*
40316 + * NOP out syscall in vsyscall page when not needed.
40317 + */
40318 +static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
40319 +                        void __user *buffer, size_t *lenp, loff_t *ppos)
40320 +{
40321 +       extern u16 vsysc1, vsysc2;
40322 +       u16 *map1, *map2;
40323 +       int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
40324 +       if (!write)
40325 +               return ret;
40326 +       /* gcc has some trouble with __va(__pa()), so just do it this
40327 +          way. */
40328 +       map1 = ioremap(__pa_symbol(&vsysc1), 2);
40329 +       if (!map1)
40330 +               return -ENOMEM;
40331 +       map2 = ioremap(__pa_symbol(&vsysc2), 2);
40332 +       if (!map2) {
40333 +               ret = -ENOMEM;
40334 +               goto out;
40335 +       }
40336 +       if (!sysctl_vsyscall) {
40337 +               *map1 = SYSCALL;
40338 +               *map2 = SYSCALL;
40339 +       } else {
40340 +               *map1 = NOP2;
40341 +               *map2 = NOP2;
40342 +       }
40343 +       iounmap(map2);
40344 +out:
40345 +       iounmap(map1);
40346 +       return ret;
40347 +}
40348 +
40349 +static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
40350 +                               void __user *oldval, size_t __user *oldlenp,
40351 +                               void __user *newval, size_t newlen,
40352 +                               void **context)
40353 +{
40354 +       return -ENOSYS;
40355 +}
40356 +
40357 +static ctl_table kernel_table2[] = {
40358 +       { .ctl_name = 99, .procname = "vsyscall64",
40359 +         .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644,
40360 +         .strategy = vsyscall_sysctl_nostrat,
40361 +         .proc_handler = vsyscall_sysctl_change },
40362 +       { 0, }
40363 +};
40364 +
40365 +static ctl_table kernel_root_table2[] = {
40366 +       { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
40367 +         .child = kernel_table2 },
40368 +       { 0 },
40369 +};
40370 +
40371 +#endif
40372 +
40373 +static void __init map_vsyscall(void)
40374 +{
40375 +       extern char __vsyscall_0;
40376 +       unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
40377 +
40378 +       __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
40379 +}
40380 +
40381 +#ifdef CONFIG_XEN
40382 +static void __init map_vsyscall_user(void)
40383 +{
40384 +       extern void __set_fixmap_user(enum fixed_addresses, unsigned long, pgprot_t);
40385 +       extern char __vsyscall_0;
40386 +       unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
40387 +
40388 +       __set_fixmap_user(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
40389 +}
40390 +#endif
40391 +
40392 +static int __init vsyscall_init(void)
40393 +{
40394 +       BUG_ON(((unsigned long) &vgettimeofday !=
40395 +                       VSYSCALL_ADDR(__NR_vgettimeofday)));
40396 +       BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
40397 +       BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
40398 +       map_vsyscall();
40399 +#ifdef CONFIG_XEN
40400 +       map_vsyscall_user();
40401 +       sysctl_vsyscall = 0; /* disable vgettimeofay() */
40402 +#endif
40403 +#ifdef CONFIG_SYSCTL
40404 +       register_sysctl_table(kernel_root_table2, 0);
40405 +#endif
40406 +       return 0;
40407 +}
40408 +
40409 +__initcall(vsyscall_init);
40410 diff -urNp linux-2.6/arch/x86_64/kernel/x8664_ksyms-xen.c new/arch/x86_64/kernel/x8664_ksyms-xen.c
40411 --- linux-2.6/arch/x86_64/kernel/x8664_ksyms-xen.c      1970-01-01 01:00:00.000000000 +0100
40412 +++ new/arch/x86_64/kernel/x8664_ksyms-xen.c    2006-05-09 12:33:21.000000000 +0200
40413 @@ -0,0 +1,155 @@
40414 +#include <linux/config.h>
40415 +#include <linux/module.h>
40416 +#include <linux/smp.h>
40417 +#include <linux/user.h>
40418 +#include <linux/sched.h>
40419 +#include <linux/in6.h>
40420 +#include <linux/interrupt.h>
40421 +#include <linux/smp_lock.h>
40422 +#include <linux/pm.h>
40423 +#include <linux/pci.h>
40424 +#include <linux/apm_bios.h>
40425 +#include <linux/kernel.h>
40426 +#include <linux/string.h>
40427 +#include <linux/syscalls.h>
40428 +#include <linux/tty.h>
40429 +
40430 +#include <asm/semaphore.h>
40431 +#include <asm/processor.h>
40432 +#include <asm/i387.h>
40433 +#include <asm/uaccess.h>
40434 +#include <asm/checksum.h>
40435 +#include <asm/io.h>
40436 +#include <asm/delay.h>
40437 +#include <asm/irq.h>
40438 +#include <asm/mmx.h>
40439 +#include <asm/desc.h>
40440 +#include <asm/pgtable.h>
40441 +#include <asm/pgalloc.h>
40442 +#include <asm/nmi.h>
40443 +#include <asm/kdebug.h>
40444 +#include <asm/unistd.h>
40445 +#include <asm/tlbflush.h>
40446 +#include <asm/kdebug.h>
40447 +
40448 +#ifdef CONFIG_SMP
40449 +extern void __write_lock_failed(rwlock_t *rw);
40450 +extern void __read_lock_failed(rwlock_t *rw);
40451 +#endif
40452 +
40453 +/* platform dependent support */
40454 +EXPORT_SYMBOL(boot_cpu_data);
40455 +//EXPORT_SYMBOL(dump_fpu);
40456 +EXPORT_SYMBOL(kernel_thread);
40457 +EXPORT_SYMBOL(pm_idle);
40458 +EXPORT_SYMBOL(pm_power_off);
40459 +
40460 +EXPORT_SYMBOL(__down_failed);
40461 +EXPORT_SYMBOL(__down_failed_interruptible);
40462 +EXPORT_SYMBOL(__down_failed_trylock);
40463 +EXPORT_SYMBOL(__up_wakeup);
40464 +/* Networking helper routines. */
40465 +EXPORT_SYMBOL(csum_partial_copy_nocheck);
40466 +EXPORT_SYMBOL(ip_compute_csum);
40467 +/* Delay loops */
40468 +EXPORT_SYMBOL(__udelay);
40469 +EXPORT_SYMBOL(__ndelay);
40470 +EXPORT_SYMBOL(__delay);
40471 +EXPORT_SYMBOL(__const_udelay);
40472 +
40473 +EXPORT_SYMBOL(__get_user_1);
40474 +EXPORT_SYMBOL(__get_user_2);
40475 +EXPORT_SYMBOL(__get_user_4);
40476 +EXPORT_SYMBOL(__get_user_8);
40477 +EXPORT_SYMBOL(__put_user_1);
40478 +EXPORT_SYMBOL(__put_user_2);
40479 +EXPORT_SYMBOL(__put_user_4);
40480 +EXPORT_SYMBOL(__put_user_8);
40481 +
40482 +EXPORT_SYMBOL(strncpy_from_user);
40483 +EXPORT_SYMBOL(__strncpy_from_user);
40484 +EXPORT_SYMBOL(clear_user);
40485 +EXPORT_SYMBOL(__clear_user);
40486 +EXPORT_SYMBOL(copy_user_generic);
40487 +EXPORT_SYMBOL(copy_from_user);
40488 +EXPORT_SYMBOL(copy_to_user);
40489 +EXPORT_SYMBOL(copy_in_user);
40490 +EXPORT_SYMBOL(strnlen_user);
40491 +
40492 +#ifdef CONFIG_PCI
40493 +EXPORT_SYMBOL(pci_mem_start);
40494 +#endif
40495 +
40496 +EXPORT_SYMBOL(copy_page);
40497 +EXPORT_SYMBOL(clear_page);
40498 +
40499 +EXPORT_SYMBOL(_cpu_pda);
40500 +#ifdef CONFIG_SMP
40501 +EXPORT_SYMBOL(__write_lock_failed);
40502 +EXPORT_SYMBOL(__read_lock_failed);
40503 +
40504 +EXPORT_SYMBOL(smp_call_function);
40505 +#endif
40506 +
40507 +#ifdef CONFIG_VT
40508 +EXPORT_SYMBOL(screen_info);
40509 +#endif
40510 +
40511 +#ifdef CONFIG_X86_LOCAL_APIC
40512 +EXPORT_SYMBOL_GPL(set_nmi_callback);
40513 +EXPORT_SYMBOL_GPL(unset_nmi_callback);
40514 +#endif
40515 +
40516 +/* Export string functions. We normally rely on gcc builtin for most of these,
40517 +   but gcc sometimes decides not to inline them. */    
40518 +#undef memcpy
40519 +#undef memset
40520 +#undef memmove
40521 +
40522 +extern void * memset(void *,int,__kernel_size_t);
40523 +extern size_t strlen(const char *);
40524 +extern void * memmove(void * dest,const void *src,size_t count);
40525 +extern void * memcpy(void *,const void *,__kernel_size_t);
40526 +extern void * __memcpy(void *,const void *,__kernel_size_t);
40527 +
40528 +EXPORT_SYMBOL(memset);
40529 +EXPORT_SYMBOL(memmove);
40530 +EXPORT_SYMBOL(memcpy);
40531 +EXPORT_SYMBOL(__memcpy);
40532 +
40533 +#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
40534 +/* prototypes are wrong, these are assembly with custom calling functions */
40535 +extern void rwsem_down_read_failed_thunk(void);
40536 +extern void rwsem_wake_thunk(void);
40537 +extern void rwsem_downgrade_thunk(void);
40538 +extern void rwsem_down_write_failed_thunk(void);
40539 +EXPORT_SYMBOL(rwsem_down_read_failed_thunk);
40540 +EXPORT_SYMBOL(rwsem_wake_thunk);
40541 +EXPORT_SYMBOL(rwsem_downgrade_thunk);
40542 +EXPORT_SYMBOL(rwsem_down_write_failed_thunk);
40543 +#endif
40544 +
40545 +EXPORT_SYMBOL(empty_zero_page);
40546 +
40547 +EXPORT_SYMBOL(die_chain);
40548 +
40549 +#ifdef CONFIG_SMP
40550 +EXPORT_SYMBOL(cpu_sibling_map);
40551 +EXPORT_SYMBOL(smp_num_siblings);
40552 +#endif
40553 +
40554 +#ifdef CONFIG_BUG
40555 +EXPORT_SYMBOL(out_of_line_bug);
40556 +#endif
40557 +
40558 +EXPORT_SYMBOL(init_level4_pgt);
40559 +
40560 +extern unsigned long __supported_pte_mask;
40561 +EXPORT_SYMBOL(__supported_pte_mask);
40562 +
40563 +#ifdef CONFIG_SMP
40564 +EXPORT_SYMBOL(flush_tlb_page);
40565 +#endif
40566 +
40567 +EXPORT_SYMBOL(load_gs_index);
40568 +
40569 diff -urNp linux-2.6/arch/x86_64/kernel/xen_entry.S new/arch/x86_64/kernel/xen_entry.S
40570 --- linux-2.6/arch/x86_64/kernel/xen_entry.S    1970-01-01 01:00:00.000000000 +0100
40571 +++ new/arch/x86_64/kernel/xen_entry.S  2006-05-09 12:33:21.000000000 +0200
40572 @@ -0,0 +1,40 @@
40573 +/*
40574 + * Copied from arch/xen/i386/kernel/entry.S
40575 + */                        
40576 +/* Offsets into shared_info_t. */                
40577 +#define evtchn_upcall_pending          /* 0 */
40578 +#define evtchn_upcall_mask             1
40579 +
40580 +#define sizeof_vcpu_shift              6
40581 +
40582 +#ifdef CONFIG_SMP
40583 +//#define preempt_disable(reg) incl threadinfo_preempt_count(reg)
40584 +//#define preempt_enable(reg)  decl threadinfo_preempt_count(reg)
40585 +#define preempt_disable(reg)
40586 +#define preempt_enable(reg)
40587 +#define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp)                   ; \
40588 +                               movq %gs:pda_cpunumber,reg              ; \
40589 +                               shl  $32, reg                           ; \
40590 +                               shr  $32-sizeof_vcpu_shift,reg          ; \
40591 +                               addq HYPERVISOR_shared_info,reg
40592 +#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp)                    ; \
40593 +#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff
40594 +#else
40595 +#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg
40596 +#define XEN_PUT_VCPU_INFO(reg)
40597 +#define XEN_PUT_VCPU_INFO_fixup
40598 +#endif
40599 +
40600 +#define XEN_LOCKED_BLOCK_EVENTS(reg)   movb $1,evtchn_upcall_mask(reg)
40601 +#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg)
40602 +#define XEN_BLOCK_EVENTS(reg)  XEN_GET_VCPU_INFO(reg)                  ; \
40603 +                               XEN_LOCKED_BLOCK_EVENTS(reg)            ; \
40604 +                               XEN_PUT_VCPU_INFO(reg)
40605 +#define XEN_UNBLOCK_EVENTS(reg)        XEN_GET_VCPU_INFO(reg)                  ; \
40606 +                               XEN_LOCKED_UNBLOCK_EVENTS(reg)          ; \
40607 +                               XEN_PUT_VCPU_INFO(reg)
40608 +#define XEN_TEST_PENDING(reg)  testb $0xFF,evtchn_upcall_pending(reg)
40609 +
40610 +VGCF_IN_SYSCALL = (1<<8)
40611 +        
40612 +       
40613 diff -urNp linux-2.6/arch/x86_64/Makefile new/arch/x86_64/Makefile
40614 --- linux-2.6/arch/x86_64/Makefile      2006-07-03 14:14:30.000000000 +0200
40615 +++ new/arch/x86_64/Makefile    2006-05-09 12:33:17.000000000 +0200
40616 @@ -31,6 +31,10 @@ cflags-$(CONFIG_MK8) += $(call cc-option
40617  cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
40618  cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
40619  
40620 +cppflags-$(CONFIG_XEN) += \
40621 +       -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION)
40622 +CPPFLAGS += $(cppflags-y)
40623 +
40624  cflags-y += -m64
40625  cflags-y += -mno-red-zone
40626  cflags-y += -mcmodel=kernel
40627 @@ -72,6 +76,21 @@ boot := arch/x86_64/boot
40628  PHONY += bzImage bzlilo install archmrproper \
40629          fdimage fdimage144 fdimage288 isoimage archclean
40630  
40631 +ifdef CONFIG_XEN
40632 +CPPFLAGS := -Iinclude$(if $(KBUILD_SRC),2)/asm/mach-xen $(CPPFLAGS)
40633 +head-y := arch/x86_64/kernel/head-xen.o arch/x86_64/kernel/head64-xen.o arch/x86_64/kernel/init_task.o
40634 +LDFLAGS_vmlinux := -e _start
40635 +boot := arch/i386/boot-xen
40636 +.PHONY: vmlinuz
40637 +#Default target when executing "make"
40638 +all: vmlinuz
40639 +
40640 +vmlinuz: vmlinux
40641 +       $(Q)$(MAKE) $(build)=$(boot) $@
40642 +
40643 +install:
40644 +       $(Q)$(MAKE) $(build)=$(boot) XENGUEST=$(XENGUEST) $@
40645 +else
40646  #Default target when executing "make"
40647  all: bzImage
40648  
40649 @@ -92,6 +111,7 @@ fdimage fdimage144 fdimage288 isoimage: 
40650  
40651  install:
40652         $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@ 
40653 +endif
40654  
40655  archclean:
40656         $(Q)$(MAKE) $(clean)=$(boot)
40657 diff -urNp linux-2.6/arch/x86_64/mm/fault-xen.c new/arch/x86_64/mm/fault-xen.c
40658 --- linux-2.6/arch/x86_64/mm/fault-xen.c        1970-01-01 01:00:00.000000000 +0100
40659 +++ new/arch/x86_64/mm/fault-xen.c      2006-06-28 16:31:30.000000000 +0200
40660 @@ -0,0 +1,690 @@
40661 +/*
40662 + *  linux/arch/x86-64/mm/fault.c
40663 + *
40664 + *  Copyright (C) 1995  Linus Torvalds
40665 + *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
40666 + */
40667 +
40668 +#include <linux/config.h>
40669 +#include <linux/signal.h>
40670 +#include <linux/sched.h>
40671 +#include <linux/kernel.h>
40672 +#include <linux/errno.h>
40673 +#include <linux/string.h>
40674 +#include <linux/types.h>
40675 +#include <linux/ptrace.h>
40676 +#include <linux/mman.h>
40677 +#include <linux/mm.h>
40678 +#include <linux/smp.h>
40679 +#include <linux/smp_lock.h>
40680 +#include <linux/interrupt.h>
40681 +#include <linux/init.h>
40682 +#include <linux/tty.h>
40683 +#include <linux/vt_kern.h>             /* For unblank_screen() */
40684 +#include <linux/compiler.h>
40685 +#include <linux/module.h>
40686 +#include <linux/kprobes.h>
40687 +
40688 +#include <asm/system.h>
40689 +#include <asm/uaccess.h>
40690 +#include <asm/pgalloc.h>
40691 +#include <asm/smp.h>
40692 +#include <asm/tlbflush.h>
40693 +#include <asm/proto.h>
40694 +#include <asm/kdebug.h>
40695 +#include <asm-generic/sections.h>
40696 +
40697 +/* Page fault error code bits */
40698 +#define PF_PROT        (1<<0)          /* or no page found */
40699 +#define PF_WRITE       (1<<1)
40700 +#define PF_USER        (1<<2)
40701 +#define PF_RSVD        (1<<3)
40702 +#define PF_INSTR       (1<<4)
40703 +
40704 +void bust_spinlocks(int yes)
40705 +{
40706 +       int loglevel_save = console_loglevel;
40707 +       if (yes) {
40708 +               oops_in_progress = 1;
40709 +       } else {
40710 +#ifdef CONFIG_VT
40711 +               unblank_screen();
40712 +#endif
40713 +               oops_in_progress = 0;
40714 +               /*
40715 +                * OK, the message is on the console.  Now we call printk()
40716 +                * without oops_in_progress set so that printk will give klogd
40717 +                * a poke.  Hold onto your hats...
40718 +                */
40719 +               console_loglevel = 15;          /* NMI oopser may have shut the console up */
40720 +               printk(" ");
40721 +               console_loglevel = loglevel_save;
40722 +       }
40723 +}
40724 +
40725 +/* Sometimes the CPU reports invalid exceptions on prefetch.
40726 +   Check that here and ignore.
40727 +   Opcode checker based on code by Richard Brunner */
40728 +static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
40729 +                               unsigned long error_code)
40730 +{ 
40731 +       unsigned char *instr;
40732 +       int scan_more = 1;
40733 +       int prefetch = 0; 
40734 +       unsigned char *max_instr;
40735 +
40736 +       /* If it was a exec fault ignore */
40737 +       if (error_code & PF_INSTR)
40738 +               return 0;
40739 +       
40740 +       instr = (unsigned char *)convert_rip_to_linear(current, regs);
40741 +       max_instr = instr + 15;
40742 +
40743 +       if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
40744 +               return 0;
40745 +
40746 +       while (scan_more && instr < max_instr) { 
40747 +               unsigned char opcode;
40748 +               unsigned char instr_hi;
40749 +               unsigned char instr_lo;
40750 +
40751 +               if (__get_user(opcode, instr))
40752 +                       break; 
40753 +
40754 +               instr_hi = opcode & 0xf0; 
40755 +               instr_lo = opcode & 0x0f; 
40756 +               instr++;
40757 +
40758 +               switch (instr_hi) { 
40759 +               case 0x20:
40760 +               case 0x30:
40761 +                       /* Values 0x26,0x2E,0x36,0x3E are valid x86
40762 +                          prefixes.  In long mode, the CPU will signal
40763 +                          invalid opcode if some of these prefixes are
40764 +                          present so we will never get here anyway */
40765 +                       scan_more = ((instr_lo & 7) == 0x6);
40766 +                       break;
40767 +                       
40768 +               case 0x40:
40769 +                       /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
40770 +                          Need to figure out under what instruction mode the
40771 +                          instruction was issued ... */
40772 +                       /* Could check the LDT for lm, but for now it's good
40773 +                          enough to assume that long mode only uses well known
40774 +                          segments or kernel. */
40775 +                       scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
40776 +                       break;
40777 +                       
40778 +               case 0x60:
40779 +                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
40780 +                       scan_more = (instr_lo & 0xC) == 0x4;
40781 +                       break;          
40782 +               case 0xF0:
40783 +                       /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
40784 +                       scan_more = !instr_lo || (instr_lo>>1) == 1;
40785 +                       break;                  
40786 +               case 0x00:
40787 +                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
40788 +                       scan_more = 0;
40789 +                       if (__get_user(opcode, instr)) 
40790 +                               break;
40791 +                       prefetch = (instr_lo == 0xF) &&
40792 +                               (opcode == 0x0D || opcode == 0x18);
40793 +                       break;                  
40794 +               default:
40795 +                       scan_more = 0;
40796 +                       break;
40797 +               } 
40798 +       }
40799 +       return prefetch;
40800 +}
40801 +
40802 +static int bad_address(void *p) 
40803 +{ 
40804 +       unsigned long dummy;
40805 +       return __get_user(dummy, (unsigned long *)p);
40806 +} 
40807 +
40808 +void dump_pagetable(unsigned long address)
40809 +{
40810 +       pgd_t *pgd;
40811 +       pud_t *pud;
40812 +       pmd_t *pmd;
40813 +       pte_t *pte;
40814 +
40815 +       pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
40816 +       pgd += pgd_index(address);
40817 +       if (bad_address(pgd)) goto bad;
40818 +       printk("PGD %lx ", pgd_val(*pgd));
40819 +       if (!pgd_present(*pgd)) goto ret; 
40820 +
40821 +       pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
40822 +       if (bad_address(pud)) goto bad;
40823 +       printk("PUD %lx ", pud_val(*pud));
40824 +       if (!pud_present(*pud)) goto ret;
40825 +
40826 +       pmd = pmd_offset(pud, address);
40827 +       if (bad_address(pmd)) goto bad;
40828 +       printk("PMD %lx ", pmd_val(*pmd));
40829 +       if (!pmd_present(*pmd)) goto ret;        
40830 +
40831 +       pte = pte_offset_kernel(pmd, address);
40832 +       if (bad_address(pte)) goto bad;
40833 +       printk("PTE %lx", pte_val(*pte)); 
40834 +ret:
40835 +       printk("\n");
40836 +       return;
40837 +bad:
40838 +       printk("BAD\n");
40839 +}
40840 +
40841 +static const char errata93_warning[] = 
40842 +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
40843 +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
40844 +KERN_ERR "******* Please consider a BIOS update.\n"
40845 +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
40846 +
40847 +/* Workaround for K8 erratum #93 & buggy BIOS.
40848 +   BIOS SMM functions are required to use a specific workaround
40849 +   to avoid corruption of the 64bit RIP register on C stepping K8. 
40850 +   A lot of BIOS that didn't get tested properly miss this. 
40851 +   The OS sees this as a page fault with the upper 32bits of RIP cleared.
40852 +   Try to work around it here.
40853 +   Note we only handle faults in kernel here. */
40854 +
40855 +static int is_errata93(struct pt_regs *regs, unsigned long address) 
40856 +{
40857 +       static int warned;
40858 +       if (address != regs->rip)
40859 +               return 0;
40860 +       if ((address >> 32) != 0) 
40861 +               return 0;
40862 +       address |= 0xffffffffUL << 32;
40863 +       if ((address >= (u64)_stext && address <= (u64)_etext) || 
40864 +           (address >= MODULES_VADDR && address <= MODULES_END)) { 
40865 +               if (!warned) {
40866 +                       printk(errata93_warning);               
40867 +                       warned = 1;
40868 +               }
40869 +               regs->rip = address;
40870 +               return 1;
40871 +       }
40872 +       return 0;
40873 +} 
40874 +
40875 +int unhandled_signal(struct task_struct *tsk, int sig)
40876 +{
40877 +       if (tsk->pid == 1)
40878 +               return 1;
40879 +       if (tsk->ptrace & PT_PTRACED)
40880 +               return 0;
40881 +       return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
40882 +               (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
40883 +}
40884 +
40885 +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
40886 +                                unsigned long error_code)
40887 +{
40888 +       unsigned long flags = oops_begin();
40889 +       struct task_struct *tsk;
40890 +
40891 +       printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
40892 +              current->comm, address);
40893 +       dump_pagetable(address);
40894 +       tsk = current;
40895 +       tsk->thread.cr2 = address;
40896 +       tsk->thread.trap_no = 14;
40897 +       tsk->thread.error_code = error_code;
40898 +       __die("Bad pagetable", regs, error_code);
40899 +       oops_end(flags);
40900 +       do_exit(SIGKILL);
40901 +}
40902 +
40903 +/*
40904 + * Handle a fault on the vmalloc area
40905 + *
40906 + * This assumes no large pages in there.
40907 + */
40908 +static int vmalloc_fault(unsigned long address)
40909 +{
40910 +       pgd_t *pgd, *pgd_ref;
40911 +       pud_t *pud, *pud_ref;
40912 +       pmd_t *pmd, *pmd_ref;
40913 +       pte_t *pte, *pte_ref;
40914 +
40915 +       /* Copy kernel mappings over when needed. This can also
40916 +          happen within a race in page table update. In the later
40917 +          case just flush. */
40918 +
40919 +       /* On Xen the line below does not always work. Needs investigating! */
40920 +       /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
40921 +       pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
40922 +       pgd += pgd_index(address);
40923 +       pgd_ref = pgd_offset_k(address);
40924 +       if (pgd_none(*pgd_ref))
40925 +               return -1;
40926 +       if (pgd_none(*pgd))
40927 +               set_pgd(pgd, *pgd_ref);
40928 +       else
40929 +               BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
40930 +
40931 +       /* Below here mismatches are bugs because these lower tables
40932 +          are shared */
40933 +
40934 +       pud = pud_offset(pgd, address);
40935 +       pud_ref = pud_offset(pgd_ref, address);
40936 +       if (pud_none(*pud_ref))
40937 +               return -1;
40938 +       if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
40939 +               BUG();
40940 +       pmd = pmd_offset(pud, address);
40941 +       pmd_ref = pmd_offset(pud_ref, address);
40942 +       if (pmd_none(*pmd_ref))
40943 +               return -1;
40944 +       if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
40945 +               BUG();
40946 +       pte_ref = pte_offset_kernel(pmd_ref, address);
40947 +       if (!pte_present(*pte_ref))
40948 +               return -1;
40949 +       pte = pte_offset_kernel(pmd, address);
40950 +       /* Don't use pte_page here, because the mappings can point
40951 +          outside mem_map, and the NUMA hash lookup cannot handle
40952 +          that. */
40953 +       if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
40954 +               BUG();
40955 +       return 0;
40956 +}
40957 +
40958 +int page_fault_trace = 0;
40959 +int exception_trace = 1;
40960 +
40961 +
40962 +#define MEM_VERBOSE 1
40963 +
40964 +#ifdef MEM_VERBOSE
40965 +#define MEM_LOG(_f, _a...)                     \
40966 +       printk("fault.c:[%d]-> " _f "\n",       \
40967 +       __LINE__ , ## _a )
40968 +#else
40969 +#define MEM_LOG(_f, _a...) ((void)0)
40970 +#endif
40971 +
40972 +static int spurious_fault(struct pt_regs *regs,
40973 +                         unsigned long address,
40974 +                         unsigned long error_code)
40975 +{
40976 +       pgd_t *pgd;
40977 +       pud_t *pud;
40978 +       pmd_t *pmd;
40979 +       pte_t *pte;
40980 +
40981 +#ifdef CONFIG_XEN
40982 +       /* Faults in hypervisor area are never spurious. */
40983 +       if ((address >= HYPERVISOR_VIRT_START) &&
40984 +           (address < HYPERVISOR_VIRT_END))
40985 +               return 0;
40986 +#endif
40987 +
40988 +       /* Reserved-bit violation or user access to kernel space? */
40989 +       if (error_code & (PF_RSVD|PF_USER))
40990 +               return 0;
40991 +
40992 +       pgd = init_mm.pgd + pgd_index(address);
40993 +       if (!pgd_present(*pgd))
40994 +               return 0;
40995 +
40996 +       pud = pud_offset(pgd, address);
40997 +       if (!pud_present(*pud))
40998 +               return 0;
40999 +
41000 +       pmd = pmd_offset(pud, address);
41001 +       if (!pmd_present(*pmd))
41002 +               return 0;
41003 +
41004 +       pte = pte_offset_kernel(pmd, address);
41005 +       if (!pte_present(*pte))
41006 +               return 0;
41007 +       if ((error_code & PF_WRITE) && !pte_write(*pte))
41008 +               return 0;
41009 +       if ((error_code & PF_INSTR) && (pte_val(*pte) & _PAGE_NX))
41010 +               return 0;
41011 +
41012 +       return 1;
41013 +}
41014 +
41015 +/*
41016 + * This routine handles page faults.  It determines the address,
41017 + * and the problem, and then passes it off to one of the appropriate
41018 + * routines.
41019 + */
41020 +asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
41021 +                                       unsigned long error_code)
41022 +{
41023 +       struct task_struct *tsk;
41024 +       struct mm_struct *mm;
41025 +       struct vm_area_struct * vma;
41026 +       unsigned long address;
41027 +       const struct exception_table_entry *fixup;
41028 +       int write;
41029 +       unsigned long flags;
41030 +       siginfo_t info;
41031 +
41032 +       if (!user_mode(regs))
41033 +               error_code &= ~PF_USER; /* means kernel */
41034 +
41035 +       tsk = current;
41036 +       mm = tsk->mm;
41037 +       prefetchw(&mm->mmap_sem);
41038 +
41039 +       /* get the address */
41040 +       address = HYPERVISOR_shared_info->vcpu_info[
41041 +               smp_processor_id()].arch.cr2;
41042 +
41043 +       info.si_code = SEGV_MAPERR;
41044 +
41045 +
41046 +       /*
41047 +        * We fault-in kernel-space virtual memory on-demand. The
41048 +        * 'reference' page table is init_mm.pgd.
41049 +        *
41050 +        * NOTE! We MUST NOT take any locks for this case. We may
41051 +        * be in an interrupt or a critical region, and should
41052 +        * only copy the information from the master page table,
41053 +        * nothing more.
41054 +        *
41055 +        * This verifies that the fault happens in kernel space
41056 +        * (error_code & 4) == 0, and that the fault was not a
41057 +        * protection error (error_code & 9) == 0.
41058 +        */
41059 +       if (unlikely(address >= TASK_SIZE64)) {
41060 +               /*
41061 +                * Don't check for the module range here: its PML4
41062 +                * is always initialized because it's shared with the main
41063 +                * kernel text. Only vmalloc may need PML4 syncups.
41064 +                */
41065 +               if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
41066 +                   address >= PAGE_OFFSET) {
41067 +                       if (vmalloc_fault(address) >= 0)
41068 +                               return;
41069 +               }
41070 +               if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
41071 +                                               SIGSEGV) == NOTIFY_STOP)
41072 +                       return;
41073 +               /* Can take a spurious fault if mapping changes R/O -> R/W. */
41074 +               if (spurious_fault(regs, address, error_code))
41075 +                       return;
41076 +               /*
41077 +                * Don't take the mm semaphore here. If we fixup a prefetch
41078 +                * fault we could otherwise deadlock.
41079 +                */
41080 +               goto bad_area_nosemaphore;
41081 +       }
41082 +
41083 +       if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
41084 +                                       SIGSEGV) == NOTIFY_STOP)
41085 +               return;
41086 +
41087 +       if (likely(regs->eflags & X86_EFLAGS_IF))
41088 +               local_irq_enable();
41089 +
41090 +       if (unlikely(page_fault_trace))
41091 +               printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
41092 +                      regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); 
41093 +
41094 +       if (unlikely(error_code & PF_RSVD))
41095 +               pgtable_bad(address, regs, error_code);
41096 +
41097 +       /*
41098 +        * If we're in an interrupt or have no user
41099 +        * context, we must not take the fault..
41100 +        */
41101 +       if (unlikely(in_atomic() || !mm))
41102 +               goto bad_area_nosemaphore;
41103 +
41104 + again:
41105 +       /* When running in the kernel we expect faults to occur only to
41106 +        * addresses in user space.  All other faults represent errors in the
41107 +        * kernel and should generate an OOPS.  Unfortunatly, in the case of an
41108 +        * erroneous fault occuring in a code path which already holds mmap_sem
41109 +        * we will deadlock attempting to validate the fault against the
41110 +        * address space.  Luckily the kernel only validly references user
41111 +        * space from well defined areas of code, which are listed in the
41112 +        * exceptions table.
41113 +        *
41114 +        * As the vast majority of faults will be valid we will only perform
41115 +        * the source reference check when there is a possibilty of a deadlock.
41116 +        * Attempt to lock the address space, if we cannot we then validate the
41117 +        * source.  If this is invalid we can skip the address space check,
41118 +        * thus avoiding the deadlock.
41119 +        */
41120 +       if (!down_read_trylock(&mm->mmap_sem)) {
41121 +               if ((error_code & PF_USER) == 0 &&
41122 +                   !search_exception_tables(regs->rip))
41123 +                       goto bad_area_nosemaphore;
41124 +               down_read(&mm->mmap_sem);
41125 +       }
41126 +
41127 +       vma = find_vma(mm, address);
41128 +       if (!vma)
41129 +               goto bad_area;
41130 +       if (likely(vma->vm_start <= address))
41131 +               goto good_area;
41132 +       if (!(vma->vm_flags & VM_GROWSDOWN))
41133 +               goto bad_area;
41134 +       if (error_code & 4) {
41135 +               // XXX: align red zone size with ABI 
41136 +               if (address + 128 < regs->rsp)
41137 +                       goto bad_area;
41138 +       }
41139 +       if (expand_stack(vma, address))
41140 +               goto bad_area;
41141 +/*
41142 + * Ok, we have a good vm_area for this memory access, so
41143 + * we can handle it..
41144 + */
41145 +good_area:
41146 +       info.si_code = SEGV_ACCERR;
41147 +       write = 0;
41148 +       switch (error_code & (PF_PROT|PF_WRITE)) {
41149 +               default:        /* 3: write, present */
41150 +                       /* fall through */
41151 +               case PF_WRITE:          /* write, not present */
41152 +                       if (!(vma->vm_flags & VM_WRITE))
41153 +                               goto bad_area;
41154 +                       write++;
41155 +                       break;
41156 +               case PF_PROT:           /* read, present */
41157 +                       goto bad_area;
41158 +               case 0:                 /* read, not present */
41159 +                       if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
41160 +                               goto bad_area;
41161 +       }
41162 +
41163 +       /*
41164 +        * If for any reason at all we couldn't handle the fault,
41165 +        * make sure we exit gracefully rather than endlessly redo
41166 +        * the fault.
41167 +        */
41168 +       switch (handle_mm_fault(mm, vma, address, write)) {
41169 +       case VM_FAULT_MINOR:
41170 +               tsk->min_flt++;
41171 +               break;
41172 +       case VM_FAULT_MAJOR:
41173 +               tsk->maj_flt++;
41174 +               break;
41175 +       case VM_FAULT_SIGBUS:
41176 +               goto do_sigbus;
41177 +       default:
41178 +               goto out_of_memory;
41179 +       }
41180 +
41181 +       up_read(&mm->mmap_sem);
41182 +       return;
41183 +
41184 +/*
41185 + * Something tried to access memory that isn't in our memory map..
41186 + * Fix it, but check if it's kernel or user first..
41187 + */
41188 +bad_area:
41189 +       up_read(&mm->mmap_sem);
41190 +
41191 +bad_area_nosemaphore:
41192 +       /* User mode accesses just cause a SIGSEGV */
41193 +       if (error_code & PF_USER) {
41194 +               if (is_prefetch(regs, address, error_code))
41195 +                       return;
41196 +
41197 +               /* Work around K8 erratum #100 K8 in compat mode
41198 +                  occasionally jumps to illegal addresses >4GB.  We
41199 +                  catch this here in the page fault handler because
41200 +                  these addresses are not reachable. Just detect this
41201 +                  case and return.  Any code segment in LDT is
41202 +                  compatibility mode. */
41203 +               if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
41204 +                   (address >> 32))
41205 +                       return;
41206 +
41207 +               if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
41208 +                       printk(
41209 +                      "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
41210 +                                       tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
41211 +                                       tsk->comm, tsk->pid, address, regs->rip,
41212 +                                       regs->rsp, error_code);
41213 +               }
41214 +       
41215 +               tsk->thread.cr2 = address;
41216 +               /* Kernel addresses are always protection faults */
41217 +               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
41218 +               tsk->thread.trap_no = 14;
41219 +               info.si_signo = SIGSEGV;
41220 +               info.si_errno = 0;
41221 +               /* info.si_code has been set above */
41222 +               info.si_addr = (void __user *)address;
41223 +               force_sig_info(SIGSEGV, &info, tsk);
41224 +               return;
41225 +       }
41226 +
41227 +no_context:
41228 +       
41229 +       /* Are we prepared to handle this kernel fault?  */
41230 +       fixup = search_exception_tables(regs->rip);
41231 +       if (fixup) {
41232 +               regs->rip = fixup->fixup;
41233 +               return;
41234 +       }
41235 +
41236 +       /* 
41237 +        * Hall of shame of CPU/BIOS bugs.
41238 +        */
41239 +
41240 +       if (is_prefetch(regs, address, error_code))
41241 +               return;
41242 +
41243 +       if (is_errata93(regs, address))
41244 +               return; 
41245 +
41246 +/*
41247 + * Oops. The kernel tried to access some bad page. We'll have to
41248 + * terminate things with extreme prejudice.
41249 + */
41250 +
41251 +       flags = oops_begin();
41252 +
41253 +       if (address < PAGE_SIZE)
41254 +               printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
41255 +       else
41256 +               printk(KERN_ALERT "Unable to handle kernel paging request");
41257 +       printk(" at %016lx RIP: \n" KERN_ALERT,address);
41258 +       printk_address(regs->rip);
41259 +       printk("\n");
41260 +       dump_pagetable(address);
41261 +       tsk->thread.cr2 = address;
41262 +       tsk->thread.trap_no = 14;
41263 +       tsk->thread.error_code = error_code;
41264 +       __die("Oops", regs, error_code);
41265 +       /* Executive summary in case the body of the oops scrolled away */
41266 +       printk(KERN_EMERG "CR2: %016lx\n", address);
41267 +       oops_end(flags);
41268 +       do_exit(SIGKILL);
41269 +
41270 +/*
41271 + * We ran out of memory, or some other thing happened to us that made
41272 + * us unable to handle the page fault gracefully.
41273 + */
41274 +out_of_memory:
41275 +       up_read(&mm->mmap_sem);
41276 +       if (current->pid == 1) { 
41277 +               yield();
41278 +               goto again;
41279 +       }
41280 +       printk("VM: killing process %s\n", tsk->comm);
41281 +       if (error_code & 4)
41282 +               do_exit(SIGKILL);
41283 +       goto no_context;
41284 +
41285 +do_sigbus:
41286 +       up_read(&mm->mmap_sem);
41287 +
41288 +       /* Kernel mode? Handle exceptions or die */
41289 +       if (!(error_code & PF_USER))
41290 +               goto no_context;
41291 +
41292 +       tsk->thread.cr2 = address;
41293 +       tsk->thread.error_code = error_code;
41294 +       tsk->thread.trap_no = 14;
41295 +       info.si_signo = SIGBUS;
41296 +       info.si_errno = 0;
41297 +       info.si_code = BUS_ADRERR;
41298 +       info.si_addr = (void __user *)address;
41299 +       force_sig_info(SIGBUS, &info, tsk);
41300 +       return;
41301 +}
41302 +
41303 +DEFINE_SPINLOCK(pgd_lock);
41304 +struct page *pgd_list;
41305 +
41306 +void vmalloc_sync_all(void)
41307 +{
41308 +       /* Note that races in the updates of insync and start aren't 
41309 +          problematic:
41310 +          insync can only get set bits added, and updates to start are only
41311 +          improving performance (without affecting correctness if undone). */
41312 +       static DECLARE_BITMAP(insync, PTRS_PER_PGD);
41313 +       static unsigned long start = VMALLOC_START & PGDIR_MASK;
41314 +       unsigned long address;
41315 +
41316 +       for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
41317 +               if (!test_bit(pgd_index(address), insync)) {
41318 +                       const pgd_t *pgd_ref = pgd_offset_k(address);
41319 +                       struct page *page;
41320 +
41321 +                       if (pgd_none(*pgd_ref))
41322 +                               continue;
41323 +                       spin_lock(&pgd_lock);
41324 +                       for (page = pgd_list; page;
41325 +                            page = (struct page *)page->index) {
41326 +                               pgd_t *pgd;
41327 +                               pgd = (pgd_t *)page_address(page) + pgd_index(address);
41328 +                               if (pgd_none(*pgd))
41329 +                                       set_pgd(pgd, *pgd_ref);
41330 +                               else
41331 +                                       BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
41332 +                       }
41333 +                       spin_unlock(&pgd_lock);
41334 +                       set_bit(pgd_index(address), insync);
41335 +               }
41336 +               if (address == start)
41337 +                       start = address + PGDIR_SIZE;
41338 +       }
41339 +       /* Check that there is no need to do the same for the modules area. */
41340 +       BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
41341 +       BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 
41342 +                               (__START_KERNEL & PGDIR_MASK)));
41343 +}
41344 +
41345 +static int __init enable_pagefaulttrace(char *str)
41346 +{
41347 +       page_fault_trace = 1;
41348 +       return 1;
41349 +}
41350 +__setup("pagefaulttrace", enable_pagefaulttrace);
41351 diff -urNp linux-2.6/arch/x86_64/mm/init-xen.c new/arch/x86_64/mm/init-xen.c
41352 --- linux-2.6/arch/x86_64/mm/init-xen.c 1970-01-01 01:00:00.000000000 +0100
41353 +++ new/arch/x86_64/mm/init-xen.c       2006-06-28 14:32:14.000000000 +0200
41354 @@ -0,0 +1,1202 @@
41355 +/*
41356 + *  linux/arch/x86_64/mm/init.c
41357 + *
41358 + *  Copyright (C) 1995  Linus Torvalds
41359 + *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
41360 + *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
41361 + *
41362 + *  Jun Nakajima <jun.nakajima@intel.com>
41363 + *     Modified for Xen.
41364 + */
41365 +
41366 +#include <linux/config.h>
41367 +#include <linux/signal.h>
41368 +#include <linux/sched.h>
41369 +#include <linux/kernel.h>
41370 +#include <linux/errno.h>
41371 +#include <linux/string.h>
41372 +#include <linux/types.h>
41373 +#include <linux/ptrace.h>
41374 +#include <linux/mman.h>
41375 +#include <linux/mm.h>
41376 +#include <linux/swap.h>
41377 +#include <linux/smp.h>
41378 +#include <linux/init.h>
41379 +#include <linux/pagemap.h>
41380 +#include <linux/bootmem.h>
41381 +#include <linux/proc_fs.h>
41382 +#include <linux/pci.h>
41383 +#include <linux/dma-mapping.h>
41384 +#include <linux/module.h>
41385 +#include <linux/memory_hotplug.h>
41386 +
41387 +#include <asm/processor.h>
41388 +#include <asm/system.h>
41389 +#include <asm/uaccess.h>
41390 +#include <asm/pgtable.h>
41391 +#include <asm/pgalloc.h>
41392 +#include <asm/dma.h>
41393 +#include <asm/fixmap.h>
41394 +#include <asm/e820.h>
41395 +#include <asm/apic.h>
41396 +#include <asm/tlb.h>
41397 +#include <asm/mmu_context.h>
41398 +#include <asm/proto.h>
41399 +#include <asm/smp.h>
41400 +#include <asm/sections.h>
41401 +#include <asm/dma-mapping.h>
41402 +#include <asm/swiotlb.h>
41403 +
41404 +#include <xen/features.h>
41405 +
41406 +#ifndef Dprintk
41407 +#define Dprintk(x...)
41408 +#endif
41409 +
41410 +struct dma_mapping_ops* dma_ops;
41411 +EXPORT_SYMBOL(dma_ops);
41412 +
41413 +extern unsigned long *contiguous_bitmap;
41414 +
41415 +static unsigned long dma_reserve __initdata;
41416 +
41417 +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
41418 +extern unsigned long start_pfn;
41419 +
41420 +/*
41421 + * Use this until direct mapping is established, i.e. before __va() is 
41422 + * available in init_memory_mapping().
41423 + */
41424 +
41425 +#define addr_to_page(addr, page)                               \
41426 +       (addr) &= PHYSICAL_PAGE_MASK;                           \
41427 +       (page) = ((unsigned long *) ((unsigned long)            \
41428 +       (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) +   \
41429 +       __START_KERNEL_map)))
41430 +
41431 +static void early_make_page_readonly(void *va, unsigned int feature)
41432 +{
41433 +       unsigned long addr, _va = (unsigned long)va;
41434 +       pte_t pte, *ptep;
41435 +       unsigned long *page = (unsigned long *) init_level4_pgt;
41436 +
41437 +       if (xen_feature(feature))
41438 +               return;
41439 +
41440 +       addr = (unsigned long) page[pgd_index(_va)];
41441 +       addr_to_page(addr, page);
41442 +
41443 +       addr = page[pud_index(_va)];
41444 +       addr_to_page(addr, page);
41445 +
41446 +       addr = page[pmd_index(_va)];
41447 +       addr_to_page(addr, page);
41448 +
41449 +       ptep = (pte_t *) &page[pte_index(_va)];
41450 +
41451 +       pte.pte = ptep->pte & ~_PAGE_RW;
41452 +       if (HYPERVISOR_update_va_mapping(_va, pte, 0))
41453 +               BUG();
41454 +}
41455 +
41456 +void make_page_readonly(void *va, unsigned int feature)
41457 +{
41458 +       pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
41459 +       unsigned long addr = (unsigned long) va;
41460 +
41461 +       if (xen_feature(feature))
41462 +               return;
41463 +
41464 +       pgd = pgd_offset_k(addr);
41465 +       pud = pud_offset(pgd, addr);
41466 +       pmd = pmd_offset(pud, addr);
41467 +       ptep = pte_offset_kernel(pmd, addr);
41468 +
41469 +       pte.pte = ptep->pte & ~_PAGE_RW;
41470 +       if (HYPERVISOR_update_va_mapping(addr, pte, 0))
41471 +               xen_l1_entry_update(ptep, pte); /* fallback */
41472 +
41473 +       if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
41474 +               make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
41475 +}
41476 +
41477 +void make_page_writable(void *va, unsigned int feature)
41478 +{
41479 +       pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
41480 +       unsigned long addr = (unsigned long) va;
41481 +
41482 +       if (xen_feature(feature))
41483 +               return;
41484 +
41485 +       pgd = pgd_offset_k(addr);
41486 +       pud = pud_offset(pgd, addr);
41487 +       pmd = pmd_offset(pud, addr);
41488 +       ptep = pte_offset_kernel(pmd, addr);
41489 +
41490 +       pte.pte = ptep->pte | _PAGE_RW;
41491 +       if (HYPERVISOR_update_va_mapping(addr, pte, 0))
41492 +               xen_l1_entry_update(ptep, pte); /* fallback */
41493 +
41494 +       if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
41495 +               make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
41496 +}
41497 +
41498 +void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
41499 +{
41500 +       if (xen_feature(feature))
41501 +               return;
41502 +
41503 +       while (nr-- != 0) {
41504 +               make_page_readonly(va, feature);
41505 +               va = (void*)((unsigned long)va + PAGE_SIZE);
41506 +       }
41507 +}
41508 +
41509 +void make_pages_writable(void *va, unsigned nr, unsigned int feature)
41510 +{
41511 +       if (xen_feature(feature))
41512 +               return;
41513 +
41514 +       while (nr-- != 0) {
41515 +               make_page_writable(va, feature);
41516 +               va = (void*)((unsigned long)va + PAGE_SIZE);
41517 +       }
41518 +}
41519 +
41520 +/*
41521 + * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
41522 + * physical space so we can cache the place of the first one and move
41523 + * around without checking the pgd every time.
41524 + */
41525 +
41526 +void show_mem(void)
41527 +{
41528 +       long i, total = 0, reserved = 0;
41529 +       long shared = 0, cached = 0;
41530 +       pg_data_t *pgdat;
41531 +       struct page *page;
41532 +
41533 +       printk(KERN_INFO "Mem-info:\n");
41534 +       show_free_areas();
41535 +       printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
41536 +
41537 +       for_each_online_pgdat(pgdat) {
41538 +               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
41539 +                       page = pfn_to_page(pgdat->node_start_pfn + i);
41540 +                       total++;
41541 +                       if (PageReserved(page))
41542 +                               reserved++;
41543 +                       else if (PageSwapCache(page))
41544 +                               cached++;
41545 +                       else if (page_count(page))
41546 +                               shared += page_count(page) - 1;
41547 +               }
41548 +       }
41549 +       printk(KERN_INFO "%lu pages of RAM\n", total);
41550 +       printk(KERN_INFO "%lu reserved pages\n",reserved);
41551 +       printk(KERN_INFO "%lu pages shared\n",shared);
41552 +       printk(KERN_INFO "%lu pages swap cached\n",cached);
41553 +}
41554 +
41555 +/* References to section boundaries */
41556 +
41557 +int after_bootmem;
41558 +
41559 +static __init void *spp_getpage(void)
41560 +{ 
41561 +       void *ptr;
41562 +       if (after_bootmem)
41563 +               ptr = (void *) get_zeroed_page(GFP_ATOMIC); 
41564 +       else
41565 +               ptr = alloc_bootmem_pages(PAGE_SIZE);
41566 +       if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
41567 +               panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
41568 +
41569 +       Dprintk("spp_getpage %p\n", ptr);
41570 +       return ptr;
41571 +} 
41572 +
41573 +#define pgd_offset_u(address) (pgd_t *)(init_level4_user_pgt + pgd_index(address))
41574 +
41575 +static inline pud_t *pud_offset_u(unsigned long address)
41576 +{
41577 +       pud_t *pud = level3_user_pgt;
41578 +
41579 +       return pud + pud_index(address);
41580 +}
41581 +
41582 +static __init void set_pte_phys(unsigned long vaddr,
41583 +                        unsigned long phys, pgprot_t prot, int user_mode)
41584 +{
41585 +       pgd_t *pgd;
41586 +       pud_t *pud;
41587 +       pmd_t *pmd;
41588 +       pte_t *pte, new_pte;
41589 +
41590 +       Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
41591 +
41592 +       pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
41593 +       if (pgd_none(*pgd)) {
41594 +               printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
41595 +               return;
41596 +       }
41597 +       pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
41598 +       if (pud_none(*pud)) {
41599 +               pmd = (pmd_t *) spp_getpage(); 
41600 +               make_page_readonly(pmd, XENFEAT_writable_page_tables);
41601 +               set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
41602 +               if (pmd != pmd_offset(pud, 0)) {
41603 +                       printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
41604 +                       return;
41605 +               }
41606 +       }
41607 +       pmd = pmd_offset(pud, vaddr);
41608 +       if (pmd_none(*pmd)) {
41609 +               pte = (pte_t *) spp_getpage();
41610 +               make_page_readonly(pte, XENFEAT_writable_page_tables);
41611 +               set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
41612 +               if (pte != pte_offset_kernel(pmd, 0)) {
41613 +                       printk("PAGETABLE BUG #02!\n");
41614 +                       return;
41615 +               }
41616 +       }
41617 +       new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
41618 +
41619 +       pte = pte_offset_kernel(pmd, vaddr);
41620 +       if (!pte_none(*pte) &&
41621 +           pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
41622 +               pte_ERROR(*pte);
41623 +       set_pte(pte, new_pte);
41624 +
41625 +       /*
41626 +        * It's enough to flush this one mapping.
41627 +        * (PGE mappings get flushed as well)
41628 +        */
41629 +       __flush_tlb_one(vaddr);
41630 +}
41631 +
41632 +static void set_pte_phys_ma(unsigned long vaddr,
41633 +                        unsigned long phys, pgprot_t prot)
41634 +{
41635 +       pgd_t *pgd;
41636 +       pud_t *pud;
41637 +       pmd_t *pmd;
41638 +       pte_t *pte, new_pte;
41639 +
41640 +       Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
41641 +
41642 +       pgd = pgd_offset_k(vaddr);
41643 +       if (pgd_none(*pgd)) {
41644 +               printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
41645 +               return;
41646 +       }
41647 +       pud = pud_offset(pgd, vaddr);
41648 +       if (pud_none(*pud)) {
41649 +
41650 +               pmd = (pmd_t *) spp_getpage(); 
41651 +               make_page_readonly(pmd, XENFEAT_writable_page_tables);
41652 +
41653 +               set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
41654 +
41655 +               if (pmd != pmd_offset(pud, 0)) {
41656 +                       printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
41657 +                       return;
41658 +               }
41659 +       }
41660 +       pmd = pmd_offset(pud, vaddr);
41661 +
41662 +       if (pmd_none(*pmd)) {
41663 +               pte = (pte_t *) spp_getpage();
41664 +               make_page_readonly(pte, XENFEAT_writable_page_tables);
41665 +
41666 +               set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
41667 +               if (pte != pte_offset_kernel(pmd, 0)) {
41668 +                       printk("PAGETABLE BUG #02!\n");
41669 +                       return;
41670 +               }
41671 +       }
41672 +
41673 +       new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
41674 +       pte = pte_offset_kernel(pmd, vaddr);
41675 +
41676 +       /* 
41677 +        * Note that the pte page is already RO, thus we want to use
41678 +        * xen_l1_entry_update(), not set_pte().
41679 +        */
41680 +       xen_l1_entry_update(pte, 
41681 +                           pfn_pte_ma(phys >> PAGE_SHIFT, prot));
41682 +
41683 +       /*
41684 +        * It's enough to flush this one mapping.
41685 +        * (PGE mappings get flushed as well)
41686 +        */
41687 +       __flush_tlb_one(vaddr);
41688 +}
41689 +
41690 +#define SET_FIXMAP_KERNEL 0
41691 +#define SET_FIXMAP_USER   1
41692 +
41693 +/* NOTE: this is meant to be run only at boot */
41694 +void __init 
41695 +__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
41696 +{
41697 +       unsigned long address = __fix_to_virt(idx);
41698 +
41699 +       if (idx >= __end_of_fixed_addresses) {
41700 +               printk("Invalid __set_fixmap\n");
41701 +               return;
41702 +       }
41703 +       switch (idx) {
41704 +       case VSYSCALL_FIRST_PAGE:
41705 +               set_pte_phys(address, phys, prot, SET_FIXMAP_KERNEL);
41706 +               break;
41707 +       default:
41708 +               set_pte_phys_ma(address, phys, prot);
41709 +               break;
41710 +       }
41711 +}
41712 +
41713 +/*
41714 + * At this point it only supports vsyscall area.
41715 + */
41716 +void __set_fixmap_user (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
41717 +{
41718 +       unsigned long address = __fix_to_virt(idx);
41719 +
41720 +       if (idx >= __end_of_fixed_addresses) {
41721 +               printk("Invalid __set_fixmap\n");
41722 +               return;
41723 +       }
41724 +
41725 +       set_pte_phys(address, phys, prot, SET_FIXMAP_USER); 
41726 +}
41727 +
41728 +unsigned long __initdata table_start, table_end; 
41729 +
41730 +#ifndef CONFIG_XEN
41731 +extern pmd_t temp_boot_pmds[]; 
41732 +
41733 +static  struct temp_map { 
41734 +       pmd_t *pmd;
41735 +       void  *address; 
41736 +       int    allocated; 
41737 +} temp_mappings[] __initdata = { 
41738 +       { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
41739 +       { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) }, 
41740 +       {}
41741 +}; 
41742 +#endif /* !CONFIG_XEN */
41743 +
41744 +unsigned long get_machine_pfn(unsigned long addr)
41745 +{
41746 +       pud_t* pud = pud_offset_k(NULL, addr);
41747 +       pmd_t* pmd = pmd_offset(pud, addr);
41748 +       pte_t *pte = pte_offset_kernel(pmd, addr);
41749 +
41750 +       return pte_mfn(*pte);
41751 +} 
41752 +
41753 +static __meminit void *alloc_static_page(unsigned long *phys)
41754 +{
41755 +       unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
41756 +
41757 +       if (after_bootmem) {
41758 +               void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
41759 +
41760 +               *phys = __pa(adr);
41761 +               return adr;
41762 +       }
41763 +
41764 +       *phys = start_pfn << PAGE_SHIFT;
41765 +       start_pfn++;
41766 +       memset((void *)va, 0, PAGE_SIZE);
41767 +       return (void *)va;
41768 +} 
41769 +
41770 +#define PTE_SIZE PAGE_SIZE
41771 +
41772 +static inline void __set_pte(pte_t *dst, pte_t val)
41773 +{
41774 +       *dst = val;
41775 +}
41776 +
41777 +static inline int make_readonly(unsigned long paddr)
41778 +{
41779 +       int readonly = 0;
41780 +
41781 +       /* Make new page tables read-only. */
41782 +       if (!xen_feature(XENFEAT_writable_page_tables)
41783 +           && (paddr >= (table_start << PAGE_SHIFT))
41784 +           && (paddr < (table_end << PAGE_SHIFT)))
41785 +               readonly = 1;
41786 +       /* Make old page tables read-only. */
41787 +       if (!xen_feature(XENFEAT_writable_page_tables)
41788 +           && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
41789 +           && (paddr < (start_pfn << PAGE_SHIFT)))
41790 +               readonly = 1;
41791 +
41792 +       /*
41793 +        * No need for writable mapping of kernel image. This also ensures that
41794 +        * page and descriptor tables embedded inside don't have writable
41795 +        * mappings. 
41796 +        */
41797 +       if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end)))
41798 +               readonly = 1;
41799 +
41800 +       return readonly;
41801 +}
41802 +
41803 +#ifndef CONFIG_XEN
41804 +/* Must run before zap_low_mappings */
41805 +__init void *early_ioremap(unsigned long addr, unsigned long size)
41806 +{
41807 +       unsigned long map = round_down(addr, LARGE_PAGE_SIZE); 
41808 +
41809 +       /* actually usually some more */
41810 +       if (size >= LARGE_PAGE_SIZE) { 
41811 +               printk("SMBIOS area too long %lu\n", size);
41812 +               return NULL;
41813 +       }
41814 +       set_pmd(temp_mappings[0].pmd,  __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
41815 +       map += LARGE_PAGE_SIZE;
41816 +       set_pmd(temp_mappings[1].pmd,  __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
41817 +       __flush_tlb();
41818 +       return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
41819 +}
41820 +
41821 +/* To avoid virtual aliases later */
41822 +__init void early_iounmap(void *addr, unsigned long size)
41823 +{
41824 +       if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
41825 +               printk("early_iounmap: bad address %p\n", addr);
41826 +       set_pmd(temp_mappings[0].pmd, __pmd(0));
41827 +       set_pmd(temp_mappings[1].pmd, __pmd(0));
41828 +       __flush_tlb();
41829 +}
41830 +#endif /* !CONFIG_XEN */
41831 +
41832 +static void __meminit
41833 +phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
41834 +{
41835 +       int i, k;
41836 +
41837 +       for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
41838 +               unsigned long pte_phys;
41839 +               pte_t *pte, *pte_save;
41840 +
41841 +               if (address >= end) {
41842 +                       for (; i < PTRS_PER_PMD; i++, pmd++)
41843 +                               set_pmd(pmd, __pmd(0));
41844 +                       break;
41845 +               }
41846 +               pte = alloc_static_page(&pte_phys);
41847 +               pte_save = pte;
41848 +               for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
41849 +                       if ((address >= end) ||
41850 +                           ((address >> PAGE_SHIFT) >=
41851 +                            xen_start_info->nr_pages)) { 
41852 +                               __set_pte(pte, __pte(0)); 
41853 +                               continue;
41854 +                       }
41855 +                       if (make_readonly(address)) {
41856 +                               __set_pte(pte, 
41857 +                                         __pte(address | (_KERNPG_TABLE & ~_PAGE_RW)));
41858 +                               continue;
41859 +                       }
41860 +                       __set_pte(pte, __pte(address | _KERNPG_TABLE));
41861 +               }
41862 +               pte = pte_save;
41863 +               early_make_page_readonly(pte, XENFEAT_writable_page_tables);
41864 +               set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
41865 +       }
41866 +}
41867 +
41868 +static void __meminit
41869 +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
41870 +{
41871 +       pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
41872 +
41873 +       if (pmd_none(*pmd)) {
41874 +               spin_lock(&init_mm.page_table_lock);
41875 +               phys_pmd_init(pmd, address, end);
41876 +               spin_unlock(&init_mm.page_table_lock);
41877 +               __flush_tlb_all();
41878 +       }
41879 +}
41880 +
41881 +static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
41882 +{ 
41883 +       long i = pud_index(address);
41884 +
41885 +       pud = pud + i;
41886 +
41887 +       if (after_bootmem && pud_val(*pud)) {
41888 +               phys_pmd_update(pud, address, end);
41889 +               return;
41890 +       }
41891 +
41892 +       for (; i < PTRS_PER_PUD; pud++, i++) {
41893 +               unsigned long paddr, pmd_phys;
41894 +               pmd_t *pmd;
41895 +
41896 +               paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
41897 +               if (paddr >= end)
41898 +                       break;
41899 +
41900 +               pmd = alloc_static_page(&pmd_phys);
41901 +               early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
41902 +               spin_lock(&init_mm.page_table_lock);
41903 +               set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
41904 +               phys_pmd_init(pmd, paddr, end);
41905 +               spin_unlock(&init_mm.page_table_lock);
41906 +       }
41907 +       __flush_tlb();
41908 +} 
41909 +
41910 +void __init xen_init_pt(void)
41911 +{
41912 +       unsigned long addr, *page;
41913 +
41914 +       memset((void *)init_level4_pgt,   0, PAGE_SIZE);
41915 +       memset((void *)level3_kernel_pgt, 0, PAGE_SIZE);
41916 +       memset((void *)level2_kernel_pgt, 0, PAGE_SIZE);
41917 +
41918 +       /* Find the initial pte page that was built for us. */
41919 +       page = (unsigned long *)xen_start_info->pt_base;
41920 +       addr = page[pgd_index(__START_KERNEL_map)];
41921 +       addr_to_page(addr, page);
41922 +       addr = page[pud_index(__START_KERNEL_map)];
41923 +       addr_to_page(addr, page);
41924 +
41925 +       /* Construct mapping of initial pte page in our own directories. */
41926 +       init_level4_pgt[pgd_index(__START_KERNEL_map)] = 
41927 +               mk_kernel_pgd(__pa_symbol(level3_kernel_pgt));
41928 +       level3_kernel_pgt[pud_index(__START_KERNEL_map)] = 
41929 +               __pud(__pa_symbol(level2_kernel_pgt) |
41930 +                     _KERNPG_TABLE | _PAGE_USER);
41931 +       memcpy((void *)level2_kernel_pgt, page, PAGE_SIZE);
41932 +
41933 +       early_make_page_readonly(init_level4_pgt,
41934 +                                XENFEAT_writable_page_tables);
41935 +       early_make_page_readonly(init_level4_user_pgt,
41936 +                                XENFEAT_writable_page_tables);
41937 +       early_make_page_readonly(level3_kernel_pgt,
41938 +                                XENFEAT_writable_page_tables);
41939 +       early_make_page_readonly(level3_user_pgt,
41940 +                                XENFEAT_writable_page_tables);
41941 +       early_make_page_readonly(level2_kernel_pgt,
41942 +                                XENFEAT_writable_page_tables);
41943 +
41944 +       xen_pgd_pin(__pa_symbol(init_level4_pgt));
41945 +       xen_pgd_pin(__pa_symbol(init_level4_user_pgt));
41946 +
41947 +       set_pgd((pgd_t *)(init_level4_user_pgt + 511), 
41948 +               mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
41949 +}
41950 +
41951 +void __init extend_init_mapping(unsigned long tables_space)
41952 +{
41953 +       unsigned long va = __START_KERNEL_map;
41954 +       unsigned long phys, addr, *pte_page;
41955 +       pmd_t *pmd;
41956 +       pte_t *pte, new_pte;
41957 +       unsigned long *page = (unsigned long *)init_level4_pgt;
41958 +
41959 +       addr = page[pgd_index(va)];
41960 +       addr_to_page(addr, page);
41961 +       addr = page[pud_index(va)];
41962 +       addr_to_page(addr, page);
41963 +
41964 +       /* Kill mapping of low 1MB. */
41965 +       while (va < (unsigned long)&_text) {
41966 +               HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
41967 +               va += PAGE_SIZE;
41968 +       }
41969 +
41970 +       /* Ensure init mappings cover kernel text/data and initial tables. */
41971 +       while (va < (__START_KERNEL_map
41972 +                    + (start_pfn << PAGE_SHIFT)
41973 +                    + tables_space)) {
41974 +               pmd = (pmd_t *)&page[pmd_index(va)];
41975 +               if (pmd_none(*pmd)) {
41976 +                       pte_page = alloc_static_page(&phys);
41977 +                       early_make_page_readonly(
41978 +                               pte_page, XENFEAT_writable_page_tables);
41979 +                       set_pmd(pmd, __pmd(phys | _KERNPG_TABLE | _PAGE_USER));
41980 +               } else {
41981 +                       addr = page[pmd_index(va)];
41982 +                       addr_to_page(addr, pte_page);
41983 +               }
41984 +               pte = (pte_t *)&pte_page[pte_index(va)];
41985 +               if (pte_none(*pte)) {
41986 +                       new_pte = pfn_pte(
41987 +                               (va - __START_KERNEL_map) >> PAGE_SHIFT, 
41988 +                               __pgprot(_KERNPG_TABLE | _PAGE_USER));
41989 +                       xen_l1_entry_update(pte, new_pte);
41990 +               }
41991 +               va += PAGE_SIZE;
41992 +       }
41993 +
41994 +       /* Finally, blow away any spurious initial mappings. */
41995 +       while (1) {
41996 +               pmd = (pmd_t *)&page[pmd_index(va)];
41997 +               if (pmd_none(*pmd))
41998 +                       break;
41999 +               HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
42000 +               va += PAGE_SIZE;
42001 +       }
42002 +}
42003 +
42004 +static void __init find_early_table_space(unsigned long end)
42005 +{
42006 +       unsigned long puds, pmds, ptes, tables; 
42007 +
42008 +       puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
42009 +       pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
42010 +       ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
42011 +
42012 +       tables = round_up(puds * 8, PAGE_SIZE) + 
42013 +               round_up(pmds * 8, PAGE_SIZE) + 
42014 +               round_up(ptes * 8, PAGE_SIZE); 
42015 +
42016 +       extend_init_mapping(tables);
42017 +
42018 +       table_start = start_pfn;
42019 +       table_end = table_start + (tables>>PAGE_SHIFT);
42020 +
42021 +       early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
42022 +               end, table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
42023 +}
42024 +
42025 +/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
42026 +   This runs before bootmem is initialized and gets pages directly from the 
42027 +   physical memory. To access them they are temporarily mapped. */
42028 +void __meminit init_memory_mapping(unsigned long start, unsigned long end)
42029 +{ 
42030 +       unsigned long next; 
42031 +
42032 +       Dprintk("init_memory_mapping\n");
42033 +
42034 +       /* 
42035 +        * Find space for the kernel direct mapping tables.
42036 +        * Later we should allocate these tables in the local node of the memory
42037 +        * mapped.  Unfortunately this is done currently before the nodes are 
42038 +        * discovered.
42039 +        */
42040 +       if (!after_bootmem)
42041 +               find_early_table_space(end);
42042 +
42043 +       start = (unsigned long)__va(start);
42044 +       end = (unsigned long)__va(end);
42045 +
42046 +       for (; start < end; start = next) {
42047 +               unsigned long pud_phys; 
42048 +               pgd_t *pgd = pgd_offset_k(start);
42049 +               pud_t *pud;
42050 +
42051 +               if (after_bootmem) {
42052 +                       pud = pud_offset_k(pgd, start & PGDIR_MASK);
42053 +                       make_page_readonly(pud, XENFEAT_writable_page_tables);
42054 +                       pud_phys = __pa(pud);
42055 +               } else {
42056 +                       pud = alloc_static_page(&pud_phys);
42057 +                       early_make_page_readonly(pud, XENFEAT_writable_page_tables);
42058 +               }
42059 +               next = start + PGDIR_SIZE;
42060 +               if (next > end) 
42061 +                       next = end; 
42062 +               phys_pud_init(pud, __pa(start), __pa(next));
42063 +               if (!after_bootmem)
42064 +                       set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
42065 +       }
42066 +
42067 +       if (!after_bootmem) {
42068 +               BUG_ON(start_pfn != table_end);
42069 +
42070 +               /* Re-vector virtual addresses pointing into the initial
42071 +                  mapping to the just-established permanent ones. */
42072 +               xen_start_info = __va(__pa(xen_start_info));
42073 +               xen_start_info->pt_base = (unsigned long)
42074 +                       __va(__pa(xen_start_info->pt_base));
42075 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
42076 +                       phys_to_machine_mapping =
42077 +                               __va(__pa(xen_start_info->mfn_list));
42078 +                       xen_start_info->mfn_list = (unsigned long)
42079 +                               phys_to_machine_mapping;
42080 +               }
42081 +               if (xen_start_info->mod_start)
42082 +                       xen_start_info->mod_start = (unsigned long)
42083 +                               __va(__pa(xen_start_info->mod_start));
42084 +
42085 +               /* Destroy the Xen-created mappings beyond the kernel image as
42086 +                * well as the temporary mappings created above. Prevents
42087 +                * overlap with modules area (if init mapping is very big).
42088 +                */
42089 +               start = PAGE_ALIGN((unsigned long)_end);
42090 +               end   = __START_KERNEL_map + (table_end << PAGE_SHIFT);
42091 +               for (; start < end; start += PAGE_SIZE)
42092 +                       WARN_ON(HYPERVISOR_update_va_mapping(
42093 +                               start, __pte_ma(0), 0));
42094 +       }
42095 +
42096 +       __flush_tlb_all();
42097 +}
42098 +
42099 +void __cpuinit zap_low_mappings(int cpu)
42100 +{
42101 +       /* this is not required for Xen */
42102 +#if 0
42103 +       swap_low_mappings();
42104 +#endif
42105 +}
42106 +
42107 +/* Compute zone sizes for the DMA and DMA32 zones in a node. */
42108 +__init void
42109 +size_zones(unsigned long *z, unsigned long *h,
42110 +          unsigned long start_pfn, unsigned long end_pfn)
42111 +{
42112 +       int i;
42113 +#ifndef CONFIG_XEN
42114 +       unsigned long w;
42115 +#endif
42116 +
42117 +       for (i = 0; i < MAX_NR_ZONES; i++)
42118 +               z[i] = 0;
42119 +
42120 +#ifndef CONFIG_XEN
42121 +       if (start_pfn < MAX_DMA_PFN)
42122 +               z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
42123 +       if (start_pfn < MAX_DMA32_PFN) {
42124 +               unsigned long dma32_pfn = MAX_DMA32_PFN;
42125 +               if (dma32_pfn > end_pfn)
42126 +                       dma32_pfn = end_pfn;
42127 +               z[ZONE_DMA32] = dma32_pfn - start_pfn;
42128 +       }
42129 +       z[ZONE_NORMAL] = end_pfn - start_pfn;
42130 +
42131 +       /* Remove lower zones from higher ones. */
42132 +       w = 0;
42133 +       for (i = 0; i < MAX_NR_ZONES; i++) {
42134 +               if (z[i])
42135 +                       z[i] -= w;
42136 +               w += z[i];
42137 +       }
42138 +
42139 +       /* Compute holes */
42140 +       w = start_pfn;
42141 +       for (i = 0; i < MAX_NR_ZONES; i++) {
42142 +               unsigned long s = w;
42143 +               w += z[i];
42144 +               h[i] = e820_hole_size(s, w);
42145 +       }
42146 +
42147 +       /* Add the space pace needed for mem_map to the holes too. */
42148 +       for (i = 0; i < MAX_NR_ZONES; i++)
42149 +               h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
42150 +
42151 +       /* The 16MB DMA zone has the kernel and other misc mappings.
42152 +          Account them too */
42153 +       if (h[ZONE_DMA]) {
42154 +               h[ZONE_DMA] += dma_reserve;
42155 +               if (h[ZONE_DMA] >= z[ZONE_DMA]) {
42156 +                       printk(KERN_WARNING
42157 +                               "Kernel too large and filling up ZONE_DMA?\n");
42158 +                       h[ZONE_DMA] = z[ZONE_DMA];
42159 +               }
42160 +       }
42161 +#else
42162 +       z[ZONE_DMA] = end_pfn;
42163 +       for (i = 0; i < MAX_NR_ZONES; i++)
42164 +               h[i] = 0;
42165 +#endif
42166 +}
42167 +
42168 +#ifndef CONFIG_NUMA
42169 +void __init paging_init(void)
42170 +{
42171 +       unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
42172 +       int i;
42173 +
42174 +       memory_present(0, 0, end_pfn);
42175 +       sparse_init();
42176 +       size_zones(zones, holes, 0, end_pfn);
42177 +       free_area_init_node(0, NODE_DATA(0), zones,
42178 +                           __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
42179 +
42180 +       /* Switch to the real shared_info page, and clear the
42181 +        * dummy page. */
42182 +       set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
42183 +       HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
42184 +       memset(empty_zero_page, 0, sizeof(empty_zero_page));
42185 +
42186 +       init_mm.context.pinned = 1;
42187 +
42188 +       /* Setup mapping of lower 1st MB */
42189 +       for (i = 0; i < NR_FIX_ISAMAPS; i++)
42190 +               if (xen_start_info->flags & SIF_PRIVILEGED)
42191 +                       set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
42192 +               else
42193 +                       __set_fixmap(FIX_ISAMAP_BEGIN - i,
42194 +                                    virt_to_mfn(empty_zero_page) << PAGE_SHIFT,
42195 +                                    PAGE_KERNEL_RO);
42196 +}
42197 +#endif
42198 +
42199 +/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
42200 +   from the CPU leading to inconsistent cache lines. address and size
42201 +   must be aligned to 2MB boundaries. 
42202 +   Does nothing when the mapping doesn't exist. */
42203 +void __init clear_kernel_mapping(unsigned long address, unsigned long size) 
42204 +{
42205 +       unsigned long end = address + size;
42206 +
42207 +       BUG_ON(address & ~LARGE_PAGE_MASK);
42208 +       BUG_ON(size & ~LARGE_PAGE_MASK); 
42209 +       
42210 +       for (; address < end; address += LARGE_PAGE_SIZE) { 
42211 +               pgd_t *pgd = pgd_offset_k(address);
42212 +               pud_t *pud;
42213 +               pmd_t *pmd;
42214 +               if (pgd_none(*pgd))
42215 +                       continue;
42216 +               pud = pud_offset(pgd, address);
42217 +               if (pud_none(*pud))
42218 +                       continue; 
42219 +               pmd = pmd_offset(pud, address);
42220 +               if (!pmd || pmd_none(*pmd))
42221 +                       continue; 
42222 +               if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { 
42223 +                       /* Could handle this, but it should not happen currently. */
42224 +                       printk(KERN_ERR 
42225 +              "clear_kernel_mapping: mapping has been split. will leak memory\n"); 
42226 +                       pmd_ERROR(*pmd); 
42227 +               }
42228 +               set_pmd(pmd, __pmd(0));                 
42229 +       }
42230 +       __flush_tlb_all();
42231 +} 
42232 +
42233 +/*
42234 + * Memory hotplug specific functions
42235 + */
42236 +#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)
42237 +
42238 +void online_page(struct page *page)
42239 +{
42240 +       ClearPageReserved(page);
42241 +       init_page_count(page);
42242 +       __free_page(page);
42243 +       totalram_pages++;
42244 +       num_physpages++;
42245 +}
42246 +
42247 +#ifndef CONFIG_MEMORY_HOTPLUG
42248 +/*
42249 + * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
42250 + * just online the pages.
42251 + */
42252 +int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
42253 +{
42254 +       int err = -EIO;
42255 +       unsigned long pfn;
42256 +       unsigned long total = 0, mem = 0;
42257 +       for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
42258 +               if (pfn_valid(pfn)) {
42259 +                       online_page(pfn_to_page(pfn));
42260 +                       err = 0;
42261 +                       mem++;
42262 +               }
42263 +               total++;
42264 +       }
42265 +       if (!err) {
42266 +               z->spanned_pages += total;
42267 +               z->present_pages += mem;
42268 +               z->zone_pgdat->node_spanned_pages += total;
42269 +               z->zone_pgdat->node_present_pages += mem;
42270 +       }
42271 +       return err;
42272 +}
42273 +#endif
42274 +
42275 +/*
42276 + * Memory is added always to NORMAL zone. This means you will never get
42277 + * additional DMA/DMA32 memory.
42278 + */
42279 +int add_memory(u64 start, u64 size)
42280 +{
42281 +       struct pglist_data *pgdat = NODE_DATA(0);
42282 +       struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
42283 +       unsigned long start_pfn = start >> PAGE_SHIFT;
42284 +       unsigned long nr_pages = size >> PAGE_SHIFT;
42285 +       int ret;
42286 +
42287 +       ret = __add_pages(zone, start_pfn, nr_pages);
42288 +       if (ret)
42289 +               goto error;
42290 +
42291 +       init_memory_mapping(start, (start + size -1));
42292 +
42293 +       return ret;
42294 +error:
42295 +       printk("%s: Problem encountered in __add_pages!\n", __func__);
42296 +       return ret;
42297 +}
42298 +EXPORT_SYMBOL_GPL(add_memory);
42299 +
42300 +int remove_memory(u64 start, u64 size)
42301 +{
42302 +       return -EINVAL;
42303 +}
42304 +EXPORT_SYMBOL_GPL(remove_memory);
42305 +
42306 +#endif
42307 +
42308 +static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
42309 +                        kcore_vsyscall;
42310 +
42311 +void __init mem_init(void)
42312 +{
42313 +       long codesize, reservedpages, datasize, initsize;
42314 +       unsigned long pfn;
42315 +
42316 +       contiguous_bitmap = alloc_bootmem_low_pages(
42317 +               (end_pfn + 2*BITS_PER_LONG) >> 3);
42318 +       BUG_ON(!contiguous_bitmap);
42319 +       memset(contiguous_bitmap, 0, (end_pfn + 2*BITS_PER_LONG) >> 3);
42320 +
42321 +#if defined(CONFIG_SWIOTLB)
42322 +       pci_swiotlb_init();     
42323 +#endif
42324 +       no_iommu_init();
42325 +
42326 +       /* How many end-of-memory variables you have, grandma! */
42327 +       max_low_pfn = end_pfn;
42328 +       max_pfn = end_pfn;
42329 +       num_physpages = end_pfn;
42330 +       high_memory = (void *) __va(end_pfn * PAGE_SIZE);
42331 +
42332 +       /* clear the zero-page */
42333 +       memset(empty_zero_page, 0, PAGE_SIZE);
42334 +
42335 +       reservedpages = 0;
42336 +
42337 +       /* this will put all low memory onto the freelists */
42338 +#ifdef CONFIG_NUMA
42339 +       totalram_pages = numa_free_all_bootmem();
42340 +#else
42341 +       totalram_pages = free_all_bootmem();
42342 +#endif
42343 +       /* XEN: init and count pages outside initial allocation. */
42344 +       for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
42345 +               ClearPageReserved(&mem_map[pfn]);
42346 +               init_page_count(&mem_map[pfn]);
42347 +               totalram_pages++;
42348 +       }
42349 +       reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
42350 +
42351 +       after_bootmem = 1;
42352 +
42353 +       codesize =  (unsigned long) &_etext - (unsigned long) &_text;
42354 +       datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
42355 +       initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
42356 +
42357 +       /* Register memory areas for /proc/kcore */
42358 +       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
42359 +       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
42360 +                  VMALLOC_END-VMALLOC_START);
42361 +       kclist_add(&kcore_kernel, &_stext, _end - _stext);
42362 +       kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
42363 +       kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, 
42364 +                                VSYSCALL_END - VSYSCALL_START);
42365 +
42366 +       printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
42367 +               (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
42368 +               end_pfn << (PAGE_SHIFT-10),
42369 +               codesize >> 10,
42370 +               reservedpages << (PAGE_SHIFT-10),
42371 +               datasize >> 10,
42372 +               initsize >> 10);
42373 +
42374 +#ifndef CONFIG_XEN
42375 +#ifdef CONFIG_SMP
42376 +       /*
42377 +        * Sync boot_level4_pgt mappings with the init_level4_pgt
42378 +        * except for the low identity mappings which are already zapped
42379 +        * in init_level4_pgt. This sync-up is essential for AP's bringup
42380 +        */
42381 +       memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
42382 +#endif
42383 +#endif
42384 +}
42385 +
42386 +void free_initmem(void)
42387 +{
42388 +#ifdef __DO_LATER__
42389 +       /*
42390 +        * Some pages can be pinned, but some are not. Unpinning such pages 
42391 +        * triggers BUG(). 
42392 +        */
42393 +       unsigned long addr;
42394 +
42395 +       addr = (unsigned long)(&__init_begin);
42396 +       for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
42397 +               ClearPageReserved(virt_to_page(addr));
42398 +               init_page_count(virt_to_page(addr));
42399 +               memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); 
42400 +               make_page_writable(
42401 +                       __va(__pa(addr)), XENFEAT_writable_page_tables);
42402 +               /*
42403 +                * Make pages from __PAGE_OFFSET address as well
42404 +                */
42405 +               make_page_writable(
42406 +                       (void *)addr, XENFEAT_writable_page_tables);
42407 +               free_page(addr);
42408 +               totalram_pages++;
42409 +       }
42410 +       memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
42411 +       printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
42412 +#endif
42413 +}
42414 +
42415 +#ifdef CONFIG_DEBUG_RODATA
42416 +
42417 +extern char __start_rodata, __end_rodata;
42418 +void mark_rodata_ro(void)
42419 +{
42420 +       unsigned long addr = (unsigned long)&__start_rodata;
42421 +
42422 +       for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
42423 +               change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
42424 +
42425 +       printk ("Write protecting the kernel read-only data: %luk\n",
42426 +                       (&__end_rodata - &__start_rodata) >> 10);
42427 +
42428 +       /*
42429 +        * change_page_attr_addr() requires a global_flush_tlb() call after it.
42430 +        * We do this after the printk so that if something went wrong in the
42431 +        * change, the printk gets out at least to give a better debug hint
42432 +        * of who is the culprit.
42433 +        */
42434 +       global_flush_tlb();
42435 +}
42436 +#endif
42437 +
42438 +#ifdef CONFIG_BLK_DEV_INITRD
42439 +void free_initrd_mem(unsigned long start, unsigned long end)
42440 +{
42441 +       if (start >= end)
42442 +               return;
42443 +       printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
42444 +       for (; start < end; start += PAGE_SIZE) {
42445 +               ClearPageReserved(virt_to_page(start));
42446 +               init_page_count(virt_to_page(start));
42447 +               free_page(start);
42448 +               totalram_pages++;
42449 +       }
42450 +}
42451 +#endif
42452 +
42453 +void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 
42454 +{ 
42455 +       /* Should check here against the e820 map to avoid double free */ 
42456 +#ifdef CONFIG_NUMA
42457 +       int nid = phys_to_nid(phys);
42458 +       reserve_bootmem_node(NODE_DATA(nid), phys, len);
42459 +#else                  
42460 +       reserve_bootmem(phys, len);    
42461 +#endif
42462 +       if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
42463 +               dma_reserve += len / PAGE_SIZE;
42464 +}
42465 +
42466 +int kern_addr_valid(unsigned long addr) 
42467 +{ 
42468 +       unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
42469 +       pgd_t *pgd;
42470 +       pud_t *pud;
42471 +       pmd_t *pmd;
42472 +       pte_t *pte;
42473 +
42474 +       if (above != 0 && above != -1UL)
42475 +               return 0; 
42476 +       
42477 +       pgd = pgd_offset_k(addr);
42478 +       if (pgd_none(*pgd))
42479 +               return 0;
42480 +
42481 +       pud = pud_offset_k(pgd, addr);
42482 +       if (pud_none(*pud))
42483 +               return 0; 
42484 +
42485 +       pmd = pmd_offset(pud, addr);
42486 +       if (pmd_none(*pmd))
42487 +               return 0;
42488 +       if (pmd_large(*pmd))
42489 +               return pfn_valid(pmd_pfn(*pmd));
42490 +
42491 +       pte = pte_offset_kernel(pmd, addr);
42492 +       if (pte_none(*pte))
42493 +               return 0;
42494 +       return pfn_valid(pte_pfn(*pte));
42495 +}
42496 +
42497 +#ifdef CONFIG_SYSCTL
42498 +#include <linux/sysctl.h>
42499 +
42500 +extern int exception_trace, page_fault_trace;
42501 +
42502 +static ctl_table debug_table2[] = {
42503 +       { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
42504 +         proc_dointvec },
42505 +       { 0, }
42506 +}; 
42507 +
42508 +static ctl_table debug_root_table2[] = { 
42509 +       { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, 
42510 +          .child = debug_table2 }, 
42511 +       { 0 }, 
42512 +}; 
42513 +
42514 +static __init int x8664_sysctl_init(void)
42515 +{ 
42516 +       register_sysctl_table(debug_root_table2, 1);
42517 +       return 0;
42518 +}
42519 +__initcall(x8664_sysctl_init);
42520 +#endif
42521 +
42522 +/* A pseudo VMAs to allow ptrace access for the vsyscall page.   This only
42523 +   covers the 64bit vsyscall page now. 32bit has a real VMA now and does
42524 +   not need special handling anymore. */
42525 +
42526 +static struct vm_area_struct gate_vma = {
42527 +       .vm_start = VSYSCALL_START,
42528 +       .vm_end = VSYSCALL_END,
42529 +       .vm_page_prot = PAGE_READONLY
42530 +};
42531 +
42532 +struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
42533 +{
42534 +#ifdef CONFIG_IA32_EMULATION
42535 +       if (test_tsk_thread_flag(tsk, TIF_IA32))
42536 +               return NULL;
42537 +#endif
42538 +       return &gate_vma;
42539 +}
42540 +
42541 +int in_gate_area(struct task_struct *task, unsigned long addr)
42542 +{
42543 +       struct vm_area_struct *vma = get_gate_vma(task);
42544 +       if (!vma)
42545 +               return 0;
42546 +       return (addr >= vma->vm_start) && (addr < vma->vm_end);
42547 +}
42548 +
42549 +/* Use this when you have no reliable task/vma, typically from interrupt
42550 + * context.  It is less reliable than using the task's vma and may give
42551 + * false positives.
42552 + */
42553 +int in_gate_area_no_task(unsigned long addr)
42554 +{
42555 +       return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
42556 +}
42557 diff -urNp linux-2.6/arch/x86_64/mm/Makefile new/arch/x86_64/mm/Makefile
42558 --- linux-2.6/arch/x86_64/mm/Makefile   2006-07-03 14:14:30.000000000 +0200
42559 +++ new/arch/x86_64/mm/Makefile 2006-05-09 12:33:21.000000000 +0200
42560 @@ -9,3 +9,13 @@ obj-$(CONFIG_K8_NUMA) += k8topology.o
42561  obj-$(CONFIG_ACPI_NUMA) += srat.o
42562  
42563  hugetlbpage-y = ../../i386/mm/hugetlbpage.o
42564 +
42565 +ifdef CONFIG_XEN
42566 +include $(srctree)/scripts/Makefile.xen
42567 +
42568 +ioremap-y      += ../../i386/mm/ioremap-xen.o
42569 +hypervisor-y   += ../../i386/mm/hypervisor.o
42570 +obj-y          += hypervisor.o
42571 +
42572 +obj-y := $(call cherrypickxen, $(obj-y))
42573 +endif
42574 diff -urNp linux-2.6/arch/x86_64/mm/pageattr-xen.c new/arch/x86_64/mm/pageattr-xen.c
42575 --- linux-2.6/arch/x86_64/mm/pageattr-xen.c     1970-01-01 01:00:00.000000000 +0100
42576 +++ new/arch/x86_64/mm/pageattr-xen.c   2006-06-28 14:32:14.000000000 +0200
42577 @@ -0,0 +1,396 @@
42578 +/* 
42579 + * Copyright 2002 Andi Kleen, SuSE Labs. 
42580 + * Thanks to Ben LaHaise for precious feedback.
42581 + */ 
42582 +
42583 +#include <linux/config.h>
42584 +#include <linux/mm.h>
42585 +#include <linux/sched.h>
42586 +#include <linux/highmem.h>
42587 +#include <linux/module.h>
42588 +#include <linux/slab.h>
42589 +#include <asm/uaccess.h>
42590 +#include <asm/processor.h>
42591 +#include <asm/tlbflush.h>
42592 +#include <asm/io.h>
42593 +
42594 +#ifdef CONFIG_XEN
42595 +#include <asm/pgalloc.h>
42596 +#include <asm/mmu_context.h>
42597 +
42598 +LIST_HEAD(mm_unpinned);
42599 +DEFINE_SPINLOCK(mm_unpinned_lock);
42600 +
42601 +static inline void mm_walk_set_prot(void *pt, pgprot_t flags)
42602 +{
42603 +       struct page *page = virt_to_page(pt);
42604 +       unsigned long pfn = page_to_pfn(page);
42605 +
42606 +       BUG_ON(HYPERVISOR_update_va_mapping(
42607 +                      (unsigned long)__va(pfn << PAGE_SHIFT),
42608 +                      pfn_pte(pfn, flags), 0));
42609 +}
42610 +
42611 +static void mm_walk(struct mm_struct *mm, pgprot_t flags)
42612 +{
42613 +       pgd_t       *pgd;
42614 +       pud_t       *pud;
42615 +       pmd_t       *pmd;
42616 +       pte_t       *pte;
42617 +       int          g,u,m;
42618 +
42619 +       pgd = mm->pgd;
42620 +       /*
42621 +        * Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not
42622 +        * be the 'current' task's pagetables (e.g., current may be 32-bit,
42623 +        * but the pagetables may be for a 64-bit task).
42624 +        * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
42625 +        * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
42626 +        */
42627 +       for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
42628 +               if (pgd_none(*pgd))
42629 +                       continue;
42630 +               pud = pud_offset(pgd, 0);
42631 +               if (PTRS_PER_PUD > 1) /* not folded */ 
42632 +                       mm_walk_set_prot(pud,flags);
42633 +               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
42634 +                       if (pud_none(*pud))
42635 +                               continue;
42636 +                       pmd = pmd_offset(pud, 0);
42637 +                       if (PTRS_PER_PMD > 1) /* not folded */ 
42638 +                               mm_walk_set_prot(pmd,flags);
42639 +                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
42640 +                               if (pmd_none(*pmd))
42641 +                                       continue;
42642 +                               pte = pte_offset_kernel(pmd,0);
42643 +                               mm_walk_set_prot(pte,flags);
42644 +                       }
42645 +               }
42646 +       }
42647 +}
42648 +
42649 +void mm_pin(struct mm_struct *mm)
42650 +{
42651 +       if (xen_feature(XENFEAT_writable_page_tables))
42652 +               return;
42653 +
42654 +       spin_lock(&mm->page_table_lock);
42655 +
42656 +       mm_walk(mm, PAGE_KERNEL_RO);
42657 +       BUG_ON(HYPERVISOR_update_va_mapping(
42658 +                      (unsigned long)mm->pgd,
42659 +                      pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO),
42660 +                      UVMF_TLB_FLUSH));
42661 +       BUG_ON(HYPERVISOR_update_va_mapping(
42662 +                      (unsigned long)__user_pgd(mm->pgd),
42663 +                      pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, PAGE_KERNEL_RO),
42664 +                      UVMF_TLB_FLUSH));
42665 +       xen_pgd_pin(__pa(mm->pgd)); /* kernel */
42666 +       xen_pgd_pin(__pa(__user_pgd(mm->pgd))); /* user */
42667 +       mm->context.pinned = 1;
42668 +       spin_lock(&mm_unpinned_lock);
42669 +       list_del(&mm->context.unpinned);
42670 +       spin_unlock(&mm_unpinned_lock);
42671 +
42672 +       spin_unlock(&mm->page_table_lock);
42673 +}
42674 +
42675 +void mm_unpin(struct mm_struct *mm)
42676 +{
42677 +       if (xen_feature(XENFEAT_writable_page_tables))
42678 +               return;
42679 +
42680 +       spin_lock(&mm->page_table_lock);
42681 +
42682 +       xen_pgd_unpin(__pa(mm->pgd));
42683 +       xen_pgd_unpin(__pa(__user_pgd(mm->pgd)));
42684 +       BUG_ON(HYPERVISOR_update_va_mapping(
42685 +                      (unsigned long)mm->pgd,
42686 +                      pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0));
42687 +       BUG_ON(HYPERVISOR_update_va_mapping(
42688 +                      (unsigned long)__user_pgd(mm->pgd),
42689 +                      pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, PAGE_KERNEL), 0));
42690 +       mm_walk(mm, PAGE_KERNEL);
42691 +       xen_tlb_flush();
42692 +       mm->context.pinned = 0;
42693 +       spin_lock(&mm_unpinned_lock);
42694 +       list_add(&mm->context.unpinned, &mm_unpinned);
42695 +       spin_unlock(&mm_unpinned_lock);
42696 +
42697 +       spin_unlock(&mm->page_table_lock);
42698 +}
42699 +
42700 +void mm_pin_all(void)
42701 +{
42702 +       if (xen_feature(XENFEAT_writable_page_tables))
42703 +               return;
42704 +
42705 +       while (!list_empty(&mm_unpinned))       
42706 +               mm_pin(list_entry(mm_unpinned.next, struct mm_struct,
42707 +                                 context.unpinned));
42708 +}
42709 +
42710 +void _arch_dup_mmap(struct mm_struct *mm)
42711 +{
42712 +    if (!mm->context.pinned)
42713 +        mm_pin(mm);
42714 +}
42715 +
42716 +void _arch_exit_mmap(struct mm_struct *mm)
42717 +{
42718 +    struct task_struct *tsk = current;
42719 +
42720 +    task_lock(tsk);
42721 +
42722 +    /*
42723 +     * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
42724 +     * *much* faster this way, as no tlb flushes means bigger wrpt batches.
42725 +     */
42726 +    if ( tsk->active_mm == mm )
42727 +    {
42728 +        tsk->active_mm = &init_mm;
42729 +        atomic_inc(&init_mm.mm_count);
42730 +
42731 +        switch_mm(mm, &init_mm, tsk);
42732 +
42733 +        atomic_dec(&mm->mm_count);
42734 +        BUG_ON(atomic_read(&mm->mm_count) == 0);
42735 +    }
42736 +
42737 +    task_unlock(tsk);
42738 +
42739 +    if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) )
42740 +        mm_unpin(mm);
42741 +}
42742 +
42743 +void pte_free(struct page *pte)
42744 +{
42745 +       unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
42746 +
42747 +       if (!pte_write(*virt_to_ptep(va)))
42748 +               BUG_ON(HYPERVISOR_update_va_mapping(
42749 +                       va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0));
42750 +       __free_page(pte);
42751 +}
42752 +#endif /* CONFIG_XEN */
42753 +
42754 +static inline pte_t *lookup_address(unsigned long address) 
42755 +{ 
42756 +       pgd_t *pgd = pgd_offset_k(address);
42757 +       pud_t *pud;
42758 +       pmd_t *pmd;
42759 +       pte_t *pte;
42760 +       if (pgd_none(*pgd))
42761 +               return NULL;
42762 +       pud = pud_offset(pgd, address);
42763 +       if (!pud_present(*pud))
42764 +               return NULL; 
42765 +       pmd = pmd_offset(pud, address);
42766 +       if (!pmd_present(*pmd))
42767 +               return NULL; 
42768 +       if (pmd_large(*pmd))
42769 +               return (pte_t *)pmd;
42770 +       pte = pte_offset_kernel(pmd, address);
42771 +       if (pte && !pte_present(*pte))
42772 +               pte = NULL; 
42773 +       return pte;
42774 +} 
42775 +
42776 +static struct page *split_large_page(unsigned long address, pgprot_t prot,
42777 +                                    pgprot_t ref_prot)
42778 +{ 
42779 +       int i; 
42780 +       unsigned long addr;
42781 +       struct page *base = alloc_pages(GFP_KERNEL, 0);
42782 +       pte_t *pbase;
42783 +       if (!base) 
42784 +               return NULL;
42785 +       /*
42786 +        * page_private is used to track the number of entries in
42787 +        * the page table page have non standard attributes.
42788 +        */
42789 +       SetPagePrivate(base);
42790 +       page_private(base) = 0;
42791 +
42792 +       address = __pa(address);
42793 +       addr = address & LARGE_PAGE_MASK; 
42794 +       pbase = (pte_t *)page_address(base);
42795 +       for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
42796 +               pbase[i] = pfn_pte(addr >> PAGE_SHIFT, 
42797 +                                  addr == address ? prot : ref_prot);
42798 +       }
42799 +       return base;
42800 +} 
42801 +
42802 +
42803 +static void flush_kernel_map(void *address) 
42804 +{
42805 +       if (0 && address && cpu_has_clflush) {
42806 +               /* is this worth it? */ 
42807 +               int i;
42808 +               for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) 
42809 +                       asm volatile("clflush (%0)" :: "r" (address + i)); 
42810 +       } else
42811 +               asm volatile("wbinvd":::"memory"); 
42812 +       if (address)
42813 +               __flush_tlb_one(address);
42814 +       else
42815 +               __flush_tlb_all();
42816 +}
42817 +
42818 +
42819 +static inline void flush_map(unsigned long address)
42820 +{      
42821 +       on_each_cpu(flush_kernel_map, (void *)address, 1, 1);
42822 +}
42823 +
42824 +static struct page *deferred_pages; /* protected by init_mm.mmap_sem */
42825 +
42826 +static inline void save_page(struct page *fpage)
42827 +{
42828 +       fpage->lru.next = (struct list_head *)deferred_pages;
42829 +       deferred_pages = fpage;
42830 +}
42831 +
42832 +/* 
42833 + * No more special protections in this 2/4MB area - revert to a
42834 + * large page again. 
42835 + */
42836 +static void revert_page(unsigned long address, pgprot_t ref_prot)
42837 +{
42838 +       pgd_t *pgd;
42839 +       pud_t *pud;
42840 +       pmd_t *pmd;
42841 +       pte_t large_pte;
42842 +
42843 +       pgd = pgd_offset_k(address);
42844 +       BUG_ON(pgd_none(*pgd));
42845 +       pud = pud_offset(pgd,address);
42846 +       BUG_ON(pud_none(*pud));
42847 +       pmd = pmd_offset(pud, address);
42848 +       BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
42849 +       pgprot_val(ref_prot) |= _PAGE_PSE;
42850 +       large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
42851 +       set_pte((pte_t *)pmd, large_pte);
42852 +}      
42853 +
42854 +static int
42855 +__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
42856 +                                  pgprot_t ref_prot)
42857 +{ 
42858 +       pte_t *kpte; 
42859 +       struct page *kpte_page;
42860 +       unsigned kpte_flags;
42861 +       pgprot_t ref_prot2;
42862 +       kpte = lookup_address(address);
42863 +       if (!kpte) return 0;
42864 +       kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
42865 +       kpte_flags = pte_val(*kpte); 
42866 +       if (pgprot_val(prot) != pgprot_val(ref_prot)) { 
42867 +               if ((kpte_flags & _PAGE_PSE) == 0) { 
42868 +                       set_pte(kpte, pfn_pte(pfn, prot));
42869 +               } else {
42870 +                       /*
42871 +                        * split_large_page will take the reference for this
42872 +                        * change_page_attr on the split page.
42873 +                        */
42874 +
42875 +                       struct page *split;
42876 +                       ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE));
42877 +
42878 +                       split = split_large_page(address, prot, ref_prot2);
42879 +                       if (!split)
42880 +                               return -ENOMEM;
42881 +                       set_pte(kpte,mk_pte(split, ref_prot2));
42882 +                       kpte_page = split;
42883 +               }       
42884 +               page_private(kpte_page)++;
42885 +       } else if ((kpte_flags & _PAGE_PSE) == 0) { 
42886 +               set_pte(kpte, pfn_pte(pfn, ref_prot));
42887 +               BUG_ON(page_private(kpte_page) == 0);
42888 +               page_private(kpte_page)--;
42889 +       } else
42890 +               BUG();
42891 +
42892 +       /* on x86-64 the direct mapping set at boot is not using 4k pages */
42893 +       /*
42894 +        * ..., but the XEN guest kernels (currently) do:
42895 +        * If the pte was reserved, it means it was created at boot
42896 +        * time (not via split_large_page) and in turn we must not
42897 +        * replace it with a large page.
42898 +        */
42899 +#ifndef CONFIG_XEN
42900 +       BUG_ON(PageReserved(kpte_page));
42901 +#endif
42902 +       if (page_private(kpte_page) == 0) {
42903 +               save_page(kpte_page);
42904 +               revert_page(address, ref_prot);
42905 +       }
42906 +       return 0;
42907 +} 
42908 +
42909 +/*
42910 + * Change the page attributes of an page in the linear mapping.
42911 + *
42912 + * This should be used when a page is mapped with a different caching policy
42913 + * than write-back somewhere - some CPUs do not like it when mappings with
42914 + * different caching policies exist. This changes the page attributes of the
42915 + * in kernel linear mapping too.
42916 + * 
42917 + * The caller needs to ensure that there are no conflicting mappings elsewhere.
42918 + * This function only deals with the kernel linear map.
42919 + * 
42920 + * Caller must call global_flush_tlb() after this.
42921 + */
42922 +int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
42923 +{
42924 +       int err = 0; 
42925 +       int i; 
42926 +
42927 +       down_write(&init_mm.mmap_sem);
42928 +       for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
42929 +               unsigned long pfn = __pa(address) >> PAGE_SHIFT;
42930 +
42931 +               err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
42932 +               if (err) 
42933 +                       break; 
42934 +               /* Handle kernel mapping too which aliases part of the
42935 +                * lowmem */
42936 +               if (__pa(address) < KERNEL_TEXT_SIZE) {
42937 +                       unsigned long addr2;
42938 +                       pgprot_t prot2 = prot;
42939 +                       addr2 = __START_KERNEL_map + __pa(address);
42940 +                       pgprot_val(prot2) &= ~_PAGE_NX;
42941 +                       err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC);
42942 +               } 
42943 +       }       
42944 +       up_write(&init_mm.mmap_sem); 
42945 +       return err;
42946 +}
42947 +
42948 +/* Don't call this for MMIO areas that may not have a mem_map entry */
42949 +int change_page_attr(struct page *page, int numpages, pgprot_t prot)
42950 +{
42951 +       unsigned long addr = (unsigned long)page_address(page);
42952 +       return change_page_attr_addr(addr, numpages, prot);
42953 +}
42954 +
42955 +void global_flush_tlb(void)
42956 +{ 
42957 +       struct page *dpage;
42958 +
42959 +       down_read(&init_mm.mmap_sem);
42960 +       dpage = xchg(&deferred_pages, NULL);
42961 +       up_read(&init_mm.mmap_sem);
42962 +
42963 +       flush_map((dpage && !dpage->lru.next) ? (unsigned long)page_address(dpage) : 0);
42964 +       while (dpage) {
42965 +               struct page *tmp = dpage;
42966 +               dpage = (struct page *)dpage->lru.next;
42967 +               ClearPagePrivate(tmp);
42968 +               __free_page(tmp);
42969 +       } 
42970 +} 
42971 +
42972 +EXPORT_SYMBOL(change_page_attr);
42973 +EXPORT_SYMBOL(global_flush_tlb);
42974 diff -urNp linux-2.6/arch/x86_64/oprofile/Makefile new/arch/x86_64/oprofile/Makefile
42975 --- linux-2.6/arch/x86_64/oprofile/Makefile     2006-07-03 14:14:30.000000000 +0200
42976 +++ new/arch/x86_64/oprofile/Makefile   2006-05-09 12:33:21.000000000 +0200
42977 @@ -11,9 +11,12 @@ DRIVER_OBJS = $(addprefix ../../../drive
42978         oprofilefs.o oprofile_stats.o \
42979         timer_int.o )
42980  
42981 +ifdef CONFIG_XEN
42982 +OPROFILE-y := xenoprof.o
42983 +else
42984  OPROFILE-y := init.o backtrace.o
42985  OPROFILE-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o op_model_p4.o \
42986                                      op_model_ppro.o
42987  OPROFILE-$(CONFIG_X86_IO_APIC)    += nmi_timer_int.o 
42988 -
42989 +endif
42990  oprofile-y = $(DRIVER_OBJS) $(addprefix ../../i386/oprofile/, $(OPROFILE-y))
42991 diff -urNp linux-2.6/arch/x86_64/pci/Makefile new/arch/x86_64/pci/Makefile
42992 --- linux-2.6/arch/x86_64/pci/Makefile  2006-07-03 14:14:30.000000000 +0200
42993 +++ new/arch/x86_64/pci/Makefile        2006-05-09 12:33:21.000000000 +0200
42994 @@ -15,11 +15,23 @@ obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o
42995  
42996  obj-$(CONFIG_NUMA)     += k8-bus.o
42997  
42998 +# pcifront should be after mmconfig.o and direct.o as it should only
42999 +# take over if direct access to the PCI bus is unavailable
43000 +obj-$(CONFIG_XEN_PCIDEV_FRONTEND)      += pcifront.o
43001 +
43002  direct-y += ../../i386/pci/direct.o
43003  acpi-y   += ../../i386/pci/acpi.o
43004 +pcifront-y += ../../i386/pci/pcifront.o
43005  legacy-y += ../../i386/pci/legacy.o
43006  irq-y    += ../../i386/pci/irq.o
43007  common-y += ../../i386/pci/common.o
43008  fixup-y  += ../../i386/pci/fixup.o
43009  i386-y  += ../../i386/pci/i386.o
43010  init-y += ../../i386/pci/init.o
43011 +
43012 +ifdef CONFIG_XEN
43013 +irq-y          := ../../i386/pci/irq-xen.o
43014 +include $(srctree)/scripts/Makefile.xen
43015 +
43016 +obj-y := $(call cherrypickxen, $(obj-y))
43017 +endif
43018 diff -urNp linux-2.6/Documentation/networking/netdevices.txt new/Documentation/networking/netdevices.txt
43019 --- linux-2.6/Documentation/networking/netdevices.txt   2006-07-03 14:14:08.000000000 +0200
43020 +++ new/Documentation/networking/netdevices.txt 2006-07-07 15:10:03.000000000 +0200
43021 @@ -42,9 +42,9 @@ dev->get_stats:
43022         Context: nominally process, but don't sleep inside an rwlock
43023  
43024  dev->hard_start_xmit:
43025 -       Synchronization: dev->xmit_lock spinlock.
43026 +       Synchronization: netif_tx_lock spinlock.
43027         When the driver sets NETIF_F_LLTX in dev->features this will be
43028 -       called without holding xmit_lock. In this case the driver 
43029 +       called without holding netif_tx_lock. In this case the driver
43030         has to lock by itself when needed. It is recommended to use a try lock
43031         for this and return -1 when the spin lock fails. 
43032         The locking there should also properly protect against 
43033 @@ -62,12 +62,12 @@ dev->hard_start_xmit:
43034           Only valid when NETIF_F_LLTX is set.
43035  
43036  dev->tx_timeout:
43037 -       Synchronization: dev->xmit_lock spinlock.
43038 +       Synchronization: netif_tx_lock spinlock.
43039         Context: BHs disabled
43040         Notes: netif_queue_stopped() is guaranteed true
43041  
43042  dev->set_multicast_list:
43043 -       Synchronization: dev->xmit_lock spinlock.
43044 +       Synchronization: netif_tx_lock spinlock.
43045         Context: BHs disabled
43046  
43047  dev->poll:
43048 diff -urNp linux-2.6/drivers/acpi/Kconfig new/drivers/acpi/Kconfig
43049 --- linux-2.6/drivers/acpi/Kconfig      2006-07-03 14:14:31.000000000 +0200
43050 +++ new/drivers/acpi/Kconfig    2006-05-09 12:33:23.000000000 +0200
43051 @@ -46,7 +46,7 @@ if ACPI
43052  
43053  config ACPI_SLEEP
43054         bool "Sleep States"
43055 -       depends on X86 && (!SMP || SUSPEND_SMP)
43056 +       depends on X86 && (!SMP || SUSPEND_SMP) && !XEN
43057         depends on PM
43058         default y
43059         ---help---
43060 @@ -300,6 +300,7 @@ config ACPI_SYSTEM
43061  config X86_PM_TIMER
43062         bool "Power Management Timer Support" if EMBEDDED
43063         depends on X86
43064 +       depends on !XEN
43065         default y
43066         help
43067           The Power Management Timer is available on all ACPI-capable,
43068 diff -urNp linux-2.6/drivers/char/mem.c new/drivers/char/mem.c
43069 --- linux-2.6/drivers/char/mem.c        2006-07-03 14:14:34.000000000 +0200
43070 +++ new/drivers/char/mem.c      2006-05-09 12:33:33.000000000 +0200
43071 @@ -103,6 +103,7 @@ static inline int valid_mmap_phys_addr_r
43072  }
43073  #endif
43074  
43075 +#ifndef ARCH_HAS_DEV_MEM
43076  /*
43077   * This funcion reads the *physical* memory. The f_pos points directly to the 
43078   * memory location. 
43079 @@ -225,6 +226,7 @@ static ssize_t write_mem(struct file * f
43080         *ppos += written;
43081         return written;
43082  }
43083 +#endif
43084  
43085  #ifndef __HAVE_PHYS_MEM_ACCESS_PROT
43086  static pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
43087 @@ -778,6 +780,7 @@ static int open_port(struct inode * inod
43088  #define open_kmem      open_mem
43089  #define open_oldmem    open_mem
43090  
43091 +#ifndef ARCH_HAS_DEV_MEM
43092  static struct file_operations mem_fops = {
43093         .llseek         = memory_lseek,
43094         .read           = read_mem,
43095 @@ -785,6 +788,9 @@ static struct file_operations mem_fops =
43096         .mmap           = mmap_mem,
43097         .open           = open_mem,
43098  };
43099 +#else
43100 +extern struct file_operations mem_fops;
43101 +#endif
43102  
43103  static struct file_operations kmem_fops = {
43104         .llseek         = memory_lseek,
43105 diff -urNp linux-2.6/drivers/char/tpm/Kconfig new/drivers/char/tpm/Kconfig
43106 --- linux-2.6/drivers/char/tpm/Kconfig  2006-07-03 14:14:34.000000000 +0200
43107 +++ new/drivers/char/tpm/Kconfig        2006-07-07 15:17:16.000000000 +0200
43108 @@ -31,7 +31,7 @@ config TCG_TIS
43109  
43110  config TCG_NSC
43111         tristate "National Semiconductor TPM Interface"
43112 -       depends on TCG_TPM && PNPACPI
43113 +       depends on TCG_TPM && PNPACPI && !XEN_UNPRIVILEGED_GUEST
43114         ---help---
43115           If you have a TPM security chip from National Semicondutor 
43116           say Yes and it will be accessible from within Linux.  To 
43117 @@ -58,5 +58,13 @@ config TCG_INFINEON
43118           Further information on this driver and the supported hardware
43119           can be found at http://www.prosec.rub.de/tpm
43120  
43121 -endmenu
43122 +config TCG_XEN
43123 +       tristate "XEN TPM Interface"
43124 +       depends on TCG_TPM && XEN
43125 +       ---help---
43126 +         If you want to make TPM support available to a Xen user domain,
43127 +         say Yes and it will be accessible from within Linux.
43128 +         To compile this driver as a module, choose M here; the module
43129 +         will be called tpm_xenu.
43130  
43131 +endmenu
43132 diff -urNp linux-2.6/drivers/char/tpm/Makefile new/drivers/char/tpm/Makefile
43133 --- linux-2.6/drivers/char/tpm/Makefile 2006-07-03 14:14:34.000000000 +0200
43134 +++ new/drivers/char/tpm/Makefile       2006-07-07 15:17:21.000000000 +0200
43135 @@ -9,3 +9,5 @@ obj-$(CONFIG_TCG_TIS) += tpm_tis.o
43136  obj-$(CONFIG_TCG_NSC) += tpm_nsc.o
43137  obj-$(CONFIG_TCG_ATMEL) += tpm_atmel.o
43138  obj-$(CONFIG_TCG_INFINEON) += tpm_infineon.o
43139 +obj-$(CONFIG_TCG_XEN) += tpm_xenu.o
43140 +tpm_xenu-y = tpm_xen.o tpm_vtpm.o
43141 diff -urNp linux-2.6/drivers/char/tpm/tpm.c new/drivers/char/tpm/tpm.c
43142 --- linux-2.6/drivers/char/tpm/tpm.c    2006-07-03 14:14:34.000000000 +0200
43143 +++ new/drivers/char/tpm/tpm.c  2006-07-07 15:19:52.000000000 +0200
43144 @@ -30,7 +30,9 @@
43145  
43146  enum tpm_const {
43147         TPM_MINOR = 224,        /* officially assigned */
43148 +#ifndef CONFIG_XEN
43149         TPM_BUFSIZE = 2048,
43150 +#endif
43151         TPM_NUM_DEVICES = 256,
43152  };
43153  
43154 @@ -331,7 +333,11 @@ static void timeout_work(void *ptr)
43155  
43156         down(&chip->buffer_mutex);
43157         atomic_set(&chip->data_pending, 0);
43158 +#ifndef CONFIG_XEN
43159         memset(chip->data_buffer, 0, TPM_BUFSIZE);
43160 +#else
43161 +       memset(chip->data_buffer, 0, get_chip_buffersize(chip));
43162 +#endif
43163         up(&chip->buffer_mutex);
43164  }
43165  
43166 @@ -921,7 +927,12 @@ int tpm_open(struct inode *inode, struct
43167  
43168         spin_unlock(&driver_lock);
43169  
43170 +#ifndef CONFIG_XEN
43171         chip->data_buffer = kmalloc(TPM_BUFSIZE * sizeof(u8), GFP_KERNEL);
43172 +#else
43173 +       chip->data_buffer = kmalloc(get_chip_buffersize(chip) * sizeof(u8),
43174 +                                   GFP_KERNEL);
43175 +#endif
43176         if (chip->data_buffer == NULL) {
43177                 chip->num_opens--;
43178                 put_device(chip->dev);
43179 @@ -969,8 +980,13 @@ ssize_t tpm_write(struct file *file, con
43180  
43181         down(&chip->buffer_mutex);
43182  
43183 +#ifndef CONFIG_XEN
43184         if (in_size > TPM_BUFSIZE)
43185                 in_size = TPM_BUFSIZE;
43186 +#else
43187 +       if (in_size > get_chip_buffersize(chip))
43188 +               in_size = get_chip_buffersize(chip);
43189 +#endif
43190  
43191         if (copy_from_user
43192             (chip->data_buffer, (void __user *) buf, in_size)) {
43193 @@ -979,9 +995,17 @@ ssize_t tpm_write(struct file *file, con
43194         }
43195  
43196         /* atomic tpm command send and result receive */
43197 +#ifndef CONFIG_XEN
43198         out_size = tpm_transmit(chip, chip->data_buffer, TPM_BUFSIZE);
43199 +#else
43200 +       out_size = tpm_transmit(chip, chip->data_buffer,
43201 +                               get_chip_buffersize(chip));
43202 +#endif
43203  
43204         atomic_set(&chip->data_pending, out_size);
43205 +#ifdef CONFIG_XEN
43206 +       atomic_set(&chip->data_position, 0);
43207 +#endif
43208         up(&chip->buffer_mutex);
43209  
43210         /* Set a timeout by which the reader must come claim the result */
43211 @@ -996,21 +1020,52 @@ ssize_t tpm_read(struct file *file, char
43212  {
43213         struct tpm_chip *chip = file->private_data;
43214         int ret_size;
43215 +#ifdef CONFIG_XEN
43216 +       int pos, pending = 0;
43217 +#endif
43218  
43219 +#ifndef CONFIG_XEN
43220         del_singleshot_timer_sync(&chip->user_read_timer);
43221         flush_scheduled_work();
43222 +#endif
43223         ret_size = atomic_read(&chip->data_pending);
43224 +#ifndef CONFIG_XEN
43225         atomic_set(&chip->data_pending, 0);
43226 +#endif
43227         if (ret_size > 0) {     /* relay data */
43228                 if (size < ret_size)
43229                         ret_size = size;
43230  
43231 +#ifdef CONFIG_XEN
43232 +               pos = atomic_read(&chip->data_position);
43233 +#endif
43234                 down(&chip->buffer_mutex);
43235 +#ifndef CONFIG_XEN
43236                 if (copy_to_user(buf, chip->data_buffer, ret_size))
43237 +#else
43238 +               if (copy_to_user(buf, &chip->data_buffer[pos], ret_size)) {
43239 +#endif
43240                         ret_size = -EFAULT;
43241 +#ifdef CONFIG_XEN
43242 +               } else {
43243 +                       pending = atomic_read(&chip->data_pending) - ret_size;
43244 +                       if ( pending ) {
43245 +                               atomic_set(&chip->data_pending, pending);
43246 +                               atomic_set(&chip->data_position,
43247 +                                          pos+ret_size);
43248 +                       }
43249 +               }
43250 +#endif
43251                 up(&chip->buffer_mutex);
43252         }
43253  
43254 +#ifdef CONFIG_XEN
43255 +       if ( ret_size <= 0 || pending == 0 ) {
43256 +               atomic_set(&chip->data_pending, 0);
43257 +               del_singleshot_timer_sync(&chip->user_read_timer);
43258 +               flush_scheduled_work();
43259 +       }
43260 +#endif
43261         return ret_size;
43262  }
43263  EXPORT_SYMBOL_GPL(tpm_read);
43264 diff -urNp linux-2.6/drivers/char/tpm/tpm.h new/drivers/char/tpm/tpm.h
43265 --- linux-2.6/drivers/char/tpm/tpm.h    2006-07-03 14:14:34.000000000 +0200
43266 +++ new/drivers/char/tpm/tpm.h  2006-07-07 15:21:18.000000000 +0200
43267 @@ -61,6 +61,7 @@ struct tpm_vendor_specific {
43268         const u8 req_complete_mask;
43269         const u8 req_complete_val;
43270         const u8 req_canceled;
43271 +       u32 buffersize;
43272         void __iomem *iobase;           /* ioremapped address */
43273         unsigned long base;             /* TPM base address */
43274  
43275 @@ -94,6 +95,7 @@ struct tpm_chip {
43276         /* Data passed to and from the tpm via the read/write calls */
43277         u8 *data_buffer;
43278         atomic_t data_pending;
43279 +       atomic_t data_position;
43280         struct semaphore buffer_mutex;
43281  
43282         struct timer_list user_read_timer;      /* user needs to claim result */
43283 @@ -121,6 +123,11 @@ static inline void tpm_write_index(int b
43284         outb(value & 0xFF, base+1);
43285  }
43286  
43287 +static inline u32 get_chip_buffersize(struct tpm_chip *chip)
43288 +{
43289 +       return chip->vendor.buffersize;
43290 +}
43291 +
43292  extern void tpm_get_timeouts(struct tpm_chip *);
43293  extern void tpm_gen_interrupt(struct tpm_chip *);
43294  extern void tpm_continue_selftest(struct tpm_chip *);
43295 diff -urNp linux-2.6/drivers/char/tpm/tpm_vtpm.c new/drivers/char/tpm/tpm_vtpm.c
43296 --- linux-2.6/drivers/char/tpm/tpm_vtpm.c       1970-01-01 01:00:00.000000000 +0100
43297 +++ new/drivers/char/tpm/tpm_vtpm.c     2006-07-07 15:10:03.000000000 +0200
43298 @@ -0,0 +1,547 @@
43299 +/*
43300 + * Copyright (C) 2006 IBM Corporation
43301 + *
43302 + * Authors:
43303 + * Stefan Berger <stefanb@us.ibm.com>
43304 + *
43305 + * Generic device driver part for device drivers in a virtualized
43306 + * environment.
43307 + *
43308 + * This program is free software; you can redistribute it and/or
43309 + * modify it under the terms of the GNU General Public License as
43310 + * published by the Free Software Foundation, version 2 of the
43311 + * License.
43312 + *
43313 + */
43314 +
43315 +#include <asm/uaccess.h>
43316 +#include <linux/list.h>
43317 +#include <linux/device.h>
43318 +#include <linux/interrupt.h>
43319 +#include <linux/platform_device.h>
43320 +#include "tpm.h"
43321 +#include "tpm_vtpm.h"
43322 +
43323 +/* read status bits */
43324 +enum {
43325 +       STATUS_BUSY = 0x01,
43326 +       STATUS_DATA_AVAIL = 0x02,
43327 +       STATUS_READY = 0x04
43328 +};
43329 +
43330 +struct transmission {
43331 +       struct list_head next;
43332 +
43333 +       unsigned char *request;
43334 +       size_t  request_len;
43335 +       size_t  request_buflen;
43336 +
43337 +       unsigned char *response;
43338 +       size_t  response_len;
43339 +       size_t  response_buflen;
43340 +
43341 +       unsigned int flags;
43342 +};
43343 +
43344 +enum {
43345 +       TRANSMISSION_FLAG_WAS_QUEUED = 0x1
43346 +};
43347 +
43348 +
43349 +enum {
43350 +       DATAEX_FLAG_QUEUED_ONLY = 0x1
43351 +};
43352 +
43353 +
43354 +/* local variables */
43355 +
43356 +/* local function prototypes */
43357 +static int _vtpm_send_queued(struct tpm_chip *chip);
43358 +
43359 +
43360 +/* =============================================================
43361 + * Some utility functions
43362 + * =============================================================
43363 + */
43364 +static void vtpm_state_init(struct vtpm_state *vtpms)
43365 +{
43366 +       vtpms->current_request = NULL;
43367 +       spin_lock_init(&vtpms->req_list_lock);
43368 +       init_waitqueue_head(&vtpms->req_wait_queue);
43369 +       INIT_LIST_HEAD(&vtpms->queued_requests);
43370 +
43371 +       vtpms->current_response = NULL;
43372 +       spin_lock_init(&vtpms->resp_list_lock);
43373 +       init_waitqueue_head(&vtpms->resp_wait_queue);
43374 +
43375 +       vtpms->disconnect_time = jiffies;
43376 +}
43377 +
43378 +
43379 +static inline struct transmission *transmission_alloc(void)
43380 +{
43381 +       return kzalloc(sizeof(struct transmission), GFP_ATOMIC);
43382 +}
43383 +
43384 +static unsigned char *
43385 +transmission_set_req_buffer(struct transmission *t,
43386 +                            unsigned char *buffer, size_t len)
43387 +{
43388 +       if (t->request_buflen < len) {
43389 +               kfree(t->request);
43390 +               t->request = kmalloc(len, GFP_KERNEL);
43391 +               if (!t->request) {
43392 +                       t->request_buflen = 0;
43393 +                       return NULL;
43394 +               }
43395 +               t->request_buflen = len;
43396 +       }
43397 +
43398 +       memcpy(t->request, buffer, len);
43399 +       t->request_len = len;
43400 +
43401 +       return t->request;
43402 +}
43403 +
43404 +static unsigned char *
43405 +transmission_set_res_buffer(struct transmission *t,
43406 +                            const unsigned char *buffer, size_t len)
43407 +{
43408 +       if (t->response_buflen < len) {
43409 +               kfree(t->response);
43410 +               t->response = kmalloc(len, GFP_ATOMIC);
43411 +               if (!t->response) {
43412 +                       t->response_buflen = 0;
43413 +                       return NULL;
43414 +               }
43415 +               t->response_buflen = len;
43416 +       }
43417 +
43418 +       memcpy(t->response, buffer, len);
43419 +       t->response_len = len;
43420 +
43421 +       return t->response;
43422 +}
43423 +
43424 +static inline void transmission_free(struct transmission *t)
43425 +{
43426 +       kfree(t->request);
43427 +       kfree(t->response);
43428 +       kfree(t);
43429 +}
43430 +
43431 +/* =============================================================
43432 + * Interface with the lower layer driver
43433 + * =============================================================
43434 + */
43435 +/*
43436 + * Lower layer uses this function to make a response available.
43437 + */
43438 +int vtpm_vd_recv(const struct tpm_chip *chip,
43439 +                 const unsigned char *buffer, size_t count,
43440 +                 void *ptr)
43441 +{
43442 +       unsigned long flags;
43443 +       int ret_size = 0;
43444 +       struct transmission *t;
43445 +       struct vtpm_state *vtpms;
43446 +
43447 +       vtpms = (struct vtpm_state *)chip_get_private(chip);
43448 +
43449 +       /*
43450 +        * The list with requests must contain one request
43451 +        * only and the element there must be the one that
43452 +        * was passed to me from the front-end.
43453 +        */
43454 +       spin_lock_irqsave(&vtpms->resp_list_lock, flags);
43455 +       if (vtpms->current_request != ptr) {
43456 +               spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
43457 +               return 0;
43458 +       }
43459 +
43460 +       if ((t = vtpms->current_request)) {
43461 +               transmission_free(t);
43462 +               vtpms->current_request = NULL;
43463 +       }
43464 +
43465 +       t = transmission_alloc();
43466 +       if (t) {
43467 +               if (!transmission_set_res_buffer(t, buffer, count)) {
43468 +                       transmission_free(t);
43469 +                       spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
43470 +                       return -ENOMEM;
43471 +               }
43472 +               ret_size = count;
43473 +               vtpms->current_response = t;
43474 +               wake_up_interruptible(&vtpms->resp_wait_queue);
43475 +       }
43476 +       spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
43477 +
43478 +       return ret_size;
43479 +}
43480 +
43481 +
43482 +/*
43483 + * Lower layer indicates its status (connected/disconnected)
43484 + */
43485 +void vtpm_vd_status(const struct tpm_chip *chip, u8 vd_status)
43486 +{
43487 +       struct vtpm_state *vtpms;
43488 +
43489 +       vtpms = (struct vtpm_state *)chip_get_private(chip);
43490 +
43491 +       vtpms->vd_status = vd_status;
43492 +       if ((vtpms->vd_status & TPM_VD_STATUS_CONNECTED) == 0) {
43493 +               vtpms->disconnect_time = jiffies;
43494 +       }
43495 +}
43496 +
43497 +/* =============================================================
43498 + * Interface with the generic TPM driver
43499 + * =============================================================
43500 + */
43501 +static int vtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count)
43502 +{
43503 +       int rc = 0;
43504 +       unsigned long flags;
43505 +       struct vtpm_state *vtpms;
43506 +
43507 +       vtpms = (struct vtpm_state *)chip_get_private(chip);
43508 +
43509 +       /*
43510 +        * Check if the previous operation only queued the command
43511 +        * In this case there won't be a response, so I just
43512 +        * return from here and reset that flag. In any other
43513 +        * case I should receive a response from the back-end.
43514 +        */
43515 +       spin_lock_irqsave(&vtpms->resp_list_lock, flags);
43516 +       if ((vtpms->flags & DATAEX_FLAG_QUEUED_ONLY) != 0) {
43517 +               vtpms->flags &= ~DATAEX_FLAG_QUEUED_ONLY;
43518 +               spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
43519 +               /*
43520 +                * The first few commands (measurements) must be
43521 +                * queued since it might not be possible to talk to the
43522 +                * TPM, yet.
43523 +                * Return a response of up to 30 '0's.
43524 +                */
43525 +
43526 +               count = min_t(size_t, count, 30);
43527 +               memset(buf, 0x0, count);
43528 +               return count;
43529 +       }
43530 +       /*
43531 +        * Check whether something is in the responselist and if
43532 +        * there's nothing in the list wait for something to appear.
43533 +        */
43534 +
43535 +       if (!vtpms->current_response) {
43536 +               spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
43537 +               interruptible_sleep_on_timeout(&vtpms->resp_wait_queue,
43538 +                                              1000);
43539 +               spin_lock_irqsave(&vtpms->resp_list_lock ,flags);
43540 +       }
43541 +
43542 +       if (vtpms->current_response) {
43543 +               struct transmission *t = vtpms->current_response;
43544 +               vtpms->current_response = NULL;
43545 +               rc = min(count, t->response_len);
43546 +               memcpy(buf, t->response, rc);
43547 +               transmission_free(t);
43548 +       }
43549 +
43550 +       spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
43551 +       return rc;
43552 +}
43553 +
43554 +static int vtpm_send(struct tpm_chip *chip, u8 *buf, size_t count)
43555 +{
43556 +       int rc = 0;
43557 +       unsigned long flags;
43558 +       struct transmission *t = transmission_alloc();
43559 +       struct vtpm_state *vtpms;
43560 +
43561 +       vtpms = (struct vtpm_state *)chip_get_private(chip);
43562 +
43563 +       if (!t)
43564 +               return -ENOMEM;
43565 +       /*
43566 +        * If there's a current request, it must be the
43567 +        * previous request that has timed out.
43568 +        */
43569 +       spin_lock_irqsave(&vtpms->req_list_lock, flags);
43570 +       if (vtpms->current_request != NULL) {
43571 +               printk("WARNING: Sending although there is a request outstanding.\n"
43572 +                      "         Previous request must have timed out.\n");
43573 +               transmission_free(vtpms->current_request);
43574 +               vtpms->current_request = NULL;
43575 +       }
43576 +       spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
43577 +
43578 +       /*
43579 +        * Queue the packet if the driver below is not
43580 +        * ready, yet, or there is any packet already
43581 +        * in the queue.
43582 +        * If the driver below is ready, unqueue all
43583 +        * packets first before sending our current
43584 +        * packet.
43585 +        * For each unqueued packet, except for the
43586 +        * last (=current) packet, call the function
43587 +        * tpm_xen_recv to wait for the response to come
43588 +        * back.
43589 +        */
43590 +       if ((vtpms->vd_status & TPM_VD_STATUS_CONNECTED) == 0) {
43591 +               if (time_after(jiffies,
43592 +                              vtpms->disconnect_time + HZ * 10)) {
43593 +                       rc = -ENOENT;
43594 +               } else {
43595 +                       goto queue_it;
43596 +               }
43597 +       } else {
43598 +               /*
43599 +                * Send all queued packets.
43600 +                */
43601 +               if (_vtpm_send_queued(chip) == 0) {
43602 +
43603 +                       vtpms->current_request = t;
43604 +
43605 +                       rc = vtpm_vd_send(vtpms->tpm_private,
43606 +                                         buf,
43607 +                                         count,
43608 +                                         t);
43609 +                       /*
43610 +                        * The generic TPM driver will call
43611 +                        * the function to receive the response.
43612 +                        */
43613 +                       if (rc < 0) {
43614 +                               vtpms->current_request = NULL;
43615 +                               goto queue_it;
43616 +                       }
43617 +               } else {
43618 +queue_it:
43619 +                       if (!transmission_set_req_buffer(t, buf, count)) {
43620 +                               transmission_free(t);
43621 +                               rc = -ENOMEM;
43622 +                               goto exit;
43623 +                       }
43624 +                       /*
43625 +                        * An error occurred. Don't event try
43626 +                        * to send the current request. Just
43627 +                        * queue it.
43628 +                        */
43629 +                       spin_lock_irqsave(&vtpms->req_list_lock, flags);
43630 +                       vtpms->flags |= DATAEX_FLAG_QUEUED_ONLY;
43631 +                       list_add_tail(&t->next, &vtpms->queued_requests);
43632 +                       spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
43633 +               }
43634 +       }
43635 +
43636 +exit:
43637 +       return rc;
43638 +}
43639 +
43640 +
43641 +/*
43642 + * Send all queued requests.
43643 + */
43644 +static int _vtpm_send_queued(struct tpm_chip *chip)
43645 +{
43646 +       int rc;
43647 +       int error = 0;
43648 +       long flags;
43649 +       unsigned char buffer[1];
43650 +       struct vtpm_state *vtpms;
43651 +       vtpms = (struct vtpm_state *)chip_get_private(chip);
43652 +
43653 +       spin_lock_irqsave(&vtpms->req_list_lock, flags);
43654 +
43655 +       while (!list_empty(&vtpms->queued_requests)) {
43656 +               /*
43657 +                * Need to dequeue them.
43658 +                * Read the result into a dummy buffer.
43659 +                */
43660 +               struct transmission *qt = (struct transmission *)
43661 +                                         vtpms->queued_requests.next;
43662 +               list_del(&qt->next);
43663 +               vtpms->current_request = qt;
43664 +               spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
43665 +
43666 +               rc = vtpm_vd_send(vtpms->tpm_private,
43667 +                                 qt->request,
43668 +                                 qt->request_len,
43669 +                                 qt);
43670 +
43671 +               if (rc < 0) {
43672 +                       spin_lock_irqsave(&vtpms->req_list_lock, flags);
43673 +                       if ((qt = vtpms->current_request) != NULL) {
43674 +                               /*
43675 +                                * requeue it at the beginning
43676 +                                * of the list
43677 +                                */
43678 +                               list_add(&qt->next,
43679 +                                        &vtpms->queued_requests);
43680 +                       }
43681 +                       vtpms->current_request = NULL;
43682 +                       error = 1;
43683 +                       break;
43684 +               }
43685 +               /*
43686 +                * After this point qt is not valid anymore!
43687 +                * It is freed when the front-end is delivering
43688 +                * the data by calling tpm_recv
43689 +                */
43690 +               /*
43691 +                * Receive response into provided dummy buffer
43692 +                */
43693 +               rc = vtpm_recv(chip, buffer, sizeof(buffer));
43694 +               spin_lock_irqsave(&vtpms->req_list_lock, flags);
43695 +       }
43696 +
43697 +       spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
43698 +
43699 +       return error;
43700 +}
43701 +
43702 +static void vtpm_cancel(struct tpm_chip *chip)
43703 +{
43704 +       unsigned long flags;
43705 +       struct vtpm_state *vtpms = (struct vtpm_state *)chip_get_private(chip);
43706 +
43707 +       spin_lock_irqsave(&vtpms->resp_list_lock,flags);
43708 +
43709 +       if (!vtpms->current_response && vtpms->current_request) {
43710 +               spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
43711 +               interruptible_sleep_on(&vtpms->resp_wait_queue);
43712 +               spin_lock_irqsave(&vtpms->resp_list_lock,flags);
43713 +       }
43714 +
43715 +       if (vtpms->current_response) {
43716 +               struct transmission *t = vtpms->current_response;
43717 +               vtpms->current_response = NULL;
43718 +               transmission_free(t);
43719 +       }
43720 +
43721 +       spin_unlock_irqrestore(&vtpms->resp_list_lock,flags);
43722 +}
43723 +
43724 +static u8 vtpm_status(struct tpm_chip *chip)
43725 +{
43726 +       u8 rc = 0;
43727 +       unsigned long flags;
43728 +       struct vtpm_state *vtpms;
43729 +
43730 +       vtpms = (struct vtpm_state *)chip_get_private(chip);
43731 +
43732 +       spin_lock_irqsave(&vtpms->resp_list_lock, flags);
43733 +       /*
43734 +        * Data are available if:
43735 +        *  - there's a current response
43736 +        *  - the last packet was queued only (this is fake, but necessary to
43737 +        *      get the generic TPM layer to call the receive function.)
43738 +        */
43739 +       if (vtpms->current_response ||
43740 +           0 != (vtpms->flags & DATAEX_FLAG_QUEUED_ONLY)) {
43741 +               rc = STATUS_DATA_AVAIL;
43742 +       } else if (!vtpms->current_response && !vtpms->current_request) {
43743 +               rc = STATUS_READY;
43744 +       }
43745 +
43746 +       spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
43747 +       return rc;
43748 +}
43749 +
43750 +static struct file_operations vtpm_ops = {
43751 +       .owner = THIS_MODULE,
43752 +       .llseek = no_llseek,
43753 +       .open = tpm_open,
43754 +       .read = tpm_read,
43755 +       .write = tpm_write,
43756 +       .release = tpm_release,
43757 +};
43758 +
43759 +static DEVICE_ATTR(pubek, S_IRUGO, tpm_show_pubek, NULL);
43760 +static DEVICE_ATTR(pcrs, S_IRUGO, tpm_show_pcrs, NULL);
43761 +static DEVICE_ATTR(enabled, S_IRUGO, tpm_show_enabled, NULL);
43762 +static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL);
43763 +static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL);
43764 +static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated,
43765 +                  NULL);
43766 +static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL);
43767 +static DEVICE_ATTR(cancel, S_IWUSR |S_IWGRP, NULL, tpm_store_cancel);
43768 +
43769 +static struct attribute *vtpm_attrs[] = {
43770 +       &dev_attr_pubek.attr,
43771 +       &dev_attr_pcrs.attr,
43772 +       &dev_attr_enabled.attr,
43773 +       &dev_attr_active.attr,
43774 +       &dev_attr_owned.attr,
43775 +       &dev_attr_temp_deactivated.attr,
43776 +       &dev_attr_caps.attr,
43777 +       &dev_attr_cancel.attr,
43778 +       NULL,
43779 +};
43780 +
43781 +static struct attribute_group vtpm_attr_grp = { .attrs = vtpm_attrs };
43782 +
43783 +#define TPM_LONG_TIMEOUT   (10 * 60 * HZ)
43784 +
43785 +static struct tpm_vendor_specific tpm_vtpm = {
43786 +       .recv = vtpm_recv,
43787 +       .send = vtpm_send,
43788 +       .cancel = vtpm_cancel,
43789 +       .status = vtpm_status,
43790 +       .req_complete_mask = STATUS_BUSY | STATUS_DATA_AVAIL,
43791 +       .req_complete_val  = STATUS_DATA_AVAIL,
43792 +       .req_canceled = STATUS_READY,
43793 +       .attr_group = &vtpm_attr_grp,
43794 +       .miscdev = {
43795 +               .fops = &vtpm_ops,
43796 +       },
43797 +       .duration = {
43798 +               TPM_LONG_TIMEOUT,
43799 +               TPM_LONG_TIMEOUT,
43800 +               TPM_LONG_TIMEOUT,
43801 +       },
43802 +};
43803 +
43804 +struct tpm_chip *init_vtpm(struct device *dev,
43805 +                           struct tpm_virtual_device *tvd,
43806 +                           struct tpm_private *tp)
43807 +{
43808 +       long rc;
43809 +       struct tpm_chip *chip;
43810 +       struct vtpm_state *vtpms;
43811 +
43812 +       vtpms = kzalloc(sizeof(struct vtpm_state), GFP_KERNEL);
43813 +       if (!vtpms)
43814 +               return ERR_PTR(-ENOMEM);
43815 +
43816 +       vtpm_state_init(vtpms);
43817 +       vtpms->tpmvd = tvd;
43818 +       vtpms->tpm_private = tp;
43819 +
43820 +       if (tvd)
43821 +               tpm_vtpm.buffersize = tvd->max_tx_size;
43822 +
43823 +       chip = tpm_register_hardware(dev, &tpm_vtpm);
43824 +       if (!chip) {
43825 +               rc = -ENODEV;
43826 +               goto err_free_mem;
43827 +       }
43828 +
43829 +       chip_set_private(chip, vtpms);
43830 +
43831 +       return chip;
43832 +
43833 +err_free_mem:
43834 +       kfree(vtpms);
43835 +
43836 +       return ERR_PTR(rc);
43837 +}
43838 +
43839 +void cleanup_vtpm(struct device *dev)
43840 +{
43841 +       struct tpm_chip *chip = dev_get_drvdata(dev);
43842 +       struct vtpm_state *vtpms = (struct vtpm_state*)chip_get_private(chip);
43843 +       tpm_remove_hardware(dev);
43844 +       kfree(vtpms);
43845 +}
43846 diff -urNp linux-2.6/drivers/char/tpm/tpm_vtpm.h new/drivers/char/tpm/tpm_vtpm.h
43847 --- linux-2.6/drivers/char/tpm/tpm_vtpm.h       1970-01-01 01:00:00.000000000 +0100
43848 +++ new/drivers/char/tpm/tpm_vtpm.h     2006-07-07 15:10:03.000000000 +0200
43849 @@ -0,0 +1,68 @@
43850 +#ifndef TPM_VTPM_H
43851 +#define TPM_VTPM_H
43852 +
43853 +struct tpm_chip;
43854 +struct tpm_private;
43855 +
43856 +struct tpm_virtual_device {
43857 +       /*
43858 +        * This field indicates the maximum size the driver can
43859 +        * transfer in one chunk. It is filled in by the front-end
43860 +        * driver and should be propagated to the generic tpm driver
43861 +        * for allocation of buffers.
43862 +        */
43863 +       unsigned int max_tx_size;
43864 +};
43865 +
43866 +struct vtpm_state {
43867 +       struct transmission *current_request;
43868 +       spinlock_t           req_list_lock;
43869 +       wait_queue_head_t    req_wait_queue;
43870 +
43871 +       struct list_head     queued_requests;
43872 +
43873 +       struct transmission *current_response;
43874 +       spinlock_t           resp_list_lock;
43875 +       wait_queue_head_t    resp_wait_queue;     // processes waiting for responses
43876 +
43877 +       u8                   vd_status;
43878 +       u8                   flags;
43879 +
43880 +       unsigned long        disconnect_time;
43881 +
43882 +       struct tpm_virtual_device *tpmvd;
43883 +
43884 +       /*
43885 +        * The following is a private structure of the underlying
43886 +        * driver. It is passed as parameter in the send function.
43887 +        */
43888 +       struct tpm_private *tpm_private;
43889 +};
43890 +
43891 +
43892 +enum vdev_status {
43893 +       TPM_VD_STATUS_DISCONNECTED = 0x0,
43894 +       TPM_VD_STATUS_CONNECTED = 0x1
43895 +};
43896 +
43897 +/* this function is called from tpm_vtpm.c */
43898 +int vtpm_vd_send(struct tpm_private * tp,
43899 +                 const u8 * buf, size_t count, void *ptr);
43900 +
43901 +/* these functions are offered by tpm_vtpm.c */
43902 +struct tpm_chip *init_vtpm(struct device *,
43903 +                           struct tpm_virtual_device *,
43904 +                           struct tpm_private *);
43905 +void cleanup_vtpm(struct device *);
43906 +int vtpm_vd_recv(const struct tpm_chip* chip,
43907 +                 const unsigned char *buffer, size_t count, void *ptr);
43908 +void vtpm_vd_status(const struct tpm_chip *, u8 status);
43909 +
43910 +static inline struct tpm_private *tpm_private_from_dev(struct device *dev)
43911 +{
43912 +       struct tpm_chip *chip = dev_get_drvdata(dev);
43913 +       struct vtpm_state *vtpms = chip_get_private(chip);
43914 +       return vtpms->tpm_private;
43915 +}
43916 +
43917 +#endif
43918 diff -urNp linux-2.6/drivers/char/tpm/tpm_xen.c new/drivers/char/tpm/tpm_xen.c
43919 --- linux-2.6/drivers/char/tpm/tpm_xen.c        1970-01-01 01:00:00.000000000 +0100
43920 +++ new/drivers/char/tpm/tpm_xen.c      2006-07-07 15:10:03.000000000 +0200
43921 @@ -0,0 +1,758 @@
43922 +/*
43923 + * Copyright (c) 2005, IBM Corporation
43924 + *
43925 + * Author: Stefan Berger, stefanb@us.ibm.com
43926 + * Grant table support: Mahadevan Gomathisankaran
43927 + *
43928 + * This code has been derived from drivers/xen/netfront/netfront.c
43929 + *
43930 + * Copyright (c) 2002-2004, K A Fraser
43931 + *
43932 + * This program is free software; you can redistribute it and/or
43933 + * modify it under the terms of the GNU General Public License version 2
43934 + * as published by the Free Software Foundation; or, when distributed
43935 + * separately from the Linux kernel or incorporated into other
43936 + * software packages, subject to the following license:
43937 + *
43938 + * Permission is hereby granted, free of charge, to any person obtaining a copy
43939 + * of this source file (the "Software"), to deal in the Software without
43940 + * restriction, including without limitation the rights to use, copy, modify,
43941 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
43942 + * and to permit persons to whom the Software is furnished to do so, subject to
43943 + * the following conditions:
43944 + *
43945 + * The above copyright notice and this permission notice shall be included in
43946 + * all copies or substantial portions of the Software.
43947 + *
43948 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
43949 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
43950 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
43951 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
43952 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
43953 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
43954 + * IN THE SOFTWARE.
43955 + */
43956 +
43957 +#include <linux/errno.h>
43958 +#include <linux/err.h>
43959 +#include <linux/interrupt.h>
43960 +#include <linux/mutex.h>
43961 +#include <asm/uaccess.h>
43962 +#include <xen/evtchn.h>
43963 +#include <xen/interface/grant_table.h>
43964 +#include <xen/interface/io/tpmif.h>
43965 +#include <xen/xenbus.h>
43966 +#include "tpm.h"
43967 +#include "tpm_vtpm.h"
43968 +
43969 +#undef DEBUG
43970 +
43971 +/* local structures */
43972 +struct tpm_private {
43973 +       struct tpm_chip *chip;
43974 +
43975 +       tpmif_tx_interface_t *tx;
43976 +       atomic_t refcnt;
43977 +       unsigned int evtchn;
43978 +       unsigned int irq;
43979 +       u8 is_connected;
43980 +       u8 is_suspended;
43981 +
43982 +       spinlock_t tx_lock;
43983 +
43984 +       struct tx_buffer *tx_buffers[TPMIF_TX_RING_SIZE];
43985 +
43986 +       atomic_t tx_busy;
43987 +       void *tx_remember;
43988 +
43989 +       domid_t backend_id;
43990 +       wait_queue_head_t wait_q;
43991 +
43992 +       struct xenbus_device *dev;
43993 +       int ring_ref;
43994 +};
43995 +
43996 +struct tx_buffer {
43997 +       unsigned int size;      // available space in data
43998 +       unsigned int len;       // used space in data
43999 +       unsigned char *data;    // pointer to a page
44000 +};
44001 +
44002 +
44003 +/* locally visible variables */
44004 +static grant_ref_t gref_head;
44005 +static struct tpm_private *my_priv;
44006 +
44007 +/* local function prototypes */
44008 +static irqreturn_t tpmif_int(int irq,
44009 +                             void *tpm_priv,
44010 +                             struct pt_regs *ptregs);
44011 +static void tpmif_rx_action(unsigned long unused);
44012 +static int tpmif_connect(struct xenbus_device *dev,
44013 +                         struct tpm_private *tp,
44014 +                         domid_t domid);
44015 +static DECLARE_TASKLET(tpmif_rx_tasklet, tpmif_rx_action, 0);
44016 +static int tpmif_allocate_tx_buffers(struct tpm_private *tp);
44017 +static void tpmif_free_tx_buffers(struct tpm_private *tp);
44018 +static void tpmif_set_connected_state(struct tpm_private *tp,
44019 +                                      u8 newstate);
44020 +static int tpm_xmit(struct tpm_private *tp,
44021 +                    const u8 * buf, size_t count, int userbuffer,
44022 +                    void *remember);
44023 +static void destroy_tpmring(struct tpm_private *tp);
44024 +void __exit tpmif_exit(void);
44025 +
44026 +#define DPRINTK(fmt, args...) \
44027 +    pr_debug("xen_tpm_fr (%s:%d) " fmt, __FUNCTION__, __LINE__, ##args)
44028 +#define IPRINTK(fmt, args...) \
44029 +    printk(KERN_INFO "xen_tpm_fr: " fmt, ##args)
44030 +#define WPRINTK(fmt, args...) \
44031 +    printk(KERN_WARNING "xen_tpm_fr: " fmt, ##args)
44032 +
44033 +#define GRANT_INVALID_REF      0
44034 +
44035 +
44036 +static inline int
44037 +tx_buffer_copy(struct tx_buffer *txb, const u8 * src, int len,
44038 +               int isuserbuffer)
44039 +{
44040 +       int copied = len;
44041 +
44042 +       if (len > txb->size) {
44043 +               copied = txb->size;
44044 +       }
44045 +       if (isuserbuffer) {
44046 +               if (copy_from_user(txb->data, src, copied))
44047 +                       return -EFAULT;
44048 +       } else {
44049 +               memcpy(txb->data, src, copied);
44050 +       }
44051 +       txb->len = len;
44052 +       return copied;
44053 +}
44054 +
44055 +static inline struct tx_buffer *tx_buffer_alloc(void)
44056 +{
44057 +       struct tx_buffer *txb = kzalloc(sizeof (struct tx_buffer),
44058 +                                       GFP_KERNEL);
44059 +
44060 +       if (txb) {
44061 +               txb->len = 0;
44062 +               txb->size = PAGE_SIZE;
44063 +               txb->data = (unsigned char *)__get_free_page(GFP_KERNEL);
44064 +               if (txb->data == NULL) {
44065 +                       kfree(txb);
44066 +                       txb = NULL;
44067 +               }
44068 +       }
44069 +       return txb;
44070 +}
44071 +
44072 +
44073 +static inline void tx_buffer_free(struct tx_buffer *txb)
44074 +{
44075 +       if (txb) {
44076 +               free_page((long)txb->data);
44077 +               kfree(txb);
44078 +       }
44079 +}
44080 +
44081 +/**************************************************************
44082 + Utility function for the tpm_private structure
44083 +**************************************************************/
44084 +static inline void tpm_private_init(struct tpm_private *tp)
44085 +{
44086 +       spin_lock_init(&tp->tx_lock);
44087 +       init_waitqueue_head(&tp->wait_q);
44088 +       atomic_set(&tp->refcnt, 1);
44089 +}
44090 +
44091 +static inline void tpm_private_put(void)
44092 +{
44093 +       if ( atomic_dec_and_test(&my_priv->refcnt)) {
44094 +               tpmif_free_tx_buffers(my_priv);
44095 +               kfree(my_priv);
44096 +               my_priv = NULL;
44097 +       }
44098 +}
44099 +
44100 +static struct tpm_private *tpm_private_get(void)
44101 +{
44102 +       int err;
44103 +       if (!my_priv) {
44104 +               my_priv = kzalloc(sizeof(struct tpm_private), GFP_KERNEL);
44105 +               if (my_priv) {
44106 +                       tpm_private_init(my_priv);
44107 +                       err = tpmif_allocate_tx_buffers(my_priv);
44108 +                       if (err < 0) {
44109 +                               tpm_private_put();
44110 +                       }
44111 +               }
44112 +       } else {
44113 +               atomic_inc(&my_priv->refcnt);
44114 +       }
44115 +       return my_priv;
44116 +}
44117 +
44118 +/**************************************************************
44119 +
44120 + The interface to let the tpm plugin register its callback
44121 + function and send data to another partition using this module
44122 +
44123 +**************************************************************/
44124 +
44125 +static DEFINE_MUTEX(suspend_lock);
44126 +/*
44127 + * Send data via this module by calling this function
44128 + */
44129 +int vtpm_vd_send(struct tpm_private *tp,
44130 +                 const u8 * buf, size_t count, void *ptr)
44131 +{
44132 +       int sent;
44133 +
44134 +       mutex_lock(&suspend_lock);
44135 +       sent = tpm_xmit(tp, buf, count, 0, ptr);
44136 +       mutex_unlock(&suspend_lock);
44137 +
44138 +       return sent;
44139 +}
44140 +
44141 +/**************************************************************
44142 + XENBUS support code
44143 +**************************************************************/
44144 +
44145 +static int setup_tpmring(struct xenbus_device *dev,
44146 +                         struct tpm_private *tp)
44147 +{
44148 +       tpmif_tx_interface_t *sring;
44149 +       int err;
44150 +
44151 +       tp->ring_ref = GRANT_INVALID_REF;
44152 +
44153 +       sring = (void *)__get_free_page(GFP_KERNEL);
44154 +       if (!sring) {
44155 +               xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
44156 +               return -ENOMEM;
44157 +       }
44158 +       tp->tx = sring;
44159 +
44160 +       err = xenbus_grant_ring(dev, virt_to_mfn(tp->tx));
44161 +       if (err < 0) {
44162 +               free_page((unsigned long)sring);
44163 +               tp->tx = NULL;
44164 +               xenbus_dev_fatal(dev, err, "allocating grant reference");
44165 +               goto fail;
44166 +       }
44167 +       tp->ring_ref = err;
44168 +
44169 +       err = tpmif_connect(dev, tp, dev->otherend_id);
44170 +       if (err)
44171 +               goto fail;
44172 +
44173 +       return 0;
44174 +fail:
44175 +       destroy_tpmring(tp);
44176 +       return err;
44177 +}
44178 +
44179 +
44180 +static void destroy_tpmring(struct tpm_private *tp)
44181 +{
44182 +       tpmif_set_connected_state(tp, 0);
44183 +
44184 +       if (tp->ring_ref != GRANT_INVALID_REF) {
44185 +               gnttab_end_foreign_access(tp->ring_ref, 0,
44186 +                                         (unsigned long)tp->tx);
44187 +               tp->ring_ref = GRANT_INVALID_REF;
44188 +               tp->tx = NULL;
44189 +       }
44190 +
44191 +       if (tp->irq)
44192 +               unbind_from_irqhandler(tp->irq, tp);
44193 +
44194 +       tp->evtchn = tp->irq = 0;
44195 +}
44196 +
44197 +
44198 +static int talk_to_backend(struct xenbus_device *dev,
44199 +                           struct tpm_private *tp)
44200 +{
44201 +       const char *message = NULL;
44202 +       int err;
44203 +       struct xenbus_transaction xbt;
44204 +
44205 +       err = setup_tpmring(dev, tp);
44206 +       if (err) {
44207 +               xenbus_dev_fatal(dev, err, "setting up ring");
44208 +               goto out;
44209 +       }
44210 +
44211 +again:
44212 +       err = xenbus_transaction_start(&xbt);
44213 +       if (err) {
44214 +               xenbus_dev_fatal(dev, err, "starting transaction");
44215 +               goto destroy_tpmring;
44216 +       }
44217 +
44218 +       err = xenbus_printf(xbt, dev->nodename,
44219 +                           "ring-ref","%u", tp->ring_ref);
44220 +       if (err) {
44221 +               message = "writing ring-ref";
44222 +               goto abort_transaction;
44223 +       }
44224 +
44225 +       err = xenbus_printf(xbt, dev->nodename,
44226 +                           "event-channel", "%u", tp->evtchn);
44227 +       if (err) {
44228 +               message = "writing event-channel";
44229 +               goto abort_transaction;
44230 +       }
44231 +
44232 +       err = xenbus_transaction_end(xbt, 0);
44233 +       if (err == -EAGAIN)
44234 +               goto again;
44235 +       if (err) {
44236 +               xenbus_dev_fatal(dev, err, "completing transaction");
44237 +               goto destroy_tpmring;
44238 +       }
44239 +
44240 +       xenbus_switch_state(dev, XenbusStateConnected);
44241 +
44242 +       return 0;
44243 +
44244 +abort_transaction:
44245 +       xenbus_transaction_end(xbt, 1);
44246 +       if (message)
44247 +               xenbus_dev_error(dev, err, "%s", message);
44248 +destroy_tpmring:
44249 +       destroy_tpmring(tp);
44250 +out:
44251 +       return err;
44252 +}
44253 +
44254 +/**
44255 + * Callback received when the backend's state changes.
44256 + */
44257 +static void backend_changed(struct xenbus_device *dev,
44258 +                           enum xenbus_state backend_state)
44259 +{
44260 +       struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
44261 +       DPRINTK("\n");
44262 +
44263 +       switch (backend_state) {
44264 +       case XenbusStateInitialising:
44265 +       case XenbusStateInitWait:
44266 +       case XenbusStateInitialised:
44267 +       case XenbusStateUnknown:
44268 +               break;
44269 +
44270 +       case XenbusStateConnected:
44271 +               tpmif_set_connected_state(tp, 1);
44272 +               break;
44273 +
44274 +       case XenbusStateClosing:
44275 +               tpmif_set_connected_state(tp, 0);
44276 +               break;
44277 +
44278 +       case XenbusStateClosed:
44279 +               if (tp->is_suspended == 0) {
44280 +                       device_unregister(&dev->dev);
44281 +               }
44282 +               xenbus_switch_state(dev, XenbusStateClosed);
44283 +               break;
44284 +       }
44285 +}
44286 +
44287 +struct tpm_virtual_device tvd = {
44288 +       .max_tx_size = PAGE_SIZE * TPMIF_TX_RING_SIZE,
44289 +};
44290 +
44291 +static int tpmfront_probe(struct xenbus_device *dev,
44292 +                          const struct xenbus_device_id *id)
44293 +{
44294 +       int err;
44295 +       int handle;
44296 +       struct tpm_private *tp = tpm_private_get();
44297 +
44298 +       if (!tp)
44299 +               return -ENOMEM;
44300 +
44301 +       tp->chip = init_vtpm(&dev->dev, &tvd, tp);
44302 +
44303 +       if (IS_ERR(tp->chip)) {
44304 +               return PTR_ERR(tp->chip);
44305 +       }
44306 +
44307 +       err = xenbus_scanf(XBT_NIL, dev->nodename,
44308 +                          "handle", "%i", &handle);
44309 +       if (XENBUS_EXIST_ERR(err))
44310 +               return err;
44311 +
44312 +       if (err < 0) {
44313 +               xenbus_dev_fatal(dev,err,"reading virtual-device");
44314 +               return err;
44315 +       }
44316 +
44317 +       tp->dev = dev;
44318 +
44319 +       err = talk_to_backend(dev, tp);
44320 +       if (err) {
44321 +               tpm_private_put();
44322 +               return err;
44323 +       }
44324 +       return 0;
44325 +}
44326 +
44327 +
44328 +static int tpmfront_remove(struct xenbus_device *dev)
44329 +{
44330 +       struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
44331 +       destroy_tpmring(tp);
44332 +       cleanup_vtpm(&dev->dev);
44333 +       return 0;
44334 +}
44335 +
44336 +static int tpmfront_suspend(struct xenbus_device *dev)
44337 +{
44338 +       struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
44339 +       u32 ctr;
44340 +       /* lock, so no app can send */
44341 +       mutex_lock(&suspend_lock);
44342 +       tp->is_suspended = 1;
44343 +
44344 +       for (ctr = 0; atomic_read(&tp->tx_busy) && ctr <= 25; ctr++) {
44345 +               if ((ctr % 10) == 0)
44346 +                       printk("TPM-FE [INFO]: Waiting for outstanding request.\n");
44347 +               /*
44348 +                * Wait for a request to be responded to.
44349 +                */
44350 +               interruptible_sleep_on_timeout(&tp->wait_q, 100);
44351 +       }
44352 +       xenbus_switch_state(dev, XenbusStateClosing);
44353 +
44354 +       if (atomic_read(&tp->tx_busy)) {
44355 +               /*
44356 +                * A temporary work-around.
44357 +                */
44358 +               printk("TPM-FE [WARNING]: Resetting busy flag.");
44359 +               atomic_set(&tp->tx_busy, 0);
44360 +       }
44361 +
44362 +       return 0;
44363 +}
44364 +
44365 +static int tpmfront_resume(struct xenbus_device *dev)
44366 +{
44367 +       struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
44368 +       destroy_tpmring(tp);
44369 +       return talk_to_backend(dev, tp);
44370 +}
44371 +
44372 +static int tpmif_connect(struct xenbus_device *dev,
44373 +                         struct tpm_private *tp,
44374 +                         domid_t domid)
44375 +{
44376 +       int err;
44377 +
44378 +       tp->backend_id = domid;
44379 +
44380 +       err = xenbus_alloc_evtchn(dev, &tp->evtchn);
44381 +       if (err)
44382 +               return err;
44383 +
44384 +       err = bind_evtchn_to_irqhandler(tp->evtchn,
44385 +                                       tpmif_int, SA_SAMPLE_RANDOM, "tpmif",
44386 +                                       tp);
44387 +       if (err <= 0) {
44388 +               WPRINTK("bind_evtchn_to_irqhandler failed (err=%d)\n", err);
44389 +               return err;
44390 +       }
44391 +
44392 +       tp->irq = err;
44393 +       return 0;
44394 +}
44395 +
44396 +static struct xenbus_device_id tpmfront_ids[] = {
44397 +       { "vtpm" },
44398 +       { "" }
44399 +};
44400 +
44401 +static struct xenbus_driver tpmfront = {
44402 +       .name = "vtpm",
44403 +       .owner = THIS_MODULE,
44404 +       .ids = tpmfront_ids,
44405 +       .probe = tpmfront_probe,
44406 +       .remove =  tpmfront_remove,
44407 +       .resume = tpmfront_resume,
44408 +       .otherend_changed = backend_changed,
44409 +       .suspend = tpmfront_suspend,
44410 +};
44411 +
44412 +static void __init init_tpm_xenbus(void)
44413 +{
44414 +       xenbus_register_frontend(&tpmfront);
44415 +}
44416 +
44417 +static void __exit exit_tpm_xenbus(void)
44418 +{
44419 +       xenbus_unregister_driver(&tpmfront);
44420 +}
44421 +
44422 +static int tpmif_allocate_tx_buffers(struct tpm_private *tp)
44423 +{
44424 +       unsigned int i;
44425 +
44426 +       for (i = 0; i < TPMIF_TX_RING_SIZE; i++) {
44427 +               tp->tx_buffers[i] = tx_buffer_alloc();
44428 +               if (!tp->tx_buffers[i]) {
44429 +                       tpmif_free_tx_buffers(tp);
44430 +                       return -ENOMEM;
44431 +               }
44432 +       }
44433 +       return 0;
44434 +}
44435 +
44436 +static void tpmif_free_tx_buffers(struct tpm_private *tp)
44437 +{
44438 +       unsigned int i;
44439 +
44440 +       for (i = 0; i < TPMIF_TX_RING_SIZE; i++) {
44441 +               tx_buffer_free(tp->tx_buffers[i]);
44442 +       }
44443 +}
44444 +
44445 +static void tpmif_rx_action(unsigned long priv)
44446 +{
44447 +       struct tpm_private *tp = (struct tpm_private *)priv;
44448 +
44449 +       int i = 0;
44450 +       unsigned int received;
44451 +       unsigned int offset = 0;
44452 +       u8 *buffer;
44453 +       tpmif_tx_request_t *tx;
44454 +       tx = &tp->tx->ring[i].req;
44455 +
44456 +       atomic_set(&tp->tx_busy, 0);
44457 +       wake_up_interruptible(&tp->wait_q);
44458 +
44459 +       received = tx->size;
44460 +
44461 +       buffer = kmalloc(received, GFP_ATOMIC);
44462 +       if (NULL == buffer) {
44463 +               goto exit;
44464 +       }
44465 +
44466 +       for (i = 0; i < TPMIF_TX_RING_SIZE && offset < received; i++) {
44467 +               struct tx_buffer *txb = tp->tx_buffers[i];
44468 +               tpmif_tx_request_t *tx;
44469 +               unsigned int tocopy;
44470 +
44471 +               tx = &tp->tx->ring[i].req;
44472 +               tocopy = tx->size;
44473 +               if (tocopy > PAGE_SIZE) {
44474 +                       tocopy = PAGE_SIZE;
44475 +               }
44476 +
44477 +               memcpy(&buffer[offset], txb->data, tocopy);
44478 +
44479 +               gnttab_release_grant_reference(&gref_head, tx->ref);
44480 +
44481 +               offset += tocopy;
44482 +       }
44483 +
44484 +       vtpm_vd_recv(tp->chip, buffer, received, tp->tx_remember);
44485 +       kfree(buffer);
44486 +
44487 +exit:
44488 +
44489 +       return;
44490 +}
44491 +
44492 +
44493 +static irqreturn_t tpmif_int(int irq, void *tpm_priv, struct pt_regs *ptregs)
44494 +{
44495 +       struct tpm_private *tp = tpm_priv;
44496 +       unsigned long flags;
44497 +
44498 +       spin_lock_irqsave(&tp->tx_lock, flags);
44499 +       tpmif_rx_tasklet.data = (unsigned long)tp;
44500 +       tasklet_schedule(&tpmif_rx_tasklet);
44501 +       spin_unlock_irqrestore(&tp->tx_lock, flags);
44502 +
44503 +       return IRQ_HANDLED;
44504 +}
44505 +
44506 +
44507 +static int tpm_xmit(struct tpm_private *tp,
44508 +                    const u8 * buf, size_t count, int isuserbuffer,
44509 +                    void *remember)
44510 +{
44511 +       tpmif_tx_request_t *tx;
44512 +       TPMIF_RING_IDX i;
44513 +       unsigned int offset = 0;
44514 +
44515 +       spin_lock_irq(&tp->tx_lock);
44516 +
44517 +       if (unlikely(atomic_read(&tp->tx_busy))) {
44518 +               printk("tpm_xmit: There's an outstanding request/response "
44519 +                      "on the way!\n");
44520 +               spin_unlock_irq(&tp->tx_lock);
44521 +               return -EBUSY;
44522 +       }
44523 +
44524 +       if (tp->is_connected != 1) {
44525 +               spin_unlock_irq(&tp->tx_lock);
44526 +               return -EIO;
44527 +       }
44528 +
44529 +       for (i = 0; count > 0 && i < TPMIF_TX_RING_SIZE; i++) {
44530 +               struct tx_buffer *txb = tp->tx_buffers[i];
44531 +               int copied;
44532 +
44533 +               if (NULL == txb) {
44534 +                       DPRINTK("txb (i=%d) is NULL. buffers initilized?\n"
44535 +                               "Not transmitting anything!\n", i);
44536 +                       spin_unlock_irq(&tp->tx_lock);
44537 +                       return -EFAULT;
44538 +               }
44539 +               copied = tx_buffer_copy(txb, &buf[offset], count,
44540 +                                       isuserbuffer);
44541 +               if (copied < 0) {
44542 +                       /* An error occurred */
44543 +                       spin_unlock_irq(&tp->tx_lock);
44544 +                       return copied;
44545 +               }
44546 +               count -= copied;
44547 +               offset += copied;
44548 +
44549 +               tx = &tp->tx->ring[i].req;
44550 +
44551 +               tx->addr = virt_to_machine(txb->data);
44552 +               tx->size = txb->len;
44553 +
44554 +               DPRINTK("First 4 characters sent by TPM-FE are 0x%02x 0x%02x 0x%02x 0x%02x\n",
44555 +                       txb->data[0],txb->data[1],txb->data[2],txb->data[3]);
44556 +
44557 +               /* get the granttable reference for this page */
44558 +               tx->ref = gnttab_claim_grant_reference(&gref_head);
44559 +
44560 +               if (-ENOSPC == tx->ref) {
44561 +                       spin_unlock_irq(&tp->tx_lock);
44562 +                       DPRINTK(" Grant table claim reference failed in func:%s line:%d file:%s\n", __FUNCTION__, __LINE__, __FILE__);
44563 +                       return -ENOSPC;
44564 +               }
44565 +               gnttab_grant_foreign_access_ref( tx->ref,
44566 +                                                tp->backend_id,
44567 +                                                (tx->addr >> PAGE_SHIFT),
44568 +                                                0 /*RW*/);
44569 +               wmb();
44570 +       }
44571 +
44572 +       atomic_set(&tp->tx_busy, 1);
44573 +       tp->tx_remember = remember;
44574 +
44575 +       mb();
44576 +
44577 +       DPRINTK("Notifying backend via event channel %d\n",
44578 +               tp->evtchn);
44579 +
44580 +       notify_remote_via_irq(tp->irq);
44581 +
44582 +       spin_unlock_irq(&tp->tx_lock);
44583 +       return offset;
44584 +}
44585 +
44586 +
44587 +static void tpmif_notify_upperlayer(struct tpm_private *tp)
44588 +{
44589 +       /*
44590 +        * Notify upper layer about the state of the connection
44591 +        * to the BE.
44592 +        */
44593 +       if (tp->is_connected) {
44594 +               vtpm_vd_status(tp->chip, TPM_VD_STATUS_CONNECTED);
44595 +       } else {
44596 +               vtpm_vd_status(tp->chip, TPM_VD_STATUS_DISCONNECTED);
44597 +       }
44598 +}
44599 +
44600 +
44601 +static void tpmif_set_connected_state(struct tpm_private *tp, u8 is_connected)
44602 +{
44603 +       /*
44604 +        * Don't notify upper layer if we are in suspend mode and
44605 +        * should disconnect - assumption is that we will resume
44606 +        * The mutex keeps apps from sending.
44607 +        */
44608 +       if (is_connected == 0 && tp->is_suspended == 1) {
44609 +               return;
44610 +       }
44611 +
44612 +       /*
44613 +        * Unlock the mutex if we are connected again
44614 +        * after being suspended - now resuming.
44615 +        * This also removes the suspend state.
44616 +        */
44617 +       if (is_connected == 1 && tp->is_suspended == 1) {
44618 +               tp->is_suspended = 0;
44619 +               /* unlock, so apps can resume sending */
44620 +               mutex_unlock(&suspend_lock);
44621 +       }
44622 +
44623 +       if (is_connected != tp->is_connected) {
44624 +               tp->is_connected = is_connected;
44625 +               tpmif_notify_upperlayer(tp);
44626 +       }
44627 +}
44628 +
44629 +
44630 +
44631 +/* =================================================================
44632 + * Initialization function.
44633 + * =================================================================
44634 + */
44635 +
44636 +
44637 +static int __init tpmif_init(void)
44638 +{
44639 +       long rc = 0;
44640 +       struct tpm_private *tp;
44641 +
44642 +       if ((xen_start_info->flags & SIF_INITDOMAIN)) {
44643 +               return -EPERM;
44644 +       }
44645 +
44646 +       tp = tpm_private_get();
44647 +       if (!tp) {
44648 +               rc = -ENOMEM;
44649 +               goto failexit;
44650 +       }
44651 +
44652 +       IPRINTK("Initialising the vTPM driver.\n");
44653 +       if ( gnttab_alloc_grant_references ( TPMIF_TX_RING_SIZE,
44654 +                                            &gref_head ) < 0) {
44655 +               rc = -EFAULT;
44656 +               goto gnttab_alloc_failed;
44657 +       }
44658 +
44659 +       init_tpm_xenbus();
44660 +       return 0;
44661 +
44662 +gnttab_alloc_failed:
44663 +       tpm_private_put();
44664 +failexit:
44665 +
44666 +       return (int)rc;
44667 +}
44668 +
44669 +
44670 +void __exit tpmif_exit(void)
44671 +{
44672 +       exit_tpm_xenbus();
44673 +       tpm_private_put();
44674 +       gnttab_free_grant_references(gref_head);
44675 +}
44676 +
44677 +module_init(tpmif_init);
44678 +
44679 +MODULE_LICENSE("Dual BSD/GPL");
44680 diff -urNp linux-2.6/drivers/char/tty_io.c new/drivers/char/tty_io.c
44681 --- linux-2.6/drivers/char/tty_io.c     2006-07-03 14:14:34.000000000 +0200
44682 +++ new/drivers/char/tty_io.c   2006-05-27 02:49:07.000000000 +0200
44683 @@ -132,6 +132,8 @@ LIST_HEAD(tty_drivers);                     /* linked list
44684     vt.c for deeply disgusting hack reasons */
44685  DEFINE_MUTEX(tty_mutex);
44686  
44687 +int console_use_vt = 1;
44688 +
44689  #ifdef CONFIG_UNIX98_PTYS
44690  extern struct tty_driver *ptm_driver;  /* Unix98 pty masters; for /dev/ptmx */
44691  extern int pty_limit;          /* Config limit on Unix98 ptys */
44692 @@ -2060,7 +2062,7 @@ retry_open:
44693                 goto got_driver;
44694         }
44695  #ifdef CONFIG_VT
44696 -       if (device == MKDEV(TTY_MAJOR,0)) {
44697 +       if (console_use_vt && (device == MKDEV(TTY_MAJOR,0))) {
44698                 extern struct tty_driver *console_driver;
44699                 driver = console_driver;
44700                 index = fg_console;
44701 @@ -3258,6 +3260,8 @@ static int __init tty_init(void)
44702  #endif
44703  
44704  #ifdef CONFIG_VT
44705 +       if (!console_use_vt)
44706 +               goto out_vt;
44707         cdev_init(&vc0_cdev, &console_fops);
44708         if (cdev_add(&vc0_cdev, MKDEV(TTY_MAJOR, 0), 1) ||
44709             register_chrdev_region(MKDEV(TTY_MAJOR, 0), 1, "/dev/vc/0") < 0)
44710 @@ -3266,6 +3270,7 @@ static int __init tty_init(void)
44711         class_device_create(tty_class, NULL, MKDEV(TTY_MAJOR, 0), NULL, "tty0");
44712  
44713         vty_init();
44714 + out_vt:
44715  #endif
44716         return 0;
44717  }
44718 diff -urNp linux-2.6/drivers/firmware/Kconfig new/drivers/firmware/Kconfig
44719 --- linux-2.6/drivers/firmware/Kconfig  2006-07-03 14:14:35.000000000 +0200
44720 +++ new/drivers/firmware/Kconfig        2006-05-09 12:33:37.000000000 +0200
44721 @@ -8,7 +8,7 @@ menu "Firmware Drivers"
44722  config EDD
44723         tristate "BIOS Enhanced Disk Drive calls determine boot disk (EXPERIMENTAL)"
44724         depends on EXPERIMENTAL
44725 -       depends on !IA64
44726 +       depends on !IA64 && !XEN
44727         help
44728           Say Y or M here if you want to enable BIOS Enhanced Disk Drive
44729           Services real mode BIOS calls to determine which disk
44730 diff -urNp linux-2.6/drivers/ide/ide-lib.c new/drivers/ide/ide-lib.c
44731 --- linux-2.6/drivers/ide/ide-lib.c     2006-07-03 14:14:36.000000000 +0200
44732 +++ new/drivers/ide/ide-lib.c   2006-05-27 02:49:07.000000000 +0200
44733 @@ -410,10 +410,10 @@ void ide_toggle_bounce(ide_drive_t *driv
44734  {
44735         u64 addr = BLK_BOUNCE_HIGH;     /* dma64_addr_t */
44736  
44737 -       if (!PCI_DMA_BUS_IS_PHYS) {
44738 -               addr = BLK_BOUNCE_ANY;
44739 -       } else if (on && drive->media == ide_disk) {
44740 -               if (HWIF(drive)->pci_dev)
44741 +       if (on && drive->media == ide_disk) {
44742 +               if (!PCI_DMA_BUS_IS_PHYS)
44743 +                       addr = BLK_BOUNCE_ANY;
44744 +               else if (HWIF(drive)->pci_dev)
44745                         addr = HWIF(drive)->pci_dev->dma_mask;
44746         }
44747  
44748 diff -urNp linux-2.6/drivers/infiniband/ulp/ipoib/ipoib_multicast.c new/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
44749 --- linux-2.6/drivers/infiniband/ulp/ipoib/ipoib_multicast.c    2006-07-03 14:14:40.000000000 +0200
44750 +++ new/drivers/infiniband/ulp/ipoib/ipoib_multicast.c  2006-07-07 15:21:25.000000000 +0200
44751 @@ -821,7 +821,8 @@ void ipoib_mcast_restart_task(void *dev_
44752  
44753         ipoib_mcast_stop_thread(dev, 0);
44754  
44755 -       spin_lock_irqsave(&dev->xmit_lock, flags);
44756 +       local_irq_save(flags);
44757 +       netif_tx_lock(dev);
44758         spin_lock(&priv->lock);
44759  
44760         /*
44761 @@ -896,7 +897,8 @@ void ipoib_mcast_restart_task(void *dev_
44762         }
44763  
44764         spin_unlock(&priv->lock);
44765 -       spin_unlock_irqrestore(&dev->xmit_lock, flags);
44766 +       netif_tx_unlock(dev);
44767 +       local_irq_restore(flags);
44768  
44769         /* We have to cancel outside of the spinlock */
44770         list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
44771 diff -urNp linux-2.6/drivers/Makefile new/drivers/Makefile
44772 --- linux-2.6/drivers/Makefile  2006-07-03 14:14:31.000000000 +0200
44773 +++ new/drivers/Makefile        2006-05-09 12:33:23.000000000 +0200
44774 @@ -31,6 +31,7 @@ obj-y                         += base/ block/ misc/ mfd/ net/
44775  obj-$(CONFIG_NUBUS)            += nubus/
44776  obj-$(CONFIG_ATM)              += atm/
44777  obj-$(CONFIG_PPC_PMAC)         += macintosh/
44778 +obj-$(CONFIG_XEN)              += xen/
44779  obj-$(CONFIG_IDE)              += ide/
44780  obj-$(CONFIG_FC4)              += fc4/
44781  obj-$(CONFIG_SCSI)             += scsi/
44782 diff -urNp linux-2.6/drivers/media/dvb/dvb-core/dvb_net.c new/drivers/media/dvb/dvb-core/dvb_net.c
44783 --- linux-2.6/drivers/media/dvb/dvb-core/dvb_net.c      2006-07-03 14:14:43.000000000 +0200
44784 +++ new/drivers/media/dvb/dvb-core/dvb_net.c    2006-07-07 15:21:25.000000000 +0200
44785 @@ -1052,7 +1052,7 @@ static void wq_set_multicast_list (void 
44786  
44787         dvb_net_feed_stop(dev);
44788         priv->rx_mode = RX_MODE_UNI;
44789 -       spin_lock_bh(&dev->xmit_lock);
44790 +       netif_tx_lock_bh(dev);
44791  
44792         if (dev->flags & IFF_PROMISC) {
44793                 dprintk("%s: promiscuous mode\n", dev->name);
44794 @@ -1077,7 +1077,7 @@ static void wq_set_multicast_list (void 
44795                 }
44796         }
44797  
44798 -       spin_unlock_bh(&dev->xmit_lock);
44799 +       netif_tx_unlock_bh(dev);
44800         dvb_net_feed_start(dev);
44801  }
44802  
44803 diff -urNp linux-2.6/drivers/net/8139cp.c new/drivers/net/8139cp.c
44804 --- linux-2.6/drivers/net/8139cp.c      2006-07-03 14:14:45.000000000 +0200
44805 +++ new/drivers/net/8139cp.c    2006-07-07 15:21:26.000000000 +0200
44806 @@ -792,7 +792,7 @@ static int cp_start_xmit (struct sk_buff
44807         entry = cp->tx_head;
44808         eor = (entry == (CP_TX_RING_SIZE - 1)) ? RingEnd : 0;
44809         if (dev->features & NETIF_F_TSO)
44810 -               mss = skb_shinfo(skb)->tso_size;
44811 +               mss = skb_shinfo(skb)->gso_size;
44812  
44813         if (skb_shinfo(skb)->nr_frags == 0) {
44814                 struct cp_desc *txd = &cp->tx_ring[entry];
44815 diff -urNp linux-2.6/drivers/net/bnx2.c new/drivers/net/bnx2.c
44816 --- linux-2.6/drivers/net/bnx2.c        2006-07-03 14:14:45.000000000 +0200
44817 +++ new/drivers/net/bnx2.c      2006-07-07 15:21:26.000000000 +0200
44818 @@ -1638,7 +1638,7 @@ bnx2_tx_int(struct bnx2 *bp)
44819                 skb = tx_buf->skb;
44820  #ifdef BCM_TSO 
44821                 /* partial BD completions possible with TSO packets */
44822 -               if (skb_shinfo(skb)->tso_size) {
44823 +               if (skb_shinfo(skb)->gso_size) {
44824                         u16 last_idx, last_ring_idx;
44825  
44826                         last_idx = sw_cons +
44827 @@ -2009,7 +2009,7 @@ bnx2_poll(struct net_device *dev, int *b
44828         return 1;
44829  }
44830  
44831 -/* Called with rtnl_lock from vlan functions and also dev->xmit_lock
44832 +/* Called with rtnl_lock from vlan functions and also netif_tx_lock
44833   * from set_multicast.
44834   */
44835  static void
44836 @@ -4252,7 +4252,7 @@ bnx2_vlan_rx_kill_vid(struct net_device 
44837  }
44838  #endif
44839  
44840 -/* Called with dev->xmit_lock.
44841 +/* Called with netif_tx_lock.
44842   * hard_start_xmit is pseudo-lockless - a lock is only required when
44843   * the tx queue is full. This way, we get the benefit of lockless
44844   * operations most of the time without the complexities to handle
44845 @@ -4290,7 +4290,7 @@ bnx2_start_xmit(struct sk_buff *skb, str
44846                         (TX_BD_FLAGS_VLAN_TAG | (vlan_tx_tag_get(skb) << 16));
44847         }
44848  #ifdef BCM_TSO 
44849 -       if ((mss = skb_shinfo(skb)->tso_size) &&
44850 +       if ((mss = skb_shinfo(skb)->gso_size) &&
44851                 (skb->len > (bp->dev->mtu + ETH_HLEN))) {
44852                 u32 tcp_opt_len, ip_tcp_len;
44853  
44854 diff -urNp linux-2.6/drivers/net/bonding/bond_main.c new/drivers/net/bonding/bond_main.c
44855 --- linux-2.6/drivers/net/bonding/bond_main.c   2006-07-03 14:14:46.000000000 +0200
44856 +++ new/drivers/net/bonding/bond_main.c 2006-07-07 15:21:27.000000000 +0200
44857 @@ -1199,8 +1199,7 @@ int bond_sethwaddr(struct net_device *bo
44858  }
44859  
44860  #define BOND_INTERSECT_FEATURES \
44861 -       (NETIF_F_SG|NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM|\
44862 -       NETIF_F_TSO|NETIF_F_UFO)
44863 +       (NETIF_F_SG | NETIF_F_ALL_CSUM | NETIF_F_TSO | NETIF_F_UFO)
44864  
44865  /* 
44866   * Compute the common dev->feature set available to all slaves.  Some
44867 @@ -1218,9 +1217,7 @@ static int bond_compute_features(struct 
44868                 features &= (slave->dev->features & BOND_INTERSECT_FEATURES);
44869  
44870         if ((features & NETIF_F_SG) && 
44871 -           !(features & (NETIF_F_IP_CSUM |
44872 -                         NETIF_F_NO_CSUM |
44873 -                         NETIF_F_HW_CSUM)))
44874 +           !(features & NETIF_F_ALL_CSUM))
44875                 features &= ~NETIF_F_SG;
44876  
44877         /* 
44878 @@ -4191,7 +4188,7 @@ static int bond_init(struct net_device *
44879          */
44880         bond_dev->features |= NETIF_F_VLAN_CHALLENGED;
44881  
44882 -       /* don't acquire bond device's xmit_lock when 
44883 +       /* don't acquire bond device's netif_tx_lock when
44884          * transmitting */
44885         bond_dev->features |= NETIF_F_LLTX;
44886  
44887 diff -urNp linux-2.6/drivers/net/chelsio/sge.c new/drivers/net/chelsio/sge.c
44888 --- linux-2.6/drivers/net/chelsio/sge.c 2006-07-03 14:14:46.000000000 +0200
44889 +++ new/drivers/net/chelsio/sge.c       2006-07-07 15:21:27.000000000 +0200
44890 @@ -1418,7 +1418,7 @@ int t1_start_xmit(struct sk_buff *skb, s
44891         struct cpl_tx_pkt *cpl;
44892  
44893  #ifdef NETIF_F_TSO
44894 -       if (skb_shinfo(skb)->tso_size) {
44895 +       if (skb_shinfo(skb)->gso_size) {
44896                 int eth_type;
44897                 struct cpl_tx_pkt_lso *hdr;
44898  
44899 @@ -1433,7 +1433,7 @@ int t1_start_xmit(struct sk_buff *skb, s
44900                 hdr->ip_hdr_words = skb->nh.iph->ihl;
44901                 hdr->tcp_hdr_words = skb->h.th->doff;
44902                 hdr->eth_type_mss = htons(MK_ETH_TYPE_MSS(eth_type,
44903 -                                               skb_shinfo(skb)->tso_size));
44904 +                                               skb_shinfo(skb)->gso_size));
44905                 hdr->len = htonl(skb->len - sizeof(*hdr));
44906                 cpl = (struct cpl_tx_pkt *)hdr;
44907                 sge->stats.tx_lso_pkts++;
44908 diff -urNp linux-2.6/drivers/net/e1000/e1000_main.c new/drivers/net/e1000/e1000_main.c
44909 --- linux-2.6/drivers/net/e1000/e1000_main.c    2006-07-03 14:14:46.000000000 +0200
44910 +++ new/drivers/net/e1000/e1000_main.c  2006-07-07 16:40:08.000000000 +0200
44911 @@ -2413,7 +2413,7 @@ e1000_tso(struct e1000_adapter *adapter,
44912         uint8_t ipcss, ipcso, tucss, tucso, hdr_len;
44913         int err;
44914  
44915 -       if (skb_shinfo(skb)->tso_size) {
44916 +       if (skb_shinfo(skb)->gso_size) {
44917                 if (skb_header_cloned(skb)) {
44918                         err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
44919                         if (err)
44920 @@ -2421,7 +2421,7 @@ e1000_tso(struct e1000_adapter *adapter,
44921                 }
44922  
44923                 hdr_len = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2));
44924 -               mss = skb_shinfo(skb)->tso_size;
44925 +               mss = skb_shinfo(skb)->gso_size;
44926                 if (skb->protocol == ntohs(ETH_P_IP)) {
44927                         skb->nh.iph->tot_len = 0;
44928                         skb->nh.iph->check = 0;
44929 @@ -2538,7 +2538,7 @@ e1000_tx_map(struct e1000_adapter *adapt
44930                  * tso gets written back prematurely before the data is fully
44931                  * DMA'd to the controller */
44932                 if (!skb->data_len && tx_ring->last_tx_tso &&
44933 -                   !skb_shinfo(skb)->tso_size) {
44934 +                               !skb_shinfo(skb)->gso_size) {
44935                         tx_ring->last_tx_tso = 0;
44936                         size -= 4;
44937                 }
44938 @@ -2776,7 +2776,7 @@ e1000_xmit_frame(struct sk_buff *skb, st
44939         }
44940  
44941  #ifdef NETIF_F_TSO
44942 -       mss = skb_shinfo(skb)->tso_size;
44943 +       mss = skb_shinfo(skb)->gso_size;
44944         /* The controller does a simple calculation to 
44945          * make sure there is enough room in the FIFO before
44946          * initiating the DMA for each buffer.  The calc is:
44947 @@ -2826,7 +2826,7 @@ e1000_xmit_frame(struct sk_buff *skb, st
44948  #ifdef NETIF_F_TSO
44949         /* Controller Erratum workaround */
44950         if (!skb->data_len && tx_ring->last_tx_tso &&
44951 -           !skb_shinfo(skb)->tso_size)
44952 +           !skb_shinfo(skb)->gso_size)
44953                 count++;
44954  #endif
44955  
44956 diff -urNp linux-2.6/drivers/net/forcedeth.c new/drivers/net/forcedeth.c
44957 --- linux-2.6/drivers/net/forcedeth.c   2006-07-03 14:14:46.000000000 +0200
44958 +++ new/drivers/net/forcedeth.c 2006-07-07 15:57:14.000000000 +0200
44959 @@ -533,9 +533,9 @@ typedef union _ring_type {
44960   * critical parts:
44961   * - rx is (pseudo-) lockless: it relies on the single-threading provided
44962   *     by the arch code for interrupts.
44963 - * - tx setup is lockless: it relies on dev->xmit_lock. Actual submission
44964 + * - tx setup is lockless: it relies on netif_tx_lock. Actual submission
44965   *     needs dev->priv->lock :-(
44966 - * - set_multicast_list: preparation lockless, relies on dev->xmit_lock.
44967 + * - set_multicast_list: preparation lockless, relies on netif_tx_lock.
44968   */
44969  
44970  /* in dev: base, irq */
44971 @@ -1213,7 +1213,7 @@ static void drain_ring(struct net_device
44972  
44973  /*
44974   * nv_start_xmit: dev->hard_start_xmit function
44975 - * Called with dev->xmit_lock held.
44976 + * Called with netif_tx_lock held.
44977   */
44978  static int nv_start_xmit(struct sk_buff *skb, struct net_device *dev)
44979  {
44980 @@ -1303,8 +1303,8 @@ static int nv_start_xmit(struct sk_buff 
44981         np->tx_skbuff[nr] = skb;
44982  
44983  #ifdef NETIF_F_TSO
44984 -       if (skb_shinfo(skb)->tso_size)
44985 -               tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)->tso_size << NV_TX2_TSO_SHIFT);
44986 +       if (skb_shinfo(skb)->gso_size)
44987 +               tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)->gso_size << NV_TX2_TSO_SHIFT);
44988         else
44989  #endif
44990         tx_flags_extra = (skb->ip_summed == CHECKSUM_HW ? (NV_TX2_CHECKSUM_L3|NV_TX2_CHECKSUM_L4) : 0);
44991 @@ -1407,7 +1407,7 @@ static void nv_tx_done(struct net_device
44992  
44993  /*
44994   * nv_tx_timeout: dev->tx_timeout function
44995 - * Called with dev->xmit_lock held.
44996 + * Called with netif_tx_lock held.
44997   */
44998  static void nv_tx_timeout(struct net_device *dev)
44999  {
45000 @@ -1736,8 +1736,8 @@ static int nv_change_mtu(struct net_devi
45001                  * guessed, there is probably a simpler approach.
45002                  * Changing the MTU is a rare event, it shouldn't matter.
45003                  */
45004 -               nv_disable_irq(dev);
45005 -               spin_lock_bh(&dev->xmit_lock);
45006 +               disable_irq(dev->irq);
45007 +               netif_tx_lock_bh(dev);
45008                 spin_lock(&np->lock);
45009                 /* stop engines */
45010                 nv_stop_rx(dev);
45011 @@ -1768,8 +1768,8 @@ static int nv_change_mtu(struct net_devi
45012                 nv_start_rx(dev);
45013                 nv_start_tx(dev);
45014                 spin_unlock(&np->lock);
45015 -               spin_unlock_bh(&dev->xmit_lock);
45016 -               nv_enable_irq(dev);
45017 +               netif_tx_unlock_bh(dev);
45018 +               enable_irq(dev->irq);
45019         }
45020         return 0;
45021  }
45022 @@ -1803,7 +1803,7 @@ static int nv_set_mac_address(struct net
45023         memcpy(dev->dev_addr, macaddr->sa_data, ETH_ALEN);
45024  
45025         if (netif_running(dev)) {
45026 -               spin_lock_bh(&dev->xmit_lock);
45027 +               netif_tx_lock_bh(dev);
45028                 spin_lock_irq(&np->lock);
45029  
45030                 /* stop rx engine */
45031 @@ -1815,7 +1815,7 @@ static int nv_set_mac_address(struct net
45032                 /* restart rx engine */
45033                 nv_start_rx(dev);
45034                 spin_unlock_irq(&np->lock);
45035 -               spin_unlock_bh(&dev->xmit_lock);
45036 +               netif_tx_unlock_bh(dev);
45037         } else {
45038                 nv_copy_mac_to_hw(dev);
45039         }
45040 @@ -1824,7 +1824,7 @@ static int nv_set_mac_address(struct net
45041  
45042  /*
45043   * nv_set_multicast: dev->set_multicast function
45044 - * Called with dev->xmit_lock held.
45045 + * Called with netif_tx_lock held.
45046   */
45047  static void nv_set_multicast(struct net_device *dev)
45048  {
45049 diff -urNp linux-2.6/drivers/net/hamradio/6pack.c new/drivers/net/hamradio/6pack.c
45050 --- linux-2.6/drivers/net/hamradio/6pack.c      2006-07-03 14:14:46.000000000 +0200
45051 +++ new/drivers/net/hamradio/6pack.c    2006-07-07 15:10:03.000000000 +0200
45052 @@ -308,9 +308,9 @@ static int sp_set_mac_address(struct net
45053  {
45054         struct sockaddr_ax25 *sa = addr;
45055  
45056 -       spin_lock_irq(&dev->xmit_lock);
45057 +       netif_tx_lock_bh(dev);
45058         memcpy(dev->dev_addr, &sa->sax25_call, AX25_ADDR_LEN);
45059 -       spin_unlock_irq(&dev->xmit_lock);
45060 +       netif_tx_unlock_bh(dev);
45061  
45062         return 0;
45063  }
45064 @@ -767,9 +767,9 @@ static int sixpack_ioctl(struct tty_stru
45065                         break;
45066                 }
45067  
45068 -               spin_lock_irq(&dev->xmit_lock);
45069 +               netif_tx_lock_bh(dev);
45070                 memcpy(dev->dev_addr, &addr, AX25_ADDR_LEN);
45071 -               spin_unlock_irq(&dev->xmit_lock);
45072 +               netif_tx_unlock_bh(dev);
45073  
45074                 err = 0;
45075                 break;
45076 diff -urNp linux-2.6/drivers/net/hamradio/mkiss.c new/drivers/net/hamradio/mkiss.c
45077 --- linux-2.6/drivers/net/hamradio/mkiss.c      2006-07-03 14:14:46.000000000 +0200
45078 +++ new/drivers/net/hamradio/mkiss.c    2006-07-07 15:57:37.000000000 +0200
45079 @@ -357,9 +357,9 @@ static int ax_set_mac_address(struct net
45080  {
45081         struct sockaddr_ax25 *sa = addr;
45082  
45083 -       spin_lock_irq(&dev->xmit_lock);
45084 +       netif_tx_lock_bh(dev);
45085         memcpy(dev->dev_addr, &sa->sax25_call, AX25_ADDR_LEN);
45086 -       spin_unlock_irq(&dev->xmit_lock);
45087 +       netif_tx_unlock_bh(dev);
45088  
45089         return 0;
45090  }
45091 @@ -886,9 +886,9 @@ static int mkiss_ioctl(struct tty_struct
45092                         break;
45093                 }
45094  
45095 -               spin_lock_irq(&dev->xmit_lock);
45096 +               netif_tx_lock_bh(dev);
45097                 memcpy(dev->dev_addr, addr, AX25_ADDR_LEN);
45098 -               spin_unlock_irq(&dev->xmit_lock);
45099 +               netif_tx_unlock_bh(dev);
45100  
45101                 err = 0;
45102                 break;
45103 diff -urNp linux-2.6/drivers/net/ifb.c new/drivers/net/ifb.c
45104 --- linux-2.6/drivers/net/ifb.c 2006-07-03 14:14:46.000000000 +0200
45105 +++ new/drivers/net/ifb.c       2006-07-07 15:10:03.000000000 +0200
45106 @@ -76,13 +76,13 @@ static void ri_tasklet(unsigned long dev
45107         dp->st_task_enter++;
45108         if ((skb = skb_peek(&dp->tq)) == NULL) {
45109                 dp->st_txq_refl_try++;
45110 -               if (spin_trylock(&_dev->xmit_lock)) {
45111 +               if (netif_tx_trylock(_dev)) {
45112                         dp->st_rxq_enter++;
45113                         while ((skb = skb_dequeue(&dp->rq)) != NULL) {
45114                                 skb_queue_tail(&dp->tq, skb);
45115                                 dp->st_rx2tx_tran++;
45116                         }
45117 -                       spin_unlock(&_dev->xmit_lock);
45118 +                       netif_tx_unlock(_dev);
45119                 } else {
45120                         /* reschedule */
45121                         dp->st_rxq_notenter++;
45122 @@ -110,7 +110,7 @@ static void ri_tasklet(unsigned long dev
45123                 }
45124         }
45125  
45126 -       if (spin_trylock(&_dev->xmit_lock)) {
45127 +       if (netif_tx_trylock(_dev)) {
45128                 dp->st_rxq_check++;
45129                 if ((skb = skb_peek(&dp->rq)) == NULL) {
45130                         dp->tasklet_pending = 0;
45131 @@ -118,10 +118,10 @@ static void ri_tasklet(unsigned long dev
45132                                 netif_wake_queue(_dev);
45133                 } else {
45134                         dp->st_rxq_rsch++;
45135 -                       spin_unlock(&_dev->xmit_lock);
45136 +                       netif_tx_unlock(_dev);
45137                         goto resched;
45138                 }
45139 -               spin_unlock(&_dev->xmit_lock);
45140 +               netif_tx_unlock(_dev);
45141         } else {
45142  resched:
45143                 dp->tasklet_pending = 1;
45144 diff -urNp linux-2.6/drivers/net/irda/vlsi_ir.c new/drivers/net/irda/vlsi_ir.c
45145 --- linux-2.6/drivers/net/irda/vlsi_ir.c        2006-07-03 14:14:46.000000000 +0200
45146 +++ new/drivers/net/irda/vlsi_ir.c      2006-07-07 15:57:37.000000000 +0200
45147 @@ -959,7 +959,7 @@ static int vlsi_hard_start_xmit(struct s
45148                             ||  (now.tv_sec==ready.tv_sec && now.tv_usec>=ready.tv_usec))
45149                                 break;
45150                         udelay(100);
45151 -                       /* must not sleep here - we are called under xmit_lock! */
45152 +                       /* must not sleep here - called under netif_tx_lock! */
45153                 }
45154         }
45155  
45156 diff -urNp linux-2.6/drivers/net/ixgb/ixgb_main.c new/drivers/net/ixgb/ixgb_main.c
45157 --- linux-2.6/drivers/net/ixgb/ixgb_main.c      2006-07-03 14:14:46.000000000 +0200
45158 +++ new/drivers/net/ixgb/ixgb_main.c    2006-07-07 15:57:38.000000000 +0200
45159 @@ -1168,7 +1168,7 @@ ixgb_tso(struct ixgb_adapter *adapter, s
45160         uint16_t ipcse, tucse, mss;
45161         int err;
45162  
45163 -       if(likely(skb_shinfo(skb)->tso_size)) {
45164 +       if(likely(skb_shinfo(skb)->gso_size)) {
45165                 if (skb_header_cloned(skb)) {
45166                         err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
45167                         if (err)
45168 @@ -1176,7 +1176,7 @@ ixgb_tso(struct ixgb_adapter *adapter, s
45169                 }
45170  
45171                 hdr_len = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2));
45172 -               mss = skb_shinfo(skb)->tso_size;
45173 +               mss = skb_shinfo(skb)->gso_size;
45174                 skb->nh.iph->tot_len = 0;
45175                 skb->nh.iph->check = 0;
45176                 skb->h.th->check = ~csum_tcpudp_magic(skb->nh.iph->saddr,
45177 diff -urNp linux-2.6/drivers/net/loopback.c new/drivers/net/loopback.c
45178 --- linux-2.6/drivers/net/loopback.c    2006-07-03 14:14:46.000000000 +0200
45179 +++ new/drivers/net/loopback.c  2006-07-07 15:57:38.000000000 +0200
45180 @@ -74,7 +74,7 @@ static void emulate_large_send_offload(s
45181         struct iphdr *iph = skb->nh.iph;
45182         struct tcphdr *th = (struct tcphdr*)(skb->nh.raw + (iph->ihl * 4));
45183         unsigned int doffset = (iph->ihl + th->doff) * 4;
45184 -       unsigned int mtu = skb_shinfo(skb)->tso_size + doffset;
45185 +       unsigned int mtu = skb_shinfo(skb)->gso_size + doffset;
45186         unsigned int offset = 0;
45187         u32 seq = ntohl(th->seq);
45188         u16 id  = ntohs(iph->id);
45189 @@ -139,7 +139,7 @@ static int loopback_xmit(struct sk_buff 
45190  #endif
45191  
45192  #ifdef LOOPBACK_TSO
45193 -       if (skb_shinfo(skb)->tso_size) {
45194 +       if (skb_shinfo(skb)->gso_size) {
45195                 BUG_ON(skb->protocol != htons(ETH_P_IP));
45196                 BUG_ON(skb->nh.iph->protocol != IPPROTO_TCP);
45197  
45198 diff -urNp linux-2.6/drivers/net/mv643xx_eth.c new/drivers/net/mv643xx_eth.c
45199 --- linux-2.6/drivers/net/mv643xx_eth.c 2006-07-03 14:14:46.000000000 +0200
45200 +++ new/drivers/net/mv643xx_eth.c       2006-07-07 15:57:38.000000000 +0200
45201 @@ -1200,7 +1200,7 @@ static int mv643xx_eth_start_xmit(struct
45202         }
45203  
45204         if (has_tiny_unaligned_frags(skb)) {
45205 -               if ((skb_linearize(skb, GFP_ATOMIC) != 0)) {
45206 +               if (__skb_linearize(skb)) {
45207                         stats->tx_dropped++;
45208                         printk(KERN_DEBUG "%s: failed to linearize tiny "
45209                                         "unaligned fragment\n", dev->name);
45210 diff -urNp linux-2.6/drivers/net/natsemi.c new/drivers/net/natsemi.c
45211 --- linux-2.6/drivers/net/natsemi.c     2006-07-03 14:14:46.000000000 +0200
45212 +++ new/drivers/net/natsemi.c   2006-07-07 15:57:39.000000000 +0200
45213 @@ -318,12 +318,12 @@ performance critical codepaths:
45214  The rx process only runs in the interrupt handler. Access from outside
45215  the interrupt handler is only permitted after disable_irq().
45216  
45217 -The rx process usually runs under the dev->xmit_lock. If np->intr_tx_reap
45218 +The rx process usually runs under the netif_tx_lock. If np->intr_tx_reap
45219  is set, then access is permitted under spin_lock_irq(&np->lock).
45220  
45221  Thus configuration functions that want to access everything must call
45222         disable_irq(dev->irq);
45223 -       spin_lock_bh(dev->xmit_lock);
45224 +       netif_tx_lock_bh(dev);
45225         spin_lock_irq(&np->lock);
45226  
45227  IV. Notes
45228 diff -urNp linux-2.6/drivers/net/r8169.c new/drivers/net/r8169.c
45229 --- linux-2.6/drivers/net/r8169.c       2006-07-03 14:14:47.000000000 +0200
45230 +++ new/drivers/net/r8169.c     2006-07-07 15:57:39.000000000 +0200
45231 @@ -2171,7 +2171,7 @@ static int rtl8169_xmit_frags(struct rtl
45232  static inline u32 rtl8169_tso_csum(struct sk_buff *skb, struct net_device *dev)
45233  {
45234         if (dev->features & NETIF_F_TSO) {
45235 -               u32 mss = skb_shinfo(skb)->tso_size;
45236 +               u32 mss = skb_shinfo(skb)->gso_size;
45237  
45238                 if (mss)
45239                         return LargeSend | ((mss & MSSMask) << MSSShift);
45240 diff -urNp linux-2.6/drivers/net/s2io.c new/drivers/net/s2io.c
45241 --- linux-2.6/drivers/net/s2io.c        2006-07-03 14:14:47.000000000 +0200
45242 +++ new/drivers/net/s2io.c      2006-07-07 15:57:39.000000000 +0200
45243 @@ -3564,8 +3564,8 @@ static int s2io_xmit(struct sk_buff *skb
45244         txdp->Control_1 = 0;
45245         txdp->Control_2 = 0;
45246  #ifdef NETIF_F_TSO
45247 -       mss = skb_shinfo(skb)->tso_size;
45248 -       if (mss) {
45249 +       mss = skb_shinfo(skb)->gso_size;
45250 +       if (skb_shinfo(skb)->gso_type == SKB_GSO_TCPV4) {
45251                 txdp->Control_1 |= TXD_TCP_LSO_EN;
45252                 txdp->Control_1 |= TXD_TCP_LSO_MSS(mss);
45253         }
45254 @@ -3585,10 +3585,10 @@ static int s2io_xmit(struct sk_buff *skb
45255         }
45256  
45257         frg_len = skb->len - skb->data_len;
45258 -       if (skb_shinfo(skb)->ufo_size) {
45259 +       if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4) {
45260                 int ufo_size;
45261  
45262 -               ufo_size = skb_shinfo(skb)->ufo_size;
45263 +               ufo_size = skb_shinfo(skb)->gso_size;
45264                 ufo_size &= ~7;
45265                 txdp->Control_1 |= TXD_UFO_EN;
45266                 txdp->Control_1 |= TXD_UFO_MSS(ufo_size);
45267 @@ -3614,7 +3614,7 @@ static int s2io_xmit(struct sk_buff *skb
45268         txdp->Host_Control = (unsigned long) skb;
45269         txdp->Control_1 |= TXD_BUFFER0_SIZE(frg_len);
45270  
45271 -       if (skb_shinfo(skb)->ufo_size)
45272 +       if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
45273                 txdp->Control_1 |= TXD_UFO_EN;
45274  
45275         frg_cnt = skb_shinfo(skb)->nr_frags;
45276 @@ -3629,12 +3629,12 @@ static int s2io_xmit(struct sk_buff *skb
45277                     (sp->pdev, frag->page, frag->page_offset,
45278                      frag->size, PCI_DMA_TODEVICE);
45279                 txdp->Control_1 = TXD_BUFFER0_SIZE(frag->size);
45280 -               if (skb_shinfo(skb)->ufo_size)
45281 +               if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
45282                         txdp->Control_1 |= TXD_UFO_EN;
45283         }
45284         txdp->Control_1 |= TXD_GATHER_CODE_LAST;
45285  
45286 -       if (skb_shinfo(skb)->ufo_size)
45287 +       if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
45288                 frg_cnt++; /* as Txd0 was used for inband header */
45289  
45290         tx_fifo = mac_control->tx_FIFO_start[queue];
45291 @@ -3648,7 +3648,7 @@ static int s2io_xmit(struct sk_buff *skb
45292         if (mss)
45293                 val64 |= TX_FIFO_SPECIAL_FUNC;
45294  #endif
45295 -       if (skb_shinfo(skb)->ufo_size)
45296 +       if (skb_shinfo(skb)->gso_type == SKB_GSO_UDPV4)
45297                 val64 |= TX_FIFO_SPECIAL_FUNC;
45298         writeq(val64, &tx_fifo->List_Control);
45299  
45300 diff -urNp linux-2.6/drivers/net/sky2.c new/drivers/net/sky2.c
45301 --- linux-2.6/drivers/net/sky2.c        2006-07-03 14:14:47.000000000 +0200
45302 +++ new/drivers/net/sky2.c      2006-07-07 15:57:39.000000000 +0200
45303 @@ -1160,7 +1160,7 @@ static unsigned tx_le_req(const struct s
45304         count = sizeof(dma_addr_t) / sizeof(u32);
45305         count += skb_shinfo(skb)->nr_frags * count;
45306  
45307 -       if (skb_shinfo(skb)->tso_size)
45308 +       if (skb_shinfo(skb)->gso_size)
45309                 ++count;
45310  
45311         if (skb->ip_summed == CHECKSUM_HW)
45312 @@ -1232,7 +1232,7 @@ static int sky2_xmit_frame(struct sk_buf
45313         }
45314  
45315         /* Check for TCP Segmentation Offload */
45316 -       mss = skb_shinfo(skb)->tso_size;
45317 +       mss = skb_shinfo(skb)->gso_size;
45318         if (mss != 0) {
45319                 /* just drop the packet if non-linear expansion fails */
45320                 if (skb_header_cloned(skb) &&
45321 diff -urNp linux-2.6/drivers/net/tg3.c new/drivers/net/tg3.c
45322 --- linux-2.6/drivers/net/tg3.c 2006-07-03 14:14:47.000000000 +0200
45323 +++ new/drivers/net/tg3.c       2006-07-07 16:41:44.000000000 +0200
45324 @@ -3743,7 +3743,7 @@ static int tg3_start_xmit(struct sk_buff
45325  #if TG3_TSO_SUPPORT != 0
45326         mss = 0;
45327         if (skb->len > (tp->dev->mtu + ETH_HLEN) &&
45328 -           (mss = skb_shinfo(skb)->tso_size) != 0) {
45329 +           (mss = skb_shinfo(skb)->gso_size) != 0) {
45330                 int tcp_opt_len, ip_tcp_len;
45331  
45332                 if (skb_header_cloned(skb) &&
45333 @@ -3871,7 +3871,7 @@ static int tg3_start_xmit_dma_bug(struct
45334  #if TG3_TSO_SUPPORT != 0
45335         mss = 0;
45336         if (skb->len > (tp->dev->mtu + ETH_HLEN) &&
45337 -           (mss = skb_shinfo(skb)->tso_size) != 0) {
45338 +           (mss = skb_shinfo(skb)->gso_size) != 0) {
45339                 int tcp_opt_len, ip_tcp_len;
45340  
45341                 if (skb_header_cloned(skb) &&
45342 diff -urNp linux-2.6/drivers/net/tulip/winbond-840.c new/drivers/net/tulip/winbond-840.c
45343 --- linux-2.6/drivers/net/tulip/winbond-840.c   2006-07-03 14:14:48.000000000 +0200
45344 +++ new/drivers/net/tulip/winbond-840.c 2006-07-07 15:57:40.000000000 +0200
45345 @@ -1605,11 +1605,11 @@ static void __devexit w840_remove1 (stru
45346   * - get_stats:
45347   *     spin_lock_irq(np->lock), doesn't touch hw if not present
45348   * - hard_start_xmit:
45349 - *     netif_stop_queue + spin_unlock_wait(&dev->xmit_lock);
45350 + *     synchronize_irq + netif_tx_disable;
45351   * - tx_timeout:
45352 - *     netif_device_detach + spin_unlock_wait(&dev->xmit_lock);
45353 + *     netif_device_detach + netif_tx_disable;
45354   * - set_multicast_list
45355 - *     netif_device_detach + spin_unlock_wait(&dev->xmit_lock);
45356 + *     netif_device_detach + netif_tx_disable;
45357   * - interrupt handler
45358   *     doesn't touch hw if not present, synchronize_irq waits for
45359   *     running instances of the interrupt handler.
45360 @@ -1635,11 +1635,10 @@ static int w840_suspend (struct pci_dev 
45361                 netif_device_detach(dev);
45362                 update_csr6(dev, 0);
45363                 iowrite32(0, ioaddr + IntrEnable);
45364 -               netif_stop_queue(dev);
45365                 spin_unlock_irq(&np->lock);
45366  
45367 -               spin_unlock_wait(&dev->xmit_lock);
45368                 synchronize_irq(dev->irq);
45369 +               netif_tx_disable(dev);
45370         
45371                 np->stats.rx_missed_errors += ioread32(ioaddr + RxMissed) & 0xffff;
45372  
45373 diff -urNp linux-2.6/drivers/net/typhoon.c new/drivers/net/typhoon.c
45374 --- linux-2.6/drivers/net/typhoon.c     2006-07-03 14:14:48.000000000 +0200
45375 +++ new/drivers/net/typhoon.c   2006-07-07 15:57:40.000000000 +0200
45376 @@ -340,7 +340,7 @@ enum state_values {
45377  #endif
45378  
45379  #if defined(NETIF_F_TSO)
45380 -#define skb_tso_size(x)                (skb_shinfo(x)->tso_size)
45381 +#define skb_tso_size(x)                (skb_shinfo(x)->gso_size)
45382  #define TSO_NUM_DESCRIPTORS    2
45383  #define TSO_OFFLOAD_ON         TYPHOON_OFFLOAD_TCP_SEGMENT
45384  #else
45385 diff -urNp linux-2.6/drivers/net/via-velocity.c new/drivers/net/via-velocity.c
45386 --- linux-2.6/drivers/net/via-velocity.c        2006-07-03 14:14:48.000000000 +0200
45387 +++ new/drivers/net/via-velocity.c      2006-07-07 15:10:03.000000000 +0200
45388 @@ -1899,6 +1899,13 @@ static int velocity_xmit(struct sk_buff 
45389  
45390         int pktlen = skb->len;
45391  
45392 +#ifdef VELOCITY_ZERO_COPY_SUPPORT
45393 +       if (skb_shinfo(skb)->nr_frags > 6 && __skb_linearize(skb)) {
45394 +               kfree_skb(skb);
45395 +               return 0;
45396 +       }
45397 +#endif
45398 +
45399         spin_lock_irqsave(&vptr->lock, flags);
45400  
45401         index = vptr->td_curr[qnum];
45402 @@ -1914,8 +1921,6 @@ static int velocity_xmit(struct sk_buff 
45403          */
45404         if (pktlen < ETH_ZLEN) {
45405                 /* Cannot occur until ZC support */
45406 -               if(skb_linearize(skb, GFP_ATOMIC))
45407 -                       return 0; 
45408                 pktlen = ETH_ZLEN;
45409                 memcpy(tdinfo->buf, skb->data, skb->len);
45410                 memset(tdinfo->buf + skb->len, 0, ETH_ZLEN - skb->len);
45411 @@ -1933,7 +1938,6 @@ static int velocity_xmit(struct sk_buff 
45412                 int nfrags = skb_shinfo(skb)->nr_frags;
45413                 tdinfo->skb = skb;
45414                 if (nfrags > 6) {
45415 -                       skb_linearize(skb, GFP_ATOMIC);
45416                         memcpy(tdinfo->buf, skb->data, skb->len);
45417                         tdinfo->skb_dma[0] = tdinfo->buf_dma;
45418                         td_ptr->tdesc0.pktsize = 
45419 diff -urNp linux-2.6/drivers/net/wireless/orinoco.c new/drivers/net/wireless/orinoco.c
45420 --- linux-2.6/drivers/net/wireless/orinoco.c    2006-07-03 14:14:49.000000000 +0200
45421 +++ new/drivers/net/wireless/orinoco.c  2006-07-07 15:57:40.000000000 +0200
45422 @@ -1833,7 +1833,9 @@ static int __orinoco_program_rids(struct
45423         /* Set promiscuity / multicast*/
45424         priv->promiscuous = 0;
45425         priv->mc_count = 0;
45426 -       __orinoco_set_multicast_list(dev); /* FIXME: what about the xmit_lock */
45427 +
45428 +       /* FIXME: what about netif_tx_lock */
45429 +       __orinoco_set_multicast_list(dev);
45430  
45431         return 0;
45432  }
45433 diff -urNp linux-2.6/drivers/oprofile/buffer_sync.c new/drivers/oprofile/buffer_sync.c
45434 --- linux-2.6/drivers/oprofile/buffer_sync.c    2006-07-03 14:14:50.000000000 +0200
45435 +++ new/drivers/oprofile/buffer_sync.c  2006-07-07 15:58:36.000000000 +0200
45436 @@ -6,6 +6,10 @@
45437   *
45438   * @author John Levon <levon@movementarian.org>
45439   *
45440 + * Modified by Aravind Menon for Xen
45441 + * These modifications are:
45442 + * Copyright (C) 2005 Hewlett-Packard Co.
45443 + *
45444   * This is the core of the buffer management. Each
45445   * CPU buffer is processed and entered into the
45446   * global event buffer. Such processing is necessary
45447 @@ -275,15 +279,30 @@ static void add_cpu_switch(int i)
45448         last_cookie = INVALID_COOKIE;
45449  }
45450  
45451 -static void add_kernel_ctx_switch(unsigned int in_kernel)
45452 +static void add_cpu_mode_switch(unsigned int cpu_mode)
45453  {
45454         add_event_entry(ESCAPE_CODE);
45455 -       if (in_kernel)
45456 -               add_event_entry(KERNEL_ENTER_SWITCH_CODE); 
45457 -       else
45458 -               add_event_entry(KERNEL_EXIT_SWITCH_CODE); 
45459 +       switch (cpu_mode) {
45460 +       case CPU_MODE_USER:
45461 +               add_event_entry(USER_ENTER_SWITCH_CODE);
45462 +               break;
45463 +       case CPU_MODE_KERNEL:
45464 +               add_event_entry(KERNEL_ENTER_SWITCH_CODE);
45465 +               break;
45466 +       case CPU_MODE_XEN:
45467 +               add_event_entry(XEN_ENTER_SWITCH_CODE);
45468 +               break;
45469 +        case CPU_MODE_PASSIVE_START:
45470 +                add_event_entry(PASSIVE_START_CODE);
45471 +                break;
45472 +        case CPU_MODE_PASSIVE_STOP:
45473 +                add_event_entry(PASSIVE_STOP_CODE);
45474 +                break;
45475 +       default:
45476 +               break;
45477 +       }
45478  }
45479
45480 +
45481  static void
45482  add_user_ctx_switch(struct task_struct const * task, unsigned long cookie)
45483  {
45484 @@ -348,9 +367,9 @@ static int add_us_sample(struct mm_struc
45485   * for later lookup from userspace.
45486   */
45487  static int
45488 -add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel)
45489 +add_sample(struct mm_struct * mm, struct op_sample * s, int cpu_mode)
45490  {
45491 -       if (in_kernel) {
45492 +       if (cpu_mode >= CPU_MODE_KERNEL) {
45493                 add_sample_entry(s->eip, s->event);
45494                 return 1;
45495         } else if (mm) {
45496 @@ -496,10 +515,11 @@ void sync_buffer(int cpu)
45497         struct mm_struct *mm = NULL;
45498         struct task_struct * new;
45499         unsigned long cookie = 0;
45500 -       int in_kernel = 1;
45501 +       int cpu_mode = 1;
45502         unsigned int i;
45503         sync_buffer_state state = sb_buffer_start;
45504         unsigned long available;
45505 +       int domain_switch = NO_DOMAIN_SWITCH;
45506  
45507         down(&buffer_sem);
45508   
45509 @@ -513,12 +533,19 @@ void sync_buffer(int cpu)
45510                 struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos];
45511   
45512                 if (is_code(s->eip)) {
45513 -                       if (s->event <= CPU_IS_KERNEL) {
45514 +                       if (s->event < CPU_TRACE_BEGIN) {
45515                                 /* kernel/userspace switch */
45516 -                               in_kernel = s->event;
45517 +                               cpu_mode = s->event;
45518                                 if (state == sb_buffer_start)
45519                                         state = sb_sample_start;
45520 -                               add_kernel_ctx_switch(s->event);
45521 +
45522 +                               if (s->event == CPU_MODE_PASSIVE_START)
45523 +                                       domain_switch = DOMAIN_SWITCH_START_EVENT1;
45524 +                               else if (s->event == CPU_MODE_PASSIVE_STOP)
45525 +                                       domain_switch = DOMAIN_SWITCH_STOP_EVENT1;
45526 +
45527 +                               if (domain_switch != DOMAIN_SWITCH_START_EVENT2)
45528 +                                       add_cpu_mode_switch(s->event);
45529                         } else if (s->event == CPU_TRACE_BEGIN) {
45530                                 state = sb_bt_start;
45531                                 add_trace_begin();
45532 @@ -535,11 +562,20 @@ void sync_buffer(int cpu)
45533                                 add_user_ctx_switch(new, cookie);
45534                         }
45535                 } else {
45536 -                       if (state >= sb_bt_start &&
45537 -                           !add_sample(mm, s, in_kernel)) {
45538 -                               if (state == sb_bt_start) {
45539 -                                       state = sb_bt_ignore;
45540 -                                       atomic_inc(&oprofile_stats.bt_lost_no_mapping);
45541 +                       if (domain_switch == DOMAIN_SWITCH_START_EVENT1) {
45542 +                               add_event_entry(s->event);
45543 +                               domain_switch = DOMAIN_SWITCH_START_EVENT2;
45544 +                       } else if (domain_switch == DOMAIN_SWITCH_START_EVENT1) {
45545 +                               add_sample_entry(s->eip, s->event);
45546 +                       } else if (domain_switch == DOMAIN_SWITCH_STOP_EVENT1) {
45547 +                               domain_switch = NO_DOMAIN_SWITCH;
45548 +                       } else {
45549 +                               if (state >= sb_bt_start &&
45550 +                                   !add_sample(mm, s, cpu_mode)) {
45551 +                                       if (state == sb_bt_start) {
45552 +                                               state = sb_bt_ignore;
45553 +                                               atomic_inc(&oprofile_stats.bt_lost_no_mapping);
45554 +                                       }
45555                                 }
45556                         }
45557                 }
45558 diff -urNp linux-2.6/drivers/oprofile/buffer_sync.h new/drivers/oprofile/buffer_sync.h
45559 --- linux-2.6/drivers/oprofile/buffer_sync.h    2006-07-03 14:14:50.000000000 +0200
45560 +++ new/drivers/oprofile/buffer_sync.h  2006-07-07 15:10:03.000000000 +0200
45561 @@ -9,6 +9,11 @@
45562  
45563  #ifndef OPROFILE_BUFFER_SYNC_H
45564  #define OPROFILE_BUFFER_SYNC_H
45565 +
45566 +#define NO_DOMAIN_SWITCH               -1
45567 +#define DOMAIN_SWITCH_START_EVENT1     0
45568 +#define DOMAIN_SWITCH_START_EVENT2     1
45569 +#define DOMAIN_SWITCH_STOP_EVENT1      2
45570   
45571  /* add the necessary profiling hooks */
45572  int sync_start(void);
45573 diff -urNp linux-2.6/drivers/oprofile/cpu_buffer.c new/drivers/oprofile/cpu_buffer.c
45574 --- linux-2.6/drivers/oprofile/cpu_buffer.c     2006-07-03 14:14:50.000000000 +0200
45575 +++ new/drivers/oprofile/cpu_buffer.c   2006-07-07 16:00:19.000000000 +0200
45576 @@ -6,6 +6,10 @@
45577   *
45578   * @author John Levon <levon@movementarian.org>
45579   *
45580 + * Modified by Aravind Menon for Xen
45581 + * These modifications are:
45582 + * Copyright (C) 2005 Hewlett-Packard Co.
45583 + *
45584   * Each CPU has a local buffer that stores PC value/event
45585   * pairs. We also log context switches when we notice them.
45586   * Eventually each CPU's buffer is processed into the global
45587 @@ -57,7 +61,7 @@ int alloc_cpu_buffers(void)
45588                         goto fail;
45589   
45590                 b->last_task = NULL;
45591 -               b->last_is_kernel = -1;
45592 +               b->last_cpu_mode = -1;
45593                 b->tracing = 0;
45594                 b->buffer_size = buffer_size;
45595                 b->tail_pos = 0;
45596 @@ -113,7 +117,7 @@ void cpu_buffer_reset(struct oprofile_cp
45597          * collected will populate the buffer with proper
45598          * values to initialize the buffer
45599          */
45600 -       cpu_buf->last_is_kernel = -1;
45601 +       cpu_buf->last_cpu_mode = -1;
45602         cpu_buf->last_task = NULL;
45603  }
45604  
45605 @@ -163,13 +167,13 @@ add_code(struct oprofile_cpu_buffer * bu
45606   * because of the head/tail separation of the writer and reader
45607   * of the CPU buffer.
45608   *
45609 - * is_kernel is needed because on some architectures you cannot
45610 + * cpu_mode is needed because on some architectures you cannot
45611   * tell if you are in kernel or user space simply by looking at
45612 - * pc. We tag this in the buffer by generating kernel enter/exit
45613 - * events whenever is_kernel changes
45614 + * pc. We tag this in the buffer by generating kernel/user (and xen)
45615 + *  enter events whenever cpu_mode changes
45616   */
45617  static int log_sample(struct oprofile_cpu_buffer * cpu_buf, unsigned long pc,
45618 -                     int is_kernel, unsigned long event)
45619 +                     int cpu_mode, unsigned long event)
45620  {
45621         struct task_struct * task;
45622  
45623 @@ -180,16 +184,16 @@ static int log_sample(struct oprofile_cp
45624                 return 0;
45625         }
45626  
45627 -       is_kernel = !!is_kernel;
45628 +       WARN_ON(cpu_mode > CPU_MODE_XEN);
45629  
45630         task = current;
45631  
45632         /* notice a switch from user->kernel or vice versa */
45633 -       if (cpu_buf->last_is_kernel != is_kernel) {
45634 -               cpu_buf->last_is_kernel = is_kernel;
45635 -               add_code(cpu_buf, is_kernel);
45636 +       if (cpu_buf->last_cpu_mode != cpu_mode) {
45637 +               cpu_buf->last_cpu_mode = cpu_mode;
45638 +               add_code(cpu_buf, cpu_mode);
45639         }
45640 -
45641 +       
45642         /* notice a task switch */
45643         if (cpu_buf->last_task != task) {
45644                 cpu_buf->last_task = task;
45645 diff -urNp linux-2.6/drivers/oprofile/cpu_buffer.h new/drivers/oprofile/cpu_buffer.h
45646 --- linux-2.6/drivers/oprofile/cpu_buffer.h     2006-07-03 14:14:50.000000000 +0200
45647 +++ new/drivers/oprofile/cpu_buffer.h   2006-07-07 16:00:46.000000000 +0200
45648 @@ -36,7 +36,7 @@ struct oprofile_cpu_buffer {
45649         volatile unsigned long tail_pos;
45650         unsigned long buffer_size;
45651         struct task_struct * last_task;
45652 -       int last_is_kernel;
45653 +       int last_cpu_mode;
45654         int tracing;
45655         struct op_sample * buffer;
45656         unsigned long sample_received;
45657 @@ -51,7 +51,13 @@ extern struct oprofile_cpu_buffer cpu_bu
45658  void cpu_buffer_reset(struct oprofile_cpu_buffer * cpu_buf);
45659  
45660  /* transient events for the CPU buffer -> event buffer */
45661 -#define CPU_IS_KERNEL 1
45662 -#define CPU_TRACE_BEGIN 2
45663 +#define CPU_MODE_USER           0
45664 +#define CPU_MODE_KERNEL         1
45665 +#define CPU_MODE_XEN            2
45666 +#define CPU_MODE_PASSIVE_START  3
45667 +#define CPU_MODE_PASSIVE_STOP   4
45668 +#define CPU_TRACE_BEGIN         5
45669 +
45670 +#define IGNORED_PC              0
45671  
45672  #endif /* OPROFILE_CPU_BUFFER_H */
45673 diff -urNp linux-2.6/drivers/oprofile/event_buffer.h new/drivers/oprofile/event_buffer.h
45674 --- linux-2.6/drivers/oprofile/event_buffer.h   2006-07-03 14:14:50.000000000 +0200
45675 +++ new/drivers/oprofile/event_buffer.h 2006-07-07 16:01:13.000000000 +0200
45676 @@ -29,11 +29,14 @@ void wake_up_buffer_waiter(void);
45677  #define CPU_SWITCH_CODE                2
45678  #define COOKIE_SWITCH_CODE             3
45679  #define KERNEL_ENTER_SWITCH_CODE       4
45680 -#define KERNEL_EXIT_SWITCH_CODE                5
45681 +#define USER_ENTER_SWITCH_CODE         5
45682  #define MODULE_LOADED_CODE             6
45683  #define CTX_TGID_CODE                  7
45684  #define TRACE_BEGIN_CODE               8
45685  #define TRACE_END_CODE                 9
45686 +#define XEN_ENTER_SWITCH_CODE          10
45687 +#define PASSIVE_START_CODE             11
45688 +#define PASSIVE_STOP_CODE              12
45689   
45690  #define INVALID_COOKIE ~0UL
45691  #define NO_COOKIE 0UL
45692 diff -urNp linux-2.6/drivers/oprofile/oprof.c new/drivers/oprofile/oprof.c
45693 --- linux-2.6/drivers/oprofile/oprof.c  2006-07-03 14:14:50.000000000 +0200
45694 +++ new/drivers/oprofile/oprof.c        2006-07-07 16:01:51.000000000 +0200
45695 @@ -5,6 +5,10 @@
45696   * @remark Read the file COPYING
45697   *
45698   * @author John Levon <levon@movementarian.org>
45699 + *
45700 + * Modified by Aravind Menon for Xen
45701 + * These modifications are:
45702 + * Copyright (C) 2005 Hewlett-Packard Co.
45703   */
45704  
45705  #include <linux/kernel.h>
45706 @@ -19,7 +23,7 @@
45707  #include "cpu_buffer.h"
45708  #include "buffer_sync.h"
45709  #include "oprofile_stats.h"
45710
45711 +
45712  struct oprofile_operations oprofile_ops;
45713  
45714  unsigned long oprofile_started;
45715 @@ -33,6 +37,32 @@ static DECLARE_MUTEX(start_sem);
45716   */
45717  static int timer = 0;
45718  
45719 +int oprofile_set_active(int active_domains[], unsigned int adomains)
45720 +{
45721 +       int err;
45722 +
45723 +       if (!oprofile_ops.set_active)
45724 +               return -EINVAL;
45725 +
45726 +       down(&start_sem);
45727 +       err = oprofile_ops.set_active(active_domains, adomains);
45728 +       up(&start_sem);
45729 +       return err;
45730 +}
45731 +
45732 +int oprofile_set_passive(int passive_domains[], unsigned int pdomains)
45733 +{
45734 +       int err;
45735 +
45736 +       if (!oprofile_ops.set_passive)
45737 +               return -EINVAL;
45738 +
45739 +       down(&start_sem);
45740 +       err = oprofile_ops.set_passive(passive_domains, pdomains);
45741 +       up(&start_sem);
45742 +       return err;
45743 +}
45744 +
45745  int oprofile_setup(void)
45746  {
45747         int err;
45748 diff -urNp linux-2.6/drivers/oprofile/oprof.h new/drivers/oprofile/oprof.h
45749 --- linux-2.6/drivers/oprofile/oprof.h  2006-07-03 14:14:50.000000000 +0200
45750 +++ new/drivers/oprofile/oprof.h        2006-07-07 16:02:16.000000000 +0200
45751 @@ -35,5 +35,8 @@ void oprofile_create_files(struct super_
45752  void oprofile_timer_init(struct oprofile_operations * ops);
45753  
45754  int oprofile_set_backtrace(unsigned long depth);
45755 +
45756 +int oprofile_set_active(int active_domains[], unsigned int adomains);
45757 +int oprofile_set_passive(int passive_domains[], unsigned int pdomains);
45758   
45759  #endif /* OPROF_H */
45760 diff -urNp linux-2.6/drivers/oprofile/oprofile_files.c new/drivers/oprofile/oprofile_files.c
45761 --- linux-2.6/drivers/oprofile/oprofile_files.c 2006-07-03 14:14:50.000000000 +0200
45762 +++ new/drivers/oprofile/oprofile_files.c       2006-07-07 16:04:26.000000000 +0200
45763 @@ -5,15 +5,21 @@
45764   * @remark Read the file COPYING
45765   *
45766   * @author John Levon <levon@movementarian.org>
45767 + *
45768 + * Modified by Aravind Menon for Xen
45769 + * These modifications are:
45770 + * Copyright (C) 2005 Hewlett-Packard Co.      
45771   */
45772  
45773  #include <linux/fs.h>
45774  #include <linux/oprofile.h>
45775 +#include <asm/uaccess.h>
45776 +#include <linux/ctype.h>
45777  
45778  #include "event_buffer.h"
45779  #include "oprofile_stats.h"
45780  #include "oprof.h"
45781
45782 +
45783  unsigned long fs_buffer_size = 131072;
45784  unsigned long fs_cpu_buffer_size = 8192;
45785  unsigned long fs_buffer_watershed = 32768; /* FIXME: tune */
45786 @@ -117,11 +123,206 @@ static ssize_t dump_write(struct file * 
45787  static struct file_operations dump_fops = {
45788         .write          = dump_write,
45789  };
45790
45791 +
45792 +#define TMPBUFSIZE 512
45793 +
45794 +#ifndef MAX_OPROF_DOMAINS
45795 +#define MAX_OPROF_DOMAINS   25 
45796 +#endif
45797 +
45798 +static unsigned int adomains = 0;
45799 +static int active_domains[MAX_OPROF_DOMAINS + 1];
45800 +static DEFINE_MUTEX(adom_mutex);
45801 +
45802 +static ssize_t adomain_write(struct file * file, char const __user * buf, 
45803 +                            size_t count, loff_t * offset)
45804 +{
45805 +       char *tmpbuf;
45806 +       char *startp, *endp;
45807 +       int i;
45808 +       unsigned long val;
45809 +       ssize_t retval = count;
45810 +       
45811 +       if (*offset)
45812 +               return -EINVAL; 
45813 +       if (count > TMPBUFSIZE - 1)
45814 +               return -EINVAL;
45815 +
45816 +       if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
45817 +               return -ENOMEM;
45818 +
45819 +       if (copy_from_user(tmpbuf, buf, count)) {
45820 +               kfree(tmpbuf);
45821 +               return -EFAULT;
45822 +       }
45823 +       tmpbuf[count] = 0;
45824 +
45825 +       mutex_lock(&adom_mutex);
45826 +
45827 +       startp = tmpbuf;
45828 +       /* Parse one more than MAX_OPROF_DOMAINS, for easy error checking */
45829 +       for (i = 0; i <= MAX_OPROF_DOMAINS; i++) {
45830 +               val = simple_strtoul(startp, &endp, 0);
45831 +               if (endp == startp)
45832 +                       break;
45833 +               while (ispunct(*endp) || isspace(*endp))
45834 +                       endp++;
45835 +               active_domains[i] = val;
45836 +               if (active_domains[i] != val)
45837 +                       /* Overflow, force error below */
45838 +                       i = MAX_OPROF_DOMAINS + 1;
45839 +               startp = endp;
45840 +       }
45841 +       /* Force error on trailing junk */
45842 +       adomains = *startp ? MAX_OPROF_DOMAINS + 1 : i;
45843 +
45844 +       kfree(tmpbuf);
45845 +
45846 +       if (adomains > MAX_OPROF_DOMAINS
45847 +           || oprofile_set_active(active_domains, adomains)) {
45848 +               adomains = 0;
45849 +               retval = -EINVAL;
45850 +       }
45851 +
45852 +       mutex_unlock(&adom_mutex);
45853 +       return retval;
45854 +}
45855 +
45856 +static ssize_t adomain_read(struct file * file, char __user * buf, 
45857 +                           size_t count, loff_t * offset)
45858 +{
45859 +       char * tmpbuf;
45860 +       size_t len;
45861 +       int i;
45862 +       ssize_t retval;
45863 +
45864 +       if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
45865 +               return -ENOMEM;
45866 +
45867 +       mutex_lock(&adom_mutex);
45868 +
45869 +       len = 0;
45870 +       for (i = 0; i < adomains; i++)
45871 +               len += snprintf(tmpbuf + len,
45872 +                               len < TMPBUFSIZE ? TMPBUFSIZE - len : 0,
45873 +                               "%u ", active_domains[i]);
45874 +       WARN_ON(len > TMPBUFSIZE);
45875 +       if (len != 0 && len <= TMPBUFSIZE)
45876 +               tmpbuf[len-1] = '\n';
45877 +
45878 +       mutex_unlock(&adom_mutex);
45879 +
45880 +       retval = simple_read_from_buffer(buf, count, offset, tmpbuf, len);
45881 +
45882 +       kfree(tmpbuf);
45883 +       return retval;
45884 +}
45885 +
45886 +
45887 +static struct file_operations active_domain_ops = {
45888 +       .read           = adomain_read,
45889 +       .write          = adomain_write,
45890 +};
45891 +
45892 +static unsigned int pdomains = 0;
45893 +static int passive_domains[MAX_OPROF_DOMAINS];
45894 +static DEFINE_MUTEX(pdom_mutex);
45895 +
45896 +static ssize_t pdomain_write(struct file * file, char const __user * buf, 
45897 +                            size_t count, loff_t * offset)
45898 +{
45899 +       char *tmpbuf;
45900 +       char *startp, *endp;
45901 +       int i;
45902 +       unsigned long val;
45903 +       ssize_t retval = count;
45904 +       
45905 +       if (*offset)
45906 +               return -EINVAL; 
45907 +       if (count > TMPBUFSIZE - 1)
45908 +               return -EINVAL;
45909 +
45910 +       if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
45911 +               return -ENOMEM;
45912 +
45913 +       if (copy_from_user(tmpbuf, buf, count)) {
45914 +               kfree(tmpbuf);
45915 +               return -EFAULT;
45916 +       }
45917 +       tmpbuf[count] = 0;
45918 +
45919 +       mutex_lock(&pdom_mutex);
45920 +
45921 +       startp = tmpbuf;
45922 +       /* Parse one more than MAX_OPROF_DOMAINS, for easy error checking */
45923 +       for (i = 0; i <= MAX_OPROF_DOMAINS; i++) {
45924 +               val = simple_strtoul(startp, &endp, 0);
45925 +               if (endp == startp)
45926 +                       break;
45927 +               while (ispunct(*endp) || isspace(*endp))
45928 +                       endp++;
45929 +               passive_domains[i] = val;
45930 +               if (passive_domains[i] != val)
45931 +                       /* Overflow, force error below */
45932 +                       i = MAX_OPROF_DOMAINS + 1;
45933 +               startp = endp;
45934 +       }
45935 +       /* Force error on trailing junk */
45936 +       pdomains = *startp ? MAX_OPROF_DOMAINS + 1 : i;
45937 +
45938 +       kfree(tmpbuf);
45939 +
45940 +       if (pdomains > MAX_OPROF_DOMAINS
45941 +           || oprofile_set_passive(passive_domains, pdomains)) {
45942 +               pdomains = 0;
45943 +               retval = -EINVAL;
45944 +       }
45945 +
45946 +       mutex_unlock(&pdom_mutex);
45947 +       return retval;
45948 +}
45949 +
45950 +static ssize_t pdomain_read(struct file * file, char __user * buf, 
45951 +                           size_t count, loff_t * offset)
45952 +{
45953 +       char * tmpbuf;
45954 +       size_t len;
45955 +       int i;
45956 +       ssize_t retval;
45957 +
45958 +       if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
45959 +               return -ENOMEM;
45960 +
45961 +       mutex_lock(&pdom_mutex);
45962 +
45963 +       len = 0;
45964 +       for (i = 0; i < pdomains; i++)
45965 +               len += snprintf(tmpbuf + len,
45966 +                               len < TMPBUFSIZE ? TMPBUFSIZE - len : 0,
45967 +                               "%u ", passive_domains[i]);
45968 +       WARN_ON(len > TMPBUFSIZE);
45969 +       if (len != 0 && len <= TMPBUFSIZE)
45970 +               tmpbuf[len-1] = '\n';
45971 +
45972 +       mutex_unlock(&pdom_mutex);
45973 +
45974 +       retval = simple_read_from_buffer(buf, count, offset, tmpbuf, len);
45975 +
45976 +       kfree(tmpbuf);
45977 +       return retval;
45978 +}
45979 +
45980 +static struct file_operations passive_domain_ops = {
45981 +       .read           = pdomain_read,
45982 +       .write          = pdomain_write,
45983 +};
45984 +
45985  void oprofile_create_files(struct super_block * sb, struct dentry * root)
45986  {
45987         oprofilefs_create_file(sb, root, "enable", &enable_fops);
45988         oprofilefs_create_file_perm(sb, root, "dump", &dump_fops, 0666);
45989 +       oprofilefs_create_file(sb, root, "active_domains", &active_domain_ops);
45990 +       oprofilefs_create_file(sb, root, "passive_domains", &passive_domain_ops);
45991         oprofilefs_create_file(sb, root, "buffer", &event_buffer_fops);
45992         oprofilefs_create_ulong(sb, root, "buffer_size", &fs_buffer_size);
45993         oprofilefs_create_ulong(sb, root, "buffer_watershed", &fs_buffer_watershed);
45994 diff -urNp linux-2.6/drivers/pci/Kconfig new/drivers/pci/Kconfig
45995 --- linux-2.6/drivers/pci/Kconfig       2006-07-03 14:14:50.000000000 +0200
45996 +++ new/drivers/pci/Kconfig     2006-05-09 12:34:08.000000000 +0200
45997 @@ -5,6 +5,7 @@ config PCI_MSI
45998         bool "Message Signaled Interrupts (MSI and MSI-X)"
45999         depends on PCI
46000         depends on (X86_LOCAL_APIC && X86_IO_APIC) || IA64
46001 +       depends on !XEN
46002         help
46003            This allows device drivers to enable MSI (Message Signaled
46004            Interrupts).  Message Signaled Interrupts enable a device to
46005 diff -urNp linux-2.6/drivers/s390/net/qeth_eddp.c new/drivers/s390/net/qeth_eddp.c
46006 --- linux-2.6/drivers/s390/net/qeth_eddp.c      2006-07-03 14:14:51.000000000 +0200
46007 +++ new/drivers/s390/net/qeth_eddp.c    2006-07-07 16:04:32.000000000 +0200
46008 @@ -420,7 +420,7 @@ __qeth_eddp_fill_context_tcp(struct qeth
46009         }
46010         tcph = eddp->skb->h.th;
46011         while (eddp->skb_offset < eddp->skb->len) {
46012 -               data_len = min((int)skb_shinfo(eddp->skb)->tso_size,
46013 +               data_len = min((int)skb_shinfo(eddp->skb)->gso_size,
46014                                (int)(eddp->skb->len - eddp->skb_offset));
46015                 /* prepare qdio hdr */
46016                 if (eddp->qh.hdr.l2.id == QETH_HEADER_TYPE_LAYER2){
46017 @@ -515,20 +515,20 @@ qeth_eddp_calc_num_pages(struct qeth_edd
46018  
46019         QETH_DBF_TEXT(trace, 5, "eddpcanp");
46020         /* can we put multiple skbs in one page? */
46021 -       skbs_per_page = PAGE_SIZE / (skb_shinfo(skb)->tso_size + hdr_len);
46022 +       skbs_per_page = PAGE_SIZE / (skb_shinfo(skb)->gso_size + hdr_len);
46023         if (skbs_per_page > 1){
46024 -               ctx->num_pages = (skb_shinfo(skb)->tso_segs + 1) /
46025 +               ctx->num_pages = (skb_shinfo(skb)->gso_segs + 1) /
46026                                  skbs_per_page + 1;
46027                 ctx->elements_per_skb = 1;
46028         } else {
46029                 /* no -> how many elements per skb? */
46030 -               ctx->elements_per_skb = (skb_shinfo(skb)->tso_size + hdr_len +
46031 +               ctx->elements_per_skb = (skb_shinfo(skb)->gso_size + hdr_len +
46032                                      PAGE_SIZE) >> PAGE_SHIFT;
46033                 ctx->num_pages = ctx->elements_per_skb *
46034 -                                (skb_shinfo(skb)->tso_segs + 1);
46035 +                                (skb_shinfo(skb)->gso_segs + 1);
46036         }
46037         ctx->num_elements = ctx->elements_per_skb *
46038 -                           (skb_shinfo(skb)->tso_segs + 1);
46039 +                           (skb_shinfo(skb)->gso_segs + 1);
46040  }
46041  
46042  static inline struct qeth_eddp_context *
46043 diff -urNp linux-2.6/drivers/s390/net/qeth_main.c new/drivers/s390/net/qeth_main.c
46044 --- linux-2.6/drivers/s390/net/qeth_main.c      2006-07-03 14:14:51.000000000 +0200
46045 +++ new/drivers/s390/net/qeth_main.c    2006-07-07 16:39:53.000000000 +0200
46046 @@ -4417,7 +4417,6 @@ qeth_send_packet(struct qeth_card *card,
46047         struct qeth_eddp_context *ctx = NULL;
46048         int tx_bytes = skb->len;
46049         unsigned short nr_frags = skb_shinfo(skb)->nr_frags;
46050 -       unsigned short tso_size = skb_shinfo(skb)->tso_size;
46051         int rc;
46052  
46053         QETH_DBF_TEXT(trace, 6, "sendpkt");
46054 @@ -4453,7 +4452,7 @@ qeth_send_packet(struct qeth_card *card,
46055         queue = card->qdio.out_qs
46056                 [qeth_get_priority_queue(card, skb, ipv, cast_type)];
46057  
46058 -       if (skb_shinfo(skb)->tso_size)
46059 +       if (skb_shinfo(skb)->gso_size)
46060                 large_send = card->options.large_send;
46061  
46062         /*are we able to do TSO ? If so ,prepare and send it from here */
46063 @@ -4500,7 +4499,7 @@ qeth_send_packet(struct qeth_card *card,
46064                 card->stats.tx_packets++;
46065                 card->stats.tx_bytes += tx_bytes;
46066  #ifdef CONFIG_QETH_PERF_STATS
46067 -               if (tso_size &&
46068 +               if (skb_shinfo(skb)->gso_size &&
46069                    !(large_send == QETH_LARGE_SEND_NO)) {
46070                         card->perf_stats.large_send_bytes += tx_bytes;
46071                         card->perf_stats.large_send_cnt++;
46072 diff -urNp linux-2.6/drivers/s390/net/qeth_tso.h new/drivers/s390/net/qeth_tso.h
46073 --- linux-2.6/drivers/s390/net/qeth_tso.h       2006-07-03 14:14:51.000000000 +0200
46074 +++ new/drivers/s390/net/qeth_tso.h     2006-07-07 16:05:51.000000000 +0200
46075 @@ -51,7 +51,7 @@ qeth_tso_fill_header(struct qeth_card *c
46076         hdr->ext.hdr_version = 1;
46077         hdr->ext.hdr_len     = 28;
46078         /*insert non-fix values */
46079 -       hdr->ext.mss = skb_shinfo(skb)->tso_size;
46080 +       hdr->ext.mss = skb_shinfo(skb)->gso_size;
46081         hdr->ext.dg_hdr_len = (__u16)(iph->ihl*4 + tcph->doff*4);
46082         hdr->ext.payload_len = (__u16)(skb->len - hdr->ext.dg_hdr_len -
46083                                        sizeof(struct qeth_hdr_tso));
46084 diff -urNp linux-2.6/drivers/serial/Kconfig new/drivers/serial/Kconfig
46085 --- linux-2.6/drivers/serial/Kconfig    2006-07-03 14:14:57.000000000 +0200
46086 +++ new/drivers/serial/Kconfig  2006-05-09 12:34:23.000000000 +0200
46087 @@ -11,6 +11,7 @@ menu "Serial drivers"
46088  config SERIAL_8250
46089         tristate "8250/16550 and compatible serial support"
46090         depends on (BROKEN || !SPARC)
46091 +       depends on !XEN_DISABLE_SERIAL
46092         select SERIAL_CORE
46093         ---help---
46094           This selects whether you want to include the driver for the standard
46095 diff -urNp linux-2.6/drivers/video/Kconfig new/drivers/video/Kconfig
46096 --- linux-2.6/drivers/video/Kconfig     2006-07-03 14:15:00.000000000 +0200
46097 +++ new/drivers/video/Kconfig   2006-05-09 12:34:31.000000000 +0200
46098 @@ -513,7 +513,7 @@ config FB_HGA_ACCEL
46099  
46100  config VIDEO_SELECT
46101         bool
46102 -       depends on (FB = y) && X86
46103 +       depends on (FB = y) && X86 && !XEN
46104         default y
46105  
46106  config FB_SGIVW
46107 diff -urNp linux-2.6/drivers/xen/balloon/balloon.c new/drivers/xen/balloon/balloon.c
46108 --- linux-2.6/drivers/xen/balloon/balloon.c     1970-01-01 01:00:00.000000000 +0100
46109 +++ new/drivers/xen/balloon/balloon.c   2006-07-07 16:05:51.000000000 +0200
46110 @@ -0,0 +1,608 @@
46111 +/******************************************************************************
46112 + * balloon.c
46113 + *
46114 + * Xen balloon driver - enables returning/claiming memory to/from Xen.
46115 + *
46116 + * Copyright (c) 2003, B Dragovic
46117 + * Copyright (c) 2003-2004, M Williamson, K Fraser
46118 + * Copyright (c) 2005 Dan M. Smith, IBM Corporation
46119 + * 
46120 + * This program is free software; you can redistribute it and/or
46121 + * modify it under the terms of the GNU General Public License version 2
46122 + * as published by the Free Software Foundation; or, when distributed
46123 + * separately from the Linux kernel or incorporated into other
46124 + * software packages, subject to the following license:
46125 + * 
46126 + * Permission is hereby granted, free of charge, to any person obtaining a copy
46127 + * of this source file (the "Software"), to deal in the Software without
46128 + * restriction, including without limitation the rights to use, copy, modify,
46129 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
46130 + * and to permit persons to whom the Software is furnished to do so, subject to
46131 + * the following conditions:
46132 + * 
46133 + * The above copyright notice and this permission notice shall be included in
46134 + * all copies or substantial portions of the Software.
46135 + * 
46136 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
46137 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46138 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
46139 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46140 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
46141 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
46142 + * IN THE SOFTWARE.
46143 + */
46144 +
46145 +#include <linux/config.h>
46146 +#include <linux/kernel.h>
46147 +#include <linux/module.h>
46148 +#include <linux/sched.h>
46149 +#include <linux/errno.h>
46150 +#include <linux/mm.h>
46151 +#include <linux/mman.h>
46152 +#include <linux/smp_lock.h>
46153 +#include <linux/pagemap.h>
46154 +#include <linux/bootmem.h>
46155 +#include <linux/highmem.h>
46156 +#include <linux/vmalloc.h>
46157 +#include <xen/xen_proc.h>
46158 +#include <asm/hypervisor.h>
46159 +#include <xen/balloon.h>
46160 +#include <xen/interface/memory.h>
46161 +#include <asm/pgalloc.h>
46162 +#include <asm/pgtable.h>
46163 +#include <asm/uaccess.h>
46164 +#include <asm/tlb.h>
46165 +#include <linux/list.h>
46166 +
46167 +#include <xen/xenbus.h>
46168 +
46169 +#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
46170 +
46171 +#ifdef CONFIG_PROC_FS
46172 +static struct proc_dir_entry *balloon_pde;
46173 +#endif
46174 +
46175 +static DECLARE_MUTEX(balloon_mutex);
46176 +
46177 +/*
46178 + * Protects atomic reservation decrease/increase against concurrent increases.
46179 + * Also protects non-atomic updates of current_pages and driver_pages, and
46180 + * balloon lists.
46181 + */
46182 +DEFINE_SPINLOCK(balloon_lock);
46183 +
46184 +/* We aim for 'current allocation' == 'target allocation'. */
46185 +static unsigned long current_pages;
46186 +static unsigned long target_pages;
46187 +
46188 +/* We increase/decrease in batches which fit in a page */
46189 +static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; 
46190 +
46191 +/* VM /proc information for memory */
46192 +extern unsigned long totalram_pages;
46193 +
46194 +/* We may hit the hard limit in Xen. If we do then we remember it. */
46195 +static unsigned long hard_limit;
46196 +
46197 +/*
46198 + * Drivers may alter the memory reservation independently, but they must
46199 + * inform the balloon driver so that we can avoid hitting the hard limit.
46200 + */
46201 +static unsigned long driver_pages;
46202 +
46203 +/* List of ballooned pages, threaded through the mem_map array. */
46204 +static LIST_HEAD(ballooned_pages);
46205 +static unsigned long balloon_low, balloon_high;
46206 +
46207 +/* Main work function, always executed in process context. */
46208 +static void balloon_process(void *unused);
46209 +static DECLARE_WORK(balloon_worker, balloon_process, NULL);
46210 +static struct timer_list balloon_timer;
46211 +
46212 +/* When ballooning out (allocating memory to return to Xen) we don't really 
46213 +   want the kernel to try too hard since that can trigger the oom killer. */
46214 +#define GFP_BALLOON \
46215 +       (GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC)
46216 +
46217 +#define PAGE_TO_LIST(p) (&(p)->lru)
46218 +#define LIST_TO_PAGE(l) list_entry((l), struct page, lru)
46219 +#define UNLIST_PAGE(p)                         \
46220 +       do {                                    \
46221 +               list_del(PAGE_TO_LIST(p));      \
46222 +               PAGE_TO_LIST(p)->next = NULL;   \
46223 +               PAGE_TO_LIST(p)->prev = NULL;   \
46224 +       } while(0)
46225 +
46226 +#define IPRINTK(fmt, args...) \
46227 +       printk(KERN_INFO "xen_mem: " fmt, ##args)
46228 +#define WPRINTK(fmt, args...) \
46229 +       printk(KERN_WARNING "xen_mem: " fmt, ##args)
46230 +
46231 +/* balloon_append: add the given page to the balloon. */
46232 +static void balloon_append(struct page *page)
46233 +{
46234 +       /* Lowmem is re-populated first, so highmem pages go at list tail. */
46235 +       if (PageHighMem(page)) {
46236 +               list_add_tail(PAGE_TO_LIST(page), &ballooned_pages);
46237 +               balloon_high++;
46238 +       } else {
46239 +               list_add(PAGE_TO_LIST(page), &ballooned_pages);
46240 +               balloon_low++;
46241 +       }
46242 +}
46243 +
46244 +/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
46245 +static struct page *balloon_retrieve(void)
46246 +{
46247 +       struct page *page;
46248 +
46249 +       if (list_empty(&ballooned_pages))
46250 +               return NULL;
46251 +
46252 +       page = LIST_TO_PAGE(ballooned_pages.next);
46253 +       UNLIST_PAGE(page);
46254 +
46255 +       if (PageHighMem(page))
46256 +               balloon_high--;
46257 +       else
46258 +               balloon_low--;
46259 +
46260 +       return page;
46261 +}
46262 +
46263 +static struct page *balloon_first_page(void)
46264 +{
46265 +       if (list_empty(&ballooned_pages))
46266 +               return NULL;
46267 +       return LIST_TO_PAGE(ballooned_pages.next);
46268 +}
46269 +
46270 +static struct page *balloon_next_page(struct page *page)
46271 +{
46272 +       struct list_head *next = PAGE_TO_LIST(page)->next;
46273 +       if (next == &ballooned_pages)
46274 +               return NULL;
46275 +       return LIST_TO_PAGE(next);
46276 +}
46277 +
46278 +static void balloon_alarm(unsigned long unused)
46279 +{
46280 +       schedule_work(&balloon_worker);
46281 +}
46282 +
46283 +static unsigned long current_target(void)
46284 +{
46285 +       unsigned long target = min(target_pages, hard_limit);
46286 +       if (target > (current_pages + balloon_low + balloon_high))
46287 +               target = current_pages + balloon_low + balloon_high;
46288 +       return target;
46289 +}
46290 +
46291 +static int increase_reservation(unsigned long nr_pages)
46292 +{
46293 +       unsigned long  pfn, i, flags;
46294 +       struct page   *page;
46295 +       long           rc;
46296 +       struct xen_memory_reservation reservation = {
46297 +               .address_bits = 0,
46298 +               .extent_order = 0,
46299 +               .domid        = DOMID_SELF
46300 +       };
46301 +
46302 +       if (nr_pages > ARRAY_SIZE(frame_list))
46303 +               nr_pages = ARRAY_SIZE(frame_list);
46304 +
46305 +       balloon_lock(flags);
46306 +
46307 +       page = balloon_first_page();
46308 +       for (i = 0; i < nr_pages; i++) {
46309 +               BUG_ON(page == NULL);
46310 +               frame_list[i] = page_to_pfn(page);;
46311 +               page = balloon_next_page(page);
46312 +       }
46313 +
46314 +       set_xen_guest_handle(reservation.extent_start, frame_list);
46315 +       reservation.nr_extents   = nr_pages;
46316 +       rc = HYPERVISOR_memory_op(
46317 +               XENMEM_populate_physmap, &reservation);
46318 +       if (rc < nr_pages) {
46319 +               if (rc > 0) {
46320 +                       int ret;
46321 +
46322 +                       /* We hit the Xen hard limit: reprobe. */
46323 +                       reservation.nr_extents = rc;
46324 +                       ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
46325 +                                       &reservation);
46326 +                       BUG_ON(ret != rc);
46327 +               }
46328 +               if (rc >= 0)
46329 +                       hard_limit = current_pages + rc - driver_pages;
46330 +               goto out;
46331 +       }
46332 +
46333 +       for (i = 0; i < nr_pages; i++) {
46334 +               page = balloon_retrieve();
46335 +               BUG_ON(page == NULL);
46336 +
46337 +               pfn = page_to_pfn(page);
46338 +               BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
46339 +                      phys_to_machine_mapping_valid(pfn));
46340 +
46341 +               /* Update P->M and M->P tables. */
46342 +               set_phys_to_machine(pfn, frame_list[i]);
46343 +               xen_machphys_update(frame_list[i], pfn);
46344 +
46345 +               /* Link back into the page tables if not highmem. */
46346 +               if (pfn < max_low_pfn) {
46347 +                       int ret;
46348 +                       ret = HYPERVISOR_update_va_mapping(
46349 +                               (unsigned long)__va(pfn << PAGE_SHIFT),
46350 +                               pfn_pte_ma(frame_list[i], PAGE_KERNEL),
46351 +                               0);
46352 +                       BUG_ON(ret);
46353 +               }
46354 +
46355 +               /* Relinquish the page back to the allocator. */
46356 +               ClearPageReserved(page);
46357 +               init_page_count(page);
46358 +               __free_page(page);
46359 +       }
46360 +
46361 +       current_pages += nr_pages;
46362 +       totalram_pages = current_pages;
46363 +
46364 + out:
46365 +       balloon_unlock(flags);
46366 +
46367 +       return 0;
46368 +}
46369 +
46370 +static int decrease_reservation(unsigned long nr_pages)
46371 +{
46372 +       unsigned long  pfn, i, flags;
46373 +       struct page   *page;
46374 +       void          *v;
46375 +       int            need_sleep = 0;
46376 +       int ret;
46377 +       struct xen_memory_reservation reservation = {
46378 +               .address_bits = 0,
46379 +               .extent_order = 0,
46380 +               .domid        = DOMID_SELF
46381 +       };
46382 +
46383 +       if (nr_pages > ARRAY_SIZE(frame_list))
46384 +               nr_pages = ARRAY_SIZE(frame_list);
46385 +
46386 +       for (i = 0; i < nr_pages; i++) {
46387 +               if ((page = alloc_page(GFP_BALLOON)) == NULL) {
46388 +                       nr_pages = i;
46389 +                       need_sleep = 1;
46390 +                       break;
46391 +               }
46392 +
46393 +               pfn = page_to_pfn(page);
46394 +               frame_list[i] = pfn_to_mfn(pfn);
46395 +
46396 +               if (!PageHighMem(page)) {
46397 +                       v = phys_to_virt(pfn << PAGE_SHIFT);
46398 +                       scrub_pages(v, 1);
46399 +                       ret = HYPERVISOR_update_va_mapping(
46400 +                               (unsigned long)v, __pte_ma(0), 0);
46401 +                       BUG_ON(ret);
46402 +               }
46403 +#ifdef CONFIG_XEN_SCRUB_PAGES
46404 +               else {
46405 +                       v = kmap(page);
46406 +                       scrub_pages(v, 1);
46407 +                       kunmap(page);
46408 +               }
46409 +#endif
46410 +       }
46411 +
46412 +       /* Ensure that ballooned highmem pages don't have kmaps. */
46413 +       kmap_flush_unused();
46414 +       flush_tlb_all();
46415 +
46416 +       balloon_lock(flags);
46417 +
46418 +       /* No more mappings: invalidate P2M and add to balloon. */
46419 +       for (i = 0; i < nr_pages; i++) {
46420 +               pfn = mfn_to_pfn(frame_list[i]);
46421 +               set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
46422 +               balloon_append(pfn_to_page(pfn));
46423 +       }
46424 +
46425 +       set_xen_guest_handle(reservation.extent_start, frame_list);
46426 +       reservation.nr_extents   = nr_pages;
46427 +       ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
46428 +       BUG_ON(ret != nr_pages);
46429 +
46430 +       current_pages -= nr_pages;
46431 +       totalram_pages = current_pages;
46432 +
46433 +       balloon_unlock(flags);
46434 +
46435 +       return need_sleep;
46436 +}
46437 +
46438 +/*
46439 + * We avoid multiple worker processes conflicting via the balloon mutex.
46440 + * We may of course race updates of the target counts (which are protected
46441 + * by the balloon lock), or with changes to the Xen hard limit, but we will
46442 + * recover from these in time.
46443 + */
46444 +static void balloon_process(void *unused)
46445 +{
46446 +       int need_sleep = 0;
46447 +       long credit;
46448 +
46449 +       down(&balloon_mutex);
46450 +
46451 +       do {
46452 +               credit = current_target() - current_pages;
46453 +               if (credit > 0)
46454 +                       need_sleep = (increase_reservation(credit) != 0);
46455 +               if (credit < 0)
46456 +                       need_sleep = (decrease_reservation(-credit) != 0);
46457 +
46458 +#ifndef CONFIG_PREEMPT
46459 +               if (need_resched())
46460 +                       schedule();
46461 +#endif
46462 +       } while ((credit != 0) && !need_sleep);
46463 +
46464 +       /* Schedule more work if there is some still to be done. */
46465 +       if (current_target() != current_pages)
46466 +               mod_timer(&balloon_timer, jiffies + HZ);
46467 +
46468 +       up(&balloon_mutex);
46469 +}
46470 +
46471 +/* Resets the Xen limit, sets new target, and kicks off processing. */
46472 +static void set_new_target(unsigned long target)
46473 +{
46474 +       /* No need for lock. Not read-modify-write updates. */
46475 +       hard_limit   = ~0UL;
46476 +       target_pages = target;
46477 +       schedule_work(&balloon_worker);
46478 +}
46479 +
46480 +static struct xenbus_watch target_watch =
46481 +{
46482 +       .node = "memory/target"
46483 +};
46484 +
46485 +/* React to a change in the target key */
46486 +static void watch_target(struct xenbus_watch *watch,
46487 +                        const char **vec, unsigned int len)
46488 +{
46489 +       unsigned long long new_target;
46490 +       int err;
46491 +
46492 +       err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
46493 +       if (err != 1) {
46494 +               /* This is ok (for domain0 at least) - so just return */
46495 +               return;
46496 +       }
46497 +
46498 +       /* The given memory/target value is in KiB, so it needs converting to
46499 +        * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
46500 +        */
46501 +       set_new_target(new_target >> (PAGE_SHIFT - 10));
46502 +}
46503 +
46504 +static int balloon_init_watcher(struct notifier_block *notifier,
46505 +                               unsigned long event,
46506 +                               void *data)
46507 +{
46508 +       int err;
46509 +
46510 +       err = register_xenbus_watch(&target_watch);
46511 +       if (err)
46512 +               printk(KERN_ERR "Failed to set balloon watcher\n");
46513 +
46514 +       return NOTIFY_DONE;
46515 +}
46516 +
46517 +#ifdef CONFIG_PROC_FS
46518 +static int balloon_write(struct file *file, const char __user *buffer,
46519 +                        unsigned long count, void *data)
46520 +{
46521 +       char memstring[64], *endchar;
46522 +       unsigned long long target_bytes;
46523 +
46524 +       if (!capable(CAP_SYS_ADMIN))
46525 +               return -EPERM;
46526 +
46527 +       if (count <= 1)
46528 +               return -EBADMSG; /* runt */
46529 +       if (count > sizeof(memstring))
46530 +               return -EFBIG;   /* too long */
46531 +
46532 +       if (copy_from_user(memstring, buffer, count))
46533 +               return -EFAULT;
46534 +       memstring[sizeof(memstring)-1] = '\0';
46535 +
46536 +       target_bytes = memparse(memstring, &endchar);
46537 +       set_new_target(target_bytes >> PAGE_SHIFT);
46538 +
46539 +       return count;
46540 +}
46541 +
46542 +static int balloon_read(char *page, char **start, off_t off,
46543 +                       int count, int *eof, void *data)
46544 +{
46545 +       int len;
46546 +
46547 +       len = sprintf(
46548 +               page,
46549 +               "Current allocation: %8lu kB\n"
46550 +               "Requested target:   %8lu kB\n"
46551 +               "Low-mem balloon:    %8lu kB\n"
46552 +               "High-mem balloon:   %8lu kB\n"
46553 +               "Xen hard limit:     ",
46554 +               PAGES2KB(current_pages), PAGES2KB(target_pages), 
46555 +               PAGES2KB(balloon_low), PAGES2KB(balloon_high));
46556 +
46557 +       if (hard_limit != ~0UL) {
46558 +               len += sprintf(
46559 +                       page + len, 
46560 +                       "%8lu kB (inc. %8lu kB driver headroom)\n",
46561 +                       PAGES2KB(hard_limit), PAGES2KB(driver_pages));
46562 +       } else {
46563 +               len += sprintf(
46564 +                       page + len,
46565 +                       "     ??? kB\n");
46566 +       }
46567 +
46568 +       *eof = 1;
46569 +       return len;
46570 +}
46571 +#endif
46572 +
46573 +static struct notifier_block xenstore_notifier;
46574 +
46575 +static int __init balloon_init(void)
46576 +{
46577 +       unsigned long pfn;
46578 +       struct page *page;
46579 +
46580 +       if (!is_running_on_xen())
46581 +               return -ENODEV;
46582 +
46583 +       IPRINTK("Initialising balloon driver.\n");
46584 +
46585 +       current_pages = min(xen_start_info->nr_pages, max_pfn);
46586 +       totalram_pages = current_pages;
46587 +       target_pages  = current_pages;
46588 +       balloon_low   = 0;
46589 +       balloon_high  = 0;
46590 +       driver_pages  = 0UL;
46591 +       hard_limit    = ~0UL;
46592 +
46593 +       init_timer(&balloon_timer);
46594 +       balloon_timer.data = 0;
46595 +       balloon_timer.function = balloon_alarm;
46596 +    
46597 +#ifdef CONFIG_PROC_FS
46598 +       if ((balloon_pde = create_xen_proc_entry("balloon", 0644)) == NULL) {
46599 +               WPRINTK("Unable to create /proc/xen/balloon.\n");
46600 +               return -1;
46601 +       }
46602 +
46603 +       balloon_pde->read_proc  = balloon_read;
46604 +       balloon_pde->write_proc = balloon_write;
46605 +#endif
46606 +    
46607 +       /* Initialise the balloon with excess memory space. */
46608 +       for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
46609 +               page = pfn_to_page(pfn);
46610 +               if (!PageReserved(page))
46611 +                       balloon_append(page);
46612 +       }
46613 +
46614 +       target_watch.callback = watch_target;
46615 +       xenstore_notifier.notifier_call = balloon_init_watcher;
46616 +
46617 +       register_xenstore_notifier(&xenstore_notifier);
46618 +    
46619 +       return 0;
46620 +}
46621 +
46622 +subsys_initcall(balloon_init);
46623 +
46624 +void balloon_update_driver_allowance(long delta)
46625 +{
46626 +       unsigned long flags;
46627 +
46628 +       balloon_lock(flags);
46629 +       driver_pages += delta;
46630 +       balloon_unlock(flags);
46631 +}
46632 +
46633 +static int dealloc_pte_fn(
46634 +       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
46635 +{
46636 +       unsigned long mfn = pte_mfn(*pte);
46637 +       int ret;
46638 +       struct xen_memory_reservation reservation = {
46639 +               .nr_extents   = 1,
46640 +               .extent_order = 0,
46641 +               .domid        = DOMID_SELF
46642 +       };
46643 +       set_xen_guest_handle(reservation.extent_start, &mfn);
46644 +       set_pte_at(&init_mm, addr, pte, __pte_ma(0));
46645 +       set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY);
46646 +       ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
46647 +       BUG_ON(ret != 1);
46648 +       return 0;
46649 +}
46650 +
46651 +struct page *balloon_alloc_empty_page_range(unsigned long nr_pages)
46652 +{
46653 +       unsigned long vstart, flags;
46654 +       unsigned int  order = get_order(nr_pages * PAGE_SIZE);
46655 +       int ret;
46656 +       unsigned long i;
46657 +       struct page *page;
46658 +
46659 +       vstart = __get_free_pages(GFP_KERNEL, order);
46660 +       if (vstart == 0)
46661 +               return NULL;
46662 +
46663 +       scrub_pages(vstart, 1 << order);
46664 +
46665 +       balloon_lock(flags);
46666 +       if (xen_feature(XENFEAT_auto_translated_physmap)) {
46667 +               unsigned long gmfn = __pa(vstart) >> PAGE_SHIFT;
46668 +               struct xen_memory_reservation reservation = {
46669 +                       .nr_extents   = 1,
46670 +                       .extent_order = order,
46671 +                       .domid        = DOMID_SELF
46672 +               };
46673 +               set_xen_guest_handle(reservation.extent_start, &gmfn);
46674 +               ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
46675 +                                          &reservation);
46676 +               BUG_ON(ret != 1);
46677 +       } else {
46678 +               ret = apply_to_page_range(&init_mm, vstart, PAGE_SIZE << order,
46679 +                                         dealloc_pte_fn, NULL);
46680 +               BUG_ON(ret);
46681 +       }
46682 +       current_pages -= 1UL << order;
46683 +       totalram_pages = current_pages;
46684 +       balloon_unlock(flags);
46685 +
46686 +       schedule_work(&balloon_worker);
46687 +
46688 +       flush_tlb_all();
46689 +
46690 +       page = virt_to_page(vstart);
46691 +
46692 +       for (i = 0; i < (1UL << order); i++)
46693 +               init_page_count(page + i);
46694 +
46695 +       return page;
46696 +}
46697 +
46698 +void balloon_dealloc_empty_page_range(
46699 +       struct page *page, unsigned long nr_pages)
46700 +{
46701 +       unsigned long i, flags;
46702 +       unsigned int  order = get_order(nr_pages * PAGE_SIZE);
46703 +
46704 +       balloon_lock(flags);
46705 +       for (i = 0; i < (1UL << order); i++) {
46706 +               BUG_ON(page_count(page + i) != 1);
46707 +               balloon_append(page + i);
46708 +       }
46709 +       balloon_unlock(flags);
46710 +
46711 +       schedule_work(&balloon_worker);
46712 +}
46713 +
46714 +EXPORT_SYMBOL_GPL(balloon_update_driver_allowance);
46715 +EXPORT_SYMBOL_GPL(balloon_alloc_empty_page_range);
46716 +EXPORT_SYMBOL_GPL(balloon_dealloc_empty_page_range);
46717 +
46718 +MODULE_LICENSE("Dual BSD/GPL");
46719 diff -urNp linux-2.6/drivers/xen/balloon/Makefile new/drivers/xen/balloon/Makefile
46720 --- linux-2.6/drivers/xen/balloon/Makefile      1970-01-01 01:00:00.000000000 +0100
46721 +++ new/drivers/xen/balloon/Makefile    2006-05-09 12:34:36.000000000 +0200
46722 @@ -0,0 +1,2 @@
46723 +
46724 +obj-y += balloon.o
46725 diff -urNp linux-2.6/drivers/xen/blkback/blkback.c new/drivers/xen/blkback/blkback.c
46726 --- linux-2.6/drivers/xen/blkback/blkback.c     1970-01-01 01:00:00.000000000 +0100
46727 +++ new/drivers/xen/blkback/blkback.c   2006-06-28 14:32:14.000000000 +0200
46728 @@ -0,0 +1,568 @@
46729 +/******************************************************************************
46730 + * arch/xen/drivers/blkif/backend/main.c
46731 + * 
46732 + * Back-end of the driver for virtual block devices. This portion of the
46733 + * driver exports a 'unified' block-device interface that can be accessed
46734 + * by any operating system that implements a compatible front end. A 
46735 + * reference front-end implementation can be found in:
46736 + *  arch/xen/drivers/blkif/frontend
46737 + * 
46738 + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
46739 + * Copyright (c) 2005, Christopher Clark
46740 + * 
46741 + * This program is free software; you can redistribute it and/or
46742 + * modify it under the terms of the GNU General Public License version 2
46743 + * as published by the Free Software Foundation; or, when distributed
46744 + * separately from the Linux kernel or incorporated into other
46745 + * software packages, subject to the following license:
46746 + * 
46747 + * Permission is hereby granted, free of charge, to any person obtaining a copy
46748 + * of this source file (the "Software"), to deal in the Software without
46749 + * restriction, including without limitation the rights to use, copy, modify,
46750 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
46751 + * and to permit persons to whom the Software is furnished to do so, subject to
46752 + * the following conditions:
46753 + * 
46754 + * The above copyright notice and this permission notice shall be included in
46755 + * all copies or substantial portions of the Software.
46756 + * 
46757 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
46758 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46759 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
46760 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46761 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
46762 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
46763 + * IN THE SOFTWARE.
46764 + */
46765 +
46766 +#include <linux/spinlock.h>
46767 +#include <linux/kthread.h>
46768 +#include <linux/list.h>
46769 +#include <xen/balloon.h>
46770 +#include <asm/hypervisor.h>
46771 +#include "common.h"
46772 +
46773 +/*
46774 + * These are rather arbitrary. They are fairly large because adjacent requests
46775 + * pulled from a communication ring are quite likely to end up being part of
46776 + * the same scatter/gather request at the disc.
46777 + * 
46778 + * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
46779 + * 
46780 + * This will increase the chances of being able to write whole tracks.
46781 + * 64 should be enough to keep us competitive with Linux.
46782 + */
46783 +static int blkif_reqs = 64;
46784 +module_param_named(reqs, blkif_reqs, int, 0);
46785 +MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
46786 +
46787 +static int mmap_pages;
46788 +
46789 +/* Run-time switchable: /sys/module/blkback/parameters/ */
46790 +static unsigned int log_stats = 0;
46791 +static unsigned int debug_lvl = 0;
46792 +module_param(log_stats, int, 0644);
46793 +module_param(debug_lvl, int, 0644);
46794 +
46795 +/*
46796 + * Each outstanding request that we've passed to the lower device layers has a 
46797 + * 'pending_req' allocated to it. Each buffer_head that completes decrements 
46798 + * the pendcnt towards zero. When it hits zero, the specified domain has a 
46799 + * response queued for it, with the saved 'id' passed back.
46800 + */
46801 +typedef struct {
46802 +       blkif_t       *blkif;
46803 +       unsigned long  id;
46804 +       int            nr_pages;
46805 +       atomic_t       pendcnt;
46806 +       unsigned short operation;
46807 +       int            status;
46808 +       struct list_head free_list;
46809 +} pending_req_t;
46810 +
46811 +static pending_req_t *pending_reqs;
46812 +static struct list_head pending_free;
46813 +static DEFINE_SPINLOCK(pending_free_lock);
46814 +static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
46815 +
46816 +#define BLKBACK_INVALID_HANDLE (~0)
46817 +
46818 +static unsigned long mmap_vstart;
46819 +static unsigned long *pending_vaddrs;
46820 +static grant_handle_t *pending_grant_handles;
46821 +
46822 +static inline int vaddr_pagenr(pending_req_t *req, int seg)
46823 +{
46824 +       return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
46825 +}
46826 +
46827 +static inline unsigned long vaddr(pending_req_t *req, int seg)
46828 +{
46829 +       return pending_vaddrs[vaddr_pagenr(req, seg)];
46830 +}
46831 +
46832 +#define pending_handle(_req, _seg) \
46833 +       (pending_grant_handles[vaddr_pagenr(_req, _seg)])
46834 +
46835 +
46836 +static int do_block_io_op(blkif_t *blkif);
46837 +static void dispatch_rw_block_io(blkif_t *blkif,
46838 +                                blkif_request_t *req,
46839 +                                pending_req_t *pending_req);
46840 +static void make_response(blkif_t *blkif, unsigned long id, 
46841 +                         unsigned short op, int st);
46842 +
46843 +/******************************************************************
46844 + * misc small helpers
46845 + */
46846 +static pending_req_t* alloc_req(void)
46847 +{
46848 +       pending_req_t *req = NULL;
46849 +       unsigned long flags;
46850 +
46851 +       spin_lock_irqsave(&pending_free_lock, flags);
46852 +       if (!list_empty(&pending_free)) {
46853 +               req = list_entry(pending_free.next, pending_req_t, free_list);
46854 +               list_del(&req->free_list);
46855 +       }
46856 +       spin_unlock_irqrestore(&pending_free_lock, flags);
46857 +       return req;
46858 +}
46859 +
46860 +static void free_req(pending_req_t *req)
46861 +{
46862 +       unsigned long flags;
46863 +       int was_empty;
46864 +
46865 +       spin_lock_irqsave(&pending_free_lock, flags);
46866 +       was_empty = list_empty(&pending_free);
46867 +       list_add(&req->free_list, &pending_free);
46868 +       spin_unlock_irqrestore(&pending_free_lock, flags);
46869 +       if (was_empty)
46870 +               wake_up(&pending_free_wq);
46871 +}
46872 +
46873 +static void unplug_queue(blkif_t *blkif)
46874 +{
46875 +       if (blkif->plug == NULL)
46876 +               return;
46877 +       if (blkif->plug->unplug_fn)
46878 +               blkif->plug->unplug_fn(blkif->plug);
46879 +       blk_put_queue(blkif->plug);
46880 +       blkif->plug = NULL;
46881 +}
46882 +
46883 +static void plug_queue(blkif_t *blkif, struct bio *bio)
46884 +{
46885 +       request_queue_t *q = bdev_get_queue(bio->bi_bdev);
46886 +
46887 +       if (q == blkif->plug)
46888 +               return;
46889 +       unplug_queue(blkif);
46890 +       blk_get_queue(q);
46891 +       blkif->plug = q;
46892 +}
46893 +
46894 +static void fast_flush_area(pending_req_t *req)
46895 +{
46896 +       struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
46897 +       unsigned int i, invcount = 0;
46898 +       grant_handle_t handle;
46899 +       int ret;
46900 +
46901 +       for (i = 0; i < req->nr_pages; i++) {
46902 +               handle = pending_handle(req, i);
46903 +               if (handle == BLKBACK_INVALID_HANDLE)
46904 +                       continue;
46905 +               gnttab_set_unmap_op(&unmap[i], vaddr(req, i), GNTMAP_host_map,
46906 +                                   handle);
46907 +               pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
46908 +               invcount++;
46909 +       }
46910 +
46911 +       ret = HYPERVISOR_grant_table_op(
46912 +               GNTTABOP_unmap_grant_ref, unmap, invcount);
46913 +       BUG_ON(ret);
46914 +}
46915 +
46916 +/******************************************************************
46917 + * SCHEDULER FUNCTIONS
46918 + */
46919 +
46920 +static void print_stats(blkif_t *blkif)
46921 +{
46922 +       printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d\n",
46923 +              current->comm, blkif->st_oo_req,
46924 +              blkif->st_rd_req, blkif->st_wr_req);
46925 +       blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
46926 +       blkif->st_rd_req = 0;
46927 +       blkif->st_wr_req = 0;
46928 +       blkif->st_oo_req = 0;
46929 +}
46930 +
46931 +int blkif_schedule(void *arg)
46932 +{
46933 +       blkif_t *blkif = arg;
46934 +
46935 +       blkif_get(blkif);
46936 +
46937 +       if (debug_lvl)
46938 +               printk(KERN_DEBUG "%s: started\n", current->comm);
46939 +
46940 +       while (!kthread_should_stop()) {
46941 +               wait_event_interruptible(
46942 +                       blkif->wq,
46943 +                       blkif->waiting_reqs || kthread_should_stop());
46944 +               wait_event_interruptible(
46945 +                       pending_free_wq,
46946 +                       !list_empty(&pending_free) || kthread_should_stop());
46947 +
46948 +               blkif->waiting_reqs = 0;
46949 +               smp_mb(); /* clear flag *before* checking for work */
46950 +
46951 +               if (do_block_io_op(blkif))
46952 +                       blkif->waiting_reqs = 1;
46953 +               unplug_queue(blkif);
46954 +
46955 +               if (log_stats && time_after(jiffies, blkif->st_print))
46956 +                       print_stats(blkif);
46957 +       }
46958 +
46959 +       if (log_stats)
46960 +               print_stats(blkif);
46961 +       if (debug_lvl)
46962 +               printk(KERN_DEBUG "%s: exiting\n", current->comm);
46963 +
46964 +       blkif->xenblkd = NULL;
46965 +       blkif_put(blkif);
46966 +
46967 +       return 0;
46968 +}
46969 +
46970 +/******************************************************************
46971 + * COMPLETION CALLBACK -- Called as bh->b_end_io()
46972 + */
46973 +
46974 +static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
46975 +{
46976 +       /* An error fails the entire request. */
46977 +       if (!uptodate) {
46978 +               DPRINTK("Buffer not up-to-date at end of operation\n");
46979 +               pending_req->status = BLKIF_RSP_ERROR;
46980 +       }
46981 +
46982 +       if (atomic_dec_and_test(&pending_req->pendcnt)) {
46983 +               fast_flush_area(pending_req);
46984 +               make_response(pending_req->blkif, pending_req->id,
46985 +                             pending_req->operation, pending_req->status);
46986 +               blkif_put(pending_req->blkif);
46987 +               free_req(pending_req);
46988 +       }
46989 +}
46990 +
46991 +static int end_block_io_op(struct bio *bio, unsigned int done, int error)
46992 +{
46993 +       if (bio->bi_size != 0)
46994 +               return 1;
46995 +       __end_block_io_op(bio->bi_private, !error);
46996 +       bio_put(bio);
46997 +       return error;
46998 +}
46999 +
47000 +
47001 +/******************************************************************************
47002 + * NOTIFICATION FROM GUEST OS.
47003 + */
47004 +
47005 +static void blkif_notify_work(blkif_t *blkif)
47006 +{
47007 +       blkif->waiting_reqs = 1;
47008 +       wake_up(&blkif->wq);
47009 +}
47010 +
47011 +irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
47012 +{
47013 +       blkif_notify_work(dev_id);
47014 +       return IRQ_HANDLED;
47015 +}
47016 +
47017 +
47018 +
47019 +/******************************************************************
47020 + * DOWNWARD CALLS -- These interface with the block-device layer proper.
47021 + */
47022 +
47023 +static int do_block_io_op(blkif_t *blkif)
47024 +{
47025 +       blkif_back_ring_t *blk_ring = &blkif->blk_ring;
47026 +       blkif_request_t *req;
47027 +       pending_req_t *pending_req;
47028 +       RING_IDX rc, rp;
47029 +       int more_to_do = 0;
47030 +
47031 +       rc = blk_ring->req_cons;
47032 +       rp = blk_ring->sring->req_prod;
47033 +       rmb(); /* Ensure we see queued requests up to 'rp'. */
47034 +
47035 +       while ((rc != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
47036 +
47037 +               pending_req = alloc_req();
47038 +               if (NULL == pending_req) {
47039 +                       blkif->st_oo_req++;
47040 +                       more_to_do = 1;
47041 +                       break;
47042 +               }
47043 +
47044 +               req = RING_GET_REQUEST(blk_ring, rc);
47045 +               blk_ring->req_cons = ++rc; /* before make_response() */
47046 +
47047 +               switch (req->operation) {
47048 +               case BLKIF_OP_READ:
47049 +                       blkif->st_rd_req++;
47050 +                       dispatch_rw_block_io(blkif, req, pending_req);
47051 +                       break;
47052 +               case BLKIF_OP_WRITE:
47053 +                       blkif->st_wr_req++;
47054 +                       dispatch_rw_block_io(blkif, req, pending_req);
47055 +                       break;
47056 +               default:
47057 +                       DPRINTK("error: unknown block io operation [%d]\n",
47058 +                               req->operation);
47059 +                       make_response(blkif, req->id, req->operation,
47060 +                                     BLKIF_RSP_ERROR);
47061 +                       free_req(pending_req);
47062 +                       break;
47063 +               }
47064 +       }
47065 +       return more_to_do;
47066 +}
47067 +
47068 +static void dispatch_rw_block_io(blkif_t *blkif,
47069 +                                blkif_request_t *req,
47070 +                                pending_req_t *pending_req)
47071 +{
47072 +       extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
47073 +       int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
47074 +       struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
47075 +       struct phys_req preq;
47076 +       struct { 
47077 +               unsigned long buf; unsigned int nsec;
47078 +       } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
47079 +       unsigned int nseg;
47080 +       struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
47081 +       int ret, i, nbio = 0;
47082 +
47083 +       /* Check that number of segments is sane. */
47084 +       nseg = req->nr_segments;
47085 +       if (unlikely(nseg == 0) || 
47086 +           unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
47087 +               DPRINTK("Bad number of segments in request (%d)\n", nseg);
47088 +               goto fail_response;
47089 +       }
47090 +
47091 +       preq.dev           = req->handle;
47092 +       preq.sector_number = req->sector_number;
47093 +       preq.nr_sects      = 0;
47094 +
47095 +       pending_req->blkif     = blkif;
47096 +       pending_req->id        = req->id;
47097 +       pending_req->operation = operation;
47098 +       pending_req->status    = BLKIF_RSP_OKAY;
47099 +       pending_req->nr_pages  = nseg;
47100 +
47101 +       for (i = 0; i < nseg; i++) {
47102 +               uint32_t flags;
47103 +
47104 +               seg[i].nsec = req->seg[i].last_sect -
47105 +                       req->seg[i].first_sect + 1;
47106 +
47107 +               if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
47108 +                   (seg[i].nsec <= 0))
47109 +                       goto fail_response;
47110 +               preq.nr_sects += seg[i].nsec;
47111 +
47112 +               flags = GNTMAP_host_map;
47113 +               if ( operation == WRITE )
47114 +                       flags |= GNTMAP_readonly;
47115 +               gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
47116 +                                 req->seg[i].gref, blkif->domid);
47117 +       }
47118 +
47119 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
47120 +       BUG_ON(ret);
47121 +
47122 +       for (i = 0; i < nseg; i++) {
47123 +               if (unlikely(map[i].status != 0)) {
47124 +                       DPRINTK("invalid buffer -- could not remap it\n");
47125 +                       goto fail_flush;
47126 +               }
47127 +
47128 +               pending_handle(pending_req, i) = map[i].handle;
47129 +#ifdef CONFIG_XEN_IA64_DOM0_NON_VP
47130 +               pending_vaddrs[vaddr_pagenr(pending_req, i)] =
47131 +                       (unsigned long)gnttab_map_vaddr(map[i]);
47132 +#else
47133 +               set_phys_to_machine(__pa(vaddr(
47134 +                       pending_req, i)) >> PAGE_SHIFT,
47135 +                       FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
47136 +#endif
47137 +               seg[i].buf  = map[i].dev_bus_addr | 
47138 +                       (req->seg[i].first_sect << 9);
47139 +       }
47140 +
47141 +       if (vbd_translate(&preq, blkif, operation) != 0) {
47142 +               DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", 
47143 +                       operation == READ ? "read" : "write",
47144 +                       preq.sector_number,
47145 +                       preq.sector_number + preq.nr_sects, preq.dev); 
47146 +               goto fail_flush;
47147 +       }
47148 +
47149 +       for (i = 0; i < nseg; i++) {
47150 +               if (((int)preq.sector_number|(int)seg[i].nsec) &
47151 +                   ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
47152 +                       DPRINTK("Misaligned I/O request from domain %d",
47153 +                               blkif->domid);
47154 +                       goto fail_put_bio;
47155 +               }
47156 +
47157 +               while ((bio == NULL) ||
47158 +                      (bio_add_page(bio,
47159 +                                    virt_to_page(vaddr(pending_req, i)),
47160 +                                    seg[i].nsec << 9,
47161 +                                    seg[i].buf & ~PAGE_MASK) == 0)) {
47162 +                       bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
47163 +                       if (unlikely(bio == NULL))
47164 +                               goto fail_put_bio;
47165 +
47166 +                       bio->bi_bdev    = preq.bdev;
47167 +                       bio->bi_private = pending_req;
47168 +                       bio->bi_end_io  = end_block_io_op;
47169 +                       bio->bi_sector  = preq.sector_number;
47170 +               }
47171 +
47172 +               preq.sector_number += seg[i].nsec;
47173 +       }
47174 +
47175 +       plug_queue(blkif, bio);
47176 +       atomic_set(&pending_req->pendcnt, nbio);
47177 +       blkif_get(blkif);
47178 +
47179 +       for (i = 0; i < nbio; i++)
47180 +               submit_bio(operation, biolist[i]);
47181 +
47182 +       return;
47183 +
47184 + fail_put_bio:
47185 +       for (i = 0; i < (nbio-1); i++)
47186 +               bio_put(biolist[i]);
47187 + fail_flush:
47188 +       fast_flush_area(pending_req);
47189 + fail_response:
47190 +       make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
47191 +       free_req(pending_req);
47192 +} 
47193 +
47194 +
47195 +
47196 +/******************************************************************
47197 + * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
47198 + */
47199 +
47200 +
47201 +static void make_response(blkif_t *blkif, unsigned long id, 
47202 +                         unsigned short op, int st)
47203 +{
47204 +       blkif_response_t *resp;
47205 +       unsigned long     flags;
47206 +       blkif_back_ring_t *blk_ring = &blkif->blk_ring;
47207 +       int more_to_do = 0;
47208 +       int notify;
47209 +
47210 +       spin_lock_irqsave(&blkif->blk_ring_lock, flags);
47211 +
47212 +       /* Place on the response ring for the relevant domain. */ 
47213 +       resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
47214 +       resp->id        = id;
47215 +       resp->operation = op;
47216 +       resp->status    = st;
47217 +       blk_ring->rsp_prod_pvt++;
47218 +       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
47219 +
47220 +       if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
47221 +               /*
47222 +                * Tail check for pending requests. Allows frontend to avoid
47223 +                * notifications if requests are already in flight (lower
47224 +                * overheads and promotes batching).
47225 +                */
47226 +               RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
47227 +
47228 +       } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
47229 +               more_to_do = 1;
47230 +
47231 +       }
47232 +       spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
47233 +
47234 +       if (more_to_do)
47235 +               blkif_notify_work(blkif);
47236 +       if (notify)
47237 +               notify_remote_via_irq(blkif->irq);
47238 +}
47239 +
47240 +static int __init blkif_init(void)
47241 +{
47242 +       struct page *page;
47243 +       int i;
47244 +
47245 +       if (!is_running_on_xen())
47246 +               return -ENODEV;
47247 +
47248 +       mmap_pages            = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
47249 +       pending_reqs          = kmalloc(sizeof(pending_reqs[0]) *
47250 +                                       blkif_reqs, GFP_KERNEL);
47251 +       pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
47252 +                                       mmap_pages, GFP_KERNEL);
47253 +       pending_vaddrs        = kmalloc(sizeof(pending_vaddrs[0]) *
47254 +                                       mmap_pages, GFP_KERNEL);
47255 +       if (!pending_reqs || !pending_grant_handles || !pending_vaddrs) {
47256 +               kfree(pending_reqs);
47257 +               kfree(pending_grant_handles);
47258 +               kfree(pending_vaddrs);
47259 +               printk("%s: out of memory\n", __FUNCTION__);
47260 +               return -ENOMEM;
47261 +       }
47262 +
47263 +       blkif_interface_init();
47264 +       
47265 +#ifdef CONFIG_XEN_IA64_DOM0_NON_VP
47266 +       extern unsigned long alloc_empty_foreign_map_page_range(
47267 +               unsigned long pages);
47268 +       mmap_vstart = (unsigned long)
47269 +               alloc_empty_foreign_map_page_range(mmap_pages);
47270 +#else /* ! ia64 */
47271 +       page = balloon_alloc_empty_page_range(mmap_pages);
47272 +       BUG_ON(page == NULL);
47273 +       mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
47274 +#endif
47275 +       printk("%s: reqs=%d, pages=%d, mmap_vstart=0x%lx\n",
47276 +              __FUNCTION__, blkif_reqs, mmap_pages, mmap_vstart);
47277 +       BUG_ON(mmap_vstart == 0);
47278 +       for (i = 0; i < mmap_pages; i++) {
47279 +               pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
47280 +               pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
47281 +       }
47282 +
47283 +       memset(pending_reqs, 0, sizeof(pending_reqs));
47284 +       INIT_LIST_HEAD(&pending_free);
47285 +
47286 +       for (i = 0; i < blkif_reqs; i++)
47287 +               list_add_tail(&pending_reqs[i].free_list, &pending_free);
47288 +    
47289 +       blkif_xenbus_init();
47290 +
47291 +       return 0;
47292 +}
47293 +
47294 +module_init(blkif_init);
47295 +
47296 +MODULE_LICENSE("Dual BSD/GPL");
47297 diff -urNp linux-2.6/drivers/xen/blkback/common.h new/drivers/xen/blkback/common.h
47298 --- linux-2.6/drivers/xen/blkback/common.h      1970-01-01 01:00:00.000000000 +0100
47299 +++ new/drivers/xen/blkback/common.h    2006-07-07 15:10:03.000000000 +0200
47300 @@ -0,0 +1,134 @@
47301 +/* 
47302 + * This program is free software; you can redistribute it and/or
47303 + * modify it under the terms of the GNU General Public License version 2
47304 + * as published by the Free Software Foundation; or, when distributed
47305 + * separately from the Linux kernel or incorporated into other
47306 + * software packages, subject to the following license:
47307 + * 
47308 + * Permission is hereby granted, free of charge, to any person obtaining a copy
47309 + * of this source file (the "Software"), to deal in the Software without
47310 + * restriction, including without limitation the rights to use, copy, modify,
47311 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
47312 + * and to permit persons to whom the Software is furnished to do so, subject to
47313 + * the following conditions:
47314 + * 
47315 + * The above copyright notice and this permission notice shall be included in
47316 + * all copies or substantial portions of the Software.
47317 + * 
47318 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
47319 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
47320 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
47321 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
47322 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
47323 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
47324 + * IN THE SOFTWARE.
47325 + */
47326 +
47327 +#ifndef __BLKIF__BACKEND__COMMON_H__
47328 +#define __BLKIF__BACKEND__COMMON_H__
47329 +
47330 +#include <linux/config.h>
47331 +#include <linux/version.h>
47332 +#include <linux/module.h>
47333 +#include <linux/interrupt.h>
47334 +#include <linux/slab.h>
47335 +#include <linux/blkdev.h>
47336 +#include <linux/vmalloc.h>
47337 +#include <linux/wait.h>
47338 +#include <asm/io.h>
47339 +#include <asm/setup.h>
47340 +#include <asm/pgalloc.h>
47341 +#include <xen/evtchn.h>
47342 +#include <asm/hypervisor.h>
47343 +#include <xen/interface/io/blkif.h>
47344 +#include <xen/interface/io/ring.h>
47345 +#include <xen/gnttab.h>
47346 +#include <xen/driver_util.h>
47347 +
47348 +#define DPRINTK(_f, _a...)                     \
47349 +       pr_debug("(file=%s, line=%d) " _f,      \
47350 +                __FILE__ , __LINE__ , ## _a )
47351 +
47352 +struct vbd {
47353 +       blkif_vdev_t   handle;      /* what the domain refers to this vbd as */
47354 +       unsigned char  readonly;    /* Non-zero -> read-only */
47355 +       unsigned char  type;        /* VDISK_xxx */
47356 +       u32            pdevice;     /* phys device that this vbd maps to */
47357 +       struct block_device *bdev;
47358 +}; 
47359 +
47360 +struct backend_info; 
47361 +
47362 +typedef struct blkif_st {
47363 +       /* Unique identifier for this interface. */
47364 +       domid_t           domid;
47365 +       unsigned int      handle;
47366 +       /* Physical parameters of the comms window. */
47367 +       unsigned int      evtchn;
47368 +       unsigned int      irq;
47369 +       /* Comms information. */
47370 +       blkif_back_ring_t blk_ring;
47371 +       struct vm_struct *blk_ring_area;
47372 +       /* The VBD attached to this interface. */
47373 +       struct vbd        vbd;
47374 +       /* Back pointer to the backend_info. */
47375 +       struct backend_info *be; 
47376 +       /* Private fields. */
47377 +       spinlock_t       blk_ring_lock;
47378 +       atomic_t         refcnt;
47379 +
47380 +       wait_queue_head_t   wq;
47381 +       struct task_struct  *xenblkd;
47382 +       unsigned int        waiting_reqs;
47383 +       request_queue_t     *plug;
47384 +
47385 +       /* statistics */
47386 +       unsigned long       st_print;
47387 +       int                 st_rd_req;
47388 +       int                 st_wr_req;
47389 +       int                 st_oo_req;
47390 +
47391 +       wait_queue_head_t waiting_to_free;
47392 +
47393 +       grant_handle_t shmem_handle;
47394 +       grant_ref_t    shmem_ref;
47395 +} blkif_t;
47396 +
47397 +blkif_t *blkif_alloc(domid_t domid);
47398 +void blkif_disconnect(blkif_t *blkif);
47399 +void blkif_free(blkif_t *blkif);
47400 +int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn);
47401 +
47402 +#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
47403 +#define blkif_put(_b)                                  \
47404 +       do {                                            \
47405 +               if (atomic_dec_and_test(&(_b)->refcnt)) \
47406 +                       wake_up(&(_b)->waiting_to_free);\
47407 +       } while (0)
47408 +
47409 +/* Create a vbd. */
47410 +int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major,
47411 +              unsigned minor, int readonly);
47412 +void vbd_free(struct vbd *vbd);
47413 +
47414 +unsigned long vbd_size(struct vbd *vbd);
47415 +unsigned int vbd_info(struct vbd *vbd);
47416 +unsigned long vbd_secsize(struct vbd *vbd);
47417 +
47418 +struct phys_req {
47419 +       unsigned short       dev;
47420 +       unsigned short       nr_sects;
47421 +       struct block_device *bdev;
47422 +       blkif_sector_t       sector_number;
47423 +};
47424 +
47425 +int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation); 
47426 +
47427 +void blkif_interface_init(void);
47428 +
47429 +void blkif_xenbus_init(void);
47430 +
47431 +irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
47432 +int blkif_schedule(void *arg);
47433 +
47434 +#endif /* __BLKIF__BACKEND__COMMON_H__ */
47435 diff -urNp linux-2.6/drivers/xen/blkback/interface.c new/drivers/xen/blkback/interface.c
47436 --- linux-2.6/drivers/xen/blkback/interface.c   1970-01-01 01:00:00.000000000 +0100
47437 +++ new/drivers/xen/blkback/interface.c 2006-07-07 15:10:03.000000000 +0200
47438 @@ -0,0 +1,177 @@
47439 +/******************************************************************************
47440 + * arch/xen/drivers/blkif/backend/interface.c
47441 + * 
47442 + * Block-device interface management.
47443 + * 
47444 + * Copyright (c) 2004, Keir Fraser
47445 + * 
47446 + * This program is free software; you can redistribute it and/or
47447 + * modify it under the terms of the GNU General Public License version 2
47448 + * as published by the Free Software Foundation; or, when distributed
47449 + * separately from the Linux kernel or incorporated into other
47450 + * software packages, subject to the following license:
47451 + * 
47452 + * Permission is hereby granted, free of charge, to any person obtaining a copy
47453 + * of this source file (the "Software"), to deal in the Software without
47454 + * restriction, including without limitation the rights to use, copy, modify,
47455 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
47456 + * and to permit persons to whom the Software is furnished to do so, subject to
47457 + * the following conditions:
47458 + * 
47459 + * The above copyright notice and this permission notice shall be included in
47460 + * all copies or substantial portions of the Software.
47461 + * 
47462 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
47463 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
47464 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
47465 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
47466 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
47467 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
47468 + * IN THE SOFTWARE.
47469 + */
47470 +
47471 +#include "common.h"
47472 +#include <xen/evtchn.h>
47473 +#include <linux/kthread.h>
47474 +
47475 +static kmem_cache_t *blkif_cachep;
47476 +
47477 +blkif_t *blkif_alloc(domid_t domid)
47478 +{
47479 +       blkif_t *blkif;
47480 +
47481 +       blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
47482 +       if (!blkif)
47483 +               return ERR_PTR(-ENOMEM);
47484 +
47485 +       memset(blkif, 0, sizeof(*blkif));
47486 +       blkif->domid = domid;
47487 +       spin_lock_init(&blkif->blk_ring_lock);
47488 +       atomic_set(&blkif->refcnt, 1);
47489 +       init_waitqueue_head(&blkif->wq);
47490 +       blkif->st_print = jiffies;
47491 +       init_waitqueue_head(&blkif->waiting_to_free);
47492 +
47493 +       return blkif;
47494 +}
47495 +
47496 +static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
47497 +{
47498 +       struct gnttab_map_grant_ref op;
47499 +       int ret;
47500 +
47501 +       gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
47502 +                         GNTMAP_host_map, shared_page, blkif->domid);
47503 +
47504 +       lock_vm_area(blkif->blk_ring_area);
47505 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
47506 +       unlock_vm_area(blkif->blk_ring_area);
47507 +       BUG_ON(ret);
47508 +
47509 +       if (op.status) {
47510 +               DPRINTK(" Grant table operation failure !\n");
47511 +               return op.status;
47512 +       }
47513 +
47514 +       blkif->shmem_ref = shared_page;
47515 +       blkif->shmem_handle = op.handle;
47516 +
47517 +#ifdef CONFIG_XEN_IA64_DOM0_NON_VP
47518 +       /* on some arch's, map_grant_ref behaves like mmap, in that the
47519 +        * passed address is a hint and a different address may be returned */
47520 +       blkif->blk_ring_area->addr = gnttab_map_vaddr(op);
47521 +#endif
47522 +
47523 +       return 0;
47524 +}
47525 +
47526 +static void unmap_frontend_page(blkif_t *blkif)
47527 +{
47528 +       struct gnttab_unmap_grant_ref op;
47529 +       int ret;
47530 +
47531 +       gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
47532 +                           GNTMAP_host_map, blkif->shmem_handle);
47533 +
47534 +       lock_vm_area(blkif->blk_ring_area);
47535 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
47536 +       unlock_vm_area(blkif->blk_ring_area);
47537 +       BUG_ON(ret);
47538 +}
47539 +
47540 +int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
47541 +{
47542 +       blkif_sring_t *sring;
47543 +       int err;
47544 +       struct evtchn_bind_interdomain bind_interdomain;
47545 +
47546 +       /* Already connected through? */
47547 +       if (blkif->irq)
47548 +               return 0;
47549 +
47550 +       if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL )
47551 +               return -ENOMEM;
47552 +
47553 +       err = map_frontend_page(blkif, shared_page);
47554 +       if (err) {
47555 +               free_vm_area(blkif->blk_ring_area);
47556 +               return err;
47557 +       }
47558 +
47559 +       bind_interdomain.remote_dom  = blkif->domid;
47560 +       bind_interdomain.remote_port = evtchn;
47561 +
47562 +       err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
47563 +                                         &bind_interdomain);
47564 +       if (err) {
47565 +               unmap_frontend_page(blkif);
47566 +               free_vm_area(blkif->blk_ring_area);
47567 +               return err;
47568 +       }
47569 +
47570 +       blkif->evtchn = bind_interdomain.local_port;
47571 +
47572 +       sring = (blkif_sring_t *)blkif->blk_ring_area->addr;
47573 +       BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE);
47574 +
47575 +       blkif->irq = bind_evtchn_to_irqhandler(
47576 +               blkif->evtchn, blkif_be_int, 0, "blkif-backend", blkif);
47577 +
47578 +       return 0;
47579 +}
47580 +
47581 +void blkif_disconnect(blkif_t *blkif)
47582 +{
47583 +       if (blkif->xenblkd) {
47584 +               kthread_stop(blkif->xenblkd);
47585 +               blkif->xenblkd = NULL;
47586 +       }
47587 +
47588 +       atomic_dec(&blkif->refcnt);
47589 +       wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
47590 +       atomic_inc(&blkif->refcnt);
47591 +
47592 +       if (blkif->irq) {
47593 +               unbind_from_irqhandler(blkif->irq, blkif);
47594 +               blkif->irq = 0;
47595 +       }
47596 +
47597 +       if (blkif->blk_ring.sring) {
47598 +               unmap_frontend_page(blkif);
47599 +               free_vm_area(blkif->blk_ring_area);
47600 +               blkif->blk_ring.sring = NULL;
47601 +       }
47602 +}
47603 +
47604 +void blkif_free(blkif_t *blkif)
47605 +{
47606 +       if (!atomic_dec_and_test(&blkif->refcnt))
47607 +               BUG();
47608 +       kmem_cache_free(blkif_cachep, blkif);
47609 +}
47610 +
47611 +void __init blkif_interface_init(void)
47612 +{
47613 +       blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), 
47614 +                                        0, 0, NULL, NULL);
47615 +}
47616 diff -urNp linux-2.6/drivers/xen/blkback/Makefile new/drivers/xen/blkback/Makefile
47617 --- linux-2.6/drivers/xen/blkback/Makefile      1970-01-01 01:00:00.000000000 +0100
47618 +++ new/drivers/xen/blkback/Makefile    2006-05-09 12:34:36.000000000 +0200
47619 @@ -0,0 +1,3 @@
47620 +obj-$(CONFIG_XEN_BLKDEV_BACKEND) := blkbk.o
47621 +
47622 +blkbk-y        := blkback.o xenbus.o interface.o vbd.o
47623 diff -urNp linux-2.6/drivers/xen/blkback/vbd.c new/drivers/xen/blkback/vbd.c
47624 --- linux-2.6/drivers/xen/blkback/vbd.c 1970-01-01 01:00:00.000000000 +0100
47625 +++ new/drivers/xen/blkback/vbd.c       2006-05-23 18:42:17.000000000 +0200
47626 @@ -0,0 +1,119 @@
47627 +/******************************************************************************
47628 + * blkback/vbd.c
47629 + * 
47630 + * Routines for managing virtual block devices (VBDs).
47631 + * 
47632 + * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
47633 + * 
47634 + * This program is free software; you can redistribute it and/or
47635 + * modify it under the terms of the GNU General Public License version 2
47636 + * as published by the Free Software Foundation; or, when distributed
47637 + * separately from the Linux kernel or incorporated into other
47638 + * software packages, subject to the following license:
47639 + * 
47640 + * Permission is hereby granted, free of charge, to any person obtaining a copy
47641 + * of this source file (the "Software"), to deal in the Software without
47642 + * restriction, including without limitation the rights to use, copy, modify,
47643 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
47644 + * and to permit persons to whom the Software is furnished to do so, subject to
47645 + * the following conditions:
47646 + * 
47647 + * The above copyright notice and this permission notice shall be included in
47648 + * all copies or substantial portions of the Software.
47649 + * 
47650 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
47651 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
47652 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
47653 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
47654 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
47655 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
47656 + * IN THE SOFTWARE.
47657 + */
47658 +
47659 +#include "common.h"
47660 +#include <xen/xenbus.h>
47661 +
47662 +#define vbd_sz(_v)   ((_v)->bdev->bd_part ?                            \
47663 +       (_v)->bdev->bd_part->nr_sects : (_v)->bdev->bd_disk->capacity)
47664 +
47665 +unsigned long vbd_size(struct vbd *vbd)
47666 +{
47667 +       return vbd_sz(vbd);
47668 +}
47669 +
47670 +unsigned int vbd_info(struct vbd *vbd)
47671 +{
47672 +       return vbd->type | (vbd->readonly?VDISK_READONLY:0);
47673 +}
47674 +
47675 +unsigned long vbd_secsize(struct vbd *vbd)
47676 +{
47677 +       return bdev_hardsect_size(vbd->bdev);
47678 +}
47679 +
47680 +int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
47681 +              unsigned minor, int readonly)
47682 +{
47683 +       struct vbd *vbd;
47684 +       struct block_device *bdev;
47685 +
47686 +       vbd = &blkif->vbd;
47687 +       vbd->handle   = handle; 
47688 +       vbd->readonly = readonly;
47689 +       vbd->type     = 0;
47690 +
47691 +       vbd->pdevice  = MKDEV(major, minor);
47692 +
47693 +       bdev = open_by_devnum(vbd->pdevice,
47694 +                             vbd->readonly ? FMODE_READ : FMODE_WRITE);
47695 +
47696 +       if (IS_ERR(bdev)) {
47697 +               DPRINTK("vbd_creat: device %08x could not be opened.\n",
47698 +                       vbd->pdevice);
47699 +               return -ENOENT;
47700 +       }
47701 +
47702 +       vbd->bdev = bdev;
47703 +
47704 +       if (vbd->bdev->bd_disk == NULL) {
47705 +               DPRINTK("vbd_creat: device %08x doesn't exist.\n",
47706 +                       vbd->pdevice);
47707 +               vbd_free(vbd);
47708 +               return -ENOENT;
47709 +       }
47710 +
47711 +       if (vbd->bdev->bd_disk->flags & GENHD_FL_CD)
47712 +               vbd->type |= VDISK_CDROM;
47713 +       if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
47714 +               vbd->type |= VDISK_REMOVABLE;
47715 +
47716 +       DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
47717 +               handle, blkif->domid);
47718 +       return 0;
47719 +}
47720 +
47721 +void vbd_free(struct vbd *vbd)
47722 +{
47723 +       if (vbd->bdev)
47724 +               blkdev_put(vbd->bdev);
47725 +       vbd->bdev = NULL;
47726 +}
47727 +
47728 +int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
47729 +{
47730 +       struct vbd *vbd = &blkif->vbd;
47731 +       int rc = -EACCES;
47732 +
47733 +       if ((operation == WRITE) && vbd->readonly)
47734 +               goto out;
47735 +
47736 +       if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd)))
47737 +               goto out;
47738 +
47739 +       req->dev  = vbd->pdevice;
47740 +       req->bdev = vbd->bdev;
47741 +       rc = 0;
47742 +
47743 + out:
47744 +       return rc;
47745 +}
47746 diff -urNp linux-2.6/drivers/xen/blkback/xenbus.c new/drivers/xen/blkback/xenbus.c
47747 --- linux-2.6/drivers/xen/blkback/xenbus.c      1970-01-01 01:00:00.000000000 +0100
47748 +++ new/drivers/xen/blkback/xenbus.c    2006-07-07 15:10:03.000000000 +0200
47749 @@ -0,0 +1,460 @@
47750 +/*  Xenbus code for blkif backend
47751 +    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
47752 +    Copyright (C) 2005 XenSource Ltd
47753 +
47754 +    This program is free software; you can redistribute it and/or modify
47755 +    it under the terms of the GNU General Public License as published by
47756 +    the Free Software Foundation; either version 2 of the License, or
47757 +    (at your option) any later version.
47758 +
47759 +    This program is distributed in the hope that it will be useful,
47760 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
47761 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
47762 +    GNU General Public License for more details.
47763 +
47764 +    You should have received a copy of the GNU General Public License
47765 +    along with this program; if not, write to the Free Software
47766 +    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
47767 +*/
47768 +
47769 +#include <stdarg.h>
47770 +#include <linux/module.h>
47771 +#include <linux/kthread.h>
47772 +#include <xen/xenbus.h>
47773 +#include "common.h"
47774 +
47775 +#undef DPRINTK
47776 +#define DPRINTK(fmt, args...)                          \
47777 +       pr_debug("blkback/xenbus (%s:%d) " fmt ".\n",   \
47778 +                __FUNCTION__, __LINE__, ##args)
47779 +
47780 +struct backend_info
47781 +{
47782 +       struct xenbus_device *dev;
47783 +       blkif_t *blkif;
47784 +       struct xenbus_watch backend_watch;
47785 +       unsigned major;
47786 +       unsigned minor;
47787 +       char *mode;
47788 +};
47789 +
47790 +static void connect(struct backend_info *);
47791 +static int connect_ring(struct backend_info *);
47792 +static void backend_changed(struct xenbus_watch *, const char **,
47793 +                           unsigned int);
47794 +
47795 +static void update_blkif_status(blkif_t *blkif)
47796 +{ 
47797 +       int err;
47798 +
47799 +       /* Not ready to connect? */
47800 +       if (!blkif->irq || !blkif->vbd.bdev)
47801 +               return;
47802 +
47803 +       /* Already connected? */
47804 +       if (blkif->be->dev->state == XenbusStateConnected)
47805 +               return;
47806 +
47807 +       /* Attempt to connect: exit if we fail to. */
47808 +       connect(blkif->be);
47809 +       if (blkif->be->dev->state != XenbusStateConnected)
47810 +               return;
47811 +
47812 +       blkif->xenblkd = kthread_run(blkif_schedule, blkif,
47813 +                                    "xvd %d %02x:%02x",
47814 +                                    blkif->domid,
47815 +                                    blkif->be->major, blkif->be->minor);
47816 +       if (IS_ERR(blkif->xenblkd)) {
47817 +               err = PTR_ERR(blkif->xenblkd);
47818 +               blkif->xenblkd = NULL;
47819 +               xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
47820 +       }
47821 +}
47822 +
47823 +
47824 +/****************************************************************
47825 + *  sysfs interface for VBD I/O requests
47826 + */
47827 +
47828 +#define VBD_SHOW(name, format, args...)                                        \
47829 +       static ssize_t show_##name(struct device *_dev,                 \
47830 +                                  struct device_attribute *attr,       \
47831 +                                  char *buf)                           \
47832 +       {                                                               \
47833 +               struct xenbus_device *dev = to_xenbus_device(_dev);     \
47834 +               struct backend_info *be = dev->dev.driver_data;         \
47835 +                                                                       \
47836 +               return sprintf(buf, format, ##args);                    \
47837 +       }                                                               \
47838 +       DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
47839 +
47840 +VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req);
47841 +VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req);
47842 +VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req);
47843 +
47844 +static struct attribute *vbdstat_attrs[] = {
47845 +       &dev_attr_oo_req.attr,
47846 +       &dev_attr_rd_req.attr,
47847 +       &dev_attr_wr_req.attr,
47848 +       NULL
47849 +};
47850 +
47851 +static struct attribute_group vbdstat_group = {
47852 +       .name = "statistics",
47853 +       .attrs = vbdstat_attrs,
47854 +};
47855 +
47856 +VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
47857 +VBD_SHOW(mode, "%s\n", be->mode);
47858 +
47859 +int xenvbd_sysfs_addif(struct xenbus_device *dev)
47860 +{
47861 +       int error;
47862 +       
47863 +       error = device_create_file(&dev->dev, &dev_attr_physical_device);
47864 +       if (error)
47865 +               goto fail1;
47866 +
47867 +       error = device_create_file(&dev->dev, &dev_attr_mode);
47868 +       if (error)
47869 +               goto fail2;
47870 +
47871 +       error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group);
47872 +       if (error)
47873 +               goto fail3;
47874 +
47875 +       return 0;
47876 +
47877 +fail3: sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
47878 +fail2: device_remove_file(&dev->dev, &dev_attr_mode);
47879 +fail1: device_remove_file(&dev->dev, &dev_attr_physical_device);
47880 +       return error;
47881 +}
47882 +
47883 +void xenvbd_sysfs_delif(struct xenbus_device *dev)
47884 +{
47885 +       sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
47886 +       device_remove_file(&dev->dev, &dev_attr_mode);
47887 +       device_remove_file(&dev->dev, &dev_attr_physical_device);
47888 +}
47889 +
47890 +static int blkback_remove(struct xenbus_device *dev)
47891 +{
47892 +       struct backend_info *be = dev->dev.driver_data;
47893 +
47894 +       DPRINTK("");
47895 +
47896 +       if (be->backend_watch.node) {
47897 +               unregister_xenbus_watch(&be->backend_watch);
47898 +               kfree(be->backend_watch.node);
47899 +               be->backend_watch.node = NULL;
47900 +       }
47901 +
47902 +       if (be->blkif) {
47903 +               blkif_disconnect(be->blkif);
47904 +               vbd_free(&be->blkif->vbd);
47905 +               blkif_free(be->blkif);
47906 +               be->blkif = NULL;
47907 +       }
47908 +
47909 +       if (be->major || be->minor)
47910 +               xenvbd_sysfs_delif(dev);
47911 +
47912 +       kfree(be);
47913 +       dev->dev.driver_data = NULL;
47914 +       return 0;
47915 +}
47916 +
47917 +
47918 +/**
47919 + * Entry point to this code when a new device is created.  Allocate the basic
47920 + * structures, and watch the store waiting for the hotplug scripts to tell us
47921 + * the device's physical major and minor numbers.  Switch to InitWait.
47922 + */
47923 +static int blkback_probe(struct xenbus_device *dev,
47924 +                        const struct xenbus_device_id *id)
47925 +{
47926 +       int err;
47927 +       struct backend_info *be = kzalloc(sizeof(struct backend_info),
47928 +                                         GFP_KERNEL);
47929 +       if (!be) {
47930 +               xenbus_dev_fatal(dev, -ENOMEM,
47931 +                                "allocating backend structure");
47932 +               return -ENOMEM;
47933 +       }
47934 +       be->dev = dev;
47935 +       dev->dev.driver_data = be;
47936 +
47937 +       be->blkif = blkif_alloc(dev->otherend_id);
47938 +       if (IS_ERR(be->blkif)) {
47939 +               err = PTR_ERR(be->blkif);
47940 +               be->blkif = NULL;
47941 +               xenbus_dev_fatal(dev, err, "creating block interface");
47942 +               goto fail;
47943 +       }
47944 +
47945 +       /* setup back pointer */
47946 +       be->blkif->be = be; 
47947 +
47948 +       err = xenbus_watch_path2(dev, dev->nodename, "physical-device",
47949 +                                &be->backend_watch, backend_changed);
47950 +       if (err)
47951 +               goto fail;
47952 +
47953 +       err = xenbus_switch_state(dev, XenbusStateInitWait);
47954 +       if (err)
47955 +               goto fail;
47956 +
47957 +       return 0;
47958 +
47959 +fail:
47960 +       DPRINTK("failed");
47961 +       blkback_remove(dev);
47962 +       return err;
47963 +}
47964 +
47965 +
47966 +/**
47967 + * Callback received when the hotplug scripts have placed the physical-device
47968 + * node.  Read it and the mode node, and create a vbd.  If the frontend is
47969 + * ready, connect.
47970 + */
47971 +static void backend_changed(struct xenbus_watch *watch,
47972 +                           const char **vec, unsigned int len)
47973 +{
47974 +       int err;
47975 +       unsigned major;
47976 +       unsigned minor;
47977 +       struct backend_info *be
47978 +               = container_of(watch, struct backend_info, backend_watch);
47979 +       struct xenbus_device *dev = be->dev;
47980 +
47981 +       DPRINTK("");
47982 +
47983 +       err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
47984 +                          &major, &minor);
47985 +       if (XENBUS_EXIST_ERR(err)) {
47986 +               /* Since this watch will fire once immediately after it is
47987 +                  registered, we expect this.  Ignore it, and wait for the
47988 +                  hotplug scripts. */
47989 +               return;
47990 +       }
47991 +       if (err != 2) {
47992 +               xenbus_dev_fatal(dev, err, "reading physical-device");
47993 +               return;
47994 +       }
47995 +
47996 +       if ((be->major || be->minor) &&
47997 +           ((be->major != major) || (be->minor != minor))) {
47998 +               printk(KERN_WARNING
47999 +                      "blkback: changing physical device (from %x:%x to "
48000 +                      "%x:%x) not supported.\n", be->major, be->minor,
48001 +                      major, minor);
48002 +               return;
48003 +       }
48004 +
48005 +       be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
48006 +       if (IS_ERR(be->mode)) {
48007 +               err = PTR_ERR(be->mode);
48008 +               be->mode = NULL;
48009 +               xenbus_dev_fatal(dev, err, "reading mode");
48010 +               return;
48011 +       }
48012 +
48013 +       if (be->major == 0 && be->minor == 0) {
48014 +               /* Front end dir is a number, which is used as the handle. */
48015 +
48016 +               char *p = strrchr(dev->otherend, '/') + 1;
48017 +               long handle = simple_strtoul(p, NULL, 0);
48018 +
48019 +               be->major = major;
48020 +               be->minor = minor;
48021 +
48022 +               err = vbd_create(be->blkif, handle, major, minor,
48023 +                                (NULL == strchr(be->mode, 'w')));
48024 +               if (err) {
48025 +                       be->major = be->minor = 0;
48026 +                       xenbus_dev_fatal(dev, err, "creating vbd structure");
48027 +                       return;
48028 +               }
48029 +
48030 +               err = xenvbd_sysfs_addif(dev);
48031 +               if (err) {
48032 +                       vbd_free(&be->blkif->vbd);
48033 +                       be->major = be->minor = 0;
48034 +                       xenbus_dev_fatal(dev, err, "creating sysfs entries");
48035 +                       return;
48036 +               }
48037 +
48038 +               /* We're potentially connected now */
48039 +               update_blkif_status(be->blkif); 
48040 +       }
48041 +}
48042 +
48043 +
48044 +/**
48045 + * Callback received when the frontend's state changes.
48046 + */
48047 +static void frontend_changed(struct xenbus_device *dev,
48048 +                            enum xenbus_state frontend_state)
48049 +{
48050 +       struct backend_info *be = dev->dev.driver_data;
48051 +       int err;
48052 +
48053 +       DPRINTK("");
48054 +
48055 +       switch (frontend_state) {
48056 +       case XenbusStateInitialising:
48057 +               break;
48058 +
48059 +       case XenbusStateInitialised:
48060 +       case XenbusStateConnected:
48061 +               /* Ensure we connect even when two watches fire in 
48062 +                  close successsion and we miss the intermediate value 
48063 +                  of frontend_state. */
48064 +               if (dev->state == XenbusStateConnected)
48065 +                       break;
48066 +
48067 +               err = connect_ring(be);
48068 +               if (err)
48069 +                       break;
48070 +               update_blkif_status(be->blkif);
48071 +               break;
48072 +
48073 +       case XenbusStateClosing:
48074 +               blkif_disconnect(be->blkif);
48075 +               xenbus_switch_state(dev, XenbusStateClosing);
48076 +               break;
48077 +
48078 +       case XenbusStateClosed:
48079 +               device_unregister(&dev->dev);
48080 +               break;
48081 +
48082 +       case XenbusStateUnknown:
48083 +       case XenbusStateInitWait:
48084 +       default:
48085 +               xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
48086 +                                frontend_state);
48087 +               break;
48088 +       }
48089 +}
48090 +
48091 +
48092 +/* ** Connection ** */
48093 +
48094 +
48095 +/**
48096 + * Write the physical details regarding the block device to the store, and
48097 + * switch to Connected state.
48098 + */
48099 +static void connect(struct backend_info *be)
48100 +{
48101 +       struct xenbus_transaction xbt;
48102 +       int err;
48103 +       struct xenbus_device *dev = be->dev;
48104 +
48105 +       DPRINTK("%s", dev->otherend);
48106 +
48107 +       /* Supply the information about the device the frontend needs */
48108 +again:
48109 +       err = xenbus_transaction_start(&xbt);
48110 +
48111 +       if (err) {
48112 +               xenbus_dev_fatal(dev, err, "starting transaction");
48113 +               return;
48114 +       }
48115 +
48116 +       err = xenbus_printf(xbt, dev->nodename, "sectors", "%lu",
48117 +                           vbd_size(&be->blkif->vbd));
48118 +       if (err) {
48119 +               xenbus_dev_fatal(dev, err, "writing %s/sectors",
48120 +                                dev->nodename);
48121 +               goto abort;
48122 +       }
48123 +
48124 +       /* FIXME: use a typename instead */
48125 +       err = xenbus_printf(xbt, dev->nodename, "info", "%u",
48126 +                           vbd_info(&be->blkif->vbd));
48127 +       if (err) {
48128 +               xenbus_dev_fatal(dev, err, "writing %s/info",
48129 +                                dev->nodename);
48130 +               goto abort;
48131 +       }
48132 +       err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
48133 +                           vbd_secsize(&be->blkif->vbd));
48134 +       if (err) {
48135 +               xenbus_dev_fatal(dev, err, "writing %s/sector-size",
48136 +                                dev->nodename);
48137 +               goto abort;
48138 +       }
48139 +
48140 +       err = xenbus_transaction_end(xbt, 0);
48141 +       if (err == -EAGAIN)
48142 +               goto again;
48143 +       if (err)
48144 +               xenbus_dev_fatal(dev, err, "ending transaction");
48145 +
48146 +       err = xenbus_switch_state(dev, XenbusStateConnected);
48147 +       if (err)
48148 +               xenbus_dev_fatal(dev, err, "switching to Connected state",
48149 +                                dev->nodename);
48150 +
48151 +       return;
48152 + abort:
48153 +       xenbus_transaction_end(xbt, 1);
48154 +}
48155 +
48156 +
48157 +static int connect_ring(struct backend_info *be)
48158 +{
48159 +       struct xenbus_device *dev = be->dev;
48160 +       unsigned long ring_ref;
48161 +       unsigned int evtchn;
48162 +       int err;
48163 +
48164 +       DPRINTK("%s", dev->otherend);
48165 +
48166 +       err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref,
48167 +                           "event-channel", "%u", &evtchn, NULL);
48168 +       if (err) {
48169 +               xenbus_dev_fatal(dev, err,
48170 +                                "reading %s/ring-ref and event-channel",
48171 +                                dev->otherend);
48172 +               return err;
48173 +       }
48174 +
48175 +       /* Map the shared frame, irq etc. */
48176 +       err = blkif_map(be->blkif, ring_ref, evtchn);
48177 +       if (err) {
48178 +               xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
48179 +                                ring_ref, evtchn);
48180 +               return err;
48181 +       }
48182 +
48183 +       return 0;
48184 +}
48185 +
48186 +
48187 +/* ** Driver Registration ** */
48188 +
48189 +
48190 +static struct xenbus_device_id blkback_ids[] = {
48191 +       { "vbd" },
48192 +       { "" }
48193 +};
48194 +
48195 +
48196 +static struct xenbus_driver blkback = {
48197 +       .name = "vbd",
48198 +       .owner = THIS_MODULE,
48199 +       .ids = blkback_ids,
48200 +       .probe = blkback_probe,
48201 +       .remove = blkback_remove,
48202 +       .otherend_changed = frontend_changed
48203 +};
48204 +
48205 +
48206 +void blkif_xenbus_init(void)
48207 +{
48208 +       xenbus_register_backend(&blkback);
48209 +}
48210 diff -urNp linux-2.6/drivers/xen/blkfront/blkfront.c new/drivers/xen/blkfront/blkfront.c
48211 --- linux-2.6/drivers/xen/blkfront/blkfront.c   1970-01-01 01:00:00.000000000 +0100
48212 +++ new/drivers/xen/blkfront/blkfront.c 2006-06-28 14:32:14.000000000 +0200
48213 @@ -0,0 +1,841 @@
48214 +/******************************************************************************
48215 + * blkfront.c
48216 + * 
48217 + * XenLinux virtual block-device driver.
48218 + * 
48219 + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
48220 + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
48221 + * Copyright (c) 2004, Christian Limpach
48222 + * Copyright (c) 2004, Andrew Warfield
48223 + * Copyright (c) 2005, Christopher Clark
48224 + * Copyright (c) 2005, XenSource Ltd
48225 + * 
48226 + * This program is free software; you can redistribute it and/or
48227 + * modify it under the terms of the GNU General Public License version 2
48228 + * as published by the Free Software Foundation; or, when distributed
48229 + * separately from the Linux kernel or incorporated into other
48230 + * software packages, subject to the following license:
48231 + * 
48232 + * Permission is hereby granted, free of charge, to any person obtaining a copy
48233 + * of this source file (the "Software"), to deal in the Software without
48234 + * restriction, including without limitation the rights to use, copy, modify,
48235 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
48236 + * and to permit persons to whom the Software is furnished to do so, subject to
48237 + * the following conditions:
48238 + * 
48239 + * The above copyright notice and this permission notice shall be included in
48240 + * all copies or substantial portions of the Software.
48241 + * 
48242 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
48243 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
48244 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
48245 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
48246 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
48247 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
48248 + * IN THE SOFTWARE.
48249 + */
48250 +
48251 +#include <linux/version.h>
48252 +#include "block.h"
48253 +#include <linux/cdrom.h>
48254 +#include <linux/sched.h>
48255 +#include <linux/interrupt.h>
48256 +#include <scsi/scsi.h>
48257 +#include <xen/evtchn.h>
48258 +#include <xen/xenbus.h>
48259 +#include <xen/interface/grant_table.h>
48260 +#include <xen/gnttab.h>
48261 +#include <asm/hypervisor.h>
48262 +
48263 +#define BLKIF_STATE_DISCONNECTED 0
48264 +#define BLKIF_STATE_CONNECTED    1
48265 +#define BLKIF_STATE_SUSPENDED    2
48266 +
48267 +#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
48268 +    (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
48269 +#define GRANT_INVALID_REF      0
48270 +
48271 +static void connect(struct blkfront_info *);
48272 +static void blkfront_closing(struct xenbus_device *);
48273 +static int blkfront_remove(struct xenbus_device *);
48274 +static int talk_to_backend(struct xenbus_device *, struct blkfront_info *);
48275 +static int setup_blkring(struct xenbus_device *, struct blkfront_info *);
48276 +
48277 +static void kick_pending_request_queues(struct blkfront_info *);
48278 +
48279 +static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs);
48280 +static void blkif_restart_queue(void *arg);
48281 +static void blkif_recover(struct blkfront_info *);
48282 +static void blkif_completion(struct blk_shadow *);
48283 +static void blkif_free(struct blkfront_info *, int);
48284 +
48285 +
48286 +/**
48287 + * Entry point to this code when a new device is created.  Allocate the basic
48288 + * structures and the ring buffer for communication with the backend, and
48289 + * inform the backend of the appropriate details for those.  Switch to
48290 + * Initialised state.
48291 + */
48292 +static int blkfront_probe(struct xenbus_device *dev,
48293 +                         const struct xenbus_device_id *id)
48294 +{
48295 +       int err, vdevice, i;
48296 +       struct blkfront_info *info;
48297 +
48298 +       /* FIXME: Use dynamic device id if this is not set. */
48299 +       err = xenbus_scanf(XBT_NIL, dev->nodename,
48300 +                          "virtual-device", "%i", &vdevice);
48301 +       if (err != 1) {
48302 +               xenbus_dev_fatal(dev, err, "reading virtual-device");
48303 +               return err;
48304 +       }
48305 +
48306 +       info = kzalloc(sizeof(*info), GFP_KERNEL);
48307 +       if (!info) {
48308 +               xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
48309 +               return -ENOMEM;
48310 +       }
48311 +
48312 +       info->xbdev = dev;
48313 +       info->vdevice = vdevice;
48314 +       info->connected = BLKIF_STATE_DISCONNECTED;
48315 +       INIT_WORK(&info->work, blkif_restart_queue, (void *)info);
48316 +
48317 +       for (i = 0; i < BLK_RING_SIZE; i++)
48318 +               info->shadow[i].req.id = i+1;
48319 +       info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
48320 +
48321 +       /* Front end dir is a number, which is used as the id. */
48322 +       info->handle = simple_strtoul(strrchr(dev->nodename,'/')+1, NULL, 0);
48323 +       dev->dev.driver_data = info;
48324 +
48325 +       err = talk_to_backend(dev, info);
48326 +       if (err) {
48327 +               kfree(info);
48328 +               dev->dev.driver_data = NULL;
48329 +               return err;
48330 +       }
48331 +
48332 +       return 0;
48333 +}
48334 +
48335 +
48336 +/**
48337 + * We are reconnecting to the backend, due to a suspend/resume, or a backend
48338 + * driver restart.  We tear down our blkif structure and recreate it, but
48339 + * leave the device-layer structures intact so that this is transparent to the
48340 + * rest of the kernel.
48341 + */
48342 +static int blkfront_resume(struct xenbus_device *dev)
48343 +{
48344 +       struct blkfront_info *info = dev->dev.driver_data;
48345 +       int err;
48346 +
48347 +       DPRINTK("blkfront_resume: %s\n", dev->nodename);
48348 +
48349 +       blkif_free(info, 1);
48350 +
48351 +       err = talk_to_backend(dev, info);
48352 +       if (!err)
48353 +               blkif_recover(info);
48354 +
48355 +       return err;
48356 +}
48357 +
48358 +
48359 +/* Common code used when first setting up, and when resuming. */
48360 +static int talk_to_backend(struct xenbus_device *dev,
48361 +                          struct blkfront_info *info)
48362 +{
48363 +       const char *message = NULL;
48364 +       struct xenbus_transaction xbt;
48365 +       int err;
48366 +
48367 +       /* Create shared ring, alloc event channel. */
48368 +       err = setup_blkring(dev, info);
48369 +       if (err)
48370 +               goto out;
48371 +
48372 +again:
48373 +       err = xenbus_transaction_start(&xbt);
48374 +       if (err) {
48375 +               xenbus_dev_fatal(dev, err, "starting transaction");
48376 +               goto destroy_blkring;
48377 +       }
48378 +
48379 +       err = xenbus_printf(xbt, dev->nodename,
48380 +                           "ring-ref","%u", info->ring_ref);
48381 +       if (err) {
48382 +               message = "writing ring-ref";
48383 +               goto abort_transaction;
48384 +       }
48385 +       err = xenbus_printf(xbt, dev->nodename,
48386 +                           "event-channel", "%u", info->evtchn);
48387 +       if (err) {
48388 +               message = "writing event-channel";
48389 +               goto abort_transaction;
48390 +       }
48391 +
48392 +       err = xenbus_transaction_end(xbt, 0);
48393 +       if (err) {
48394 +               if (err == -EAGAIN)
48395 +                       goto again;
48396 +               xenbus_dev_fatal(dev, err, "completing transaction");
48397 +               goto destroy_blkring;
48398 +       }
48399 +
48400 +       xenbus_switch_state(dev, XenbusStateInitialised);
48401 +
48402 +       return 0;
48403 +
48404 + abort_transaction:
48405 +       xenbus_transaction_end(xbt, 1);
48406 +       if (message)
48407 +               xenbus_dev_fatal(dev, err, "%s", message);
48408 + destroy_blkring:
48409 +       blkif_free(info, 0);
48410 + out:
48411 +       return err;
48412 +}
48413 +
48414 +
48415 +static int setup_blkring(struct xenbus_device *dev,
48416 +                        struct blkfront_info *info)
48417 +{
48418 +       blkif_sring_t *sring;
48419 +       int err;
48420 +
48421 +       info->ring_ref = GRANT_INVALID_REF;
48422 +
48423 +       sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL);
48424 +       if (!sring) {
48425 +               xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
48426 +               return -ENOMEM;
48427 +       }
48428 +       SHARED_RING_INIT(sring);
48429 +       FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
48430 +
48431 +       err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
48432 +       if (err < 0) {
48433 +               free_page((unsigned long)sring);
48434 +               info->ring.sring = NULL;
48435 +               goto fail;
48436 +       }
48437 +       info->ring_ref = err;
48438 +
48439 +       err = xenbus_alloc_evtchn(dev, &info->evtchn);
48440 +       if (err)
48441 +               goto fail;
48442 +
48443 +       err = bind_evtchn_to_irqhandler(
48444 +               info->evtchn, blkif_int, SA_SAMPLE_RANDOM, "blkif", info);
48445 +       if (err <= 0) {
48446 +               xenbus_dev_fatal(dev, err,
48447 +                                "bind_evtchn_to_irqhandler failed");
48448 +               goto fail;
48449 +       }
48450 +       info->irq = err;
48451 +
48452 +       return 0;
48453 +fail:
48454 +       blkif_free(info, 0);
48455 +       return err;
48456 +}
48457 +
48458 +
48459 +/**
48460 + * Callback received when the backend's state changes.
48461 + */
48462 +static void backend_changed(struct xenbus_device *dev,
48463 +                           enum xenbus_state backend_state)
48464 +{
48465 +       struct blkfront_info *info = dev->dev.driver_data;
48466 +       struct block_device *bd;
48467 +
48468 +       DPRINTK("blkfront:backend_changed.\n");
48469 +
48470 +       switch (backend_state) {
48471 +       case XenbusStateUnknown:
48472 +       case XenbusStateInitialising:
48473 +       case XenbusStateInitWait:
48474 +       case XenbusStateInitialised:
48475 +       case XenbusStateClosed:
48476 +               break;
48477 +
48478 +       case XenbusStateConnected:
48479 +               connect(info);
48480 +               break;
48481 +
48482 +       case XenbusStateClosing:
48483 +               bd = bdget(info->dev);
48484 +               if (bd == NULL)
48485 +                       xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
48486 +
48487 +               mutex_lock(&bd->bd_mutex);
48488 +               if (info->users > 0)
48489 +                       xenbus_dev_error(dev, -EBUSY,
48490 +                                        "Device in use; refusing to close");
48491 +               else
48492 +                       blkfront_closing(dev);
48493 +               mutex_unlock(&bd->bd_mutex);
48494 +               bdput(bd);
48495 +               break;
48496 +       }
48497 +}
48498 +
48499 +
48500 +/* ** Connection ** */
48501 +
48502 +
48503 +/*
48504 + * Invoked when the backend is finally 'ready' (and has told produced
48505 + * the details about the physical device - #sectors, size, etc).
48506 + */
48507 +static void connect(struct blkfront_info *info)
48508 +{
48509 +       unsigned long sectors, sector_size;
48510 +       unsigned int binfo;
48511 +       int err;
48512 +
48513 +       if ((info->connected == BLKIF_STATE_CONNECTED) ||
48514 +           (info->connected == BLKIF_STATE_SUSPENDED) )
48515 +               return;
48516 +
48517 +       DPRINTK("blkfront.c:connect:%s.\n", info->xbdev->otherend);
48518 +
48519 +       err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
48520 +                           "sectors", "%lu", &sectors,
48521 +                           "info", "%u", &binfo,
48522 +                           "sector-size", "%lu", &sector_size,
48523 +                           NULL);
48524 +       if (err) {
48525 +               xenbus_dev_fatal(info->xbdev, err,
48526 +                                "reading backend fields at %s",
48527 +                                info->xbdev->otherend);
48528 +               return;
48529 +       }
48530 +
48531 +       err = xlvbd_add(sectors, info->vdevice, binfo, sector_size, info);
48532 +       if (err) {
48533 +               xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
48534 +                                info->xbdev->otherend);
48535 +               return;
48536 +       }
48537 +
48538 +       (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
48539 +
48540 +       /* Kick pending requests. */
48541 +       spin_lock_irq(&blkif_io_lock);
48542 +       info->connected = BLKIF_STATE_CONNECTED;
48543 +       kick_pending_request_queues(info);
48544 +       spin_unlock_irq(&blkif_io_lock);
48545 +
48546 +       add_disk(info->gd);
48547 +}
48548 +
48549 +/**
48550 + * Handle the change of state of the backend to Closing.  We must delete our
48551 + * device-layer structures now, to ensure that writes are flushed through to
48552 + * the backend.  Once is this done, we can switch to Closed in
48553 + * acknowledgement.
48554 + */
48555 +static void blkfront_closing(struct xenbus_device *dev)
48556 +{
48557 +       struct blkfront_info *info = dev->dev.driver_data;
48558 +       unsigned long flags;
48559 +
48560 +       DPRINTK("blkfront_closing: %s removed\n", dev->nodename);
48561 +
48562 +       if (info->rq == NULL)
48563 +               return;
48564 +
48565 +       spin_lock_irqsave(&blkif_io_lock, flags);
48566 +       /* No more blkif_request(). */
48567 +       blk_stop_queue(info->rq);
48568 +       /* No more gnttab callback work. */
48569 +       gnttab_cancel_free_callback(&info->callback);
48570 +       flush_scheduled_work();
48571 +       spin_unlock_irqrestore(&blkif_io_lock, flags);
48572 +
48573 +       xlvbd_del(info);
48574 +
48575 +       xenbus_switch_state(dev, XenbusStateClosed);
48576 +}
48577 +
48578 +
48579 +static int blkfront_remove(struct xenbus_device *dev)
48580 +{
48581 +       struct blkfront_info *info = dev->dev.driver_data;
48582 +
48583 +       DPRINTK("blkfront_remove: %s removed\n", dev->nodename);
48584 +
48585 +       blkif_free(info, 0);
48586 +
48587 +       kfree(info);
48588 +
48589 +       return 0;
48590 +}
48591 +
48592 +
48593 +static inline int GET_ID_FROM_FREELIST(
48594 +       struct blkfront_info *info)
48595 +{
48596 +       unsigned long free = info->shadow_free;
48597 +       BUG_ON(free > BLK_RING_SIZE);
48598 +       info->shadow_free = info->shadow[free].req.id;
48599 +       info->shadow[free].req.id = 0x0fffffee; /* debug */
48600 +       return free;
48601 +}
48602 +
48603 +static inline void ADD_ID_TO_FREELIST(
48604 +       struct blkfront_info *info, unsigned long id)
48605 +{
48606 +       info->shadow[id].req.id  = info->shadow_free;
48607 +       info->shadow[id].request = 0;
48608 +       info->shadow_free = id;
48609 +}
48610 +
48611 +static inline void flush_requests(struct blkfront_info *info)
48612 +{
48613 +       int notify;
48614 +
48615 +       RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
48616 +
48617 +       if (notify)
48618 +               notify_remote_via_irq(info->irq);
48619 +}
48620 +
48621 +static void kick_pending_request_queues(struct blkfront_info *info)
48622 +{
48623 +       if (!RING_FULL(&info->ring)) {
48624 +               /* Re-enable calldowns. */
48625 +               blk_start_queue(info->rq);
48626 +               /* Kick things off immediately. */
48627 +               do_blkif_request(info->rq);
48628 +       }
48629 +}
48630 +
48631 +static void blkif_restart_queue(void *arg)
48632 +{
48633 +       struct blkfront_info *info = (struct blkfront_info *)arg;
48634 +       spin_lock_irq(&blkif_io_lock);
48635 +       if (info->connected == BLKIF_STATE_CONNECTED)
48636 +               kick_pending_request_queues(info);
48637 +       spin_unlock_irq(&blkif_io_lock);
48638 +}
48639 +
48640 +static void blkif_restart_queue_callback(void *arg)
48641 +{
48642 +       struct blkfront_info *info = (struct blkfront_info *)arg;
48643 +       schedule_work(&info->work);
48644 +}
48645 +
48646 +int blkif_open(struct inode *inode, struct file *filep)
48647 +{
48648 +       struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
48649 +       info->users++;
48650 +       return 0;
48651 +}
48652 +
48653 +
48654 +int blkif_release(struct inode *inode, struct file *filep)
48655 +{
48656 +       struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
48657 +       info->users--;
48658 +       if (info->users == 0) {
48659 +               /* Check whether we have been instructed to close.  We will
48660 +                  have ignored this request initially, as the device was
48661 +                  still mounted. */
48662 +               struct xenbus_device * dev = info->xbdev;
48663 +               enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
48664 +
48665 +               if (state == XenbusStateClosing)
48666 +                       blkfront_closing(dev);
48667 +       }
48668 +       return 0;
48669 +}
48670 +
48671 +
48672 +int blkif_ioctl(struct inode *inode, struct file *filep,
48673 +               unsigned command, unsigned long argument)
48674 +{
48675 +       int i;
48676 +
48677 +       DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
48678 +                     command, (long)argument, inode->i_rdev);
48679 +
48680 +       switch (command) {
48681 +       case CDROMMULTISESSION:
48682 +               DPRINTK("FIXME: support multisession CDs later\n");
48683 +               for (i = 0; i < sizeof(struct cdrom_multisession); i++)
48684 +                       if (put_user(0, (char __user *)(argument + i)))
48685 +                               return -EFAULT;
48686 +               return 0;
48687 +
48688 +       default:
48689 +               /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
48690 +                 command);*/
48691 +               return -EINVAL; /* same return as native Linux */
48692 +       }
48693 +
48694 +       return 0;
48695 +}
48696 +
48697 +
48698 +int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
48699 +{
48700 +       /* We don't have real geometry info, but let's at least return
48701 +          values consistent with the size of the device */
48702 +       sector_t nsect = get_capacity(bd->bd_disk);
48703 +       sector_t cylinders = nsect;
48704 +
48705 +       hg->heads = 0xff;
48706 +       hg->sectors = 0x3f;
48707 +       sector_div(cylinders, hg->heads * hg->sectors);
48708 +       hg->cylinders = cylinders;
48709 +       if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
48710 +               hg->cylinders = 0xffff;
48711 +       return 0;
48712 +}
48713 +
48714 +
48715 +/*
48716 + * blkif_queue_request
48717 + *
48718 + * request block io
48719 + *
48720 + * id: for guest use only.
48721 + * operation: BLKIF_OP_{READ,WRITE,PROBE}
48722 + * buffer: buffer to read/write into. this should be a
48723 + *   virtual address in the guest os.
48724 + */
48725 +static int blkif_queue_request(struct request *req)
48726 +{
48727 +       struct blkfront_info *info = req->rq_disk->private_data;
48728 +       unsigned long buffer_mfn;
48729 +       blkif_request_t *ring_req;
48730 +       struct bio *bio;
48731 +       struct bio_vec *bvec;
48732 +       int idx;
48733 +       unsigned long id;
48734 +       unsigned int fsect, lsect;
48735 +       int ref;
48736 +       grant_ref_t gref_head;
48737 +
48738 +       if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
48739 +               return 1;
48740 +
48741 +       if (gnttab_alloc_grant_references(
48742 +               BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
48743 +               gnttab_request_free_callback(
48744 +                       &info->callback,
48745 +                       blkif_restart_queue_callback,
48746 +                       info,
48747 +                       BLKIF_MAX_SEGMENTS_PER_REQUEST);
48748 +               return 1;
48749 +       }
48750 +
48751 +       /* Fill out a communications ring structure. */
48752 +       ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
48753 +       id = GET_ID_FROM_FREELIST(info);
48754 +       info->shadow[id].request = (unsigned long)req;
48755 +
48756 +       ring_req->id = id;
48757 +       ring_req->operation = rq_data_dir(req) ?
48758 +               BLKIF_OP_WRITE : BLKIF_OP_READ;
48759 +       ring_req->sector_number = (blkif_sector_t)req->sector;
48760 +       ring_req->handle = info->handle;
48761 +
48762 +       ring_req->nr_segments = 0;
48763 +       rq_for_each_bio (bio, req) {
48764 +               bio_for_each_segment (bvec, bio, idx) {
48765 +                       BUG_ON(ring_req->nr_segments
48766 +                              == BLKIF_MAX_SEGMENTS_PER_REQUEST);
48767 +                       buffer_mfn = page_to_phys(bvec->bv_page) >> PAGE_SHIFT;
48768 +                       fsect = bvec->bv_offset >> 9;
48769 +                       lsect = fsect + (bvec->bv_len >> 9) - 1;
48770 +                       /* install a grant reference. */
48771 +                       ref = gnttab_claim_grant_reference(&gref_head);
48772 +                       BUG_ON(ref == -ENOSPC);
48773 +
48774 +                       gnttab_grant_foreign_access_ref(
48775 +                               ref,
48776 +                               info->xbdev->otherend_id,
48777 +                               buffer_mfn,
48778 +                               rq_data_dir(req) );
48779 +
48780 +                       info->shadow[id].frame[ring_req->nr_segments] =
48781 +                               mfn_to_pfn(buffer_mfn);
48782 +
48783 +                       ring_req->seg[ring_req->nr_segments] =
48784 +                               (struct blkif_request_segment) {
48785 +                                       .gref       = ref,
48786 +                                       .first_sect = fsect,
48787 +                                       .last_sect  = lsect };
48788 +
48789 +                       ring_req->nr_segments++;
48790 +               }
48791 +       }
48792 +
48793 +       info->ring.req_prod_pvt++;
48794 +
48795 +       /* Keep a private copy so we can reissue requests when recovering. */
48796 +       info->shadow[id].req = *ring_req;
48797 +
48798 +       gnttab_free_grant_references(gref_head);
48799 +
48800 +       return 0;
48801 +}
48802 +
48803 +/*
48804 + * do_blkif_request
48805 + *  read a block; request is in a request queue
48806 + */
48807 +void do_blkif_request(request_queue_t *rq)
48808 +{
48809 +       struct blkfront_info *info = NULL;
48810 +       struct request *req;
48811 +       int queued;
48812 +
48813 +       DPRINTK("Entered do_blkif_request\n");
48814 +
48815 +       queued = 0;
48816 +
48817 +       while ((req = elv_next_request(rq)) != NULL) {
48818 +               info = req->rq_disk->private_data;
48819 +               if (!blk_fs_request(req)) {
48820 +                       end_request(req, 0);
48821 +                       continue;
48822 +               }
48823 +
48824 +               if (RING_FULL(&info->ring))
48825 +                       goto wait;
48826 +
48827 +               DPRINTK("do_blk_req %p: cmd %p, sec %lx, "
48828 +                       "(%u/%li) buffer:%p [%s]\n",
48829 +                       req, req->cmd, req->sector, req->current_nr_sectors,
48830 +                       req->nr_sectors, req->buffer,
48831 +                       rq_data_dir(req) ? "write" : "read");
48832 +
48833 +
48834 +               blkdev_dequeue_request(req);
48835 +               if (blkif_queue_request(req)) {
48836 +                       blk_requeue_request(rq, req);
48837 +               wait:
48838 +                       /* Avoid pointless unplugs. */
48839 +                       blk_stop_queue(rq);
48840 +                       break;
48841 +               }
48842 +
48843 +               queued++;
48844 +       }
48845 +
48846 +       if (queued != 0)
48847 +               flush_requests(info);
48848 +}
48849 +
48850 +
48851 +static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
48852 +{
48853 +       struct request *req;
48854 +       blkif_response_t *bret;
48855 +       RING_IDX i, rp;
48856 +       unsigned long flags;
48857 +       struct blkfront_info *info = (struct blkfront_info *)dev_id;
48858 +
48859 +       spin_lock_irqsave(&blkif_io_lock, flags);
48860 +
48861 +       if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
48862 +               spin_unlock_irqrestore(&blkif_io_lock, flags);
48863 +               return IRQ_HANDLED;
48864 +       }
48865 +
48866 + again:
48867 +       rp = info->ring.sring->rsp_prod;
48868 +       rmb(); /* Ensure we see queued responses up to 'rp'. */
48869 +
48870 +       for (i = info->ring.rsp_cons; i != rp; i++) {
48871 +               unsigned long id;
48872 +               int ret;
48873 +
48874 +               bret = RING_GET_RESPONSE(&info->ring, i);
48875 +               id   = bret->id;
48876 +               req  = (struct request *)info->shadow[id].request;
48877 +
48878 +               blkif_completion(&info->shadow[id]);
48879 +
48880 +               ADD_ID_TO_FREELIST(info, id);
48881 +
48882 +               switch (bret->operation) {
48883 +               case BLKIF_OP_READ:
48884 +               case BLKIF_OP_WRITE:
48885 +                       if (unlikely(bret->status != BLKIF_RSP_OKAY))
48886 +                               DPRINTK("Bad return from blkdev data "
48887 +                                       "request: %x\n", bret->status);
48888 +
48889 +                       ret = end_that_request_first(
48890 +                               req, (bret->status == BLKIF_RSP_OKAY),
48891 +                               req->hard_nr_sectors);
48892 +                       BUG_ON(ret);
48893 +                       end_that_request_last(
48894 +                               req, (bret->status == BLKIF_RSP_OKAY));
48895 +                       break;
48896 +               default:
48897 +                       BUG();
48898 +               }
48899 +       }
48900 +
48901 +       info->ring.rsp_cons = i;
48902 +
48903 +       if (i != info->ring.req_prod_pvt) {
48904 +               int more_to_do;
48905 +               RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
48906 +               if (more_to_do)
48907 +                       goto again;
48908 +       } else
48909 +               info->ring.sring->rsp_event = i + 1;
48910 +
48911 +       kick_pending_request_queues(info);
48912 +
48913 +       spin_unlock_irqrestore(&blkif_io_lock, flags);
48914 +
48915 +       return IRQ_HANDLED;
48916 +}
48917 +
48918 +static void blkif_free(struct blkfront_info *info, int suspend)
48919 +{
48920 +       /* Prevent new requests being issued until we fix things up. */
48921 +       spin_lock_irq(&blkif_io_lock);
48922 +       info->connected = suspend ?
48923 +               BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
48924 +       /* No more blkif_request(). */
48925 +       if (info->rq)
48926 +               blk_stop_queue(info->rq);
48927 +       /* No more gnttab callback work. */
48928 +       gnttab_cancel_free_callback(&info->callback);
48929 +       flush_scheduled_work();
48930 +       spin_unlock_irq(&blkif_io_lock);
48931 +
48932 +       /* Free resources associated with old device channel. */
48933 +       if (info->ring_ref != GRANT_INVALID_REF) {
48934 +               gnttab_end_foreign_access(info->ring_ref, 0,
48935 +                                         (unsigned long)info->ring.sring);
48936 +               info->ring_ref = GRANT_INVALID_REF;
48937 +               info->ring.sring = NULL;
48938 +       }
48939 +       if (info->irq)
48940 +               unbind_from_irqhandler(info->irq, info);
48941 +       info->evtchn = info->irq = 0;
48942 +
48943 +}
48944 +
48945 +static void blkif_completion(struct blk_shadow *s)
48946 +{
48947 +       int i;
48948 +       for (i = 0; i < s->req.nr_segments; i++)
48949 +               gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
48950 +}
48951 +
48952 +static void blkif_recover(struct blkfront_info *info)
48953 +{
48954 +       int i;
48955 +       blkif_request_t *req;
48956 +       struct blk_shadow *copy;
48957 +       int j;
48958 +
48959 +       /* Stage 1: Make a safe copy of the shadow state. */
48960 +       copy = kmalloc(sizeof(info->shadow), GFP_KERNEL | __GFP_NOFAIL);
48961 +       memcpy(copy, info->shadow, sizeof(info->shadow));
48962 +
48963 +       /* Stage 2: Set up free list. */
48964 +       memset(&info->shadow, 0, sizeof(info->shadow));
48965 +       for (i = 0; i < BLK_RING_SIZE; i++)
48966 +               info->shadow[i].req.id = i+1;
48967 +       info->shadow_free = info->ring.req_prod_pvt;
48968 +       info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
48969 +
48970 +       /* Stage 3: Find pending requests and requeue them. */
48971 +       for (i = 0; i < BLK_RING_SIZE; i++) {
48972 +               /* Not in use? */
48973 +               if (copy[i].request == 0)
48974 +                       continue;
48975 +
48976 +               /* Grab a request slot and copy shadow state into it. */
48977 +               req = RING_GET_REQUEST(
48978 +                       &info->ring, info->ring.req_prod_pvt);
48979 +               *req = copy[i].req;
48980 +
48981 +               /* We get a new request id, and must reset the shadow state. */
48982 +               req->id = GET_ID_FROM_FREELIST(info);
48983 +               memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
48984 +
48985 +               /* Rewrite any grant references invalidated by susp/resume. */
48986 +               for (j = 0; j < req->nr_segments; j++)
48987 +                       gnttab_grant_foreign_access_ref(
48988 +                               req->seg[j].gref,
48989 +                               info->xbdev->otherend_id,
48990 +                               pfn_to_mfn(info->shadow[req->id].frame[j]),
48991 +                               rq_data_dir(
48992 +                                       (struct request *)
48993 +                                       info->shadow[req->id].request));
48994 +               info->shadow[req->id].req = *req;
48995 +
48996 +               info->ring.req_prod_pvt++;
48997 +       }
48998 +
48999 +       kfree(copy);
49000 +
49001 +       (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
49002 +
49003 +       spin_lock_irq(&blkif_io_lock);
49004 +
49005 +       /* Now safe for us to use the shared ring */
49006 +       info->connected = BLKIF_STATE_CONNECTED;
49007 +
49008 +       /* Send off requeued requests */
49009 +       flush_requests(info);
49010 +
49011 +       /* Kick any other new requests queued since we resumed */
49012 +       kick_pending_request_queues(info);
49013 +
49014 +       spin_unlock_irq(&blkif_io_lock);
49015 +}
49016 +
49017 +
49018 +/* ** Driver Registration ** */
49019 +
49020 +
49021 +static struct xenbus_device_id blkfront_ids[] = {
49022 +       { "vbd" },
49023 +       { "" }
49024 +};
49025 +
49026 +
49027 +static struct xenbus_driver blkfront = {
49028 +       .name = "vbd",
49029 +       .owner = THIS_MODULE,
49030 +       .ids = blkfront_ids,
49031 +       .probe = blkfront_probe,
49032 +       .remove = blkfront_remove,
49033 +       .resume = blkfront_resume,
49034 +       .otherend_changed = backend_changed,
49035 +};
49036 +
49037 +
49038 +static int __init xlblk_init(void)
49039 +{
49040 +       if (!is_running_on_xen())
49041 +               return -ENODEV;
49042 +
49043 +       return xenbus_register_frontend(&blkfront);
49044 +}
49045 +module_init(xlblk_init);
49046 +
49047 +
49048 +static void xlblk_exit(void)
49049 +{
49050 +       return xenbus_unregister_driver(&blkfront);
49051 +}
49052 +module_exit(xlblk_exit);
49053 +
49054 +MODULE_LICENSE("Dual BSD/GPL");
49055 diff -urNp linux-2.6/drivers/xen/blkfront/block.h new/drivers/xen/blkfront/block.h
49056 --- linux-2.6/drivers/xen/blkfront/block.h      1970-01-01 01:00:00.000000000 +0100
49057 +++ new/drivers/xen/blkfront/block.h    2006-06-28 14:32:14.000000000 +0200
49058 @@ -0,0 +1,156 @@
49059 +/******************************************************************************
49060 + * block.h
49061 + * 
49062 + * Shared definitions between all levels of XenLinux Virtual block devices.
49063 + * 
49064 + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
49065 + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
49066 + * Copyright (c) 2004-2005, Christian Limpach
49067 + * 
49068 + * This program is free software; you can redistribute it and/or
49069 + * modify it under the terms of the GNU General Public License version 2
49070 + * as published by the Free Software Foundation; or, when distributed
49071 + * separately from the Linux kernel or incorporated into other
49072 + * software packages, subject to the following license:
49073 + * 
49074 + * Permission is hereby granted, free of charge, to any person obtaining a copy
49075 + * of this source file (the "Software"), to deal in the Software without
49076 + * restriction, including without limitation the rights to use, copy, modify,
49077 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
49078 + * and to permit persons to whom the Software is furnished to do so, subject to
49079 + * the following conditions:
49080 + * 
49081 + * The above copyright notice and this permission notice shall be included in
49082 + * all copies or substantial portions of the Software.
49083 + * 
49084 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
49085 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
49086 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
49087 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
49088 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
49089 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
49090 + * IN THE SOFTWARE.
49091 + */
49092 +
49093 +#ifndef __XEN_DRIVERS_BLOCK_H__
49094 +#define __XEN_DRIVERS_BLOCK_H__
49095 +
49096 +#include <linux/config.h>
49097 +#include <linux/version.h>
49098 +#include <linux/module.h>
49099 +#include <linux/kernel.h>
49100 +#include <linux/sched.h>
49101 +#include <linux/slab.h>
49102 +#include <linux/string.h>
49103 +#include <linux/errno.h>
49104 +#include <linux/fs.h>
49105 +#include <linux/hdreg.h>
49106 +#include <linux/blkdev.h>
49107 +#include <linux/major.h>
49108 +#include <linux/devfs_fs_kernel.h>
49109 +#include <asm/hypervisor.h>
49110 +#include <xen/xenbus.h>
49111 +#include <xen/gnttab.h>
49112 +#include <xen/interface/xen.h>
49113 +#include <xen/interface/io/blkif.h>
49114 +#include <xen/interface/io/ring.h>
49115 +#include <asm/io.h>
49116 +#include <asm/atomic.h>
49117 +#include <asm/uaccess.h>
49118 +
49119 +#if 1
49120 +#define IPRINTK(fmt, args...)                          \
49121 +       printk(KERN_INFO "xen_blk: " fmt, ##args)
49122 +#else
49123 +#define IPRINTK(fmt, args...) ((void)0)
49124 +#endif
49125 +
49126 +#if 1
49127 +#define WPRINTK(fmt, args...)                          \
49128 +       printk(KERN_WARNING "xen_blk: " fmt, ##args)
49129 +#else
49130 +#define WPRINTK(fmt, args...) ((void)0)
49131 +#endif
49132 +
49133 +#define DPRINTK(_f, _a...) pr_debug(_f, ## _a)
49134 +
49135 +#if 0
49136 +#define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a)
49137 +#else
49138 +#define DPRINTK_IOCTL(_f, _a...) ((void)0)
49139 +#endif
49140 +
49141 +struct xlbd_type_info
49142 +{
49143 +       int partn_shift;
49144 +       int disks_per_major;
49145 +       char *devname;
49146 +       char *diskname;
49147 +};
49148 +
49149 +struct xlbd_major_info
49150 +{
49151 +       int major;
49152 +       int index;
49153 +       int usage;
49154 +       struct xlbd_type_info *type;
49155 +};
49156 +
49157 +struct blk_shadow {
49158 +       blkif_request_t req;
49159 +       unsigned long request;
49160 +       unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
49161 +};
49162 +
49163 +#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
49164 +
49165 +/*
49166 + * We have one of these per vbd, whether ide, scsi or 'other'.  They
49167 + * hang in private_data off the gendisk structure. We may end up
49168 + * putting all kinds of interesting stuff here :-)
49169 + */
49170 +struct blkfront_info
49171 +{
49172 +       struct xenbus_device *xbdev;
49173 +       dev_t dev;
49174 +       struct gendisk *gd;
49175 +       int vdevice;
49176 +       blkif_vdev_t handle;
49177 +       int connected;
49178 +       int ring_ref;
49179 +       blkif_front_ring_t ring;
49180 +       unsigned int evtchn, irq;
49181 +       struct xlbd_major_info *mi;
49182 +       request_queue_t *rq;
49183 +       struct work_struct work;
49184 +       struct gnttab_free_callback callback;
49185 +       struct blk_shadow shadow[BLK_RING_SIZE];
49186 +       unsigned long shadow_free;
49187 +
49188 +       /**
49189 +        * The number of people holding this device open.  We won't allow a
49190 +        * hot-unplug unless this is 0.
49191 +        */
49192 +       int users;
49193 +};
49194 +
49195 +extern spinlock_t blkif_io_lock;
49196 +
49197 +extern int blkif_open(struct inode *inode, struct file *filep);
49198 +extern int blkif_release(struct inode *inode, struct file *filep);
49199 +extern int blkif_ioctl(struct inode *inode, struct file *filep,
49200 +                      unsigned command, unsigned long argument);
49201 +extern int blkif_getgeo(struct block_device *, struct hd_geometry *);
49202 +extern int blkif_check(dev_t dev);
49203 +extern int blkif_revalidate(dev_t dev);
49204 +extern void do_blkif_request (request_queue_t *rq);
49205 +
49206 +/* Virtual block-device subsystem. */
49207 +/* Note that xlvbd_add doesn't call add_disk for you: you're expected
49208 +   to call add_disk on info->gd once the disk is properly connected
49209 +   up. */
49210 +int xlvbd_add(blkif_sector_t capacity, int device,
49211 +             u16 vdisk_info, u16 sector_size, struct blkfront_info *info);
49212 +void xlvbd_del(struct blkfront_info *info);
49213 +
49214 +#endif /* __XEN_DRIVERS_BLOCK_H__ */
49215 diff -urNp linux-2.6/drivers/xen/blkfront/Kconfig new/drivers/xen/blkfront/Kconfig
49216 --- linux-2.6/drivers/xen/blkfront/Kconfig      1970-01-01 01:00:00.000000000 +0100
49217 +++ new/drivers/xen/blkfront/Kconfig    2006-05-09 12:34:37.000000000 +0200
49218 @@ -0,0 +1,6 @@
49219 +
49220 +config XENBLOCK
49221 +       tristate "Block device driver"
49222 +       depends on ARCH_XEN
49223 +       help
49224 +         Block device driver for Xen
49225 diff -urNp linux-2.6/drivers/xen/blkfront/Makefile new/drivers/xen/blkfront/Makefile
49226 --- linux-2.6/drivers/xen/blkfront/Makefile     1970-01-01 01:00:00.000000000 +0100
49227 +++ new/drivers/xen/blkfront/Makefile   2006-05-09 12:34:37.000000000 +0200
49228 @@ -0,0 +1,5 @@
49229 +
49230 +obj-$(CONFIG_XEN_BLKDEV_FRONTEND)      := xenblk.o
49231 +
49232 +xenblk-objs := blkfront.o vbd.o
49233 +
49234 diff -urNp linux-2.6/drivers/xen/blkfront/vbd.c new/drivers/xen/blkfront/vbd.c
49235 --- linux-2.6/drivers/xen/blkfront/vbd.c        1970-01-01 01:00:00.000000000 +0100
49236 +++ new/drivers/xen/blkfront/vbd.c      2006-06-07 13:29:36.000000000 +0200
49237 @@ -0,0 +1,318 @@
49238 +/******************************************************************************
49239 + * vbd.c
49240 + * 
49241 + * XenLinux virtual block-device driver (xvd).
49242 + * 
49243 + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
49244 + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
49245 + * Copyright (c) 2004-2005, Christian Limpach
49246 + * 
49247 + * This program is free software; you can redistribute it and/or
49248 + * modify it under the terms of the GNU General Public License version 2
49249 + * as published by the Free Software Foundation; or, when distributed
49250 + * separately from the Linux kernel or incorporated into other
49251 + * software packages, subject to the following license:
49252 + * 
49253 + * Permission is hereby granted, free of charge, to any person obtaining a copy
49254 + * of this source file (the "Software"), to deal in the Software without
49255 + * restriction, including without limitation the rights to use, copy, modify,
49256 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
49257 + * and to permit persons to whom the Software is furnished to do so, subject to
49258 + * the following conditions:
49259 + * 
49260 + * The above copyright notice and this permission notice shall be included in
49261 + * all copies or substantial portions of the Software.
49262 + * 
49263 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
49264 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
49265 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
49266 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
49267 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
49268 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
49269 + * IN THE SOFTWARE.
49270 + */
49271 +
49272 +#include "block.h"
49273 +#include <linux/blkdev.h>
49274 +#include <linux/list.h>
49275 +
49276 +#define BLKIF_MAJOR(dev) ((dev)>>8)
49277 +#define BLKIF_MINOR(dev) ((dev) & 0xff)
49278 +
49279 +/*
49280 + * For convenience we distinguish between ide, scsi and 'other' (i.e.,
49281 + * potentially combinations of the two) in the naming scheme and in a few other
49282 + * places.
49283 + */
49284 +
49285 +#define NUM_IDE_MAJORS 10
49286 +#define NUM_SCSI_MAJORS 9
49287 +#define NUM_VBD_MAJORS 1
49288 +
49289 +static struct xlbd_type_info xlbd_ide_type = {
49290 +       .partn_shift = 6,
49291 +       .disks_per_major = 2,
49292 +       .devname = "ide",
49293 +       .diskname = "hd",
49294 +};
49295 +
49296 +static struct xlbd_type_info xlbd_scsi_type = {
49297 +       .partn_shift = 4,
49298 +       .disks_per_major = 16,
49299 +       .devname = "sd",
49300 +       .diskname = "sd",
49301 +};
49302 +
49303 +static struct xlbd_type_info xlbd_vbd_type = {
49304 +       .partn_shift = 4,
49305 +       .disks_per_major = 16,
49306 +       .devname = "xvd",
49307 +       .diskname = "xvd",
49308 +};
49309 +
49310 +static struct xlbd_major_info *major_info[NUM_IDE_MAJORS + NUM_SCSI_MAJORS +
49311 +                                        NUM_VBD_MAJORS];
49312 +
49313 +#define XLBD_MAJOR_IDE_START   0
49314 +#define XLBD_MAJOR_SCSI_START  (NUM_IDE_MAJORS)
49315 +#define XLBD_MAJOR_VBD_START   (NUM_IDE_MAJORS + NUM_SCSI_MAJORS)
49316 +
49317 +#define XLBD_MAJOR_IDE_RANGE   XLBD_MAJOR_IDE_START ... XLBD_MAJOR_SCSI_START - 1
49318 +#define XLBD_MAJOR_SCSI_RANGE  XLBD_MAJOR_SCSI_START ... XLBD_MAJOR_VBD_START - 1
49319 +#define XLBD_MAJOR_VBD_RANGE   XLBD_MAJOR_VBD_START ... XLBD_MAJOR_VBD_START + NUM_VBD_MAJORS - 1
49320 +
49321 +/* Information about our VBDs. */
49322 +#define MAX_VBDS 64
49323 +static LIST_HEAD(vbds_list);
49324 +
49325 +static struct block_device_operations xlvbd_block_fops =
49326 +{
49327 +       .owner = THIS_MODULE,
49328 +       .open = blkif_open,
49329 +       .release = blkif_release,
49330 +       .ioctl  = blkif_ioctl,
49331 +       .getgeo = blkif_getgeo
49332 +};
49333 +
49334 +DEFINE_SPINLOCK(blkif_io_lock);
49335 +
49336 +static struct xlbd_major_info *
49337 +xlbd_alloc_major_info(int major, int minor, int index)
49338 +{
49339 +       struct xlbd_major_info *ptr;
49340 +
49341 +       ptr = kzalloc(sizeof(struct xlbd_major_info), GFP_KERNEL);
49342 +       if (ptr == NULL)
49343 +               return NULL;
49344 +
49345 +       ptr->major = major;
49346 +
49347 +       switch (index) {
49348 +       case XLBD_MAJOR_IDE_RANGE:
49349 +               ptr->type = &xlbd_ide_type;
49350 +               ptr->index = index - XLBD_MAJOR_IDE_START;
49351 +               break;
49352 +       case XLBD_MAJOR_SCSI_RANGE:
49353 +               ptr->type = &xlbd_scsi_type;
49354 +               ptr->index = index - XLBD_MAJOR_SCSI_START;
49355 +               break;
49356 +       case XLBD_MAJOR_VBD_RANGE:
49357 +               ptr->type = &xlbd_vbd_type;
49358 +               ptr->index = index - XLBD_MAJOR_VBD_START;
49359 +               break;
49360 +       }
49361 +
49362 +       printk("Registering block device major %i\n", ptr->major);
49363 +       if (register_blkdev(ptr->major, ptr->type->devname)) {
49364 +               WPRINTK("can't get major %d with name %s\n",
49365 +                       ptr->major, ptr->type->devname);
49366 +               kfree(ptr);
49367 +               return NULL;
49368 +       }
49369 +
49370 +       devfs_mk_dir(ptr->type->devname);
49371 +       major_info[index] = ptr;
49372 +       return ptr;
49373 +}
49374 +
49375 +static struct xlbd_major_info *
49376 +xlbd_get_major_info(int vdevice)
49377 +{
49378 +       struct xlbd_major_info *mi;
49379 +       int major, minor, index;
49380 +
49381 +       major = BLKIF_MAJOR(vdevice);
49382 +       minor = BLKIF_MINOR(vdevice);
49383 +
49384 +       switch (major) {
49385 +       case IDE0_MAJOR: index = 0; break;
49386 +       case IDE1_MAJOR: index = 1; break;
49387 +       case IDE2_MAJOR: index = 2; break;
49388 +       case IDE3_MAJOR: index = 3; break;
49389 +       case IDE4_MAJOR: index = 4; break;
49390 +       case IDE5_MAJOR: index = 5; break;
49391 +       case IDE6_MAJOR: index = 6; break;
49392 +       case IDE7_MAJOR: index = 7; break;
49393 +       case IDE8_MAJOR: index = 8; break;
49394 +       case IDE9_MAJOR: index = 9; break;
49395 +       case SCSI_DISK0_MAJOR: index = 10; break;
49396 +       case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR:
49397 +               index = 11 + major - SCSI_DISK1_MAJOR;
49398 +               break;
49399 +       case SCSI_CDROM_MAJOR: index = 18; break;
49400 +       default: index = 19; break;
49401 +       }
49402 +
49403 +       mi = ((major_info[index] != NULL) ? major_info[index] :
49404 +             xlbd_alloc_major_info(major, minor, index));
49405 +       if (mi)
49406 +               mi->usage++;
49407 +       return mi;
49408 +}
49409 +
49410 +static void
49411 +xlbd_put_major_info(struct xlbd_major_info *mi)
49412 +{
49413 +       mi->usage--;
49414 +       /* XXX: release major if 0 */
49415 +}
49416 +
49417 +static int
49418 +xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
49419 +{
49420 +       request_queue_t *rq;
49421 +
49422 +       rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
49423 +       if (rq == NULL)
49424 +               return -1;
49425 +
49426 +       elevator_init(rq, "noop");
49427 +
49428 +       /* Hard sector size and max sectors impersonate the equiv. hardware. */
49429 +       blk_queue_hardsect_size(rq, sector_size);
49430 +       blk_queue_max_sectors(rq, 512);
49431 +
49432 +       /* Each segment in a request is up to an aligned page in size. */
49433 +       blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
49434 +       blk_queue_max_segment_size(rq, PAGE_SIZE);
49435 +
49436 +       /* Ensure a merged request will fit in a single I/O ring slot. */
49437 +       blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
49438 +       blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
49439 +
49440 +       /* Make sure buffer addresses are sector-aligned. */
49441 +       blk_queue_dma_alignment(rq, 511);
49442 +
49443 +       gd->queue = rq;
49444 +
49445 +       return 0;
49446 +}
49447 +
49448 +static int
49449 +xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity, int vdevice,
49450 +                   u16 vdisk_info, u16 sector_size,
49451 +                   struct blkfront_info *info)
49452 +{
49453 +       struct gendisk *gd;
49454 +       struct xlbd_major_info *mi;
49455 +       int nr_minors = 1;
49456 +       int err = -ENODEV;
49457 +
49458 +       BUG_ON(info->gd != NULL);
49459 +       BUG_ON(info->mi != NULL);
49460 +       BUG_ON(info->rq != NULL);
49461 +
49462 +       mi = xlbd_get_major_info(vdevice);
49463 +       if (mi == NULL)
49464 +               goto out;
49465 +       info->mi = mi;
49466 +
49467 +       if ((minor & ((1 << mi->type->partn_shift) - 1)) == 0)
49468 +               nr_minors = 1 << mi->type->partn_shift;
49469 +
49470 +       gd = alloc_disk(nr_minors);
49471 +       if (gd == NULL)
49472 +               goto out;
49473 +
49474 +       if (nr_minors > 1)
49475 +               sprintf(gd->disk_name, "%s%c", mi->type->diskname,
49476 +                       'a' + mi->index * mi->type->disks_per_major +
49477 +                       (minor >> mi->type->partn_shift));
49478 +       else
49479 +               sprintf(gd->disk_name, "%s%c%d", mi->type->diskname,
49480 +                       'a' + mi->index * mi->type->disks_per_major +
49481 +                       (minor >> mi->type->partn_shift),
49482 +                       minor & ((1 << mi->type->partn_shift) - 1));
49483 +
49484 +       gd->major = mi->major;
49485 +       gd->first_minor = minor;
49486 +       gd->fops = &xlvbd_block_fops;
49487 +       gd->private_data = info;
49488 +       gd->driverfs_dev = &(info->xbdev->dev);
49489 +       set_capacity(gd, capacity);
49490 +
49491 +       if (xlvbd_init_blk_queue(gd, sector_size)) {
49492 +               del_gendisk(gd);
49493 +               goto out;
49494 +       }
49495 +
49496 +       info->rq = gd->queue;
49497 +
49498 +       if (vdisk_info & VDISK_READONLY)
49499 +               set_disk_ro(gd, 1);
49500 +
49501 +       if (vdisk_info & VDISK_REMOVABLE)
49502 +               gd->flags |= GENHD_FL_REMOVABLE;
49503 +
49504 +       if (vdisk_info & VDISK_CDROM)
49505 +               gd->flags |= GENHD_FL_CD;
49506 +
49507 +       info->gd = gd;
49508 +
49509 +       return 0;
49510 +
49511 + out:
49512 +       if (mi)
49513 +               xlbd_put_major_info(mi);
49514 +       info->mi = NULL;
49515 +       return err;
49516 +}
49517 +
49518 +int
49519 +xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
49520 +         u16 sector_size, struct blkfront_info *info)
49521 +{
49522 +       struct block_device *bd;
49523 +       int err = 0;
49524 +
49525 +       info->dev = MKDEV(BLKIF_MAJOR(vdevice), BLKIF_MINOR(vdevice));
49526 +
49527 +       bd = bdget(info->dev);
49528 +       if (bd == NULL)
49529 +               return -ENODEV;
49530 +
49531 +       err = xlvbd_alloc_gendisk(BLKIF_MINOR(vdevice), capacity, vdevice,
49532 +                                 vdisk_info, sector_size, info);
49533 +
49534 +       bdput(bd);
49535 +       return err;
49536 +}
49537 +
49538 +void
49539 +xlvbd_del(struct blkfront_info *info)
49540 +{
49541 +       if (info->mi == NULL)
49542 +               return;
49543 +
49544 +       BUG_ON(info->gd == NULL);
49545 +       del_gendisk(info->gd);
49546 +       put_disk(info->gd);
49547 +       info->gd = NULL;
49548 +
49549 +       xlbd_put_major_info(info->mi);
49550 +       info->mi = NULL;
49551 +
49552 +       BUG_ON(info->rq == NULL);
49553 +       blk_cleanup_queue(info->rq);
49554 +       info->rq = NULL;
49555 +}
49556 diff -urNp linux-2.6/drivers/xen/char/Makefile new/drivers/xen/char/Makefile
49557 --- linux-2.6/drivers/xen/char/Makefile 1970-01-01 01:00:00.000000000 +0100
49558 +++ new/drivers/xen/char/Makefile       2006-05-09 12:34:37.000000000 +0200
49559 @@ -0,0 +1,2 @@
49560 +
49561 +obj-y  := mem.o
49562 diff -urNp linux-2.6/drivers/xen/char/mem.c new/drivers/xen/char/mem.c
49563 --- linux-2.6/drivers/xen/char/mem.c    1970-01-01 01:00:00.000000000 +0100
49564 +++ new/drivers/xen/char/mem.c  2006-06-28 14:32:14.000000000 +0200
49565 @@ -0,0 +1,182 @@
49566 +/*
49567 + *  Originally from linux/drivers/char/mem.c
49568 + *
49569 + *  Copyright (C) 1991, 1992  Linus Torvalds
49570 + *
49571 + *  Added devfs support. 
49572 + *    Jan-11-1998, C. Scott Ananian <cananian@alumni.princeton.edu>
49573 + *  Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar <kanoj@sgi.com>
49574 + */
49575 +
49576 +#include <linux/config.h>
49577 +#include <linux/mm.h>
49578 +#include <linux/miscdevice.h>
49579 +#include <linux/slab.h>
49580 +#include <linux/vmalloc.h>
49581 +#include <linux/mman.h>
49582 +#include <linux/random.h>
49583 +#include <linux/init.h>
49584 +#include <linux/raw.h>
49585 +#include <linux/tty.h>
49586 +#include <linux/capability.h>
49587 +#include <linux/smp_lock.h>
49588 +#include <linux/devfs_fs_kernel.h>
49589 +#include <linux/ptrace.h>
49590 +#include <linux/device.h>
49591 +#include <asm/pgalloc.h>
49592 +#include <asm/uaccess.h>
49593 +#include <asm/io.h>
49594 +#include <asm/hypervisor.h>
49595 +
49596 +static inline int uncached_access(struct file *file)
49597 +{
49598 +       if (file->f_flags & O_SYNC)
49599 +               return 1;
49600 +       /* Xen sets correct MTRR type on non-RAM for us. */
49601 +       return 0;
49602 +}
49603 +
49604 +/*
49605 + * This funcion reads the *physical* memory. The f_pos points directly to the 
49606 + * memory location. 
49607 + */
49608 +static ssize_t read_mem(struct file * file, char __user * buf,
49609 +                       size_t count, loff_t *ppos)
49610 +{
49611 +       unsigned long p = *ppos, ignored;
49612 +       ssize_t read = 0, sz;
49613 +       void __iomem *v;
49614 +
49615 +       while (count > 0) {
49616 +               /*
49617 +                * Handle first page in case it's not aligned
49618 +                */
49619 +               if (-p & (PAGE_SIZE - 1))
49620 +                       sz = -p & (PAGE_SIZE - 1);
49621 +               else
49622 +                       sz = PAGE_SIZE;
49623 +
49624 +               sz = min_t(unsigned long, sz, count);
49625 +
49626 +               if ((v = ioremap(p, sz)) == NULL) {
49627 +                       /*
49628 +                        * Some programs (e.g., dmidecode) groove off into weird RAM
49629 +                        * areas where no tables can possibly exist (because Xen will
49630 +                        * have stomped on them!). These programs get rather upset if
49631 +                        * we let them know that Xen failed their access, so we fake
49632 +                        * out a read of all zeroes. :-)
49633 +                        */
49634 +                       if (clear_user(buf, count))
49635 +                               return -EFAULT;
49636 +                       read += count;
49637 +                       break;
49638 +               }
49639 +
49640 +               ignored = copy_to_user(buf, v, sz);
49641 +               iounmap(v);
49642 +               if (ignored)
49643 +                       return -EFAULT;
49644 +               buf += sz;
49645 +               p += sz;
49646 +               count -= sz;
49647 +               read += sz;
49648 +       }
49649 +
49650 +       *ppos += read;
49651 +       return read;
49652 +}
49653 +
49654 +static ssize_t write_mem(struct file * file, const char __user * buf, 
49655 +                        size_t count, loff_t *ppos)
49656 +{
49657 +       unsigned long p = *ppos, ignored;
49658 +       ssize_t written = 0, sz;
49659 +       void __iomem *v;
49660 +
49661 +       while (count > 0) {
49662 +               /*
49663 +                * Handle first page in case it's not aligned
49664 +                */
49665 +               if (-p & (PAGE_SIZE - 1))
49666 +                       sz = -p & (PAGE_SIZE - 1);
49667 +               else
49668 +                       sz = PAGE_SIZE;
49669 +
49670 +               sz = min_t(unsigned long, sz, count);
49671 +
49672 +               if ((v = ioremap(p, sz)) == NULL)
49673 +                       break;
49674 +
49675 +               ignored = copy_from_user(v, buf, sz);
49676 +               iounmap(v);
49677 +               if (ignored) {
49678 +                       written += sz - ignored;
49679 +                       if (written)
49680 +                               break;
49681 +                       return -EFAULT;
49682 +               }
49683 +               buf += sz;
49684 +               p += sz;
49685 +               count -= sz;
49686 +               written += sz;
49687 +       }
49688 +
49689 +       *ppos += written;
49690 +       return written;
49691 +}
49692 +
49693 +static int mmap_mem(struct file * file, struct vm_area_struct * vma)
49694 +{
49695 +       size_t size = vma->vm_end - vma->vm_start;
49696 +
49697 +       if (uncached_access(file))
49698 +               vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
49699 +
49700 +       /* We want to return the real error code, not EAGAIN. */
49701 +       return direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
49702 +                                     size, vma->vm_page_prot, DOMID_IO);
49703 +}
49704 +
49705 +/*
49706 + * The memory devices use the full 32/64 bits of the offset, and so we cannot
49707 + * check against negative addresses: they are ok. The return value is weird,
49708 + * though, in that case (0).
49709 + *
49710 + * also note that seeking relative to the "end of file" isn't supported:
49711 + * it has no meaning, so it returns -EINVAL.
49712 + */
49713 +static loff_t memory_lseek(struct file * file, loff_t offset, int orig)
49714 +{
49715 +       loff_t ret;
49716 +
49717 +       mutex_lock(&file->f_dentry->d_inode->i_mutex);
49718 +       switch (orig) {
49719 +               case 0:
49720 +                       file->f_pos = offset;
49721 +                       ret = file->f_pos;
49722 +                       force_successful_syscall_return();
49723 +                       break;
49724 +               case 1:
49725 +                       file->f_pos += offset;
49726 +                       ret = file->f_pos;
49727 +                       force_successful_syscall_return();
49728 +                       break;
49729 +               default:
49730 +                       ret = -EINVAL;
49731 +       }
49732 +       mutex_unlock(&file->f_dentry->d_inode->i_mutex);
49733 +       return ret;
49734 +}
49735 +
49736 +static int open_mem(struct inode * inode, struct file * filp)
49737 +{
49738 +       return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
49739 +}
49740 +
49741 +struct file_operations mem_fops = {
49742 +       .llseek         = memory_lseek,
49743 +       .read           = read_mem,
49744 +       .write          = write_mem,
49745 +       .mmap           = mmap_mem,
49746 +       .open           = open_mem,
49747 +};
49748 diff -urNp linux-2.6/drivers/xen/console/console.c new/drivers/xen/console/console.c
49749 --- linux-2.6/drivers/xen/console/console.c     1970-01-01 01:00:00.000000000 +0100
49750 +++ new/drivers/xen/console/console.c   2006-06-28 14:32:14.000000000 +0200
49751 @@ -0,0 +1,640 @@
49752 +/******************************************************************************
49753 + * console.c
49754 + * 
49755 + * Virtual console driver.
49756 + * 
49757 + * Copyright (c) 2002-2004, K A Fraser.
49758 + * 
49759 + * This program is free software; you can redistribute it and/or
49760 + * modify it under the terms of the GNU General Public License version 2
49761 + * as published by the Free Software Foundation; or, when distributed
49762 + * separately from the Linux kernel or incorporated into other
49763 + * software packages, subject to the following license:
49764 + * 
49765 + * Permission is hereby granted, free of charge, to any person obtaining a copy
49766 + * of this source file (the "Software"), to deal in the Software without
49767 + * restriction, including without limitation the rights to use, copy, modify,
49768 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
49769 + * and to permit persons to whom the Software is furnished to do so, subject to
49770 + * the following conditions:
49771 + * 
49772 + * The above copyright notice and this permission notice shall be included in
49773 + * all copies or substantial portions of the Software.
49774 + * 
49775 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
49776 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
49777 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
49778 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
49779 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
49780 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
49781 + * IN THE SOFTWARE.
49782 + */
49783 +
49784 +#include <linux/config.h>
49785 +#include <linux/version.h>
49786 +#include <linux/module.h>
49787 +#include <linux/errno.h>
49788 +#include <linux/signal.h>
49789 +#include <linux/sched.h>
49790 +#include <linux/interrupt.h>
49791 +#include <linux/tty.h>
49792 +#include <linux/tty_flip.h>
49793 +#include <linux/serial.h>
49794 +#include <linux/major.h>
49795 +#include <linux/ptrace.h>
49796 +#include <linux/ioport.h>
49797 +#include <linux/mm.h>
49798 +#include <linux/slab.h>
49799 +#include <linux/init.h>
49800 +#include <linux/console.h>
49801 +#include <linux/bootmem.h>
49802 +#include <linux/sysrq.h>
49803 +#include <asm/io.h>
49804 +#include <asm/irq.h>
49805 +#include <asm/uaccess.h>
49806 +#include <xen/interface/xen.h>
49807 +#include <xen/interface/event_channel.h>
49808 +#include <asm/hypervisor.h>
49809 +#include <xen/evtchn.h>
49810 +#include <xen/xencons.h>
49811 +
49812 +/*
49813 + * Modes:
49814 + *  'xencons=off'  [XC_OFF]:     Console is disabled.
49815 + *  'xencons=tty'  [XC_TTY]:     Console attached to '/dev/tty[0-9]+'.
49816 + *  'xencons=ttyS' [XC_SERIAL]:  Console attached to '/dev/ttyS[0-9]+'.
49817 + *                 [XC_DEFAULT]: DOM0 -> XC_SERIAL ; all others -> XC_TTY.
49818 + * 
49819 + * NB. In mode XC_TTY, we create dummy consoles for tty2-63. This suppresses
49820 + * warnings from standard distro startup scripts.
49821 + */
49822 +static enum { XC_OFF, XC_DEFAULT, XC_TTY, XC_SERIAL } xc_mode = XC_DEFAULT;
49823 +static int xc_num = -1;
49824 +
49825 +#ifdef CONFIG_MAGIC_SYSRQ
49826 +static unsigned long sysrq_requested;
49827 +extern int sysrq_enabled;
49828 +#endif
49829 +
49830 +static int __init xencons_setup(char *str)
49831 +{
49832 +       char *q;
49833 +       int n;
49834 +
49835 +       if (!strncmp(str, "ttyS", 4))
49836 +               xc_mode = XC_SERIAL;
49837 +       else if (!strncmp(str, "tty", 3))
49838 +               xc_mode = XC_TTY;
49839 +       else if (!strncmp(str, "off", 3))
49840 +               xc_mode = XC_OFF;
49841 +
49842 +       switch (xc_mode) {
49843 +       case XC_SERIAL:
49844 +               n = simple_strtol(str+4, &q, 10);
49845 +               if (q > (str + 4))
49846 +                       xc_num = n;
49847 +               break;
49848 +       case XC_TTY:
49849 +               n = simple_strtol(str+3, &q, 10);
49850 +               if (q > (str + 3))
49851 +                       xc_num = n;
49852 +               break;
49853 +       default:
49854 +               break;
49855 +       }
49856 +
49857 +       return 1;
49858 +}
49859 +__setup("xencons=", xencons_setup);
49860 +
49861 +/* The kernel and user-land drivers share a common transmit buffer. */
49862 +static unsigned int wbuf_size = 4096;
49863 +#define WBUF_MASK(_i) ((_i)&(wbuf_size-1))
49864 +static char *wbuf;
49865 +static unsigned int wc, wp; /* write_cons, write_prod */
49866 +
49867 +static int __init xencons_bufsz_setup(char *str)
49868 +{
49869 +       unsigned int goal;
49870 +       goal = simple_strtoul(str, NULL, 0);
49871 +       if (goal) {
49872 +               goal = roundup_pow_of_two(goal);
49873 +               if (wbuf_size < goal)
49874 +                       wbuf_size = goal;
49875 +       }
49876 +       return 1;
49877 +}
49878 +__setup("xencons_bufsz=", xencons_bufsz_setup);
49879 +
49880 +/* This lock protects accesses to the common transmit buffer. */
49881 +static DEFINE_SPINLOCK(xencons_lock);
49882 +
49883 +/* Common transmit-kick routine. */
49884 +static void __xencons_tx_flush(void);
49885 +
49886 +static struct tty_driver *xencons_driver;
49887 +
49888 +/******************** Kernel console driver ********************************/
49889 +
49890 +static void kcons_write(struct console *c, const char *s, unsigned int count)
49891 +{
49892 +       int           i = 0;
49893 +       unsigned long flags;
49894 +
49895 +       spin_lock_irqsave(&xencons_lock, flags);
49896 +
49897 +       while (i < count) {
49898 +               for (; i < count; i++) {
49899 +                       if ((wp - wc) >= (wbuf_size - 1))
49900 +                               break;
49901 +                       if ((wbuf[WBUF_MASK(wp++)] = s[i]) == '\n')
49902 +                               wbuf[WBUF_MASK(wp++)] = '\r';
49903 +               }
49904 +
49905 +               __xencons_tx_flush();
49906 +       }
49907 +
49908 +       spin_unlock_irqrestore(&xencons_lock, flags);
49909 +}
49910 +
49911 +static void kcons_write_dom0(struct console *c, const char *s, unsigned int count)
49912 +{
49913 +
49914 +       while (count > 0) {
49915 +               int rc;
49916 +               rc = HYPERVISOR_console_io( CONSOLEIO_write, count, (char *)s);
49917 +               if (rc <= 0)
49918 +                       break;
49919 +               count -= rc;
49920 +               s += rc;
49921 +       }
49922 +}
49923 +
49924 +static struct tty_driver *kcons_device(struct console *c, int *index)
49925 +{
49926 +       *index = 0;
49927 +       return xencons_driver;
49928 +}
49929 +
49930 +static struct console kcons_info = {
49931 +       .device = kcons_device,
49932 +       .flags  = CON_PRINTBUFFER,
49933 +       .index  = -1,
49934 +};
49935 +
49936 +#define __RETCODE 0
49937 +static int __init xen_console_init(void)
49938 +{
49939 +       if (!is_running_on_xen())
49940 +               return __RETCODE;
49941 +
49942 +       if (xen_start_info->flags & SIF_INITDOMAIN) {
49943 +               if (xc_mode == XC_DEFAULT)
49944 +                       xc_mode = XC_SERIAL;
49945 +               kcons_info.write = kcons_write_dom0;
49946 +               if (xc_mode == XC_SERIAL)
49947 +                       kcons_info.flags |= CON_ENABLED;
49948 +       } else {
49949 +               if (xc_mode == XC_DEFAULT)
49950 +                       xc_mode = XC_TTY;
49951 +               kcons_info.write = kcons_write;
49952 +       }
49953 +
49954 +       switch (xc_mode) {
49955 +       case XC_SERIAL:
49956 +               strcpy(kcons_info.name, "ttyS");
49957 +               if (xc_num == -1)
49958 +                       xc_num = 0;
49959 +               break;
49960 +
49961 +       case XC_TTY:
49962 +               strcpy(kcons_info.name, "tty");
49963 +               if (xc_num == -1)
49964 +                       xc_num = 1;
49965 +               break;
49966 +
49967 +       default:
49968 +               return __RETCODE;
49969 +       }
49970 +
49971 +       wbuf = alloc_bootmem(wbuf_size);
49972 +
49973 +       register_console(&kcons_info);
49974 +
49975 +       return __RETCODE;
49976 +}
49977 +console_initcall(xen_console_init);
49978 +
49979 +/*** Useful function for console debugging -- goes straight to Xen. ***/
49980 +asmlinkage int xprintk(const char *fmt, ...)
49981 +{
49982 +       va_list args;
49983 +       int printk_len;
49984 +       static char printk_buf[1024];
49985 +
49986 +       /* Emit the output into the temporary buffer */
49987 +       va_start(args, fmt);
49988 +       printk_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args);
49989 +       va_end(args);
49990 +
49991 +       /* Send the processed output directly to Xen. */
49992 +       kcons_write_dom0(NULL, printk_buf, printk_len);
49993 +
49994 +       return 0;
49995 +}
49996 +
49997 +/*** Forcibly flush console data before dying. ***/
49998 +void xencons_force_flush(void)
49999 +{
50000 +       int sz;
50001 +
50002 +       /* Emergency console is synchronous, so there's nothing to flush. */
50003 +       if (xen_start_info->flags & SIF_INITDOMAIN)
50004 +               return;
50005 +
50006 +       /* Spin until console data is flushed through to the daemon. */
50007 +       while (wc != wp) {
50008 +               int sent = 0;
50009 +               if ((sz = wp - wc) == 0)
50010 +                       continue;
50011 +               sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz);
50012 +               if (sent > 0)
50013 +                       wc += sent;
50014 +       }
50015 +}
50016 +
50017 +
50018 +/******************** User-space console driver (/dev/console) ************/
50019 +
50020 +#define DRV(_d)         (_d)
50021 +#define TTY_INDEX(_tty) ((_tty)->index)
50022 +
50023 +static struct termios *xencons_termios[MAX_NR_CONSOLES];
50024 +static struct termios *xencons_termios_locked[MAX_NR_CONSOLES];
50025 +static struct tty_struct *xencons_tty;
50026 +static int xencons_priv_irq;
50027 +static char x_char;
50028 +
50029 +void xencons_rx(char *buf, unsigned len, struct pt_regs *regs)
50030 +{
50031 +       int           i;
50032 +       unsigned long flags;
50033 +
50034 +       spin_lock_irqsave(&xencons_lock, flags);
50035 +       if (xencons_tty == NULL)
50036 +               goto out;
50037 +
50038 +       for (i = 0; i < len; i++) {
50039 +#ifdef CONFIG_MAGIC_SYSRQ
50040 +               if (sysrq_enabled) {
50041 +                       if (buf[i] == '\x0f') { /* ^O */
50042 +                               sysrq_requested = jiffies;
50043 +                               continue; /* don't print the sysrq key */
50044 +                       } else if (sysrq_requested) {
50045 +                               unsigned long sysrq_timeout =
50046 +                                       sysrq_requested + HZ*2;
50047 +                               sysrq_requested = 0;
50048 +                               if (time_before(jiffies, sysrq_timeout)) {
50049 +                                       spin_unlock_irqrestore(
50050 +                                               &xencons_lock, flags);
50051 +                                       handle_sysrq(
50052 +                                               buf[i], regs, xencons_tty);
50053 +                                       spin_lock_irqsave(
50054 +                                               &xencons_lock, flags);
50055 +                                       continue;
50056 +                               }
50057 +                       }
50058 +               }
50059 +#endif
50060 +               tty_insert_flip_char(xencons_tty, buf[i], 0);
50061 +       }
50062 +       tty_flip_buffer_push(xencons_tty);
50063 +
50064 + out:
50065 +       spin_unlock_irqrestore(&xencons_lock, flags);
50066 +}
50067 +
50068 +static void __xencons_tx_flush(void)
50069 +{
50070 +       int sent, sz, work_done = 0;
50071 +
50072 +       if (x_char) {
50073 +               if (xen_start_info->flags & SIF_INITDOMAIN)
50074 +                       kcons_write_dom0(NULL, &x_char, 1);
50075 +               else
50076 +                       while (x_char)
50077 +                               if (xencons_ring_send(&x_char, 1) == 1)
50078 +                                       break;
50079 +               x_char = 0;
50080 +               work_done = 1;
50081 +       }
50082 +
50083 +       while (wc != wp) {
50084 +               sz = wp - wc;
50085 +               if (sz > (wbuf_size - WBUF_MASK(wc)))
50086 +                       sz = wbuf_size - WBUF_MASK(wc);
50087 +               if (xen_start_info->flags & SIF_INITDOMAIN) {
50088 +                       kcons_write_dom0(NULL, &wbuf[WBUF_MASK(wc)], sz);
50089 +                       wc += sz;
50090 +               } else {
50091 +                       sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz);
50092 +                       if (sent == 0)
50093 +                               break;
50094 +                       wc += sent;
50095 +               }
50096 +               work_done = 1;
50097 +       }
50098 +
50099 +       if (work_done && (xencons_tty != NULL)) {
50100 +               wake_up_interruptible(&xencons_tty->write_wait);
50101 +               if ((xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) &&
50102 +                   (xencons_tty->ldisc.write_wakeup != NULL))
50103 +                       (xencons_tty->ldisc.write_wakeup)(xencons_tty);
50104 +       }
50105 +}
50106 +
50107 +void xencons_tx(void)
50108 +{
50109 +       unsigned long flags;
50110 +
50111 +       spin_lock_irqsave(&xencons_lock, flags);
50112 +       __xencons_tx_flush();
50113 +       spin_unlock_irqrestore(&xencons_lock, flags);
50114 +}
50115 +
50116 +/* Privileged receive callback and transmit kicker. */
50117 +static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id,
50118 +                                         struct pt_regs *regs)
50119 +{
50120 +       static char rbuf[16];
50121 +       int         l;
50122 +
50123 +       while ((l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0)
50124 +               xencons_rx(rbuf, l, regs);
50125 +
50126 +       xencons_tx();
50127 +
50128 +       return IRQ_HANDLED;
50129 +}
50130 +
50131 +static int xencons_write_room(struct tty_struct *tty)
50132 +{
50133 +       return wbuf_size - (wp - wc);
50134 +}
50135 +
50136 +static int xencons_chars_in_buffer(struct tty_struct *tty)
50137 +{
50138 +       return wp - wc;
50139 +}
50140 +
50141 +static void xencons_send_xchar(struct tty_struct *tty, char ch)
50142 +{
50143 +       unsigned long flags;
50144 +
50145 +       if (TTY_INDEX(tty) != 0)
50146 +               return;
50147 +
50148 +       spin_lock_irqsave(&xencons_lock, flags);
50149 +       x_char = ch;
50150 +       __xencons_tx_flush();
50151 +       spin_unlock_irqrestore(&xencons_lock, flags);
50152 +}
50153 +
50154 +static void xencons_throttle(struct tty_struct *tty)
50155 +{
50156 +       if (TTY_INDEX(tty) != 0)
50157 +               return;
50158 +
50159 +       if (I_IXOFF(tty))
50160 +               xencons_send_xchar(tty, STOP_CHAR(tty));
50161 +}
50162 +
50163 +static void xencons_unthrottle(struct tty_struct *tty)
50164 +{
50165 +       if (TTY_INDEX(tty) != 0)
50166 +               return;
50167 +
50168 +       if (I_IXOFF(tty)) {
50169 +               if (x_char != 0)
50170 +                       x_char = 0;
50171 +               else
50172 +                       xencons_send_xchar(tty, START_CHAR(tty));
50173 +       }
50174 +}
50175 +
50176 +static void xencons_flush_buffer(struct tty_struct *tty)
50177 +{
50178 +       unsigned long flags;
50179 +
50180 +       if (TTY_INDEX(tty) != 0)
50181 +               return;
50182 +
50183 +       spin_lock_irqsave(&xencons_lock, flags);
50184 +       wc = wp = 0;
50185 +       spin_unlock_irqrestore(&xencons_lock, flags);
50186 +}
50187 +
50188 +static inline int __xencons_put_char(int ch)
50189 +{
50190 +       char _ch = (char)ch;
50191 +       if ((wp - wc) == wbuf_size)
50192 +               return 0;
50193 +       wbuf[WBUF_MASK(wp++)] = _ch;
50194 +       return 1;
50195 +}
50196 +
50197 +static int xencons_write(
50198 +       struct tty_struct *tty,
50199 +       const unsigned char *buf,
50200 +       int count)
50201 +{
50202 +       int i;
50203 +       unsigned long flags;
50204 +
50205 +       if (TTY_INDEX(tty) != 0)
50206 +               return count;
50207 +
50208 +       spin_lock_irqsave(&xencons_lock, flags);
50209 +
50210 +       for (i = 0; i < count; i++)
50211 +               if (!__xencons_put_char(buf[i]))
50212 +                       break;
50213 +
50214 +       if (i != 0)
50215 +               __xencons_tx_flush();
50216 +
50217 +       spin_unlock_irqrestore(&xencons_lock, flags);
50218 +
50219 +       return i;
50220 +}
50221 +
50222 +static void xencons_put_char(struct tty_struct *tty, u_char ch)
50223 +{
50224 +       unsigned long flags;
50225 +
50226 +       if (TTY_INDEX(tty) != 0)
50227 +               return;
50228 +
50229 +       spin_lock_irqsave(&xencons_lock, flags);
50230 +       (void)__xencons_put_char(ch);
50231 +       spin_unlock_irqrestore(&xencons_lock, flags);
50232 +}
50233 +
50234 +static void xencons_flush_chars(struct tty_struct *tty)
50235 +{
50236 +       unsigned long flags;
50237 +
50238 +       if (TTY_INDEX(tty) != 0)
50239 +               return;
50240 +
50241 +       spin_lock_irqsave(&xencons_lock, flags);
50242 +       __xencons_tx_flush();
50243 +       spin_unlock_irqrestore(&xencons_lock, flags);
50244 +}
50245 +
50246 +static void xencons_wait_until_sent(struct tty_struct *tty, int timeout)
50247 +{
50248 +       unsigned long orig_jiffies = jiffies;
50249 +
50250 +       if (TTY_INDEX(tty) != 0)
50251 +               return;
50252 +
50253 +       while (DRV(tty->driver)->chars_in_buffer(tty)) {
50254 +               set_current_state(TASK_INTERRUPTIBLE);
50255 +               schedule_timeout(1);
50256 +               if (signal_pending(current))
50257 +                       break;
50258 +               if (timeout && time_after(jiffies, orig_jiffies + timeout))
50259 +                       break;
50260 +       }
50261 +
50262 +       set_current_state(TASK_RUNNING);
50263 +}
50264 +
50265 +static int xencons_open(struct tty_struct *tty, struct file *filp)
50266 +{
50267 +       unsigned long flags;
50268 +
50269 +       if (TTY_INDEX(tty) != 0)
50270 +               return 0;
50271 +
50272 +       spin_lock_irqsave(&xencons_lock, flags);
50273 +       tty->driver_data = NULL;
50274 +       if (xencons_tty == NULL)
50275 +               xencons_tty = tty;
50276 +       __xencons_tx_flush();
50277 +       spin_unlock_irqrestore(&xencons_lock, flags);
50278 +
50279 +       return 0;
50280 +}
50281 +
50282 +static void xencons_close(struct tty_struct *tty, struct file *filp)
50283 +{
50284 +       unsigned long flags;
50285 +
50286 +       if (TTY_INDEX(tty) != 0)
50287 +               return;
50288 +
50289 +       if (tty->count == 1) {
50290 +               tty->closing = 1;
50291 +               tty_wait_until_sent(tty, 0);
50292 +               if (DRV(tty->driver)->flush_buffer != NULL)
50293 +                       DRV(tty->driver)->flush_buffer(tty);
50294 +               if (tty->ldisc.flush_buffer != NULL)
50295 +                       tty->ldisc.flush_buffer(tty);
50296 +               tty->closing = 0;
50297 +               spin_lock_irqsave(&xencons_lock, flags);
50298 +               xencons_tty = NULL;
50299 +               spin_unlock_irqrestore(&xencons_lock, flags);
50300 +       }
50301 +}
50302 +
50303 +static struct tty_operations xencons_ops = {
50304 +       .open = xencons_open,
50305 +       .close = xencons_close,
50306 +       .write = xencons_write,
50307 +       .write_room = xencons_write_room,
50308 +       .put_char = xencons_put_char,
50309 +       .flush_chars = xencons_flush_chars,
50310 +       .chars_in_buffer = xencons_chars_in_buffer,
50311 +       .send_xchar = xencons_send_xchar,
50312 +       .flush_buffer = xencons_flush_buffer,
50313 +       .throttle = xencons_throttle,
50314 +       .unthrottle = xencons_unthrottle,
50315 +       .wait_until_sent = xencons_wait_until_sent,
50316 +};
50317 +
50318 +static int __init xencons_init(void)
50319 +{
50320 +       int rc;
50321 +
50322 +       if (!is_running_on_xen())
50323 +               return -ENODEV;
50324 +
50325 +       if (xc_mode == XC_OFF)
50326 +               return 0;
50327 +
50328 +       xencons_ring_init();
50329 +
50330 +       xencons_driver = alloc_tty_driver((xc_mode == XC_SERIAL) ?
50331 +                                         1 : MAX_NR_CONSOLES);
50332 +       if (xencons_driver == NULL)
50333 +               return -ENOMEM;
50334 +
50335 +       DRV(xencons_driver)->name            = "xencons";
50336 +       DRV(xencons_driver)->major           = TTY_MAJOR;
50337 +       DRV(xencons_driver)->type            = TTY_DRIVER_TYPE_SERIAL;
50338 +       DRV(xencons_driver)->subtype         = SERIAL_TYPE_NORMAL;
50339 +       DRV(xencons_driver)->init_termios    = tty_std_termios;
50340 +       DRV(xencons_driver)->flags           =
50341 +               TTY_DRIVER_REAL_RAW |
50342 +               TTY_DRIVER_RESET_TERMIOS |
50343 +               TTY_DRIVER_NO_DEVFS;
50344 +       DRV(xencons_driver)->termios         = xencons_termios;
50345 +       DRV(xencons_driver)->termios_locked  = xencons_termios_locked;
50346 +
50347 +       if (xc_mode == XC_SERIAL) {
50348 +               DRV(xencons_driver)->name        = "ttyS";
50349 +               DRV(xencons_driver)->minor_start = 64 + xc_num;
50350 +               DRV(xencons_driver)->name_base   = 0 + xc_num;
50351 +       } else {
50352 +               DRV(xencons_driver)->name        = "tty";
50353 +               DRV(xencons_driver)->minor_start = xc_num;
50354 +               DRV(xencons_driver)->name_base   = xc_num;
50355 +       }
50356 +
50357 +       tty_set_operations(xencons_driver, &xencons_ops);
50358 +
50359 +       if ((rc = tty_register_driver(DRV(xencons_driver))) != 0) {
50360 +               printk("WARNING: Failed to register Xen virtual "
50361 +                      "console driver as '%s%d'\n",
50362 +                      DRV(xencons_driver)->name,
50363 +                      DRV(xencons_driver)->name_base);
50364 +               put_tty_driver(xencons_driver);
50365 +               xencons_driver = NULL;
50366 +               return rc;
50367 +       }
50368 +
50369 +       tty_register_device(xencons_driver, 0, NULL);
50370 +
50371 +       if (xen_start_info->flags & SIF_INITDOMAIN) {
50372 +               xencons_priv_irq = bind_virq_to_irqhandler(
50373 +                       VIRQ_CONSOLE,
50374 +                       0,
50375 +                       xencons_priv_interrupt,
50376 +                       0,
50377 +                       "console",
50378 +                       NULL);
50379 +               BUG_ON(xencons_priv_irq < 0);
50380 +       }
50381 +
50382 +       printk("Xen virtual console successfully installed as %s%d\n",
50383 +              DRV(xencons_driver)->name,
50384 +              DRV(xencons_driver)->name_base );
50385 +
50386 +       return 0;
50387 +}
50388 +
50389 +module_init(xencons_init);
50390 +
50391 +MODULE_LICENSE("Dual BSD/GPL");
50392 diff -urNp linux-2.6/drivers/xen/console/Makefile new/drivers/xen/console/Makefile
50393 --- linux-2.6/drivers/xen/console/Makefile      1970-01-01 01:00:00.000000000 +0100
50394 +++ new/drivers/xen/console/Makefile    2006-05-09 12:34:37.000000000 +0200
50395 @@ -0,0 +1,2 @@
50396 +
50397 +obj-y  := console.o xencons_ring.o
50398 diff -urNp linux-2.6/drivers/xen/console/xencons_ring.c new/drivers/xen/console/xencons_ring.c
50399 --- linux-2.6/drivers/xen/console/xencons_ring.c        1970-01-01 01:00:00.000000000 +0100
50400 +++ new/drivers/xen/console/xencons_ring.c      2006-05-23 18:42:17.000000000 +0200
50401 @@ -0,0 +1,141 @@
50402 +/* 
50403 + * This program is free software; you can redistribute it and/or
50404 + * modify it under the terms of the GNU General Public License version 2
50405 + * as published by the Free Software Foundation; or, when distributed
50406 + * separately from the Linux kernel or incorporated into other
50407 + * software packages, subject to the following license:
50408 + * 
50409 + * Permission is hereby granted, free of charge, to any person obtaining a copy
50410 + * of this source file (the "Software"), to deal in the Software without
50411 + * restriction, including without limitation the rights to use, copy, modify,
50412 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
50413 + * and to permit persons to whom the Software is furnished to do so, subject to
50414 + * the following conditions:
50415 + * 
50416 + * The above copyright notice and this permission notice shall be included in
50417 + * all copies or substantial portions of the Software.
50418 + * 
50419 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
50420 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
50421 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
50422 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
50423 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
50424 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
50425 + * IN THE SOFTWARE.
50426 + */
50427 +
50428 +#include <linux/version.h>
50429 +#include <linux/module.h>
50430 +#include <linux/errno.h>
50431 +#include <linux/signal.h>
50432 +#include <linux/sched.h>
50433 +#include <linux/interrupt.h>
50434 +#include <linux/tty.h>
50435 +#include <linux/tty_flip.h>
50436 +#include <linux/serial.h>
50437 +#include <linux/major.h>
50438 +#include <linux/ptrace.h>
50439 +#include <linux/ioport.h>
50440 +#include <linux/mm.h>
50441 +#include <linux/slab.h>
50442 +
50443 +#include <asm/hypervisor.h>
50444 +#include <xen/evtchn.h>
50445 +#include <xen/xencons.h>
50446 +#include <linux/wait.h>
50447 +#include <linux/interrupt.h>
50448 +#include <linux/sched.h>
50449 +#include <linux/err.h>
50450 +#include <xen/interface/io/console.h>
50451 +
50452 +static int xencons_irq;
50453 +
50454 +static inline struct xencons_interface *xencons_interface(void)
50455 +{
50456 +       return mfn_to_virt(xen_start_info->console_mfn);
50457 +}
50458 +
50459 +static inline void notify_daemon(void)
50460 +{
50461 +       /* Use evtchn: this is called early, before irq is set up. */
50462 +       notify_remote_via_evtchn(xen_start_info->console_evtchn);
50463 +}
50464 +
50465 +int xencons_ring_send(const char *data, unsigned len)
50466 +{
50467 +       int sent = 0;
50468 +       struct xencons_interface *intf = xencons_interface();
50469 +       XENCONS_RING_IDX cons, prod;
50470 +
50471 +       cons = intf->out_cons;
50472 +       prod = intf->out_prod;
50473 +       mb();
50474 +       BUG_ON((prod - cons) > sizeof(intf->out));
50475 +
50476 +       while ((sent < len) && ((prod - cons) < sizeof(intf->out)))
50477 +               intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++];
50478 +
50479 +       wmb();
50480 +       intf->out_prod = prod;
50481 +
50482 +       notify_daemon();
50483 +
50484 +       return sent;
50485 +}
50486 +
50487 +static irqreturn_t handle_input(int irq, void *unused, struct pt_regs *regs)
50488 +{
50489 +       struct xencons_interface *intf = xencons_interface();
50490 +       XENCONS_RING_IDX cons, prod;
50491 +
50492 +       cons = intf->in_cons;
50493 +       prod = intf->in_prod;
50494 +       mb();
50495 +       BUG_ON((prod - cons) > sizeof(intf->in));
50496 +
50497 +       while (cons != prod) {
50498 +               xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1, regs);
50499 +               cons++;
50500 +       }
50501 +
50502 +       mb();
50503 +       intf->in_cons = cons;
50504 +
50505 +       notify_daemon();
50506 +
50507 +       xencons_tx();
50508 +
50509 +       return IRQ_HANDLED;
50510 +}
50511 +
50512 +int xencons_ring_init(void)
50513 +{
50514 +       int err;
50515 +
50516 +       if (xencons_irq)
50517 +               unbind_from_irqhandler(xencons_irq, NULL);
50518 +       xencons_irq = 0;
50519 +
50520 +       if (!xen_start_info->console_evtchn)
50521 +               return 0;
50522 +
50523 +       err = bind_evtchn_to_irqhandler(
50524 +               xen_start_info->console_evtchn,
50525 +               handle_input, 0, "xencons", NULL);
50526 +       if (err <= 0) {
50527 +               printk(KERN_ERR "XEN console request irq failed %i\n", err);
50528 +               return err;
50529 +       }
50530 +
50531 +       xencons_irq = err;
50532 +
50533 +       /* In case we have in-flight data after save/restore... */
50534 +       notify_daemon();
50535 +
50536 +       return 0;
50537 +}
50538 +
50539 +void xencons_resume(void)
50540 +{
50541 +       (void)xencons_ring_init();
50542 +}
50543 diff -urNp linux-2.6/drivers/xen/core/cpu_hotplug.c new/drivers/xen/core/cpu_hotplug.c
50544 --- linux-2.6/drivers/xen/core/cpu_hotplug.c    1970-01-01 01:00:00.000000000 +0100
50545 +++ new/drivers/xen/core/cpu_hotplug.c  2006-07-07 15:10:03.000000000 +0200
50546 @@ -0,0 +1,188 @@
50547 +#include <linux/config.h>
50548 +#include <linux/init.h>
50549 +#include <linux/kernel.h>
50550 +#include <linux/sched.h>
50551 +#include <linux/notifier.h>
50552 +#include <linux/cpu.h>
50553 +#include <xen/cpu_hotplug.h>
50554 +#include <xen/xenbus.h>
50555 +
50556 +/*
50557 + * Set of CPUs that remote admin software will allow us to bring online.
50558 + * Notified to us via xenbus.
50559 + */
50560 +static cpumask_t xenbus_allowed_cpumask;
50561 +
50562 +/* Set of CPUs that local admin will allow us to bring online. */
50563 +static cpumask_t local_allowed_cpumask = CPU_MASK_ALL;
50564 +
50565 +static int local_cpu_hotplug_request(void)
50566 +{
50567 +       /*
50568 +        * We assume a CPU hotplug request comes from local admin if it is made
50569 +        * via a userspace process (i.e., one with a real mm_struct).
50570 +        */
50571 +       return (current->mm != NULL);
50572 +}
50573 +
50574 +static void vcpu_hotplug(unsigned int cpu)
50575 +{
50576 +       int err;
50577 +       char dir[32], state[32];
50578 +
50579 +       if ((cpu >= NR_CPUS) || !cpu_possible(cpu))
50580 +               return;
50581 +
50582 +       sprintf(dir, "cpu/%d", cpu);
50583 +       err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state);
50584 +       if (err != 1) {
50585 +               printk(KERN_ERR "XENBUS: Unable to read cpu state\n");
50586 +               return;
50587 +       }
50588 +
50589 +       if (strcmp(state, "online") == 0) {
50590 +               cpu_set(cpu, xenbus_allowed_cpumask);
50591 +               (void)cpu_up(cpu);
50592 +       } else if (strcmp(state, "offline") == 0) {
50593 +               cpu_clear(cpu, xenbus_allowed_cpumask);
50594 +               (void)cpu_down(cpu);
50595 +       } else {
50596 +               printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n",
50597 +                      state, cpu);
50598 +       }
50599 +}
50600 +
50601 +static void handle_vcpu_hotplug_event(
50602 +       struct xenbus_watch *watch, const char **vec, unsigned int len)
50603 +{
50604 +       int cpu;
50605 +       char *cpustr;
50606 +       const char *node = vec[XS_WATCH_PATH];
50607 +
50608 +       if ((cpustr = strstr(node, "cpu/")) != NULL) {
50609 +               sscanf(cpustr, "cpu/%d", &cpu);
50610 +               vcpu_hotplug(cpu);
50611 +       }
50612 +}
50613 +
50614 +static int smpboot_cpu_notify(struct notifier_block *notifier,
50615 +                             unsigned long action, void *hcpu)
50616 +{
50617 +       int cpu = (long)hcpu;
50618 +
50619 +       /*
50620 +        * We do this in a callback notifier rather than __cpu_disable()
50621 +        * because local_cpu_hotplug_request() does not work in the latter
50622 +        * as it's always executed from within a stopmachine kthread.
50623 +        */
50624 +       if ((action == CPU_DOWN_PREPARE) && local_cpu_hotplug_request())
50625 +               cpu_clear(cpu, local_allowed_cpumask);
50626 +
50627 +       return NOTIFY_OK;
50628 +}
50629 +
50630 +static int setup_cpu_watcher(struct notifier_block *notifier,
50631 +                             unsigned long event, void *data)
50632 +{
50633 +       int i;
50634 +
50635 +       static struct xenbus_watch cpu_watch = {
50636 +               .node = "cpu",
50637 +               .callback = handle_vcpu_hotplug_event,
50638 +               .flags = XBWF_new_thread };
50639 +       (void)register_xenbus_watch(&cpu_watch);
50640 +
50641 +       if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
50642 +               for_each_cpu(i)
50643 +                       vcpu_hotplug(i);
50644 +               printk(KERN_INFO "Brought up %ld CPUs\n",
50645 +                      (long)num_online_cpus());
50646 +       }
50647 +
50648 +       return NOTIFY_DONE;
50649 +}
50650 +
50651 +static int __init setup_vcpu_hotplug_event(void)
50652 +{
50653 +       static struct notifier_block hotplug_cpu = {
50654 +               .notifier_call = smpboot_cpu_notify };
50655 +       static struct notifier_block xsn_cpu = {
50656 +               .notifier_call = setup_cpu_watcher };
50657 +
50658 +       if (!is_running_on_xen())
50659 +               return -ENODEV;
50660 +
50661 +       register_cpu_notifier(&hotplug_cpu);
50662 +       register_xenstore_notifier(&xsn_cpu);
50663 +
50664 +       return 0;
50665 +}
50666 +
50667 +arch_initcall(setup_vcpu_hotplug_event);
50668 +
50669 +int smp_suspend(void)
50670 +{
50671 +       int i, err;
50672 +
50673 +       lock_cpu_hotplug();
50674 +
50675 +       /*
50676 +        * Take all other CPUs offline. We hold the hotplug mutex to
50677 +        * avoid other processes bringing up CPUs under our feet.
50678 +        */
50679 +       while (num_online_cpus() > 1) {
50680 +               unlock_cpu_hotplug();
50681 +               for_each_online_cpu(i) {
50682 +                       if (i == 0)
50683 +                               continue;
50684 +                       err = cpu_down(i);
50685 +                       if (err) {
50686 +                               printk(KERN_CRIT "Failed to take all CPUs "
50687 +                                      "down: %d.\n", err);
50688 +                               for_each_cpu(i)
50689 +                                       vcpu_hotplug(i);
50690 +                               return err;
50691 +                       }
50692 +               }
50693 +               lock_cpu_hotplug();
50694 +       }
50695 +
50696 +       return 0;
50697 +}
50698 +
50699 +void smp_resume(void)
50700 +{
50701 +       int cpu;
50702 +
50703 +       for_each_cpu(cpu)
50704 +               cpu_initialize_context(cpu);
50705 +
50706 +       unlock_cpu_hotplug();
50707 +
50708 +       for_each_cpu(cpu)
50709 +               vcpu_hotplug(cpu);
50710 +}
50711 +
50712 +int cpu_up_check(unsigned int cpu)
50713 +{
50714 +       int rc = 0;
50715 +
50716 +       if (local_cpu_hotplug_request()) {
50717 +               cpu_set(cpu, local_allowed_cpumask);
50718 +               if (!cpu_isset(cpu, xenbus_allowed_cpumask)) {
50719 +                       printk("%s: attempt to bring up CPU %u disallowed by "
50720 +                              "remote admin.\n", __FUNCTION__, cpu);
50721 +                       rc = -EBUSY;
50722 +               }
50723 +       } else if (!cpu_isset(cpu, local_allowed_cpumask) ||
50724 +                  !cpu_isset(cpu, xenbus_allowed_cpumask)) {
50725 +               rc = -EBUSY;
50726 +       }
50727 +
50728 +       return rc;
50729 +}
50730 +
50731 +void init_xenbus_allowed_cpumask(void)
50732 +{
50733 +       xenbus_allowed_cpumask = cpu_present_map;
50734 +}
50735 diff -urNp linux-2.6/drivers/xen/core/evtchn.c new/drivers/xen/core/evtchn.c
50736 --- linux-2.6/drivers/xen/core/evtchn.c 1970-01-01 01:00:00.000000000 +0100
50737 +++ new/drivers/xen/core/evtchn.c       2006-07-07 15:10:03.000000000 +0200
50738 @@ -0,0 +1,853 @@
50739 +/******************************************************************************
50740 + * evtchn.c
50741 + * 
50742 + * Communication via Xen event channels.
50743 + * 
50744 + * Copyright (c) 2002-2005, K A Fraser
50745 + * 
50746 + * This program is free software; you can redistribute it and/or
50747 + * modify it under the terms of the GNU General Public License version 2
50748 + * as published by the Free Software Foundation; or, when distributed
50749 + * separately from the Linux kernel or incorporated into other
50750 + * software packages, subject to the following license:
50751 + * 
50752 + * Permission is hereby granted, free of charge, to any person obtaining a copy
50753 + * of this source file (the "Software"), to deal in the Software without
50754 + * restriction, including without limitation the rights to use, copy, modify,
50755 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
50756 + * and to permit persons to whom the Software is furnished to do so, subject to
50757 + * the following conditions:
50758 + * 
50759 + * The above copyright notice and this permission notice shall be included in
50760 + * all copies or substantial portions of the Software.
50761 + * 
50762 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
50763 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
50764 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
50765 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
50766 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
50767 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
50768 + * IN THE SOFTWARE.
50769 + */
50770 +
50771 +#include <linux/config.h>
50772 +#include <linux/module.h>
50773 +#include <linux/irq.h>
50774 +#include <linux/interrupt.h>
50775 +#include <linux/sched.h>
50776 +#include <linux/kernel_stat.h>
50777 +#include <linux/version.h>
50778 +#include <asm/atomic.h>
50779 +#include <asm/system.h>
50780 +#include <asm/ptrace.h>
50781 +#include <asm/synch_bitops.h>
50782 +#include <xen/evtchn.h>
50783 +#include <xen/interface/event_channel.h>
50784 +#include <xen/interface/physdev.h>
50785 +#include <asm/hypervisor.h>
50786 +#include <linux/mc146818rtc.h> /* RTC_IRQ */
50787 +
50788 +/*
50789 + * This lock protects updates to the following mapping and reference-count
50790 + * arrays. The lock does not need to be acquired to read the mapping tables.
50791 + */
50792 +static DEFINE_SPINLOCK(irq_mapping_update_lock);
50793 +
50794 +/* IRQ <-> event-channel mappings. */
50795 +static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
50796 +       [0 ...  NR_EVENT_CHANNELS-1] = -1 };
50797 +
50798 +/* Packed IRQ information: binding type, sub-type index, and event channel. */
50799 +static u32 irq_info[NR_IRQS];
50800 +
50801 +/* Binding types. */
50802 +enum { IRQT_UNBOUND, IRQT_PIRQ, IRQT_VIRQ, IRQT_IPI, IRQT_EVTCHN };
50803 +
50804 +/* Constructor for packed IRQ information. */
50805 +static inline u32 mk_irq_info(u32 type, u32 index, u32 evtchn)
50806 +{
50807 +       return ((type << 24) | (index << 16) | evtchn);
50808 +}
50809 +
50810 +/* Convenient shorthand for packed representation of an unbound IRQ. */
50811 +#define IRQ_UNBOUND    mk_irq_info(IRQT_UNBOUND, 0, 0)
50812 +
50813 +/*
50814 + * Accessors for packed IRQ information.
50815 + */
50816 +
50817 +static inline unsigned int evtchn_from_irq(int irq)
50818 +{
50819 +       return (u16)(irq_info[irq]);
50820 +}
50821 +
50822 +static inline unsigned int index_from_irq(int irq)
50823 +{
50824 +       return (u8)(irq_info[irq] >> 16);
50825 +}
50826 +
50827 +static inline unsigned int type_from_irq(int irq)
50828 +{
50829 +       return (u8)(irq_info[irq] >> 24);
50830 +}
50831 +
50832 +/* IRQ <-> VIRQ mapping. */
50833 +DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
50834 +
50835 +/* IRQ <-> IPI mapping. */
50836 +#ifndef NR_IPIS
50837 +#define NR_IPIS 1
50838 +#endif
50839 +DEFINE_PER_CPU(int, ipi_to_irq[NR_IPIS]) = {[0 ... NR_IPIS-1] = -1};
50840 +
50841 +/* Reference counts for bindings to IRQs. */
50842 +static int irq_bindcount[NR_IRQS];
50843 +
50844 +/* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
50845 +static unsigned long pirq_needs_eoi[NR_PIRQS/sizeof(unsigned long)];
50846 +
50847 +#ifdef CONFIG_SMP
50848 +
50849 +static u8 cpu_evtchn[NR_EVENT_CHANNELS];
50850 +static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
50851 +
50852 +static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh,
50853 +                                          unsigned int idx)
50854 +{
50855 +       return (sh->evtchn_pending[idx] &
50856 +               cpu_evtchn_mask[cpu][idx] &
50857 +               ~sh->evtchn_mask[idx]);
50858 +}
50859 +
50860 +static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
50861 +{
50862 +       int irq = evtchn_to_irq[chn];
50863 +
50864 +       BUG_ON(irq == -1);
50865 +       set_native_irq_info(irq, cpumask_of_cpu(cpu));
50866 +
50867 +       clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]);
50868 +       set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]);
50869 +       cpu_evtchn[chn] = cpu;
50870 +}
50871 +
50872 +static void init_evtchn_cpu_bindings(void)
50873 +{
50874 +       int i;
50875 +
50876 +       /* By default all event channels notify CPU#0. */
50877 +       for (i = 0; i < NR_IRQS; i++)
50878 +               set_native_irq_info(i, cpumask_of_cpu(0));
50879 +
50880 +       memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
50881 +       memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
50882 +}
50883 +
50884 +static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
50885 +{
50886 +       return cpu_evtchn[evtchn];
50887 +}
50888 +
50889 +#else
50890 +
50891 +static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh,
50892 +                                          unsigned int idx)
50893 +{
50894 +       return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx]);
50895 +}
50896 +
50897 +static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
50898 +{
50899 +}
50900 +
50901 +static void init_evtchn_cpu_bindings(void)
50902 +{
50903 +}
50904 +
50905 +static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
50906 +{
50907 +       return 0;
50908 +}
50909 +
50910 +#endif
50911 +
50912 +/* Upcall to generic IRQ layer. */
50913 +#ifdef CONFIG_X86
50914 +extern fastcall unsigned int do_IRQ(struct pt_regs *regs);
50915 +void __init xen_init_IRQ(void);
50916 +void __init init_IRQ(void)
50917 +{
50918 +       irq_ctx_init(0);
50919 +       xen_init_IRQ();
50920 +}
50921 +#if defined (__i386__)
50922 +static inline void exit_idle(void) {}
50923 +#define IRQ_REG orig_eax
50924 +#elif defined (__x86_64__)
50925 +#include <asm/idle.h>
50926 +#define IRQ_REG orig_rax
50927 +#endif
50928 +#define do_IRQ(irq, regs) do {         \
50929 +       (regs)->IRQ_REG = ~(irq);       \
50930 +       do_IRQ((regs));                 \
50931 +} while (0)
50932 +#endif
50933 +
50934 +/* Xen will never allocate port zero for any purpose. */
50935 +#define VALID_EVTCHN(chn)      ((chn) != 0)
50936 +
50937 +/*
50938 + * Force a proper event-channel callback from Xen after clearing the
50939 + * callback mask. We do this in a very simple manner, by making a call
50940 + * down into Xen. The pending flag will be checked by Xen on return.
50941 + */
50942 +void force_evtchn_callback(void)
50943 +{
50944 +       (void)HYPERVISOR_xen_version(0, NULL);
50945 +}
50946 +/* Not a GPL symbol: used in ubiquitous macros, so too restrictive. */
50947 +EXPORT_SYMBOL(force_evtchn_callback);
50948 +
50949 +/* NB. Interrupts are disabled on entry. */
50950 +asmlinkage void evtchn_do_upcall(struct pt_regs *regs)
50951 +{
50952 +       unsigned long  l1, l2;
50953 +       unsigned int   l1i, l2i, port;
50954 +       int            irq, cpu = smp_processor_id();
50955 +       shared_info_t *s = HYPERVISOR_shared_info;
50956 +       vcpu_info_t   *vcpu_info = &s->vcpu_info[cpu];
50957 +
50958 +       vcpu_info->evtchn_upcall_pending = 0;
50959 +
50960 +#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
50961 +       /* Clear master pending flag /before/ clearing selector flag. */
50962 +       rmb();
50963 +#endif
50964 +       l1 = xchg(&vcpu_info->evtchn_pending_sel, 0);
50965 +       while (l1 != 0) {
50966 +               l1i = __ffs(l1);
50967 +               l1 &= ~(1UL << l1i);
50968 +
50969 +               while ((l2 = active_evtchns(cpu, s, l1i)) != 0) {
50970 +                       l2i = __ffs(l2);
50971 +
50972 +                       port = (l1i * BITS_PER_LONG) + l2i;
50973 +                       if ((irq = evtchn_to_irq[port]) != -1)
50974 +                               do_IRQ(irq, regs);
50975 +                       else {
50976 +                               exit_idle();
50977 +                               evtchn_device_upcall(port);
50978 +                       }
50979 +               }
50980 +       }
50981 +}
50982 +
50983 +static int find_unbound_irq(void)
50984 +{
50985 +       int irq;
50986 +
50987 +       /* Only allocate from dynirq range */
50988 +       for (irq = DYNIRQ_BASE; irq < NR_IRQS; irq++)
50989 +               if (irq_bindcount[irq] == 0)
50990 +                       break;
50991 +
50992 +       if (irq == NR_IRQS)
50993 +               panic("No available IRQ to bind to: increase NR_IRQS!\n");
50994 +
50995 +       return irq;
50996 +}
50997 +
50998 +static int bind_evtchn_to_irq(unsigned int evtchn)
50999 +{
51000 +       int irq;
51001 +
51002 +       spin_lock(&irq_mapping_update_lock);
51003 +
51004 +       if ((irq = evtchn_to_irq[evtchn]) == -1) {
51005 +               irq = find_unbound_irq();
51006 +               evtchn_to_irq[evtchn] = irq;
51007 +               irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
51008 +       }
51009 +
51010 +       irq_bindcount[irq]++;
51011 +
51012 +       spin_unlock(&irq_mapping_update_lock);
51013 +
51014 +       return irq;
51015 +}
51016 +
51017 +static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
51018 +{
51019 +       struct evtchn_bind_virq bind_virq;
51020 +       int evtchn, irq;
51021 +
51022 +       spin_lock(&irq_mapping_update_lock);
51023 +
51024 +       if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) {
51025 +               bind_virq.virq = virq;
51026 +               bind_virq.vcpu = cpu;
51027 +               if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
51028 +                                               &bind_virq) != 0)
51029 +                       BUG();
51030 +               evtchn = bind_virq.port;
51031 +
51032 +               irq = find_unbound_irq();
51033 +               evtchn_to_irq[evtchn] = irq;
51034 +               irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
51035 +
51036 +               per_cpu(virq_to_irq, cpu)[virq] = irq;
51037 +
51038 +               bind_evtchn_to_cpu(evtchn, cpu);
51039 +       }
51040 +
51041 +       irq_bindcount[irq]++;
51042 +
51043 +       spin_unlock(&irq_mapping_update_lock);
51044 +
51045 +       return irq;
51046 +}
51047 +
51048 +static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
51049 +{
51050 +       struct evtchn_bind_ipi bind_ipi;
51051 +       int evtchn, irq;
51052 +
51053 +       spin_lock(&irq_mapping_update_lock);
51054 +
51055 +       if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) {
51056 +               bind_ipi.vcpu = cpu;
51057 +               if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
51058 +                                               &bind_ipi) != 0)
51059 +                       BUG();
51060 +               evtchn = bind_ipi.port;
51061 +
51062 +               irq = find_unbound_irq();
51063 +               evtchn_to_irq[evtchn] = irq;
51064 +               irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
51065 +
51066 +               per_cpu(ipi_to_irq, cpu)[ipi] = irq;
51067 +
51068 +               bind_evtchn_to_cpu(evtchn, cpu);
51069 +       }
51070 +
51071 +       irq_bindcount[irq]++;
51072 +
51073 +       spin_unlock(&irq_mapping_update_lock);
51074 +
51075 +       return irq;
51076 +}
51077 +
51078 +static void unbind_from_irq(unsigned int irq)
51079 +{
51080 +       struct evtchn_close close;
51081 +       int evtchn = evtchn_from_irq(irq);
51082 +
51083 +       spin_lock(&irq_mapping_update_lock);
51084 +
51085 +       if ((--irq_bindcount[irq] == 0) && VALID_EVTCHN(evtchn)) {
51086 +               close.port = evtchn;
51087 +               if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
51088 +                       BUG();
51089 +
51090 +               switch (type_from_irq(irq)) {
51091 +               case IRQT_VIRQ:
51092 +                       per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
51093 +                               [index_from_irq(irq)] = -1;
51094 +                       break;
51095 +               case IRQT_IPI:
51096 +                       per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))
51097 +                               [index_from_irq(irq)] = -1;
51098 +                       break;
51099 +               default:
51100 +                       break;
51101 +               }
51102 +
51103 +               /* Closed ports are implicitly re-bound to VCPU0. */
51104 +               bind_evtchn_to_cpu(evtchn, 0);
51105 +
51106 +               evtchn_to_irq[evtchn] = -1;
51107 +               irq_info[irq] = IRQ_UNBOUND;
51108 +       }
51109 +
51110 +       spin_unlock(&irq_mapping_update_lock);
51111 +}
51112 +
51113 +int bind_evtchn_to_irqhandler(
51114 +       unsigned int evtchn,
51115 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
51116 +       unsigned long irqflags,
51117 +       const char *devname,
51118 +       void *dev_id)
51119 +{
51120 +       unsigned int irq;
51121 +       int retval;
51122 +
51123 +       irq = bind_evtchn_to_irq(evtchn);
51124 +       retval = request_irq(irq, handler, irqflags, devname, dev_id);
51125 +       if (retval != 0) {
51126 +               unbind_from_irq(irq);
51127 +               return retval;
51128 +       }
51129 +
51130 +       return irq;
51131 +}
51132 +EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
51133 +
51134 +int bind_virq_to_irqhandler(
51135 +       unsigned int virq,
51136 +       unsigned int cpu,
51137 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
51138 +       unsigned long irqflags,
51139 +       const char *devname,
51140 +       void *dev_id)
51141 +{
51142 +       unsigned int irq;
51143 +       int retval;
51144 +
51145 +       irq = bind_virq_to_irq(virq, cpu);
51146 +       retval = request_irq(irq, handler, irqflags, devname, dev_id);
51147 +       if (retval != 0) {
51148 +               unbind_from_irq(irq);
51149 +               return retval;
51150 +       }
51151 +
51152 +       return irq;
51153 +}
51154 +EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
51155 +
51156 +int bind_ipi_to_irqhandler(
51157 +       unsigned int ipi,
51158 +       unsigned int cpu,
51159 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
51160 +       unsigned long irqflags,
51161 +       const char *devname,
51162 +       void *dev_id)
51163 +{
51164 +       unsigned int irq;
51165 +       int retval;
51166 +
51167 +       irq = bind_ipi_to_irq(ipi, cpu);
51168 +       retval = request_irq(irq, handler, irqflags, devname, dev_id);
51169 +       if (retval != 0) {
51170 +               unbind_from_irq(irq);
51171 +               return retval;
51172 +       }
51173 +
51174 +       return irq;
51175 +}
51176 +EXPORT_SYMBOL_GPL(bind_ipi_to_irqhandler);
51177 +
51178 +void unbind_from_irqhandler(unsigned int irq, void *dev_id)
51179 +{
51180 +       free_irq(irq, dev_id);
51181 +       unbind_from_irq(irq);
51182 +}
51183 +EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
51184 +
51185 +/* Rebind an evtchn so that it gets delivered to a specific cpu */
51186 +static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
51187 +{
51188 +       struct evtchn_bind_vcpu bind_vcpu;
51189 +       int evtchn = evtchn_from_irq(irq);
51190 +
51191 +       if (!VALID_EVTCHN(evtchn))
51192 +               return;
51193 +
51194 +       /* Send future instances of this interrupt to other vcpu. */
51195 +       bind_vcpu.port = evtchn;
51196 +       bind_vcpu.vcpu = tcpu;
51197 +
51198 +       /*
51199 +        * If this fails, it usually just indicates that we're dealing with a 
51200 +        * virq or IPI channel, which don't actually need to be rebound. Ignore
51201 +        * it, but don't do the xenlinux-level rebind in that case.
51202 +        */
51203 +       if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
51204 +               bind_evtchn_to_cpu(evtchn, tcpu);
51205 +}
51206 +
51207 +
51208 +static void set_affinity_irq(unsigned irq, cpumask_t dest)
51209 +{
51210 +       unsigned tcpu = first_cpu(dest);
51211 +       rebind_irq_to_cpu(irq, tcpu);
51212 +}
51213 +
51214 +/*
51215 + * Interface to generic handling in irq.c
51216 + */
51217 +
51218 +static unsigned int startup_dynirq(unsigned int irq)
51219 +{
51220 +       int evtchn = evtchn_from_irq(irq);
51221 +
51222 +       if (VALID_EVTCHN(evtchn))
51223 +               unmask_evtchn(evtchn);
51224 +       return 0;
51225 +}
51226 +
51227 +static void shutdown_dynirq(unsigned int irq)
51228 +{
51229 +       int evtchn = evtchn_from_irq(irq);
51230 +
51231 +       if (VALID_EVTCHN(evtchn))
51232 +               mask_evtchn(evtchn);
51233 +}
51234 +
51235 +static void enable_dynirq(unsigned int irq)
51236 +{
51237 +       int evtchn = evtchn_from_irq(irq);
51238 +
51239 +       if (VALID_EVTCHN(evtchn))
51240 +               unmask_evtchn(evtchn);
51241 +}
51242 +
51243 +static void disable_dynirq(unsigned int irq)
51244 +{
51245 +       int evtchn = evtchn_from_irq(irq);
51246 +
51247 +       if (VALID_EVTCHN(evtchn))
51248 +               mask_evtchn(evtchn);
51249 +}
51250 +
51251 +static void ack_dynirq(unsigned int irq)
51252 +{
51253 +       int evtchn = evtchn_from_irq(irq);
51254 +
51255 +       move_native_irq(irq);
51256 +
51257 +       if (VALID_EVTCHN(evtchn)) {
51258 +               mask_evtchn(evtchn);
51259 +               clear_evtchn(evtchn);
51260 +       }
51261 +}
51262 +
51263 +static void end_dynirq(unsigned int irq)
51264 +{
51265 +       int evtchn = evtchn_from_irq(irq);
51266 +
51267 +       if (VALID_EVTCHN(evtchn) && !(irq_desc[irq].status & IRQ_DISABLED))
51268 +               unmask_evtchn(evtchn);
51269 +}
51270 +
51271 +static struct hw_interrupt_type dynirq_type = {
51272 +       "Dynamic-irq",
51273 +       startup_dynirq,
51274 +       shutdown_dynirq,
51275 +       enable_dynirq,
51276 +       disable_dynirq,
51277 +       ack_dynirq,
51278 +       end_dynirq,
51279 +       set_affinity_irq
51280 +};
51281 +
51282 +static inline void pirq_unmask_notify(int pirq)
51283 +{
51284 +       struct physdev_eoi eoi = { .irq = pirq };
51285 +       if (unlikely(test_bit(pirq, &pirq_needs_eoi[0])))
51286 +               (void)HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
51287 +}
51288 +
51289 +static inline void pirq_query_unmask(int pirq)
51290 +{
51291 +       struct physdev_irq_status_query irq_status;
51292 +       irq_status.irq = pirq;
51293 +       (void)HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status);
51294 +       clear_bit(pirq, &pirq_needs_eoi[0]);
51295 +       if (irq_status.flags & XENIRQSTAT_needs_eoi)
51296 +               set_bit(pirq, &pirq_needs_eoi[0]);
51297 +}
51298 +
51299 +/*
51300 + * On startup, if there is no action associated with the IRQ then we are
51301 + * probing. In this case we should not share with others as it will confuse us.
51302 + */
51303 +#define probing_irq(_irq) (irq_desc[(_irq)].action == NULL)
51304 +
51305 +static unsigned int startup_pirq(unsigned int irq)
51306 +{
51307 +       struct evtchn_bind_pirq bind_pirq;
51308 +       int evtchn = evtchn_from_irq(irq);
51309 +
51310 +       if (VALID_EVTCHN(evtchn))
51311 +               goto out;
51312 +
51313 +       bind_pirq.pirq  = irq;
51314 +       /* NB. We are happy to share unless we are probing. */
51315 +       bind_pirq.flags = probing_irq(irq) ? 0 : BIND_PIRQ__WILL_SHARE;
51316 +       if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq) != 0) {
51317 +               if (!probing_irq(irq))
51318 +                       printk(KERN_INFO "Failed to obtain physical IRQ %d\n",
51319 +                              irq);
51320 +               return 0;
51321 +       }
51322 +       evtchn = bind_pirq.port;
51323 +
51324 +       pirq_query_unmask(irq_to_pirq(irq));
51325 +
51326 +       evtchn_to_irq[evtchn] = irq;
51327 +       bind_evtchn_to_cpu(evtchn, 0);
51328 +       irq_info[irq] = mk_irq_info(IRQT_PIRQ, irq, evtchn);
51329 +
51330 + out:
51331 +       unmask_evtchn(evtchn);
51332 +       pirq_unmask_notify(irq_to_pirq(irq));
51333 +
51334 +       return 0;
51335 +}
51336 +
51337 +static void shutdown_pirq(unsigned int irq)
51338 +{
51339 +       struct evtchn_close close;
51340 +       int evtchn = evtchn_from_irq(irq);
51341 +
51342 +       if (!VALID_EVTCHN(evtchn))
51343 +               return;
51344 +
51345 +       mask_evtchn(evtchn);
51346 +
51347 +       close.port = evtchn;
51348 +       if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
51349 +               BUG();
51350 +
51351 +       bind_evtchn_to_cpu(evtchn, 0);
51352 +       evtchn_to_irq[evtchn] = -1;
51353 +       irq_info[irq] = IRQ_UNBOUND;
51354 +}
51355 +
51356 +static void enable_pirq(unsigned int irq)
51357 +{
51358 +       int evtchn = evtchn_from_irq(irq);
51359 +
51360 +       if (VALID_EVTCHN(evtchn)) {
51361 +               unmask_evtchn(evtchn);
51362 +               pirq_unmask_notify(irq_to_pirq(irq));
51363 +       }
51364 +}
51365 +
51366 +static void disable_pirq(unsigned int irq)
51367 +{
51368 +       int evtchn = evtchn_from_irq(irq);
51369 +
51370 +       if (VALID_EVTCHN(evtchn))
51371 +               mask_evtchn(evtchn);
51372 +}
51373 +
51374 +static void ack_pirq(unsigned int irq)
51375 +{
51376 +       int evtchn = evtchn_from_irq(irq);
51377 +
51378 +       move_native_irq(irq);
51379 +
51380 +       if (VALID_EVTCHN(evtchn)) {
51381 +               mask_evtchn(evtchn);
51382 +               clear_evtchn(evtchn);
51383 +       }
51384 +}
51385 +
51386 +static void end_pirq(unsigned int irq)
51387 +{
51388 +       int evtchn = evtchn_from_irq(irq);
51389 +
51390 +       if (VALID_EVTCHN(evtchn) && !(irq_desc[irq].status & IRQ_DISABLED)) {
51391 +               unmask_evtchn(evtchn);
51392 +               pirq_unmask_notify(irq_to_pirq(irq));
51393 +       }
51394 +}
51395 +
51396 +static struct hw_interrupt_type pirq_type = {
51397 +       "Phys-irq",
51398 +       startup_pirq,
51399 +       shutdown_pirq,
51400 +       enable_pirq,
51401 +       disable_pirq,
51402 +       ack_pirq,
51403 +       end_pirq,
51404 +       set_affinity_irq
51405 +};
51406 +
51407 +int irq_ignore_unhandled(unsigned int irq)
51408 +{
51409 +       struct physdev_irq_status_query irq_status = { .irq = irq };
51410 +
51411 +       if (!is_running_on_xen())
51412 +               return 0;
51413 +
51414 +       (void)HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status);
51415 +       return !!(irq_status.flags & XENIRQSTAT_shared);
51416 +}
51417 +
51418 +void resend_irq_on_evtchn(struct hw_interrupt_type *h, unsigned int i)
51419 +{
51420 +       int evtchn = evtchn_from_irq(i);
51421 +       shared_info_t *s = HYPERVISOR_shared_info;
51422 +       if (!VALID_EVTCHN(evtchn))
51423 +               return;
51424 +       BUG_ON(!synch_test_bit(evtchn, &s->evtchn_mask[0]));
51425 +       synch_set_bit(evtchn, &s->evtchn_pending[0]);
51426 +}
51427 +
51428 +void notify_remote_via_irq(int irq)
51429 +{
51430 +       int evtchn = evtchn_from_irq(irq);
51431 +
51432 +       if (VALID_EVTCHN(evtchn))
51433 +               notify_remote_via_evtchn(evtchn);
51434 +}
51435 +EXPORT_SYMBOL_GPL(notify_remote_via_irq);
51436 +
51437 +void mask_evtchn(int port)
51438 +{
51439 +       shared_info_t *s = HYPERVISOR_shared_info;
51440 +       synch_set_bit(port, &s->evtchn_mask[0]);
51441 +}
51442 +EXPORT_SYMBOL_GPL(mask_evtchn);
51443 +
51444 +void unmask_evtchn(int port)
51445 +{
51446 +       shared_info_t *s = HYPERVISOR_shared_info;
51447 +       unsigned int cpu = smp_processor_id();
51448 +       vcpu_info_t *vcpu_info = &s->vcpu_info[cpu];
51449 +
51450 +       BUG_ON(!irqs_disabled());
51451 +
51452 +       /* Slow path (hypercall) if this is a non-local port. */
51453 +       if (unlikely(cpu != cpu_from_evtchn(port))) {
51454 +               struct evtchn_unmask unmask = { .port = port };
51455 +               (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
51456 +               return;
51457 +       }
51458 +
51459 +       synch_clear_bit(port, &s->evtchn_mask[0]);
51460 +
51461 +       /*
51462 +        * The following is basically the equivalent of 'hw_resend_irq'. Just
51463 +        * like a real IO-APIC we 'lose the interrupt edge' if the channel is
51464 +        * masked.
51465 +        */
51466 +       if (synch_test_bit(port, &s->evtchn_pending[0]) &&
51467 +           !synch_test_and_set_bit(port / BITS_PER_LONG,
51468 +                                   &vcpu_info->evtchn_pending_sel))
51469 +               vcpu_info->evtchn_upcall_pending = 1;
51470 +}
51471 +EXPORT_SYMBOL_GPL(unmask_evtchn);
51472 +
51473 +void irq_resume(void)
51474 +{
51475 +       struct evtchn_bind_virq bind_virq;
51476 +       struct evtchn_bind_ipi  bind_ipi;
51477 +       int cpu, pirq, virq, ipi, irq, evtchn;
51478 +
51479 +       init_evtchn_cpu_bindings();
51480 +
51481 +       /* New event-channel space is not 'live' yet. */
51482 +       for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
51483 +               mask_evtchn(evtchn);
51484 +
51485 +       /* Check that no PIRQs are still bound. */
51486 +       for (pirq = 0; pirq < NR_PIRQS; pirq++)
51487 +               BUG_ON(irq_info[pirq_to_irq(pirq)] != IRQ_UNBOUND);
51488 +
51489 +       /* Secondary CPUs must have no VIRQ or IPI bindings. */
51490 +       for_each_possible_cpu(cpu) {
51491 +               if (cpu == 0)
51492 +                       continue;
51493 +               for (virq = 0; virq < NR_VIRQS; virq++)
51494 +                       BUG_ON(per_cpu(virq_to_irq, cpu)[virq] != -1);
51495 +               for (ipi = 0; ipi < NR_IPIS; ipi++)
51496 +                       BUG_ON(per_cpu(ipi_to_irq, cpu)[ipi] != -1);
51497 +       }
51498 +
51499 +       /* No IRQ <-> event-channel mappings. */
51500 +       for (irq = 0; irq < NR_IRQS; irq++)
51501 +               irq_info[irq] &= ~0xFFFF; /* zap event-channel binding */
51502 +       for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
51503 +               evtchn_to_irq[evtchn] = -1;
51504 +
51505 +       /* Primary CPU: rebind VIRQs automatically. */
51506 +       for (virq = 0; virq < NR_VIRQS; virq++) {
51507 +               if ((irq = per_cpu(virq_to_irq, 0)[virq]) == -1)
51508 +                       continue;
51509 +
51510 +               BUG_ON(irq_info[irq] != mk_irq_info(IRQT_VIRQ, virq, 0));
51511 +
51512 +               /* Get a new binding from Xen. */
51513 +               bind_virq.virq = virq;
51514 +               bind_virq.vcpu = 0;
51515 +               if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
51516 +                                               &bind_virq) != 0)
51517 +                       BUG();
51518 +               evtchn = bind_virq.port;
51519 +
51520 +               /* Record the new mapping. */
51521 +               evtchn_to_irq[evtchn] = irq;
51522 +               irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
51523 +
51524 +               /* Ready for use. */
51525 +               unmask_evtchn(evtchn);
51526 +       }
51527 +
51528 +       /* Primary CPU: rebind IPIs automatically. */
51529 +       for (ipi = 0; ipi < NR_IPIS; ipi++) {
51530 +               if ((irq = per_cpu(ipi_to_irq, 0)[ipi]) == -1)
51531 +                       continue;
51532 +
51533 +               BUG_ON(irq_info[irq] != mk_irq_info(IRQT_IPI, ipi, 0));
51534 +
51535 +               /* Get a new binding from Xen. */
51536 +               bind_ipi.vcpu = 0;
51537 +               if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
51538 +                                               &bind_ipi) != 0)
51539 +                       BUG();
51540 +               evtchn = bind_ipi.port;
51541 +
51542 +               /* Record the new mapping. */
51543 +               evtchn_to_irq[evtchn] = irq;
51544 +               irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
51545 +
51546 +               /* Ready for use. */
51547 +               unmask_evtchn(evtchn);
51548 +       }
51549 +}
51550 +
51551 +void __init xen_init_IRQ(void)
51552 +{
51553 +       int i;
51554 +
51555 +       init_evtchn_cpu_bindings();
51556 +
51557 +       /* No event channels are 'live' right now. */
51558 +       for (i = 0; i < NR_EVENT_CHANNELS; i++)
51559 +               mask_evtchn(i);
51560 +
51561 +       /* No IRQ -> event-channel mappings. */
51562 +       for (i = 0; i < NR_IRQS; i++)
51563 +               irq_info[i] = IRQ_UNBOUND;
51564 +
51565 +       /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
51566 +       for (i = 0; i < NR_DYNIRQS; i++) {
51567 +               irq_bindcount[dynirq_to_irq(i)] = 0;
51568 +
51569 +               irq_desc[dynirq_to_irq(i)].status  = IRQ_DISABLED;
51570 +               irq_desc[dynirq_to_irq(i)].action  = NULL;
51571 +               irq_desc[dynirq_to_irq(i)].depth   = 1;
51572 +               irq_desc[dynirq_to_irq(i)].handler = &dynirq_type;
51573 +       }
51574 +
51575 +       /* Phys IRQ space is statically bound (1:1 mapping). Nail refcnts. */
51576 +       for (i = 0; i < NR_PIRQS; i++) {
51577 +               irq_bindcount[pirq_to_irq(i)] = 1;
51578 +
51579 +#ifdef RTC_IRQ
51580 +               /* If not domain 0, force our RTC driver to fail its probe. */
51581 +               if ((i == RTC_IRQ) &&
51582 +                   !(xen_start_info->flags & SIF_INITDOMAIN))
51583 +                       continue;
51584 +#endif
51585 +
51586 +               irq_desc[pirq_to_irq(i)].status  = IRQ_DISABLED;
51587 +               irq_desc[pirq_to_irq(i)].action  = NULL;
51588 +               irq_desc[pirq_to_irq(i)].depth   = 1;
51589 +               irq_desc[pirq_to_irq(i)].handler = &pirq_type;
51590 +       }
51591 +}
51592 diff -urNp linux-2.6/drivers/xen/core/features.c new/drivers/xen/core/features.c
51593 --- linux-2.6/drivers/xen/core/features.c       1970-01-01 01:00:00.000000000 +0100
51594 +++ new/drivers/xen/core/features.c     2006-05-09 12:34:37.000000000 +0200
51595 @@ -0,0 +1,30 @@
51596 +/******************************************************************************
51597 + * features.c
51598 + *
51599 + * Xen feature flags.
51600 + *
51601 + * Copyright (c) 2006, Ian Campbell, XenSource Inc.
51602 + */
51603 +#include <linux/types.h>
51604 +#include <linux/cache.h>
51605 +#include <linux/module.h>
51606 +#include <asm/hypervisor.h>
51607 +#include <xen/features.h>
51608 +
51609 +u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
51610 +/* Not a GPL symbol: used in ubiquitous macros, so too restrictive. */
51611 +EXPORT_SYMBOL(xen_features);
51612 +
51613 +void setup_xen_features(void)
51614 +{
51615 +       xen_feature_info_t fi;
51616 +       int i, j;
51617 +
51618 +       for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
51619 +               fi.submap_idx = i;
51620 +               if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
51621 +                       break;
51622 +               for (j=0; j<32; j++)
51623 +                       xen_features[i*32+j] = !!(fi.submap & 1<<j);
51624 +       }
51625 +}
51626 diff -urNp linux-2.6/drivers/xen/core/gnttab.c new/drivers/xen/core/gnttab.c
51627 --- linux-2.6/drivers/xen/core/gnttab.c 1970-01-01 01:00:00.000000000 +0100
51628 +++ new/drivers/xen/core/gnttab.c       2006-07-07 15:10:03.000000000 +0200
51629 @@ -0,0 +1,442 @@
51630 +/******************************************************************************
51631 + * gnttab.c
51632 + *
51633 + * Granting foreign access to our memory reservation.
51634 + *
51635 + * Copyright (c) 2005, Christopher Clark
51636 + * Copyright (c) 2004-2005, K A Fraser
51637 + *
51638 + * This program is free software; you can redistribute it and/or
51639 + * modify it under the terms of the GNU General Public License version 2
51640 + * as published by the Free Software Foundation; or, when distributed
51641 + * separately from the Linux kernel or incorporated into other
51642 + * software packages, subject to the following license:
51643 + *
51644 + * Permission is hereby granted, free of charge, to any person obtaining a copy
51645 + * of this source file (the "Software"), to deal in the Software without
51646 + * restriction, including without limitation the rights to use, copy, modify,
51647 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
51648 + * and to permit persons to whom the Software is furnished to do so, subject to
51649 + * the following conditions:
51650 + *
51651 + * The above copyright notice and this permission notice shall be included in
51652 + * all copies or substantial portions of the Software.
51653 + *
51654 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
51655 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
51656 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
51657 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
51658 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
51659 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
51660 + * IN THE SOFTWARE.
51661 + */
51662 +
51663 +#include <linux/config.h>
51664 +#include <linux/module.h>
51665 +#include <linux/sched.h>
51666 +#include <linux/mm.h>
51667 +#include <linux/vmalloc.h>
51668 +#include <xen/interface/xen.h>
51669 +#include <xen/gnttab.h>
51670 +#include <asm/pgtable.h>
51671 +#include <asm/uaccess.h>
51672 +#include <asm/synch_bitops.h>
51673 +
51674 +/* External tools reserve first few grant table entries. */
51675 +#define NR_RESERVED_ENTRIES 8
51676 +
51677 +#define NR_GRANT_ENTRIES \
51678 +       (NR_GRANT_FRAMES * PAGE_SIZE / sizeof(struct grant_entry))
51679 +#define GNTTAB_LIST_END (NR_GRANT_ENTRIES + 1)
51680 +
51681 +static grant_ref_t gnttab_list[NR_GRANT_ENTRIES];
51682 +static int gnttab_free_count;
51683 +static grant_ref_t gnttab_free_head;
51684 +static DEFINE_SPINLOCK(gnttab_list_lock);
51685 +
51686 +static struct grant_entry *shared;
51687 +
51688 +static struct gnttab_free_callback *gnttab_free_callback_list;
51689 +
51690 +static int get_free_entries(int count)
51691 +{
51692 +       unsigned long flags;
51693 +       int ref;
51694 +       grant_ref_t head;
51695 +       spin_lock_irqsave(&gnttab_list_lock, flags);
51696 +       if (gnttab_free_count < count) {
51697 +               spin_unlock_irqrestore(&gnttab_list_lock, flags);
51698 +               return -1;
51699 +       }
51700 +       ref = head = gnttab_free_head;
51701 +       gnttab_free_count -= count;
51702 +       while (count-- > 1)
51703 +               head = gnttab_list[head];
51704 +       gnttab_free_head = gnttab_list[head];
51705 +       gnttab_list[head] = GNTTAB_LIST_END;
51706 +       spin_unlock_irqrestore(&gnttab_list_lock, flags);
51707 +       return ref;
51708 +}
51709 +
51710 +#define get_free_entry() get_free_entries(1)
51711 +
51712 +static void do_free_callbacks(void)
51713 +{
51714 +       struct gnttab_free_callback *callback, *next;
51715 +
51716 +       callback = gnttab_free_callback_list;
51717 +       gnttab_free_callback_list = NULL;
51718 +
51719 +       while (callback != NULL) {
51720 +               next = callback->next;
51721 +               if (gnttab_free_count >= callback->count) {
51722 +                       callback->next = NULL;
51723 +                       callback->fn(callback->arg);
51724 +               } else {
51725 +                       callback->next = gnttab_free_callback_list;
51726 +                       gnttab_free_callback_list = callback;
51727 +               }
51728 +               callback = next;
51729 +       }
51730 +}
51731 +
51732 +static inline void check_free_callbacks(void)
51733 +{
51734 +       if (unlikely(gnttab_free_callback_list))
51735 +               do_free_callbacks();
51736 +}
51737 +
51738 +static void put_free_entry(grant_ref_t ref)
51739 +{
51740 +       unsigned long flags;
51741 +       spin_lock_irqsave(&gnttab_list_lock, flags);
51742 +       gnttab_list[ref] = gnttab_free_head;
51743 +       gnttab_free_head = ref;
51744 +       gnttab_free_count++;
51745 +       check_free_callbacks();
51746 +       spin_unlock_irqrestore(&gnttab_list_lock, flags);
51747 +}
51748 +
51749 +/*
51750 + * Public grant-issuing interface functions
51751 + */
51752 +
51753 +int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
51754 +                               int readonly)
51755 +{
51756 +       int ref;
51757 +
51758 +       if (unlikely((ref = get_free_entry()) == -1))
51759 +               return -ENOSPC;
51760 +
51761 +       shared[ref].frame = frame;
51762 +       shared[ref].domid = domid;
51763 +       wmb();
51764 +       shared[ref].flags = GTF_permit_access | (readonly ? GTF_readonly : 0);
51765 +
51766 +       return ref;
51767 +}
51768 +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
51769 +
51770 +void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
51771 +                                    unsigned long frame, int readonly)
51772 +{
51773 +       shared[ref].frame = frame;
51774 +       shared[ref].domid = domid;
51775 +       wmb();
51776 +       shared[ref].flags = GTF_permit_access | (readonly ? GTF_readonly : 0);
51777 +}
51778 +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref);
51779 +
51780 +
51781 +int gnttab_query_foreign_access(grant_ref_t ref)
51782 +{
51783 +       u16 nflags;
51784 +
51785 +       nflags = shared[ref].flags;
51786 +
51787 +       return (nflags & (GTF_reading|GTF_writing));
51788 +}
51789 +EXPORT_SYMBOL_GPL(gnttab_query_foreign_access);
51790 +
51791 +int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
51792 +{
51793 +       u16 flags, nflags;
51794 +
51795 +       nflags = shared[ref].flags;
51796 +       do {
51797 +               if ((flags = nflags) & (GTF_reading|GTF_writing)) {
51798 +                       printk(KERN_ALERT "WARNING: g.e. still in use!\n");
51799 +                       return 0;
51800 +               }
51801 +       } while ((nflags = synch_cmpxchg_subword(&shared[ref].flags, flags, 0)) !=
51802 +                flags);
51803 +
51804 +       return 1;
51805 +}
51806 +EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref);
51807 +
51808 +void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
51809 +                              unsigned long page)
51810 +{
51811 +       if (gnttab_end_foreign_access_ref(ref, readonly)) {
51812 +               put_free_entry(ref);
51813 +               if (page != 0)
51814 +                       free_page(page);
51815 +       } else {
51816 +               /* XXX This needs to be fixed so that the ref and page are
51817 +                  placed on a list to be freed up later. */
51818 +               printk(KERN_WARNING
51819 +                      "WARNING: leaking g.e. and page still in use!\n");
51820 +       }
51821 +}
51822 +EXPORT_SYMBOL_GPL(gnttab_end_foreign_access);
51823 +
51824 +int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn)
51825 +{
51826 +       int ref;
51827 +
51828 +       if (unlikely((ref = get_free_entry()) == -1))
51829 +               return -ENOSPC;
51830 +       gnttab_grant_foreign_transfer_ref(ref, domid, pfn);
51831 +
51832 +       return ref;
51833 +}
51834 +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer);
51835 +
51836 +void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,
51837 +                                      unsigned long pfn)
51838 +{
51839 +       shared[ref].frame = pfn;
51840 +       shared[ref].domid = domid;
51841 +       wmb();
51842 +       shared[ref].flags = GTF_accept_transfer;
51843 +}
51844 +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref);
51845 +
51846 +unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
51847 +{
51848 +       unsigned long frame;
51849 +       u16           flags;
51850 +
51851 +       /*
51852 +        * If a transfer is not even yet started, try to reclaim the grant
51853 +        * reference and return failure (== 0).
51854 +        */
51855 +       while (!((flags = shared[ref].flags) & GTF_transfer_committed)) {
51856 +               if (synch_cmpxchg_subword(&shared[ref].flags, flags, 0) == flags)
51857 +                       return 0;
51858 +               cpu_relax();
51859 +       }
51860 +
51861 +       /* If a transfer is in progress then wait until it is completed. */
51862 +       while (!(flags & GTF_transfer_completed)) {
51863 +               flags = shared[ref].flags;
51864 +               cpu_relax();
51865 +       }
51866 +
51867 +       /* Read the frame number /after/ reading completion status. */
51868 +       rmb();
51869 +       frame = shared[ref].frame;
51870 +       BUG_ON(frame == 0);
51871 +
51872 +       return frame;
51873 +}
51874 +EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);
51875 +
51876 +unsigned long gnttab_end_foreign_transfer(grant_ref_t ref)
51877 +{
51878 +       unsigned long frame = gnttab_end_foreign_transfer_ref(ref);
51879 +       put_free_entry(ref);
51880 +       return frame;
51881 +}
51882 +EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer);
51883 +
51884 +void gnttab_free_grant_reference(grant_ref_t ref)
51885 +{
51886 +       put_free_entry(ref);
51887 +}
51888 +EXPORT_SYMBOL_GPL(gnttab_free_grant_reference);
51889 +
51890 +void gnttab_free_grant_references(grant_ref_t head)
51891 +{
51892 +       grant_ref_t ref;
51893 +       unsigned long flags;
51894 +       int count = 1;
51895 +       if (head == GNTTAB_LIST_END)
51896 +               return;
51897 +       spin_lock_irqsave(&gnttab_list_lock, flags);
51898 +       ref = head;
51899 +       while (gnttab_list[ref] != GNTTAB_LIST_END) {
51900 +               ref = gnttab_list[ref];
51901 +               count++;
51902 +       }
51903 +       gnttab_list[ref] = gnttab_free_head;
51904 +       gnttab_free_head = head;
51905 +       gnttab_free_count += count;
51906 +       check_free_callbacks();
51907 +       spin_unlock_irqrestore(&gnttab_list_lock, flags);
51908 +}
51909 +EXPORT_SYMBOL_GPL(gnttab_free_grant_references);
51910 +
51911 +int gnttab_alloc_grant_references(u16 count, grant_ref_t *head)
51912 +{
51913 +       int h = get_free_entries(count);
51914 +
51915 +       if (h == -1)
51916 +               return -ENOSPC;
51917 +
51918 +       *head = h;
51919 +
51920 +       return 0;
51921 +}
51922 +EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references);
51923 +
51924 +int gnttab_empty_grant_references(const grant_ref_t *private_head)
51925 +{
51926 +       return (*private_head == GNTTAB_LIST_END);
51927 +}
51928 +EXPORT_SYMBOL_GPL(gnttab_empty_grant_references);
51929 +
51930 +int gnttab_claim_grant_reference(grant_ref_t *private_head)
51931 +{
51932 +       grant_ref_t g = *private_head;
51933 +       if (unlikely(g == GNTTAB_LIST_END))
51934 +               return -ENOSPC;
51935 +       *private_head = gnttab_list[g];
51936 +       return g;
51937 +}
51938 +EXPORT_SYMBOL_GPL(gnttab_claim_grant_reference);
51939 +
51940 +void gnttab_release_grant_reference(grant_ref_t *private_head,
51941 +                                   grant_ref_t release)
51942 +{
51943 +       gnttab_list[release] = *private_head;
51944 +       *private_head = release;
51945 +}
51946 +EXPORT_SYMBOL_GPL(gnttab_release_grant_reference);
51947 +
51948 +void gnttab_request_free_callback(struct gnttab_free_callback *callback,
51949 +                                 void (*fn)(void *), void *arg, u16 count)
51950 +{
51951 +       unsigned long flags;
51952 +       spin_lock_irqsave(&gnttab_list_lock, flags);
51953 +       if (callback->next)
51954 +               goto out;
51955 +       callback->fn = fn;
51956 +       callback->arg = arg;
51957 +       callback->count = count;
51958 +       callback->next = gnttab_free_callback_list;
51959 +       gnttab_free_callback_list = callback;
51960 +       check_free_callbacks();
51961 +out:
51962 +       spin_unlock_irqrestore(&gnttab_list_lock, flags);
51963 +}
51964 +EXPORT_SYMBOL_GPL(gnttab_request_free_callback);
51965 +
51966 +void gnttab_cancel_free_callback(struct gnttab_free_callback *callback)
51967 +{
51968 +       struct gnttab_free_callback **pcb;
51969 +       unsigned long flags;
51970 +
51971 +       spin_lock_irqsave(&gnttab_list_lock, flags);
51972 +       for (pcb = &gnttab_free_callback_list; *pcb; pcb = &(*pcb)->next) {
51973 +               if (*pcb == callback) {
51974 +                       *pcb = callback->next;
51975 +                       break;
51976 +               }
51977 +       }
51978 +       spin_unlock_irqrestore(&gnttab_list_lock, flags);
51979 +}
51980 +EXPORT_SYMBOL_GPL(gnttab_cancel_free_callback);
51981 +
51982 +#ifndef __ia64__
51983 +static int map_pte_fn(pte_t *pte, struct page *pmd_page,
51984 +                     unsigned long addr, void *data)
51985 +{
51986 +       unsigned long **frames = (unsigned long **)data;
51987 +
51988 +       set_pte_at(&init_mm, addr, pte, pfn_pte_ma((*frames)[0], PAGE_KERNEL));
51989 +       (*frames)++;
51990 +       return 0;
51991 +}
51992 +
51993 +static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
51994 +                       unsigned long addr, void *data)
51995 +{
51996 +
51997 +       set_pte_at(&init_mm, addr, pte, __pte(0));
51998 +       return 0;
51999 +}
52000 +#endif
52001 +
52002 +int gnttab_resume(void)
52003 +{
52004 +       struct gnttab_setup_table setup;
52005 +       unsigned long frames[NR_GRANT_FRAMES];
52006 +       int rc;
52007 +#ifndef __ia64__
52008 +       void *pframes = frames;
52009 +       struct vm_struct *area;
52010 +#endif
52011 +
52012 +       setup.dom        = DOMID_SELF;
52013 +       setup.nr_frames  = NR_GRANT_FRAMES;
52014 +       set_xen_guest_handle(setup.frame_list, frames);
52015 +
52016 +       rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
52017 +       if (rc == -ENOSYS)
52018 +               return -ENOSYS;
52019 +
52020 +       BUG_ON(rc || setup.status);
52021 +
52022 +#ifndef __ia64__
52023 +       if (shared == NULL) {
52024 +               area = get_vm_area(PAGE_SIZE * NR_GRANT_FRAMES, VM_IOREMAP);
52025 +               BUG_ON(area == NULL);
52026 +               shared = area->addr;
52027 +       }
52028 +       rc = apply_to_page_range(&init_mm, (unsigned long)shared,
52029 +                                PAGE_SIZE * NR_GRANT_FRAMES,
52030 +                                map_pte_fn, &pframes);
52031 +       BUG_ON(rc);
52032 +#else
52033 +       shared = __va(frames[0] << PAGE_SHIFT);
52034 +       printk("grant table at %p\n", shared);
52035 +#endif
52036 +
52037 +       return 0;
52038 +}
52039 +
52040 +int gnttab_suspend(void)
52041 +{
52042 +
52043 +#ifndef __ia64__
52044 +       apply_to_page_range(&init_mm, (unsigned long)shared,
52045 +                           PAGE_SIZE * NR_GRANT_FRAMES,
52046 +                           unmap_pte_fn, NULL);
52047 +#endif
52048 +
52049 +       return 0;
52050 +}
52051 +
52052 +static int __init gnttab_init(void)
52053 +{
52054 +       int i;
52055 +
52056 +       if (!is_running_on_xen())
52057 +               return -ENODEV;
52058 +
52059 +       if (gnttab_resume() < 0)
52060 +               return -ENODEV;
52061 +
52062 +       for (i = NR_RESERVED_ENTRIES; i < NR_GRANT_ENTRIES; i++)
52063 +               gnttab_list[i] = i + 1;
52064 +       gnttab_free_count = NR_GRANT_ENTRIES - NR_RESERVED_ENTRIES;
52065 +       gnttab_free_head  = NR_RESERVED_ENTRIES;
52066 +
52067 +       printk("Grant table initialized\n");
52068 +       return 0;
52069 +}
52070 +
52071 +core_initcall(gnttab_init);
52072 diff -urNp linux-2.6/drivers/xen/core/hypervisor_sysfs.c new/drivers/xen/core/hypervisor_sysfs.c
52073 --- linux-2.6/drivers/xen/core/hypervisor_sysfs.c       1970-01-01 01:00:00.000000000 +0100
52074 +++ new/drivers/xen/core/hypervisor_sysfs.c     2006-05-23 18:42:17.000000000 +0200
52075 @@ -0,0 +1,60 @@
52076 +/*
52077 + *  copyright (c) 2006 IBM Corporation
52078 + *  Authored by: Mike D. Day <ncmike@us.ibm.com>
52079 + *
52080 + *  This program is free software; you can redistribute it and/or modify
52081 + *  it under the terms of the GNU General Public License version 2 as
52082 + *  published by the Free Software Foundation.
52083 + */
52084 +
52085 +#include <linux/config.h>
52086 +#include <linux/kernel.h>
52087 +#include <linux/module.h>
52088 +#include <linux/kobject.h>
52089 +#include <xen/hypervisor_sysfs.h>
52090 +
52091 +decl_subsys(hypervisor, NULL, NULL);
52092 +
52093 +static ssize_t hyp_sysfs_show(struct kobject *kobj,
52094 +                             struct attribute *attr,
52095 +                             char *buffer)
52096 +{
52097 +       struct hyp_sysfs_attr *hyp_attr;
52098 +       hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
52099 +       if (hyp_attr->show)
52100 +               return hyp_attr->show(hyp_attr, buffer);
52101 +       return 0;
52102 +}
52103 +
52104 +static ssize_t hyp_sysfs_store(struct kobject *kobj,
52105 +                              struct attribute *attr,
52106 +                              const char *buffer,
52107 +                              size_t len)
52108 +{
52109 +       struct hyp_sysfs_attr *hyp_attr;
52110 +       hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
52111 +       if (hyp_attr->store)
52112 +               return hyp_attr->store(hyp_attr, buffer, len);
52113 +       return 0;
52114 +}
52115 +
52116 +struct sysfs_ops hyp_sysfs_ops = {
52117 +       .show = hyp_sysfs_show,
52118 +       .store = hyp_sysfs_store,
52119 +};
52120 +
52121 +static struct kobj_type hyp_sysfs_kobj_type = {
52122 +       .sysfs_ops = &hyp_sysfs_ops,
52123 +};
52124 +
52125 +static int __init hypervisor_subsys_init(void)
52126 +{
52127 +       if (!is_running_on_xen())
52128 +               return -ENODEV;
52129 +
52130 +       hypervisor_subsys.kset.kobj.ktype = &hyp_sysfs_kobj_type;
52131 +       return subsystem_register(&hypervisor_subsys);
52132 +}
52133 +
52134 +device_initcall(hypervisor_subsys_init);
52135 +EXPORT_SYMBOL_GPL(hypervisor_subsys);
52136 diff -urNp linux-2.6/drivers/xen/core/Makefile new/drivers/xen/core/Makefile
52137 --- linux-2.6/drivers/xen/core/Makefile 1970-01-01 01:00:00.000000000 +0100
52138 +++ new/drivers/xen/core/Makefile       2006-06-28 14:32:14.000000000 +0200
52139 @@ -0,0 +1,13 @@
52140 +#
52141 +# Makefile for the linux kernel.
52142 +#
52143 +
52144 +obj-y := evtchn.o gnttab.o features.o
52145 +
52146 +obj-$(CONFIG_PROC_FS)          += xen_proc.o
52147 +obj-$(CONFIG_SYSFS)            += hypervisor_sysfs.o
52148 +obj-$(CONFIG_HOTPLUG_CPU)      += cpu_hotplug.o
52149 +obj-$(CONFIG_XEN_SYSFS)                += xen_sysfs.o
52150 +obj-$(CONFIG_XEN_SKBUFF)       += skbuff.o
52151 +obj-$(CONFIG_XEN_REBOOT)       += reboot.o
52152 +obj-$(CONFIG_XEN_SMPBOOT)      += smpboot.o
52153 diff -urNp linux-2.6/drivers/xen/core/reboot.c new/drivers/xen/core/reboot.c
52154 --- linux-2.6/drivers/xen/core/reboot.c 1970-01-01 01:00:00.000000000 +0100
52155 +++ new/drivers/xen/core/reboot.c       2006-06-28 14:32:14.000000000 +0200
52156 @@ -0,0 +1,364 @@
52157 +#define __KERNEL_SYSCALLS__
52158 +#include <linux/version.h>
52159 +#include <linux/kernel.h>
52160 +#include <linux/mm.h>
52161 +#include <linux/unistd.h>
52162 +#include <linux/module.h>
52163 +#include <linux/reboot.h>
52164 +#include <linux/sysrq.h>
52165 +#include <linux/stringify.h>
52166 +#include <asm/irq.h>
52167 +#include <asm/mmu_context.h>
52168 +#include <xen/evtchn.h>
52169 +#include <asm/hypervisor.h>
52170 +#include <xen/interface/dom0_ops.h>
52171 +#include <xen/xenbus.h>
52172 +#include <linux/cpu.h>
52173 +#include <linux/kthread.h>
52174 +#include <xen/gnttab.h>
52175 +#include <xen/xencons.h>
52176 +#include <xen/cpu_hotplug.h>
52177 +
52178 +#if defined(__i386__) || defined(__x86_64__)
52179 +/*
52180 + * Power off function, if any
52181 + */
52182 +void (*pm_power_off)(void);
52183 +EXPORT_SYMBOL(pm_power_off);
52184 +#endif
52185 +
52186 +extern void ctrl_alt_del(void);
52187 +
52188 +#define SHUTDOWN_INVALID  -1
52189 +#define SHUTDOWN_POWEROFF  0
52190 +#define SHUTDOWN_SUSPEND   2
52191 +/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
52192 + * report a crash, not be instructed to crash!
52193 + * HALT is the same as POWEROFF, as far as we're concerned.  The tools use
52194 + * the distinction when we return the reason code to them.
52195 + */
52196 +#define SHUTDOWN_HALT      4
52197 +
52198 +void machine_emergency_restart(void)
52199 +{
52200 +       /* We really want to get pending console data out before we die. */
52201 +       xencons_force_flush();
52202 +       HYPERVISOR_shutdown(SHUTDOWN_reboot);
52203 +}
52204 +
52205 +void machine_restart(char * __unused)
52206 +{
52207 +       machine_emergency_restart();
52208 +}
52209 +
52210 +void machine_halt(void)
52211 +{
52212 +       machine_power_off();
52213 +}
52214 +
52215 +void machine_power_off(void)
52216 +{
52217 +       /* We really want to get pending console data out before we die. */
52218 +       xencons_force_flush();
52219 +#if defined(__i386__) || defined(__x86_64__)
52220 +       if (pm_power_off)
52221 +               pm_power_off();
52222 +#endif
52223 +       HYPERVISOR_shutdown(SHUTDOWN_poweroff);
52224 +}
52225 +
52226 +int reboot_thru_bios = 0;      /* for dmi_scan.c */
52227 +EXPORT_SYMBOL(machine_restart);
52228 +EXPORT_SYMBOL(machine_halt);
52229 +EXPORT_SYMBOL(machine_power_off);
52230 +
52231 +
52232 +/******************************************************************************
52233 + * Stop/pickle callback handling.
52234 + */
52235 +
52236 +/* Ignore multiple shutdown requests. */
52237 +static int shutting_down = SHUTDOWN_INVALID;
52238 +static void __shutdown_handler(void *unused);
52239 +static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL);
52240 +
52241 +/* Ensure we run on the idle task page tables so that we will
52242 +   switch page tables before running user space. This is needed
52243 +   on architectures with separate kernel and user page tables
52244 +   because the user page table pointer is not saved/restored. */
52245 +static void switch_idle_mm(void)
52246 +{
52247 +       struct mm_struct *mm = current->active_mm;
52248 +
52249 +       if (mm == &init_mm)
52250 +               return;
52251 +
52252 +       atomic_inc(&init_mm.mm_count);
52253 +       switch_mm(mm, &init_mm, current);
52254 +       current->active_mm = &init_mm;
52255 +       mmdrop(mm);
52256 +}
52257 +
52258 +static int __do_suspend(void *ignore)
52259 +{
52260 +       int i, j, k, fpp, err;
52261 +
52262 +       extern unsigned long max_pfn;
52263 +       extern unsigned long *pfn_to_mfn_frame_list_list;
52264 +       extern unsigned long *pfn_to_mfn_frame_list[];
52265 +
52266 +       extern void time_resume(void);
52267 +
52268 +       BUG_ON(smp_processor_id() != 0);
52269 +       BUG_ON(in_interrupt());
52270 +
52271 +       if (xen_feature(XENFEAT_auto_translated_physmap)) {
52272 +               printk(KERN_WARNING "Cannot suspend in "
52273 +                      "auto_translated_physmap mode.\n");
52274 +               return -EOPNOTSUPP;
52275 +       }
52276 +
52277 +       err = smp_suspend();
52278 +       if (err)
52279 +               return err;
52280 +
52281 +       xenbus_suspend();
52282 +
52283 +       preempt_disable();
52284 +
52285 +#ifdef __i386__
52286 +       kmem_cache_shrink(pgd_cache);
52287 +#endif
52288 +       mm_pin_all();
52289 +
52290 +       __cli();
52291 +       preempt_enable();
52292 +
52293 +       gnttab_suspend();
52294 +
52295 +       HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
52296 +       clear_fixmap(FIX_SHARED_INFO);
52297 +
52298 +       xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
52299 +       xen_start_info->console_mfn = mfn_to_pfn(xen_start_info->console_mfn);
52300 +
52301 +       /*
52302 +        * We'll stop somewhere inside this hypercall. When it returns,
52303 +        * we'll start resuming after the restore.
52304 +        */
52305 +       HYPERVISOR_suspend(virt_to_mfn(xen_start_info));
52306 +
52307 +       shutting_down = SHUTDOWN_INVALID;
52308 +
52309 +       set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
52310 +
52311 +       HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
52312 +
52313 +       memset(empty_zero_page, 0, PAGE_SIZE);
52314 +
52315 +       HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
52316 +               virt_to_mfn(pfn_to_mfn_frame_list_list);
52317 +
52318 +       fpp = PAGE_SIZE/sizeof(unsigned long);
52319 +       for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) {
52320 +               if ((j % fpp) == 0) {
52321 +                       k++;
52322 +                       pfn_to_mfn_frame_list_list[k] =
52323 +                               virt_to_mfn(pfn_to_mfn_frame_list[k]);
52324 +                       j = 0;
52325 +               }
52326 +               pfn_to_mfn_frame_list[k][j] =
52327 +                       virt_to_mfn(&phys_to_machine_mapping[i]);
52328 +       }
52329 +       HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
52330 +
52331 +       gnttab_resume();
52332 +
52333 +       irq_resume();
52334 +
52335 +       time_resume();
52336 +
52337 +       switch_idle_mm();
52338 +
52339 +       __sti();
52340 +
52341 +       xencons_resume();
52342 +
52343 +       xenbus_resume();
52344 +
52345 +       smp_resume();
52346 +
52347 +       return err;
52348 +}
52349 +
52350 +static int shutdown_process(void *__unused)
52351 +{
52352 +       static char *envp[] = { "HOME=/", "TERM=linux",
52353 +                               "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
52354 +       static char *poweroff_argv[] = { "/sbin/poweroff", NULL };
52355 +
52356 +       extern asmlinkage long sys_reboot(int magic1, int magic2,
52357 +                                         unsigned int cmd, void *arg);
52358 +
52359 +       if ((shutting_down == SHUTDOWN_POWEROFF) ||
52360 +           (shutting_down == SHUTDOWN_HALT)) {
52361 +               if (execve("/sbin/poweroff", poweroff_argv, envp) < 0) {
52362 +                       sys_reboot(LINUX_REBOOT_MAGIC1,
52363 +                                  LINUX_REBOOT_MAGIC2,
52364 +                                  LINUX_REBOOT_CMD_POWER_OFF,
52365 +                                  NULL);
52366 +               }
52367 +       }
52368 +
52369 +       shutting_down = SHUTDOWN_INVALID; /* could try again */
52370 +
52371 +       return 0;
52372 +}
52373 +
52374 +static int kthread_create_on_cpu(int (*f)(void *arg),
52375 +                                void *arg,
52376 +                                const char *name,
52377 +                                int cpu)
52378 +{
52379 +       struct task_struct *p;
52380 +       p = kthread_create(f, arg, name);
52381 +       if (IS_ERR(p))
52382 +               return PTR_ERR(p);
52383 +       kthread_bind(p, cpu);
52384 +       wake_up_process(p);
52385 +       return 0;
52386 +}
52387 +
52388 +static void __shutdown_handler(void *unused)
52389 +{
52390 +       int err;
52391 +
52392 +       if (shutting_down != SHUTDOWN_SUSPEND)
52393 +               err = kernel_thread(shutdown_process, NULL,
52394 +                                   CLONE_FS | CLONE_FILES);
52395 +       else
52396 +               err = kthread_create_on_cpu(__do_suspend, NULL, "suspend", 0);
52397 +
52398 +       if (err < 0) {
52399 +               printk(KERN_WARNING "Error creating shutdown process (%d): "
52400 +                      "retrying...\n", -err);
52401 +               schedule_delayed_work(&shutdown_work, HZ/2);
52402 +       }
52403 +}
52404 +
52405 +static void shutdown_handler(struct xenbus_watch *watch,
52406 +                            const char **vec, unsigned int len)
52407 +{
52408 +       char *str;
52409 +       struct xenbus_transaction xbt;
52410 +       int err;
52411 +
52412 +       if (shutting_down != SHUTDOWN_INVALID)
52413 +               return;
52414 +
52415 + again:
52416 +       err = xenbus_transaction_start(&xbt);
52417 +       if (err)
52418 +               return;
52419 +       str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
52420 +       /* Ignore read errors and empty reads. */
52421 +       if (XENBUS_IS_ERR_READ(str)) {
52422 +               xenbus_transaction_end(xbt, 1);
52423 +               return;
52424 +       }
52425 +
52426 +       xenbus_write(xbt, "control", "shutdown", "");
52427 +
52428 +       err = xenbus_transaction_end(xbt, 0);
52429 +       if (err == -EAGAIN) {
52430 +               kfree(str);
52431 +               goto again;
52432 +       }
52433 +
52434 +       if (strcmp(str, "poweroff") == 0)
52435 +               shutting_down = SHUTDOWN_POWEROFF;
52436 +       else if (strcmp(str, "reboot") == 0)
52437 +               ctrl_alt_del();
52438 +       else if (strcmp(str, "suspend") == 0)
52439 +               shutting_down = SHUTDOWN_SUSPEND;
52440 +       else if (strcmp(str, "halt") == 0)
52441 +               shutting_down = SHUTDOWN_HALT;
52442 +       else {
52443 +               printk("Ignoring shutdown request: %s\n", str);
52444 +               shutting_down = SHUTDOWN_INVALID;
52445 +       }
52446 +
52447 +       if (shutting_down != SHUTDOWN_INVALID)
52448 +               schedule_work(&shutdown_work);
52449 +
52450 +       kfree(str);
52451 +}
52452 +
52453 +static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
52454 +                         unsigned int len)
52455 +{
52456 +       char sysrq_key = '\0';
52457 +       struct xenbus_transaction xbt;
52458 +       int err;
52459 +
52460 + again:
52461 +       err = xenbus_transaction_start(&xbt);
52462 +       if (err)
52463 +               return;
52464 +       if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
52465 +               printk(KERN_ERR "Unable to read sysrq code in "
52466 +                      "control/sysrq\n");
52467 +               xenbus_transaction_end(xbt, 1);
52468 +               return;
52469 +       }
52470 +
52471 +       if (sysrq_key != '\0')
52472 +               xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
52473 +
52474 +       err = xenbus_transaction_end(xbt, 0);
52475 +       if (err == -EAGAIN)
52476 +               goto again;
52477 +
52478 +#ifdef CONFIG_MAGIC_SYSRQ
52479 +       if (sysrq_key != '\0')
52480 +               handle_sysrq(sysrq_key, NULL, NULL);
52481 +#endif
52482 +}
52483 +
52484 +static struct xenbus_watch shutdown_watch = {
52485 +       .node = "control/shutdown",
52486 +       .callback = shutdown_handler
52487 +};
52488 +
52489 +static struct xenbus_watch sysrq_watch = {
52490 +       .node ="control/sysrq",
52491 +       .callback = sysrq_handler
52492 +};
52493 +
52494 +static int setup_shutdown_watcher(struct notifier_block *notifier,
52495 +                                 unsigned long event,
52496 +                                 void *data)
52497 +{
52498 +       int err;
52499 +
52500 +       err = register_xenbus_watch(&shutdown_watch);
52501 +       if (err)
52502 +               printk(KERN_ERR "Failed to set shutdown watcher\n");
52503 +
52504 +       err = register_xenbus_watch(&sysrq_watch);
52505 +       if (err)
52506 +               printk(KERN_ERR "Failed to set sysrq watcher\n");
52507 +
52508 +       return NOTIFY_DONE;
52509 +}
52510 +
52511 +static int __init setup_shutdown_event(void)
52512 +{
52513 +       static struct notifier_block xenstore_notifier = {
52514 +               .notifier_call = setup_shutdown_watcher
52515 +       };
52516 +       register_xenstore_notifier(&xenstore_notifier);
52517 +       return 0;
52518 +}
52519 +
52520 +subsys_initcall(setup_shutdown_event);
52521 diff -urNp linux-2.6/drivers/xen/core/skbuff.c new/drivers/xen/core/skbuff.c
52522 --- linux-2.6/drivers/xen/core/skbuff.c 1970-01-01 01:00:00.000000000 +0100
52523 +++ new/drivers/xen/core/skbuff.c       2006-06-28 14:32:14.000000000 +0200
52524 @@ -0,0 +1,141 @@
52525 +
52526 +#include <linux/config.h>
52527 +#include <linux/module.h>
52528 +#include <linux/version.h>
52529 +#include <linux/kernel.h>
52530 +#include <linux/sched.h>
52531 +#include <linux/slab.h>
52532 +#include <linux/netdevice.h>
52533 +#include <linux/inetdevice.h>
52534 +#include <linux/etherdevice.h>
52535 +#include <linux/skbuff.h>
52536 +#include <linux/init.h>
52537 +#include <asm/io.h>
52538 +#include <asm/page.h>
52539 +#include <asm/hypervisor.h>
52540 +
52541 +/* Referenced in netback.c. */
52542 +/*static*/ kmem_cache_t *skbuff_cachep;
52543 +EXPORT_SYMBOL(skbuff_cachep);
52544 +
52545 +#define MAX_SKBUFF_ORDER 4
52546 +static kmem_cache_t *skbuff_order_cachep[MAX_SKBUFF_ORDER + 1];
52547 +
52548 +static struct {
52549 +       int size;
52550 +       kmem_cache_t *cachep;
52551 +} skbuff_small[] = { { 512, NULL }, { 2048, NULL } };
52552 +
52553 +struct sk_buff *__alloc_skb(unsigned int length, gfp_t gfp_mask,
52554 +                           int fclone)
52555 +{
52556 +       int order, i;
52557 +       kmem_cache_t *cachep;
52558 +
52559 +       length = SKB_DATA_ALIGN(length) + sizeof(struct skb_shared_info);
52560 +
52561 +       if (length <= skbuff_small[ARRAY_SIZE(skbuff_small)-1].size) {
52562 +               for (i = 0; skbuff_small[i].size < length; i++)
52563 +                       continue;
52564 +               cachep = skbuff_small[i].cachep;
52565 +       } else {
52566 +               order = get_order(length);
52567 +               if (order > MAX_SKBUFF_ORDER) {
52568 +                       printk(KERN_ALERT "Attempt to allocate order %d "
52569 +                              "skbuff. Increase MAX_SKBUFF_ORDER.\n", order);
52570 +                       return NULL;
52571 +               }
52572 +               cachep = skbuff_order_cachep[order];
52573 +       }
52574 +
52575 +       length -= sizeof(struct skb_shared_info);
52576 +
52577 +       return alloc_skb_from_cache(cachep, length, gfp_mask, fclone);
52578 +}
52579 +
52580 +struct sk_buff *__dev_alloc_skb(unsigned int length, gfp_t gfp_mask)
52581 +{
52582 +       struct sk_buff *skb;
52583 +       int order;
52584 +
52585 +       length = SKB_DATA_ALIGN(length + 16);
52586 +       order = get_order(length + sizeof(struct skb_shared_info));
52587 +       if (order > MAX_SKBUFF_ORDER) {
52588 +               printk(KERN_ALERT "Attempt to allocate order %d skbuff. "
52589 +                      "Increase MAX_SKBUFF_ORDER.\n", order);
52590 +               return NULL;
52591 +       }
52592 +
52593 +       skb = alloc_skb_from_cache(
52594 +               skbuff_order_cachep[order], length, gfp_mask, 0);
52595 +       if (skb != NULL)
52596 +               skb_reserve(skb, 16);
52597 +
52598 +       return skb;
52599 +}
52600 +
52601 +static void skbuff_ctor(void *buf, kmem_cache_t *cachep, unsigned long unused)
52602 +{
52603 +       int order = 0;
52604 +
52605 +       while (skbuff_order_cachep[order] != cachep)
52606 +               order++;
52607 +
52608 +       /* Do our best to allocate contiguous memory but fall back to IOMMU. */
52609 +       if (order != 0)
52610 +               (void)xen_create_contiguous_region(
52611 +                       (unsigned long)buf, order, 0);
52612 +
52613 +       scrub_pages(buf, 1 << order);
52614 +}
52615 +
52616 +static void skbuff_dtor(void *buf, kmem_cache_t *cachep, unsigned long unused)
52617 +{
52618 +       int order = 0;
52619 +
52620 +       while (skbuff_order_cachep[order] != cachep)
52621 +               order++;
52622 +
52623 +       if (order != 0)
52624 +               xen_destroy_contiguous_region((unsigned long)buf, order);
52625 +}
52626 +
52627 +static int __init skbuff_init(void)
52628 +{
52629 +       static char name[MAX_SKBUFF_ORDER + 1][20];
52630 +       static char small_name[ARRAY_SIZE(skbuff_small)][20];
52631 +       unsigned long size;
52632 +       int i, order;
52633 +
52634 +       for (i = 0; i < ARRAY_SIZE(skbuff_small); i++) {
52635 +               size = skbuff_small[i].size;
52636 +               sprintf(small_name[i], "xen-skb-%lu", size);
52637 +               /*
52638 +                * No ctor/dtor: objects do not span page boundaries, and they
52639 +                * are only used on transmit path so no need for scrubbing.
52640 +                */
52641 +               skbuff_small[i].cachep = kmem_cache_create(
52642 +                       small_name[i], size, size, 0, NULL, NULL);
52643 +       }
52644 +
52645 +       for (order = 0; order <= MAX_SKBUFF_ORDER; order++) {
52646 +               size = PAGE_SIZE << order;
52647 +               sprintf(name[order], "xen-skb-%lu", size);
52648 +               if (is_running_on_xen() &&
52649 +                   (xen_start_info->flags & SIF_PRIVILEGED))
52650 +                       skbuff_order_cachep[order] = kmem_cache_create(
52651 +                               name[order], size, size, 0,
52652 +                               skbuff_ctor, skbuff_dtor);
52653 +               else
52654 +                       skbuff_order_cachep[order] = kmem_cache_create(
52655 +                               name[order], size, size, 0, NULL, NULL);
52656 +                       
52657 +       }
52658 +
52659 +       skbuff_cachep = skbuff_order_cachep[0];
52660 +
52661 +       return 0;
52662 +}
52663 +core_initcall(skbuff_init);
52664 +
52665 +EXPORT_SYMBOL(__dev_alloc_skb);
52666 diff -urNp linux-2.6/drivers/xen/core/smpboot.c new/drivers/xen/core/smpboot.c
52667 --- linux-2.6/drivers/xen/core/smpboot.c        1970-01-01 01:00:00.000000000 +0100
52668 +++ new/drivers/xen/core/smpboot.c      2006-06-07 13:15:16.000000000 +0200
52669 @@ -0,0 +1,433 @@
52670 +/*
52671 + *     Xen SMP booting functions
52672 + *
52673 + *     See arch/i386/kernel/smpboot.c for copyright and credits for derived
52674 + *     portions of this file.
52675 + */
52676 +
52677 +#include <linux/module.h>
52678 +#include <linux/config.h>
52679 +#include <linux/init.h>
52680 +#include <linux/kernel.h>
52681 +#include <linux/mm.h>
52682 +#include <linux/sched.h>
52683 +#include <linux/kernel_stat.h>
52684 +#include <linux/smp_lock.h>
52685 +#include <linux/irq.h>
52686 +#include <linux/bootmem.h>
52687 +#include <linux/notifier.h>
52688 +#include <linux/cpu.h>
52689 +#include <linux/percpu.h>
52690 +#include <asm/desc.h>
52691 +#include <asm/arch_hooks.h>
52692 +#include <asm/pgalloc.h>
52693 +#include <xen/evtchn.h>
52694 +#include <xen/interface/vcpu.h>
52695 +#include <xen/cpu_hotplug.h>
52696 +#include <xen/xenbus.h>
52697 +
52698 +extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *);
52699 +extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *);
52700 +
52701 +extern void local_setup_timer(unsigned int cpu);
52702 +extern void local_teardown_timer(unsigned int cpu);
52703 +
52704 +extern void hypervisor_callback(void);
52705 +extern void failsafe_callback(void);
52706 +extern void system_call(void);
52707 +extern void smp_trap_init(trap_info_t *);
52708 +
52709 +/* Number of siblings per CPU package */
52710 +int smp_num_siblings = 1;
52711 +int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */
52712 +EXPORT_SYMBOL(phys_proc_id);
52713 +int cpu_core_id[NR_CPUS]; /* Core ID of each logical CPU */
52714 +EXPORT_SYMBOL(cpu_core_id);
52715 +#if defined(__i386__)
52716 +int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
52717 +#elif defined(__x86_64__)
52718 +u8 cpu_llc_id[NR_CPUS] __cpuinitdata  = {[0 ... NR_CPUS-1] = BAD_APICID};
52719 +#endif
52720 +
52721 +cpumask_t cpu_online_map;
52722 +EXPORT_SYMBOL(cpu_online_map);
52723 +cpumask_t cpu_possible_map;
52724 +EXPORT_SYMBOL(cpu_possible_map);
52725 +
52726 +struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
52727 +EXPORT_SYMBOL(cpu_data);
52728 +
52729 +#ifdef CONFIG_HOTPLUG_CPU
52730 +DEFINE_PER_CPU(int, cpu_state) = { 0 };
52731 +#endif
52732 +
52733 +static DEFINE_PER_CPU(int, resched_irq);
52734 +static DEFINE_PER_CPU(int, callfunc_irq);
52735 +static char resched_name[NR_CPUS][15];
52736 +static char callfunc_name[NR_CPUS][15];
52737 +
52738 +u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
52739 +
52740 +void *xquad_portio;
52741 +
52742 +cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
52743 +cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
52744 +EXPORT_SYMBOL(cpu_core_map);
52745 +
52746 +#if defined(__i386__)
52747 +u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = 0xff };
52748 +EXPORT_SYMBOL(x86_cpu_to_apicid);
52749 +#elif !defined(CONFIG_X86_IO_APIC)
52750 +unsigned int maxcpus = NR_CPUS;
52751 +#endif
52752 +
52753 +void __init prefill_possible_map(void)
52754 +{
52755 +       int i, rc;
52756 +
52757 +       for (i = 0; i < NR_CPUS; i++) {
52758 +               rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
52759 +               if (rc >= 0)
52760 +                       cpu_set(i, cpu_possible_map);
52761 +       }
52762 +}
52763 +
52764 +void __init smp_alloc_memory(void)
52765 +{
52766 +}
52767 +
52768 +static inline void
52769 +set_cpu_sibling_map(int cpu)
52770 +{
52771 +       phys_proc_id[cpu] = cpu;
52772 +       cpu_core_id[cpu]  = 0;
52773 +
52774 +       cpu_sibling_map[cpu] = cpumask_of_cpu(cpu);
52775 +       cpu_core_map[cpu]    = cpumask_of_cpu(cpu);
52776 +
52777 +       cpu_data[cpu].booted_cores = 1;
52778 +}
52779 +
52780 +static void xen_smp_intr_init(unsigned int cpu)
52781 +{
52782 +       sprintf(resched_name[cpu], "resched%d", cpu);
52783 +       per_cpu(resched_irq, cpu) =
52784 +               bind_ipi_to_irqhandler(
52785 +                       RESCHEDULE_VECTOR,
52786 +                       cpu,
52787 +                       smp_reschedule_interrupt,
52788 +                       SA_INTERRUPT,
52789 +                       resched_name[cpu],
52790 +                       NULL);
52791 +       BUG_ON(per_cpu(resched_irq, cpu) < 0);
52792 +
52793 +       sprintf(callfunc_name[cpu], "callfunc%d", cpu);
52794 +       per_cpu(callfunc_irq, cpu) =
52795 +               bind_ipi_to_irqhandler(
52796 +                       CALL_FUNCTION_VECTOR,
52797 +                       cpu,
52798 +                       smp_call_function_interrupt,
52799 +                       SA_INTERRUPT,
52800 +                       callfunc_name[cpu],
52801 +                       NULL);
52802 +       BUG_ON(per_cpu(callfunc_irq, cpu) < 0);
52803 +
52804 +       if (cpu != 0)
52805 +               local_setup_timer(cpu);
52806 +}
52807 +
52808 +#ifdef CONFIG_HOTPLUG_CPU
52809 +static void xen_smp_intr_exit(unsigned int cpu)
52810 +{
52811 +       if (cpu != 0)
52812 +               local_teardown_timer(cpu);
52813 +
52814 +       unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
52815 +       unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
52816 +}
52817 +#endif
52818 +
52819 +void cpu_bringup(void)
52820 +{
52821 +       cpu_init();
52822 +       touch_softlockup_watchdog();
52823 +       preempt_disable();
52824 +       local_irq_enable();
52825 +}
52826 +
52827 +static void cpu_bringup_and_idle(void)
52828 +{
52829 +       cpu_bringup();
52830 +       cpu_idle();
52831 +}
52832 +
52833 +void cpu_initialize_context(unsigned int cpu)
52834 +{
52835 +       vcpu_guest_context_t ctxt;
52836 +       struct task_struct *idle = idle_task(cpu);
52837 +#ifdef __x86_64__
52838 +       struct desc_ptr *gdt_descr = &cpu_gdt_descr[cpu];
52839 +#else
52840 +       struct Xgt_desc_struct *gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
52841 +#endif
52842 +
52843 +       if (cpu == 0)
52844 +               return;
52845 +
52846 +       memset(&ctxt, 0, sizeof(ctxt));
52847 +
52848 +       ctxt.flags = VGCF_IN_KERNEL;
52849 +       ctxt.user_regs.ds = __USER_DS;
52850 +       ctxt.user_regs.es = __USER_DS;
52851 +       ctxt.user_regs.fs = 0;
52852 +       ctxt.user_regs.gs = 0;
52853 +       ctxt.user_regs.ss = __KERNEL_DS;
52854 +       ctxt.user_regs.eip = (unsigned long)cpu_bringup_and_idle;
52855 +       ctxt.user_regs.eflags = X86_EFLAGS_IF | 0x1000; /* IOPL_RING1 */
52856 +
52857 +       memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
52858 +
52859 +       smp_trap_init(ctxt.trap_ctxt);
52860 +
52861 +       ctxt.ldt_ents = 0;
52862 +
52863 +       ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address);
52864 +       ctxt.gdt_ents      = gdt_descr->size / 8;
52865 +
52866 +#ifdef __i386__
52867 +       ctxt.user_regs.cs = __KERNEL_CS;
52868 +       ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
52869 +
52870 +       ctxt.kernel_ss = __KERNEL_DS;
52871 +       ctxt.kernel_sp = idle->thread.esp0;
52872 +
52873 +       ctxt.event_callback_cs     = __KERNEL_CS;
52874 +       ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
52875 +       ctxt.failsafe_callback_cs  = __KERNEL_CS;
52876 +       ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
52877 +
52878 +       ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
52879 +#else /* __x86_64__ */
52880 +       ctxt.user_regs.cs = __KERNEL_CS;
52881 +       ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
52882 +
52883 +       ctxt.kernel_ss = __KERNEL_DS;
52884 +       ctxt.kernel_sp = idle->thread.rsp0;
52885 +
52886 +       ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
52887 +       ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
52888 +       ctxt.syscall_callback_eip  = (unsigned long)system_call;
52889 +
52890 +       ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
52891 +
52892 +       ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu));
52893 +#endif
52894 +
52895 +       BUG_ON(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt));
52896 +}
52897 +
52898 +void __init smp_prepare_cpus(unsigned int max_cpus)
52899 +{
52900 +       int cpu;
52901 +       struct task_struct *idle;
52902 +#ifdef __x86_64__
52903 +       struct desc_ptr *gdt_descr;
52904 +#else
52905 +       struct Xgt_desc_struct *gdt_descr;
52906 +#endif
52907 +
52908 +       boot_cpu_data.apicid = 0;
52909 +       cpu_data[0] = boot_cpu_data;
52910 +
52911 +       cpu_2_logical_apicid[0] = 0;
52912 +       x86_cpu_to_apicid[0] = 0;
52913 +
52914 +       current_thread_info()->cpu = 0;
52915 +
52916 +       for (cpu = 0; cpu < NR_CPUS; cpu++) {
52917 +               cpus_clear(cpu_sibling_map[cpu]);
52918 +               cpus_clear(cpu_core_map[cpu]);
52919 +       }
52920 +
52921 +       set_cpu_sibling_map(0);
52922 +
52923 +       xen_smp_intr_init(0);
52924 +
52925 +       for_each_cpu_mask (cpu, cpu_possible_map) {
52926 +               if (cpu == 0)
52927 +                       continue;
52928 +
52929 +#ifdef __x86_64__
52930 +               gdt_descr = &cpu_gdt_descr[cpu];
52931 +#else
52932 +               gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
52933 +#endif
52934 +               gdt_descr->address = get_zeroed_page(GFP_KERNEL);
52935 +               if (unlikely(!gdt_descr->address)) {
52936 +                       printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
52937 +                       continue;
52938 +               }
52939 +               gdt_descr->size = GDT_SIZE;
52940 +               memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
52941 +               make_page_readonly(
52942 +                       (void *)gdt_descr->address,
52943 +                       XENFEAT_writable_descriptor_tables);
52944 +
52945 +               cpu_data[cpu] = boot_cpu_data;
52946 +               cpu_data[cpu].apicid = cpu;
52947 +
52948 +               cpu_2_logical_apicid[cpu] = cpu;
52949 +               x86_cpu_to_apicid[cpu] = cpu;
52950 +
52951 +               idle = fork_idle(cpu);
52952 +               if (IS_ERR(idle))
52953 +                       panic("failed fork for CPU %d", cpu);
52954 +
52955 +#ifdef __x86_64__
52956 +               cpu_pda(cpu)->pcurrent = idle;
52957 +               cpu_pda(cpu)->cpunumber = cpu;
52958 +               clear_ti_thread_flag(idle->thread_info, TIF_FORK);
52959 +#endif
52960 +
52961 +               irq_ctx_init(cpu);
52962 +
52963 +#ifdef CONFIG_HOTPLUG_CPU
52964 +               if (xen_start_info->flags & SIF_INITDOMAIN)
52965 +                       cpu_set(cpu, cpu_present_map);
52966 +#else
52967 +               cpu_set(cpu, cpu_present_map);
52968 +#endif
52969 +
52970 +               cpu_initialize_context(cpu);
52971 +       }
52972 +
52973 +       init_xenbus_allowed_cpumask();
52974 +
52975 +       /* Currently, Xen gives no dynamic NUMA/HT info. */
52976 +       for (cpu = 1; cpu < NR_CPUS; cpu++) {
52977 +               cpu_sibling_map[cpu] = cpumask_of_cpu(cpu);
52978 +               cpu_core_map[cpu]    = cpumask_of_cpu(cpu);
52979 +       }
52980 +
52981 +#ifdef CONFIG_X86_IO_APIC
52982 +       /*
52983 +        * Here we can be sure that there is an IO-APIC in the system. Let's
52984 +        * go and set it up:
52985 +        */
52986 +       if (!skip_ioapic_setup && nr_ioapics)
52987 +               setup_IO_APIC();
52988 +#endif
52989 +}
52990 +
52991 +void __devinit smp_prepare_boot_cpu(void)
52992 +{
52993 +}
52994 +
52995 +#ifdef CONFIG_HOTPLUG_CPU
52996 +
52997 +/*
52998 + * Initialize cpu_present_map late to skip SMP boot code in init/main.c.
52999 + * But do it early enough to catch critical for_each_present_cpu() loops
53000 + * in i386-specific code.
53001 + */
53002 +static int __init initialize_cpu_present_map(void)
53003 +{
53004 +       cpu_present_map = cpu_possible_map;
53005 +       return 0;
53006 +}
53007 +core_initcall(initialize_cpu_present_map);
53008 +
53009 +static void
53010 +remove_siblinginfo(int cpu)
53011 +{
53012 +       phys_proc_id[cpu] = BAD_APICID;
53013 +       cpu_core_id[cpu]  = BAD_APICID;
53014 +
53015 +       cpus_clear(cpu_sibling_map[cpu]);
53016 +       cpus_clear(cpu_core_map[cpu]);
53017 +
53018 +       cpu_data[cpu].booted_cores = 0;
53019 +}
53020 +
53021 +int __cpu_disable(void)
53022 +{
53023 +       cpumask_t map = cpu_online_map;
53024 +       int cpu = smp_processor_id();
53025 +
53026 +       if (cpu == 0)
53027 +               return -EBUSY;
53028 +
53029 +       remove_siblinginfo(cpu);
53030 +
53031 +       cpu_clear(cpu, map);
53032 +       fixup_irqs(map);
53033 +       cpu_clear(cpu, cpu_online_map);
53034 +
53035 +       return 0;
53036 +}
53037 +
53038 +void __cpu_die(unsigned int cpu)
53039 +{
53040 +       while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
53041 +               current->state = TASK_UNINTERRUPTIBLE;
53042 +               schedule_timeout(HZ/10);
53043 +       }
53044 +
53045 +       xen_smp_intr_exit(cpu);
53046 +
53047 +#ifdef __i386__
53048 +       if (num_online_cpus() == 1)
53049 +               alternatives_smp_switch(0);
53050 +#endif
53051 +}
53052 +
53053 +#else /* !CONFIG_HOTPLUG_CPU */
53054 +
53055 +int __cpu_disable(void)
53056 +{
53057 +       return -ENOSYS;
53058 +}
53059 +
53060 +void __cpu_die(unsigned int cpu)
53061 +{
53062 +       BUG();
53063 +}
53064 +
53065 +#endif /* CONFIG_HOTPLUG_CPU */
53066 +
53067 +int __devinit __cpu_up(unsigned int cpu)
53068 +{
53069 +       int rc;
53070 +
53071 +       rc = cpu_up_check(cpu);
53072 +       if (rc)
53073 +               return rc;
53074 +
53075 +#ifdef __i386__
53076 +       if (num_online_cpus() == 1)
53077 +               alternatives_smp_switch(1);
53078 +#endif
53079 +
53080 +       /* This must be done before setting cpu_online_map */
53081 +       set_cpu_sibling_map(cpu);
53082 +       wmb();
53083 +
53084 +       xen_smp_intr_init(cpu);
53085 +       cpu_set(cpu, cpu_online_map);
53086 +
53087 +       rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
53088 +       BUG_ON(rc);
53089 +
53090 +       return 0;
53091 +}
53092 +
53093 +void __init smp_cpus_done(unsigned int max_cpus)
53094 +{
53095 +}
53096 +
53097 +#ifndef CONFIG_X86_LOCAL_APIC
53098 +int setup_profiling_timer(unsigned int multiplier)
53099 +{
53100 +       return -EINVAL;
53101 +}
53102 +#endif
53103 diff -urNp linux-2.6/drivers/xen/core/xen_proc.c new/drivers/xen/core/xen_proc.c
53104 --- linux-2.6/drivers/xen/core/xen_proc.c       1970-01-01 01:00:00.000000000 +0100
53105 +++ new/drivers/xen/core/xen_proc.c     2006-05-23 18:42:17.000000000 +0200
53106 @@ -0,0 +1,19 @@
53107 +
53108 +#include <linux/config.h>
53109 +#include <linux/proc_fs.h>
53110 +#include <xen/xen_proc.h>
53111 +
53112 +static struct proc_dir_entry *xen_base;
53113 +
53114 +struct proc_dir_entry *create_xen_proc_entry(const char *name, mode_t mode)
53115 +{
53116 +       if ( xen_base == NULL )
53117 +               if ( (xen_base = proc_mkdir("xen", &proc_root)) == NULL )
53118 +                       panic("Couldn't create /proc/xen");
53119 +       return create_proc_entry(name, mode, xen_base);
53120 +}
53121 +
53122 +void remove_xen_proc_entry(const char *name)
53123 +{
53124 +       remove_proc_entry(name, xen_base);
53125 +}
53126 diff -urNp linux-2.6/drivers/xen/core/xen_sysfs.c new/drivers/xen/core/xen_sysfs.c
53127 --- linux-2.6/drivers/xen/core/xen_sysfs.c      1970-01-01 01:00:00.000000000 +0100
53128 +++ new/drivers/xen/core/xen_sysfs.c    2006-07-07 15:10:03.000000000 +0200
53129 @@ -0,0 +1,379 @@
53130 +/*
53131 + *  copyright (c) 2006 IBM Corporation
53132 + *  Authored by: Mike D. Day <ncmike@us.ibm.com>
53133 + *
53134 + *  This program is free software; you can redistribute it and/or modify
53135 + *  it under the terms of the GNU General Public License version 2 as
53136 + *  published by the Free Software Foundation.
53137 + */
53138 +
53139 +#include <linux/config.h>
53140 +#include <linux/err.h>
53141 +#include <linux/kernel.h>
53142 +#include <linux/module.h>
53143 +#include <linux/init.h>
53144 +#include <asm/hypervisor.h>
53145 +#include <xen/features.h>
53146 +#include <xen/hypervisor_sysfs.h>
53147 +#include <xen/xenbus.h>
53148 +
53149 +MODULE_LICENSE("GPL");
53150 +MODULE_AUTHOR("Mike D. Day <ncmike@us.ibm.com>");
53151 +
53152 +static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer)
53153 +{
53154 +       return sprintf(buffer, "xen\n");
53155 +}
53156 +
53157 +HYPERVISOR_ATTR_RO(type);
53158 +
53159 +static int __init xen_sysfs_type_init(void)
53160 +{
53161 +       return sysfs_create_file(&hypervisor_subsys.kset.kobj, &type_attr.attr);
53162 +}
53163 +
53164 +static void xen_sysfs_type_destroy(void)
53165 +{
53166 +       sysfs_remove_file(&hypervisor_subsys.kset.kobj, &type_attr.attr);
53167 +}
53168 +
53169 +/* xen version attributes */
53170 +static ssize_t major_show(struct hyp_sysfs_attr *attr, char *buffer)
53171 +{
53172 +       int version = HYPERVISOR_xen_version(XENVER_version, NULL);
53173 +       if (version)
53174 +               return sprintf(buffer, "%d\n", version >> 16);
53175 +       return -ENODEV;
53176 +}
53177 +
53178 +HYPERVISOR_ATTR_RO(major);
53179 +
53180 +static ssize_t minor_show(struct hyp_sysfs_attr *attr, char *buffer)
53181 +{
53182 +       int version = HYPERVISOR_xen_version(XENVER_version, NULL);
53183 +       if (version)
53184 +               return sprintf(buffer, "%d\n", version & 0xff);
53185 +       return -ENODEV;
53186 +}
53187 +
53188 +HYPERVISOR_ATTR_RO(minor);
53189 +
53190 +static ssize_t extra_show(struct hyp_sysfs_attr *attr, char *buffer)
53191 +{
53192 +       int ret = -ENOMEM;
53193 +       char *extra;
53194 +
53195 +       extra = kmalloc(XEN_EXTRAVERSION_LEN, GFP_KERNEL);
53196 +       if (extra) {
53197 +               ret = HYPERVISOR_xen_version(XENVER_extraversion, extra);
53198 +               if (!ret)
53199 +                       ret = sprintf(buffer, "%s\n", extra);
53200 +               kfree(extra);
53201 +       }
53202 +
53203 +       return ret;
53204 +}
53205 +
53206 +HYPERVISOR_ATTR_RO(extra);
53207 +
53208 +static struct attribute *version_attrs[] = {
53209 +       &major_attr.attr,
53210 +       &minor_attr.attr,
53211 +       &extra_attr.attr,
53212 +       NULL
53213 +};
53214 +
53215 +static struct attribute_group version_group = {
53216 +       .name = "version",
53217 +       .attrs = version_attrs,
53218 +};
53219 +
53220 +static int __init xen_sysfs_version_init(void)
53221 +{
53222 +       return sysfs_create_group(&hypervisor_subsys.kset.kobj,
53223 +                                 &version_group);
53224 +}
53225 +
53226 +static void xen_sysfs_version_destroy(void)
53227 +{
53228 +       sysfs_remove_group(&hypervisor_subsys.kset.kobj, &version_group);
53229 +}
53230 +
53231 +/* UUID */
53232 +
53233 +static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer)
53234 +{
53235 +       char *vm, *val;
53236 +       int ret;
53237 +
53238 +       vm = xenbus_read(XBT_NIL, "vm", "", NULL);
53239 +       if (IS_ERR(vm))
53240 +               return PTR_ERR(vm);
53241 +       val = xenbus_read(XBT_NIL, vm, "uuid", NULL);
53242 +       kfree(vm);
53243 +       if (IS_ERR(val))
53244 +               return PTR_ERR(val);
53245 +       ret = sprintf(buffer, "%s\n", val);
53246 +       kfree(val);
53247 +       return ret;
53248 +}
53249 +
53250 +HYPERVISOR_ATTR_RO(uuid);
53251 +
53252 +static int __init xen_sysfs_uuid_init(void)
53253 +{
53254 +       return sysfs_create_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr);
53255 +}
53256 +
53257 +static void xen_sysfs_uuid_destroy(void)
53258 +{
53259 +       sysfs_remove_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr);
53260 +}
53261 +
53262 +/* xen compilation attributes */
53263 +
53264 +static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer)
53265 +{
53266 +       int ret = -ENOMEM;
53267 +       struct xen_compile_info *info;
53268 +
53269 +       info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
53270 +       if (info) {
53271 +               ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
53272 +               if (!ret)
53273 +                       ret = sprintf(buffer, "%s\n", info->compiler);
53274 +               kfree(info);
53275 +       }
53276 +
53277 +       return ret;
53278 +}
53279 +
53280 +HYPERVISOR_ATTR_RO(compiler);
53281 +
53282 +static ssize_t compiled_by_show(struct hyp_sysfs_attr *attr, char *buffer)
53283 +{
53284 +       int ret = -ENOMEM;
53285 +       struct xen_compile_info *info;
53286 +
53287 +       info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
53288 +       if (info) {
53289 +               ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
53290 +               if (!ret)
53291 +                       ret = sprintf(buffer, "%s\n", info->compile_by);
53292 +               kfree(info);
53293 +       }
53294 +
53295 +       return ret;
53296 +}
53297 +
53298 +HYPERVISOR_ATTR_RO(compiled_by);
53299 +
53300 +static ssize_t compile_date_show(struct hyp_sysfs_attr *attr, char *buffer)
53301 +{
53302 +       int ret = -ENOMEM;
53303 +       struct xen_compile_info *info;
53304 +
53305 +       info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
53306 +       if (info) {
53307 +               ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
53308 +               if (!ret)
53309 +                       ret = sprintf(buffer, "%s\n", info->compile_date);
53310 +               kfree(info);
53311 +       }
53312 +
53313 +       return ret;
53314 +}
53315 +
53316 +HYPERVISOR_ATTR_RO(compile_date);
53317 +
53318 +static struct attribute *xen_compile_attrs[] = {
53319 +       &compiler_attr.attr,
53320 +       &compiled_by_attr.attr,
53321 +       &compile_date_attr.attr,
53322 +       NULL
53323 +};
53324 +
53325 +static struct attribute_group xen_compilation_group = {
53326 +       .name = "compilation",
53327 +       .attrs = xen_compile_attrs,
53328 +};
53329 +
53330 +int __init static xen_compilation_init(void)
53331 +{
53332 +       return sysfs_create_group(&hypervisor_subsys.kset.kobj,
53333 +                                 &xen_compilation_group);
53334 +}
53335 +
53336 +static void xen_compilation_destroy(void)
53337 +{
53338 +       sysfs_remove_group(&hypervisor_subsys.kset.kobj,
53339 +                          &xen_compilation_group);
53340 +}
53341 +
53342 +/* xen properties info */
53343 +
53344 +static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer)
53345 +{
53346 +       int ret = -ENOMEM;
53347 +       char *caps;
53348 +
53349 +       caps = kmalloc(XEN_CAPABILITIES_INFO_LEN, GFP_KERNEL);
53350 +       if (caps) {
53351 +               ret = HYPERVISOR_xen_version(XENVER_capabilities, caps);
53352 +               if (!ret)
53353 +                       ret = sprintf(buffer, "%s\n", caps);
53354 +               kfree(caps);
53355 +       }
53356 +
53357 +       return ret;
53358 +}
53359 +
53360 +HYPERVISOR_ATTR_RO(capabilities);
53361 +
53362 +static ssize_t changeset_show(struct hyp_sysfs_attr *attr, char *buffer)
53363 +{
53364 +       int ret = -ENOMEM;
53365 +       char *cset;
53366 +
53367 +       cset = kmalloc(XEN_CHANGESET_INFO_LEN, GFP_KERNEL);
53368 +       if (cset) {
53369 +               ret = HYPERVISOR_xen_version(XENVER_changeset, cset);
53370 +               if (!ret)
53371 +                       ret = sprintf(buffer, "%s\n", cset);
53372 +               kfree(cset);
53373 +       }
53374 +
53375 +       return ret;
53376 +}
53377 +
53378 +HYPERVISOR_ATTR_RO(changeset);
53379 +
53380 +static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer)
53381 +{
53382 +       int ret = -ENOMEM;
53383 +       struct xen_platform_parameters *parms;
53384 +
53385 +       parms = kmalloc(sizeof(struct xen_platform_parameters), GFP_KERNEL);
53386 +       if (parms) {
53387 +               ret = HYPERVISOR_xen_version(XENVER_platform_parameters,
53388 +                                            parms);
53389 +               if (!ret)
53390 +                       ret = sprintf(buffer, "%lx\n", parms->virt_start);
53391 +               kfree(parms);
53392 +       }
53393 +
53394 +       return ret;
53395 +}
53396 +
53397 +HYPERVISOR_ATTR_RO(virtual_start);
53398 +
53399 +static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer)
53400 +{
53401 +       int ret;
53402 +
53403 +       ret = HYPERVISOR_xen_version(XENVER_pagesize, NULL);
53404 +       if (ret > 0)
53405 +               ret = sprintf(buffer, "%x\n", ret);
53406 +
53407 +       return ret;
53408 +}
53409 +
53410 +HYPERVISOR_ATTR_RO(pagesize);
53411 +
53412 +/* eventually there will be several more features to export */
53413 +static ssize_t xen_feature_show(int index, char *buffer)
53414 +{
53415 +       int ret = -ENOMEM;
53416 +       struct xen_feature_info *info;
53417 +
53418 +       info = kmalloc(sizeof(struct xen_feature_info), GFP_KERNEL);
53419 +       if (info) {
53420 +               info->submap_idx = index;
53421 +               ret = HYPERVISOR_xen_version(XENVER_get_features, info);
53422 +               if (!ret)
53423 +                       ret = sprintf(buffer, "%d\n", info->submap);
53424 +               kfree(info);
53425 +       }
53426 +
53427 +       return ret;
53428 +}
53429 +
53430 +static ssize_t writable_pt_show(struct hyp_sysfs_attr *attr, char *buffer)
53431 +{
53432 +       return xen_feature_show(XENFEAT_writable_page_tables, buffer);
53433 +}
53434 +
53435 +HYPERVISOR_ATTR_RO(writable_pt);
53436 +
53437 +static struct attribute *xen_properties_attrs[] = {
53438 +       &capabilities_attr.attr,
53439 +       &changeset_attr.attr,
53440 +       &virtual_start_attr.attr,
53441 +       &pagesize_attr.attr,
53442 +       &writable_pt_attr.attr,
53443 +       NULL
53444 +};
53445 +
53446 +static struct attribute_group xen_properties_group = {
53447 +       .name = "properties",
53448 +       .attrs = xen_properties_attrs,
53449 +};
53450 +
53451 +static int __init xen_properties_init(void)
53452 +{
53453 +       return sysfs_create_group(&hypervisor_subsys.kset.kobj,
53454 +                                 &xen_properties_group);
53455 +}
53456 +
53457 +static void xen_properties_destroy(void)
53458 +{
53459 +       sysfs_remove_group(&hypervisor_subsys.kset.kobj,
53460 +                          &xen_properties_group);
53461 +}
53462 +
53463 +static int __init hyper_sysfs_init(void)
53464 +{
53465 +       int ret;
53466 +
53467 +       if (!is_running_on_xen())
53468 +               return -ENODEV;
53469 +
53470 +       ret = xen_sysfs_type_init();
53471 +       if (ret)
53472 +               goto out;
53473 +       ret = xen_sysfs_version_init();
53474 +       if (ret)
53475 +               goto version_out;
53476 +       ret = xen_compilation_init();
53477 +       if (ret)
53478 +               goto comp_out;
53479 +       ret = xen_sysfs_uuid_init();
53480 +       if (ret)
53481 +               goto uuid_out;
53482 +       ret = xen_properties_init();
53483 +       if (!ret)
53484 +               goto out;
53485 +
53486 +       xen_sysfs_uuid_destroy();
53487 +uuid_out:
53488 +       xen_compilation_destroy();
53489 +comp_out:
53490 +       xen_sysfs_version_destroy();
53491 +version_out:
53492 +       xen_sysfs_type_destroy();
53493 +out:
53494 +       return ret;
53495 +}
53496 +
53497 +static void hyper_sysfs_exit(void)
53498 +{
53499 +       xen_properties_destroy();
53500 +       xen_compilation_destroy();
53501 +       xen_sysfs_uuid_destroy();
53502 +       xen_sysfs_version_destroy();
53503 +       xen_sysfs_type_destroy();
53504 +
53505 +}
53506 +
53507 +module_init(hyper_sysfs_init);
53508 +module_exit(hyper_sysfs_exit);
53509 diff -urNp linux-2.6/drivers/xen/evtchn/evtchn.c new/drivers/xen/evtchn/evtchn.c
53510 --- linux-2.6/drivers/xen/evtchn/evtchn.c       1970-01-01 01:00:00.000000000 +0100
53511 +++ new/drivers/xen/evtchn/evtchn.c     2006-06-28 14:32:14.000000000 +0200
53512 @@ -0,0 +1,458 @@
53513 +/******************************************************************************
53514 + * evtchn.c
53515 + * 
53516 + * Driver for receiving and demuxing event-channel signals.
53517 + * 
53518 + * Copyright (c) 2004-2005, K A Fraser
53519 + * Multi-process extensions Copyright (c) 2004, Steven Smith
53520 + * 
53521 + * This program is free software; you can redistribute it and/or
53522 + * modify it under the terms of the GNU General Public License version 2
53523 + * as published by the Free Software Foundation; or, when distributed
53524 + * separately from the Linux kernel or incorporated into other
53525 + * software packages, subject to the following license:
53526 + * 
53527 + * Permission is hereby granted, free of charge, to any person obtaining a copy
53528 + * of this source file (the "Software"), to deal in the Software without
53529 + * restriction, including without limitation the rights to use, copy, modify,
53530 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
53531 + * and to permit persons to whom the Software is furnished to do so, subject to
53532 + * the following conditions:
53533 + * 
53534 + * The above copyright notice and this permission notice shall be included in
53535 + * all copies or substantial portions of the Software.
53536 + * 
53537 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
53538 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
53539 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
53540 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
53541 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
53542 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
53543 + * IN THE SOFTWARE.
53544 + */
53545 +
53546 +#include <linux/config.h>
53547 +#include <linux/module.h>
53548 +#include <linux/kernel.h>
53549 +#include <linux/sched.h>
53550 +#include <linux/slab.h>
53551 +#include <linux/string.h>
53552 +#include <linux/errno.h>
53553 +#include <linux/fs.h>
53554 +#include <linux/errno.h>
53555 +#include <linux/miscdevice.h>
53556 +#include <linux/major.h>
53557 +#include <linux/proc_fs.h>
53558 +#include <linux/stat.h>
53559 +#include <linux/poll.h>
53560 +#include <linux/irq.h>
53561 +#include <linux/init.h>
53562 +#include <linux/gfp.h>
53563 +#include <xen/evtchn.h>
53564 +#include <xen/public/evtchn.h>
53565 +
53566 +struct per_user_data {
53567 +       /* Notification ring, accessed via /dev/xen/evtchn. */
53568 +#define EVTCHN_RING_SIZE     (PAGE_SIZE / sizeof(evtchn_port_t))
53569 +#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1))
53570 +       evtchn_port_t *ring;
53571 +       unsigned int ring_cons, ring_prod, ring_overflow;
53572 +
53573 +       /* Processes wait on this queue when ring is empty. */
53574 +       wait_queue_head_t evtchn_wait;
53575 +       struct fasync_struct *evtchn_async_queue;
53576 +};
53577 +
53578 +/* Who's bound to each port? */
53579 +static struct per_user_data *port_user[NR_EVENT_CHANNELS];
53580 +static spinlock_t port_user_lock;
53581 +
53582 +void evtchn_device_upcall(int port)
53583 +{
53584 +       struct per_user_data *u;
53585 +
53586 +       spin_lock(&port_user_lock);
53587 +
53588 +       mask_evtchn(port);
53589 +       clear_evtchn(port);
53590 +
53591 +       if ((u = port_user[port]) != NULL) {
53592 +               if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
53593 +                       u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
53594 +                       if (u->ring_cons == u->ring_prod++) {
53595 +                               wake_up_interruptible(&u->evtchn_wait);
53596 +                               kill_fasync(&u->evtchn_async_queue,
53597 +                                           SIGIO, POLL_IN);
53598 +                       }
53599 +               } else {
53600 +                       u->ring_overflow = 1;
53601 +               }
53602 +       }
53603 +
53604 +       spin_unlock(&port_user_lock);
53605 +}
53606 +
53607 +static ssize_t evtchn_read(struct file *file, char __user *buf,
53608 +                          size_t count, loff_t *ppos)
53609 +{
53610 +       int rc;
53611 +       unsigned int c, p, bytes1 = 0, bytes2 = 0;
53612 +       struct per_user_data *u = file->private_data;
53613 +
53614 +       /* Whole number of ports. */
53615 +       count &= ~(sizeof(evtchn_port_t)-1);
53616 +
53617 +       if (count == 0)
53618 +               return 0;
53619 +
53620 +       if (count > PAGE_SIZE)
53621 +               count = PAGE_SIZE;
53622 +
53623 +       for (;;) {
53624 +               if (u->ring_overflow)
53625 +                       return -EFBIG;
53626 +
53627 +               if ((c = u->ring_cons) != (p = u->ring_prod))
53628 +                       break;
53629 +
53630 +               if (file->f_flags & O_NONBLOCK)
53631 +                       return -EAGAIN;
53632 +
53633 +               rc = wait_event_interruptible(
53634 +                       u->evtchn_wait, u->ring_cons != u->ring_prod);
53635 +               if (rc)
53636 +                       return rc;
53637 +       }
53638 +
53639 +       /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */
53640 +       if (((c ^ p) & EVTCHN_RING_SIZE) != 0) {
53641 +               bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) *
53642 +                       sizeof(evtchn_port_t);
53643 +               bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t);
53644 +       } else {
53645 +               bytes1 = (p - c) * sizeof(evtchn_port_t);
53646 +               bytes2 = 0;
53647 +       }
53648 +
53649 +       /* Truncate chunks according to caller's maximum byte count. */
53650 +       if (bytes1 > count) {
53651 +               bytes1 = count;
53652 +               bytes2 = 0;
53653 +       } else if ((bytes1 + bytes2) > count) {
53654 +               bytes2 = count - bytes1;
53655 +       }
53656 +
53657 +       if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) ||
53658 +           ((bytes2 != 0) &&
53659 +            copy_to_user(&buf[bytes1], &u->ring[0], bytes2)))
53660 +               return -EFAULT;
53661 +
53662 +       u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t);
53663 +
53664 +       return bytes1 + bytes2;
53665 +}
53666 +
53667 +static ssize_t evtchn_write(struct file *file, const char __user *buf,
53668 +                           size_t count, loff_t *ppos)
53669 +{
53670 +       int  rc, i;
53671 +       evtchn_port_t *kbuf = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
53672 +       struct per_user_data *u = file->private_data;
53673 +
53674 +       if (kbuf == NULL)
53675 +               return -ENOMEM;
53676 +
53677 +       /* Whole number of ports. */
53678 +       count &= ~(sizeof(evtchn_port_t)-1);
53679 +
53680 +       if (count == 0) {
53681 +               rc = 0;
53682 +               goto out;
53683 +       }
53684 +
53685 +       if (count > PAGE_SIZE)
53686 +               count = PAGE_SIZE;
53687 +
53688 +       if (copy_from_user(kbuf, buf, count) != 0) {
53689 +               rc = -EFAULT;
53690 +               goto out;
53691 +       }
53692 +
53693 +       spin_lock_irq(&port_user_lock);
53694 +       for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
53695 +               if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
53696 +                       unmask_evtchn(kbuf[i]);
53697 +       spin_unlock_irq(&port_user_lock);
53698 +
53699 +       rc = count;
53700 +
53701 + out:
53702 +       free_page((unsigned long)kbuf);
53703 +       return rc;
53704 +}
53705 +
53706 +static void evtchn_bind_to_user(struct per_user_data *u, int port)
53707 +{
53708 +       spin_lock_irq(&port_user_lock);
53709 +       BUG_ON(port_user[port] != NULL);
53710 +       port_user[port] = u;
53711 +       unmask_evtchn(port);
53712 +       spin_unlock_irq(&port_user_lock);
53713 +}
53714 +
53715 +static int evtchn_ioctl(struct inode *inode, struct file *file,
53716 +                       unsigned int cmd, unsigned long arg)
53717 +{
53718 +       int rc;
53719 +       struct per_user_data *u = file->private_data;
53720 +       void __user *uarg = (void __user *) arg;
53721 +
53722 +       switch (cmd) {
53723 +       case IOCTL_EVTCHN_BIND_VIRQ: {
53724 +               struct ioctl_evtchn_bind_virq bind;
53725 +               struct evtchn_bind_virq bind_virq;
53726 +
53727 +               rc = -EFAULT;
53728 +               if (copy_from_user(&bind, uarg, sizeof(bind)))
53729 +                       break;
53730 +
53731 +               bind_virq.virq = bind.virq;
53732 +               bind_virq.vcpu = 0;
53733 +               rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
53734 +                                                &bind_virq);
53735 +               if (rc != 0)
53736 +                       break;
53737 +
53738 +               rc = bind_virq.port;
53739 +               evtchn_bind_to_user(u, rc);
53740 +               break;
53741 +       }
53742 +
53743 +       case IOCTL_EVTCHN_BIND_INTERDOMAIN: {
53744 +               struct ioctl_evtchn_bind_interdomain bind;
53745 +               struct evtchn_bind_interdomain bind_interdomain;
53746 +
53747 +               rc = -EFAULT;
53748 +               if (copy_from_user(&bind, uarg, sizeof(bind)))
53749 +                       break;
53750 +
53751 +               bind_interdomain.remote_dom  = bind.remote_domain;
53752 +               bind_interdomain.remote_port = bind.remote_port;
53753 +               rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
53754 +                                                &bind_interdomain);
53755 +               if (rc != 0)
53756 +                       break;
53757 +
53758 +               rc = bind_interdomain.local_port;
53759 +               evtchn_bind_to_user(u, rc);
53760 +               break;
53761 +       }
53762 +
53763 +       case IOCTL_EVTCHN_BIND_UNBOUND_PORT: {
53764 +               struct ioctl_evtchn_bind_unbound_port bind;
53765 +               struct evtchn_alloc_unbound alloc_unbound;
53766 +
53767 +               rc = -EFAULT;
53768 +               if (copy_from_user(&bind, uarg, sizeof(bind)))
53769 +                       break;
53770 +
53771 +               alloc_unbound.dom        = DOMID_SELF;
53772 +               alloc_unbound.remote_dom = bind.remote_domain;
53773 +               rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
53774 +                                                &alloc_unbound);
53775 +               if (rc != 0)
53776 +                       break;
53777 +
53778 +               rc = alloc_unbound.port;
53779 +               evtchn_bind_to_user(u, rc);
53780 +               break;
53781 +       }
53782 +
53783 +       case IOCTL_EVTCHN_UNBIND: {
53784 +               struct ioctl_evtchn_unbind unbind;
53785 +               struct evtchn_close close;
53786 +               int ret;
53787 +
53788 +               rc = -EFAULT;
53789 +               if (copy_from_user(&unbind, uarg, sizeof(unbind)))
53790 +                       break;
53791 +
53792 +               rc = -EINVAL;
53793 +               if (unbind.port >= NR_EVENT_CHANNELS)
53794 +                       break;
53795 +
53796 +               spin_lock_irq(&port_user_lock);
53797 +    
53798 +               rc = -ENOTCONN;
53799 +               if (port_user[unbind.port] != u) {
53800 +                       spin_unlock_irq(&port_user_lock);
53801 +                       break;
53802 +               }
53803 +
53804 +               port_user[unbind.port] = NULL;
53805 +               mask_evtchn(unbind.port);
53806 +
53807 +               spin_unlock_irq(&port_user_lock);
53808 +
53809 +               close.port = unbind.port;
53810 +               ret = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
53811 +               BUG_ON(ret);
53812 +
53813 +               rc = 0;
53814 +               break;
53815 +       }
53816 +
53817 +       case IOCTL_EVTCHN_NOTIFY: {
53818 +               struct ioctl_evtchn_notify notify;
53819 +
53820 +               rc = -EFAULT;
53821 +               if (copy_from_user(&notify, uarg, sizeof(notify)))
53822 +                       break;
53823 +
53824 +               if (notify.port >= NR_EVENT_CHANNELS) {
53825 +                       rc = -EINVAL;
53826 +               } else if (port_user[notify.port] != u) {
53827 +                       rc = -ENOTCONN;
53828 +               } else {
53829 +                       notify_remote_via_evtchn(notify.port);
53830 +                       rc = 0;
53831 +               }
53832 +               break;
53833 +       }
53834 +
53835 +       case IOCTL_EVTCHN_RESET: {
53836 +               /* Initialise the ring to empty. Clear errors. */
53837 +               spin_lock_irq(&port_user_lock);
53838 +               u->ring_cons = u->ring_prod = u->ring_overflow = 0;
53839 +               spin_unlock_irq(&port_user_lock);
53840 +               rc = 0;
53841 +               break;
53842 +       }
53843 +
53844 +       default:
53845 +               rc = -ENOSYS;
53846 +               break;
53847 +       }
53848 +
53849 +       return rc;
53850 +}
53851 +
53852 +static unsigned int evtchn_poll(struct file *file, poll_table *wait)
53853 +{
53854 +       unsigned int mask = POLLOUT | POLLWRNORM;
53855 +       struct per_user_data *u = file->private_data;
53856 +
53857 +       poll_wait(file, &u->evtchn_wait, wait);
53858 +       if (u->ring_cons != u->ring_prod)
53859 +               mask |= POLLIN | POLLRDNORM;
53860 +       if (u->ring_overflow)
53861 +               mask = POLLERR;
53862 +       return mask;
53863 +}
53864 +
53865 +static int evtchn_fasync(int fd, struct file *filp, int on)
53866 +{
53867 +       struct per_user_data *u = filp->private_data;
53868 +       return fasync_helper(fd, filp, on, &u->evtchn_async_queue);
53869 +}
53870 +
53871 +static int evtchn_open(struct inode *inode, struct file *filp)
53872 +{
53873 +       struct per_user_data *u;
53874 +
53875 +       if ((u = kmalloc(sizeof(*u), GFP_KERNEL)) == NULL)
53876 +               return -ENOMEM;
53877 +
53878 +       memset(u, 0, sizeof(*u));
53879 +       init_waitqueue_head(&u->evtchn_wait);
53880 +
53881 +       u->ring = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
53882 +       if (u->ring == NULL) {
53883 +               kfree(u);
53884 +               return -ENOMEM;
53885 +       }
53886 +
53887 +       filp->private_data = u;
53888 +
53889 +       return 0;
53890 +}
53891 +
53892 +static int evtchn_release(struct inode *inode, struct file *filp)
53893 +{
53894 +       int i;
53895 +       struct per_user_data *u = filp->private_data;
53896 +       struct evtchn_close close;
53897 +
53898 +       spin_lock_irq(&port_user_lock);
53899 +
53900 +       free_page((unsigned long)u->ring);
53901 +
53902 +       for (i = 0; i < NR_EVENT_CHANNELS; i++) {
53903 +               int ret;
53904 +               if (port_user[i] != u)
53905 +                       continue;
53906 +
53907 +               port_user[i] = NULL;
53908 +               mask_evtchn(i);
53909 +
53910 +               close.port = i;
53911 +               ret = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
53912 +               BUG_ON(ret);
53913 +       }
53914 +
53915 +       spin_unlock_irq(&port_user_lock);
53916 +
53917 +       kfree(u);
53918 +
53919 +       return 0;
53920 +}
53921 +
53922 +static struct file_operations evtchn_fops = {
53923 +       .owner   = THIS_MODULE,
53924 +       .read    = evtchn_read,
53925 +       .write   = evtchn_write,
53926 +       .ioctl   = evtchn_ioctl,
53927 +       .poll    = evtchn_poll,
53928 +       .fasync  = evtchn_fasync,
53929 +       .open    = evtchn_open,
53930 +       .release = evtchn_release,
53931 +};
53932 +
53933 +static struct miscdevice evtchn_miscdev = {
53934 +       .minor        = EVTCHN_MINOR,
53935 +       .name         = "evtchn",
53936 +       .fops         = &evtchn_fops,
53937 +       .devfs_name   = "misc/evtchn",
53938 +};
53939 +
53940 +static int __init evtchn_init(void)
53941 +{
53942 +       int err;
53943 +
53944 +       if (!is_running_on_xen())
53945 +               return -ENODEV;
53946 +
53947 +       spin_lock_init(&port_user_lock);
53948 +       memset(port_user, 0, sizeof(port_user));
53949 +
53950 +       /* Create '/dev/misc/evtchn'. */
53951 +       err = misc_register(&evtchn_miscdev);
53952 +       if (err != 0) {
53953 +               printk(KERN_ALERT "Could not register /dev/misc/evtchn\n");
53954 +               return err;
53955 +       }
53956 +
53957 +       printk("Event-channel device installed.\n");
53958 +
53959 +       return 0;
53960 +}
53961 +
53962 +static void evtchn_cleanup(void)
53963 +{
53964 +       misc_deregister(&evtchn_miscdev);
53965 +}
53966 +
53967 +module_init(evtchn_init);
53968 +module_exit(evtchn_cleanup);
53969 +
53970 +MODULE_LICENSE("Dual BSD/GPL");
53971 diff -urNp linux-2.6/drivers/xen/evtchn/Makefile new/drivers/xen/evtchn/Makefile
53972 --- linux-2.6/drivers/xen/evtchn/Makefile       1970-01-01 01:00:00.000000000 +0100
53973 +++ new/drivers/xen/evtchn/Makefile     2006-05-09 12:34:37.000000000 +0200
53974 @@ -0,0 +1,2 @@
53975 +
53976 +obj-y  := evtchn.o
53977 diff -urNp linux-2.6/drivers/xen/Kconfig new/drivers/xen/Kconfig
53978 --- linux-2.6/drivers/xen/Kconfig       1970-01-01 01:00:00.000000000 +0100
53979 +++ new/drivers/xen/Kconfig     2006-06-28 14:32:14.000000000 +0200
53980 @@ -0,0 +1,248 @@
53981 +#
53982 +# This Kconfig describe xen options
53983 +#
53984 +
53985 +mainmenu "Xen Configuration"
53986 +
53987 +config XEN
53988 +       bool
53989 +       default y if X86_XEN || X86_64_XEN
53990 +       help
53991 +         This is the Linux Xen port.
53992 +
53993 +if XEN
53994 +config XEN_INTERFACE_VERSION
53995 +       hex
53996 +       default 0x00030202
53997 +
53998 +menu "XEN"
53999 +
54000 +config XEN_PRIVILEGED_GUEST
54001 +       bool "Privileged Guest (domain 0)"
54002 +       depends XEN
54003 +       default n
54004 +       help
54005 +         Support for privileged operation (domain 0)
54006 +
54007 +config XEN_UNPRIVILEGED_GUEST
54008 +       bool
54009 +       default !XEN_PRIVILEGED_GUEST
54010 +
54011 +config XEN_PRIVCMD
54012 +       bool
54013 +       depends on PROC_FS
54014 +       default y
54015 +
54016 +config XEN_BACKEND
54017 +        tristate "Backend driver support"
54018 +        default y
54019 +        help
54020 +          Support for backend device drivers that provide I/O services
54021 +          to other virtual machines.
54022 +
54023 +config XEN_PCIDEV_BACKEND
54024 +       tristate "PCI device backend driver"
54025 +       depends on PCI && XEN_BACKEND
54026 +       default XEN_PRIVILEGED_GUEST
54027 +       help
54028 +         The PCI device backend driver allows the kernel to export arbitrary
54029 +         PCI devices to other guests. If you select this to be a module, you
54030 +         will need to make sure no other driver has bound to the device(s)
54031 +         you want to make visible to other guests.
54032 +
54033 +choice
54034 +       prompt "PCI Backend Mode"
54035 +       depends on XEN_PCIDEV_BACKEND
54036 +       default XEN_PCIDEV_BACKEND_VPCI
54037 +
54038 +config XEN_PCIDEV_BACKEND_VPCI
54039 +       bool "Virtual PCI"
54040 +       ---help---
54041 +         This PCI Backend hides the true PCI topology and makes the frontend
54042 +         think there is a single PCI bus with only the exported devices on it.
54043 +         For example, a device at 03:05.0 will be re-assigned to 00:00.0. A
54044 +         second device at 02:1a.0 will be re-assigned to 00:01.0.
54045 +
54046 +config XEN_PCIDEV_BACKEND_PASS
54047 +       bool "Passthrough"
54048 +       ---help---
54049 +         This PCI Backend provides a real view of the PCI topology to the
54050 +         frontend (for example, a device at 06:01.b will still appear at
54051 +         06:01.b to the frontend). This is similar to how Xen 2.0.x exposed
54052 +         PCI devices to its driver domains. This may be required for drivers
54053 +         which depend on finding their hardward in certain bus/slot
54054 +         locations.
54055 +
54056 +endchoice
54057 +
54058 +config XEN_PCIDEV_BE_DEBUG
54059 +       bool "PCI Backend Debugging"
54060 +       depends on XEN_PCIDEV_BACKEND
54061 +       default n
54062 +
54063 +config XEN_BLKDEV_BACKEND
54064 +       tristate "Block-device backend driver"
54065 +        depends on XEN_BACKEND
54066 +       default y
54067 +       help
54068 +         The block-device backend driver allows the kernel to export its
54069 +         block devices to other guests via a high-performance shared-memory
54070 +         interface.
54071 +
54072 +config XEN_XENBUS_DEV
54073 +       bool
54074 +       depends on PROC_FS
54075 +       default y
54076 +
54077 +config XEN_NETDEV_BACKEND
54078 +       tristate "Network-device backend driver"
54079 +        depends on XEN_BACKEND && NET
54080 +       default y
54081 +       help
54082 +         The network-device backend driver allows the kernel to export its
54083 +         network devices to other guests via a high-performance shared-memory
54084 +         interface.
54085 +
54086 +config XEN_NETDEV_PIPELINED_TRANSMITTER
54087 +       bool "Pipelined transmitter (DANGEROUS)"
54088 +       depends on XEN_NETDEV_BACKEND
54089 +       default n
54090 +       help
54091 +         If the net backend is a dumb domain, such as a transparent Ethernet
54092 +         bridge with no local IP interface, it is safe to say Y here to get
54093 +         slightly lower network overhead.
54094 +         If the backend has a local IP interface; or may be doing smart things
54095 +         like reassembling packets to perform firewall filtering; or if you
54096 +         are unsure; or if you experience network hangs when this option is
54097 +         enabled; then you must say N here.
54098 +
54099 +config XEN_NETDEV_LOOPBACK
54100 +       tristate "Network-device loopback driver"
54101 +       depends on XEN_NETDEV_BACKEND
54102 +       default y
54103 +       help
54104 +         A two-interface loopback device to emulate a local netfront-netback
54105 +         connection.
54106 +
54107 +config XEN_TPMDEV_BACKEND
54108 +       tristate "TPM-device backend driver"
54109 +        depends on XEN_BACKEND
54110 +       default n
54111 +       help
54112 +         The TPM-device backend driver
54113 +
54114 +config XEN_TPMDEV_CLOSE_IF_VTPM_FAILS
54115 +       bool "TPM backend closes upon vTPM failure"
54116 +       depends on XEN_TPMDEV_BACKEND
54117 +       default n
54118 +       help
54119 +         The TPM backend closes the channel if the vTPM in userspace indicates
54120 +         a failure. The corresponding domain's channel will be closed.
54121 +         Say Y if you want this feature.
54122 +
54123 +config XEN_BLKDEV_FRONTEND
54124 +       tristate "Block-device frontend driver"
54125 +       depends on XEN
54126 +       default y
54127 +       help
54128 +         The block-device frontend driver allows the kernel to access block
54129 +         devices mounted within another guest OS. Unless you are building a
54130 +         dedicated device-driver domain, or your master control domain
54131 +         (domain 0), then you almost certainly want to say Y here.
54132 +
54133 +config XEN_NETDEV_FRONTEND
54134 +       tristate "Network-device frontend driver"
54135 +       depends on XEN && NET
54136 +       default y
54137 +       help
54138 +         The network-device frontend driver allows the kernel to access
54139 +         network interfaces within another guest OS. Unless you are building a
54140 +         dedicated device-driver domain, or your master control domain
54141 +         (domain 0), then you almost certainly want to say Y here.
54142 +
54143 +config XEN_SCRUB_PAGES
54144 +       bool "Scrub memory before freeing it to Xen"
54145 +       default y
54146 +       help
54147 +         Erase memory contents before freeing it back to Xen's global
54148 +         pool. This ensures that any secrets contained within that
54149 +         memory (e.g., private keys) cannot be found by other guests that
54150 +         may be running on the machine. Most people will want to say Y here.
54151 +         If security is not a concern then you may increase performance by
54152 +         saying N.
54153 +
54154 +config XEN_DISABLE_SERIAL
54155 +       bool "Disable serial port drivers"
54156 +       default y
54157 +       help
54158 +         Disable serial port drivers, allowing the Xen console driver
54159 +         to provide a serial console at ttyS0.
54160 +
54161 +config XEN_SYSFS
54162 +       tristate "Export Xen attributes in sysfs"
54163 +       depends on SYSFS
54164 +       default y
54165 +       help
54166 +               Xen hypervisor attributes will show up under /sys/hypervisor/.
54167 +
54168 +choice
54169 +       prompt "Xen version compatibility"
54170 +       default XEN_COMPAT_030002_AND_LATER
54171 +
54172 +       config XEN_COMPAT_030002_AND_LATER
54173 +               bool "3.0.2 and later"
54174 +
54175 +       config XEN_COMPAT_LATEST_ONLY
54176 +               bool "no compatibility code"
54177 +
54178 +endchoice
54179 +
54180 +config XEN_COMPAT_030002
54181 +       bool
54182 +       default XEN_COMPAT_030002_AND_LATER
54183 +
54184 +endmenu
54185 +
54186 +config HAVE_ARCH_ALLOC_SKB
54187 +       bool
54188 +       default y
54189 +
54190 +config HAVE_ARCH_DEV_ALLOC_SKB
54191 +       bool
54192 +       default y
54193 +
54194 +config HAVE_IRQ_IGNORE_UNHANDLED
54195 +       bool
54196 +       default y
54197 +
54198 +config NO_IDLE_HZ
54199 +       bool
54200 +       default y
54201 +
54202 +config XEN_UTIL
54203 +       bool
54204 +       default y
54205 +
54206 +config XEN_BALLOON
54207 +       bool
54208 +       default y
54209 +
54210 +config XEN_DEVMEM
54211 +       bool
54212 +       default y
54213 +
54214 +config XEN_SKBUFF
54215 +       bool
54216 +       default y
54217 +       depends on NET
54218 +
54219 +config XEN_REBOOT
54220 +       bool
54221 +       default y
54222 +
54223 +config XEN_SMPBOOT
54224 +       bool
54225 +       default y
54226 +       depends on SMP
54227 +
54228 +endif
54229 diff -urNp linux-2.6/drivers/xen/Makefile new/drivers/xen/Makefile
54230 --- linux-2.6/drivers/xen/Makefile      1970-01-01 01:00:00.000000000 +0100
54231 +++ new/drivers/xen/Makefile    2006-06-28 14:32:14.000000000 +0200
54232 @@ -0,0 +1,16 @@
54233 +obj-y  += core/
54234 +obj-y  += console/
54235 +obj-y  += evtchn/
54236 +obj-y  += privcmd/
54237 +obj-y  += xenbus/
54238 +
54239 +obj-$(CONFIG_XEN_UTIL)                 += util.o
54240 +obj-$(CONFIG_XEN_BALLOON)              += balloon/
54241 +obj-$(CONFIG_XEN_DEVMEM)               += char/
54242 +obj-$(CONFIG_XEN_BLKDEV_BACKEND)       += blkback/
54243 +obj-$(CONFIG_XEN_NETDEV_BACKEND)       += netback/
54244 +obj-$(CONFIG_XEN_TPMDEV_BACKEND)       += tpmback/
54245 +obj-$(CONFIG_XEN_BLKDEV_FRONTEND)      += blkfront/
54246 +obj-$(CONFIG_XEN_NETDEV_FRONTEND)      += netfront/
54247 +obj-$(CONFIG_XEN_PCIDEV_BACKEND)       += pciback/
54248 +obj-$(CONFIG_XEN_PCIDEV_FRONTEND)      += pcifront/
54249 diff -urNp linux-2.6/drivers/xen/netback/common.h new/drivers/xen/netback/common.h
54250 --- linux-2.6/drivers/xen/netback/common.h      1970-01-01 01:00:00.000000000 +0100
54251 +++ new/drivers/xen/netback/common.h    2006-06-28 14:32:14.000000000 +0200
54252 @@ -0,0 +1,124 @@
54253 +/******************************************************************************
54254 + * arch/xen/drivers/netif/backend/common.h
54255 + * 
54256 + * This program is free software; you can redistribute it and/or
54257 + * modify it under the terms of the GNU General Public License version 2
54258 + * as published by the Free Software Foundation; or, when distributed
54259 + * separately from the Linux kernel or incorporated into other
54260 + * software packages, subject to the following license:
54261 + * 
54262 + * Permission is hereby granted, free of charge, to any person obtaining a copy
54263 + * of this source file (the "Software"), to deal in the Software without
54264 + * restriction, including without limitation the rights to use, copy, modify,
54265 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
54266 + * and to permit persons to whom the Software is furnished to do so, subject to
54267 + * the following conditions:
54268 + * 
54269 + * The above copyright notice and this permission notice shall be included in
54270 + * all copies or substantial portions of the Software.
54271 + * 
54272 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
54273 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
54274 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
54275 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
54276 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
54277 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
54278 + * IN THE SOFTWARE.
54279 + */
54280 +
54281 +#ifndef __NETIF__BACKEND__COMMON_H__
54282 +#define __NETIF__BACKEND__COMMON_H__
54283 +
54284 +#include <linux/config.h>
54285 +#include <linux/version.h>
54286 +#include <linux/module.h>
54287 +#include <linux/interrupt.h>
54288 +#include <linux/slab.h>
54289 +#include <linux/ip.h>
54290 +#include <linux/in.h>
54291 +#include <linux/netdevice.h>
54292 +#include <linux/etherdevice.h>
54293 +#include <linux/wait.h>
54294 +#include <xen/evtchn.h>
54295 +#include <xen/interface/io/netif.h>
54296 +#include <asm/io.h>
54297 +#include <asm/pgalloc.h>
54298 +#include <xen/interface/grant_table.h>
54299 +#include <xen/gnttab.h>
54300 +#include <xen/driver_util.h>
54301 +
54302 +#define DPRINTK(_f, _a...)                     \
54303 +       pr_debug("(file=%s, line=%d) " _f,      \
54304 +                __FILE__ , __LINE__ , ## _a )
54305 +#define IPRINTK(fmt, args...)                          \
54306 +       printk(KERN_INFO "xen_net: " fmt, ##args)
54307 +#define WPRINTK(fmt, args...)                          \
54308 +       printk(KERN_WARNING "xen_net: " fmt, ##args)
54309 +
54310 +typedef struct netif_st {
54311 +       /* Unique identifier for this interface. */
54312 +       domid_t          domid;
54313 +       unsigned int     handle;
54314 +
54315 +       u8               fe_dev_addr[6];
54316 +
54317 +       /* Physical parameters of the comms window. */
54318 +       grant_handle_t   tx_shmem_handle;
54319 +       grant_ref_t      tx_shmem_ref; 
54320 +       grant_handle_t   rx_shmem_handle;
54321 +       grant_ref_t      rx_shmem_ref; 
54322 +       unsigned int     evtchn;
54323 +       unsigned int     irq;
54324 +
54325 +       /* The shared rings and indexes. */
54326 +       netif_tx_back_ring_t tx;
54327 +       netif_rx_back_ring_t rx;
54328 +       struct vm_struct *tx_comms_area;
54329 +       struct vm_struct *rx_comms_area;
54330 +
54331 +       /* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */
54332 +       RING_IDX rx_req_cons_peek;
54333 +
54334 +       /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
54335 +       unsigned long   credit_bytes;
54336 +       unsigned long   credit_usec;
54337 +       unsigned long   remaining_credit;
54338 +       struct timer_list credit_timeout;
54339 +
54340 +       /* Miscellaneous private stuff. */
54341 +       enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
54342 +       int active;
54343 +       struct list_head list;  /* scheduling list */
54344 +       atomic_t         refcnt;
54345 +       struct net_device *dev;
54346 +       struct net_device_stats stats;
54347 +
54348 +       wait_queue_head_t waiting_to_free;
54349 +} netif_t;
54350 +
54351 +#define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE)
54352 +#define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE)
54353 +
54354 +void netif_disconnect(netif_t *netif);
54355 +
54356 +netif_t *netif_alloc(domid_t domid, unsigned int handle, u8 be_mac[ETH_ALEN]);
54357 +int netif_map(netif_t *netif, unsigned long tx_ring_ref,
54358 +             unsigned long rx_ring_ref, unsigned int evtchn);
54359 +
54360 +#define netif_get(_b) (atomic_inc(&(_b)->refcnt))
54361 +#define netif_put(_b)                                          \
54362 +       do {                                                    \
54363 +               if ( atomic_dec_and_test(&(_b)->refcnt) )       \
54364 +                       wake_up(&(_b)->waiting_to_free);        \
54365 +       } while (0)
54366 +
54367 +void netif_xenbus_init(void);
54368 +
54369 +void netif_schedule_work(netif_t *netif);
54370 +void netif_deschedule_work(netif_t *netif);
54371 +
54372 +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
54373 +struct net_device_stats *netif_be_get_stats(struct net_device *dev);
54374 +irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs);
54375 +
54376 +#endif /* __NETIF__BACKEND__COMMON_H__ */
54377 diff -urNp linux-2.6/drivers/xen/netback/interface.c new/drivers/xen/netback/interface.c
54378 --- linux-2.6/drivers/xen/netback/interface.c   1970-01-01 01:00:00.000000000 +0100
54379 +++ new/drivers/xen/netback/interface.c 2006-07-07 15:10:03.000000000 +0200
54380 @@ -0,0 +1,314 @@
54381 +/******************************************************************************
54382 + * arch/xen/drivers/netif/backend/interface.c
54383 + * 
54384 + * Network-device interface management.
54385 + * 
54386 + * Copyright (c) 2004-2005, Keir Fraser
54387 + * 
54388 + * This program is free software; you can redistribute it and/or
54389 + * modify it under the terms of the GNU General Public License version 2
54390 + * as published by the Free Software Foundation; or, when distributed
54391 + * separately from the Linux kernel or incorporated into other
54392 + * software packages, subject to the following license:
54393 + * 
54394 + * Permission is hereby granted, free of charge, to any person obtaining a copy
54395 + * of this source file (the "Software"), to deal in the Software without
54396 + * restriction, including without limitation the rights to use, copy, modify,
54397 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
54398 + * and to permit persons to whom the Software is furnished to do so, subject to
54399 + * the following conditions:
54400 + * 
54401 + * The above copyright notice and this permission notice shall be included in
54402 + * all copies or substantial portions of the Software.
54403 + * 
54404 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
54405 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
54406 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
54407 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
54408 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
54409 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
54410 + * IN THE SOFTWARE.
54411 + */
54412 +
54413 +#include "common.h"
54414 +#include <linux/ethtool.h>
54415 +#include <linux/rtnetlink.h>
54416 +
54417 +static void __netif_up(netif_t *netif)
54418 +{
54419 +       struct net_device *dev = netif->dev;
54420 +       netif_tx_lock_bh(dev);
54421 +       netif->active = 1;
54422 +       netif_tx_unlock_bh(dev);
54423 +       enable_irq(netif->irq);
54424 +       netif_schedule_work(netif);
54425 +}
54426 +
54427 +static void __netif_down(netif_t *netif)
54428 +{
54429 +       struct net_device *dev = netif->dev;
54430 +       disable_irq(netif->irq);
54431 +       netif_tx_lock_bh(dev);
54432 +       netif->active = 0;
54433 +       netif_tx_unlock_bh(dev);
54434 +       netif_deschedule_work(netif);
54435 +}
54436 +
54437 +static int net_open(struct net_device *dev)
54438 +{
54439 +       netif_t *netif = netdev_priv(dev);
54440 +       if (netif->status == CONNECTED)
54441 +               __netif_up(netif);
54442 +       netif_start_queue(dev);
54443 +       return 0;
54444 +}
54445 +
54446 +static int net_close(struct net_device *dev)
54447 +{
54448 +       netif_t *netif = netdev_priv(dev);
54449 +       netif_stop_queue(dev);
54450 +       if (netif->status == CONNECTED)
54451 +               __netif_down(netif);
54452 +       return 0;
54453 +}
54454 +
54455 +static struct ethtool_ops network_ethtool_ops =
54456 +{
54457 +       .get_tx_csum = ethtool_op_get_tx_csum,
54458 +       .set_tx_csum = ethtool_op_set_tx_csum,
54459 +};
54460 +
54461 +netif_t *netif_alloc(domid_t domid, unsigned int handle, u8 be_mac[ETH_ALEN])
54462 +{
54463 +       int err = 0, i;
54464 +       struct net_device *dev;
54465 +       netif_t *netif;
54466 +       char name[IFNAMSIZ] = {};
54467 +
54468 +       snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
54469 +       dev = alloc_netdev(sizeof(netif_t), name, ether_setup);
54470 +       if (dev == NULL) {
54471 +               DPRINTK("Could not create netif: out of memory\n");
54472 +               return ERR_PTR(-ENOMEM);
54473 +       }
54474 +
54475 +       netif = netdev_priv(dev);
54476 +       memset(netif, 0, sizeof(*netif));
54477 +       netif->domid  = domid;
54478 +       netif->handle = handle;
54479 +       netif->status = DISCONNECTED;
54480 +       atomic_set(&netif->refcnt, 1);
54481 +       init_waitqueue_head(&netif->waiting_to_free);
54482 +       netif->dev = dev;
54483 +
54484 +       netif->credit_bytes = netif->remaining_credit = ~0UL;
54485 +       netif->credit_usec  = 0UL;
54486 +       init_timer(&netif->credit_timeout);
54487 +
54488 +       dev->hard_start_xmit = netif_be_start_xmit;
54489 +       dev->get_stats       = netif_be_get_stats;
54490 +       dev->open            = net_open;
54491 +       dev->stop            = net_close;
54492 +       dev->features        = NETIF_F_IP_CSUM;
54493 +
54494 +       SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
54495 +
54496 +       /* Disable queuing. */
54497 +       dev->tx_queue_len = 0;
54498 +
54499 +       for (i = 0; i < ETH_ALEN; i++)
54500 +               if (be_mac[i] != 0)
54501 +                       break;
54502 +       if (i == ETH_ALEN) {
54503 +               /*
54504 +                * Initialise a dummy MAC address. We choose the numerically
54505 +                * largest non-broadcast address to prevent the address getting
54506 +                * stolen by an Ethernet bridge for STP purposes.
54507 +                * (FE:FF:FF:FF:FF:FF)
54508 +                */ 
54509 +               memset(dev->dev_addr, 0xFF, ETH_ALEN);
54510 +               dev->dev_addr[0] &= ~0x01;
54511 +       } else
54512 +               memcpy(dev->dev_addr, be_mac, ETH_ALEN);
54513 +
54514 +       rtnl_lock();
54515 +       err = register_netdevice(dev);
54516 +       rtnl_unlock();
54517 +       if (err) {
54518 +               DPRINTK("Could not register new net device %s: err=%d\n",
54519 +                       dev->name, err);
54520 +               free_netdev(dev);
54521 +               return ERR_PTR(err);
54522 +       }
54523 +
54524 +       DPRINTK("Successfully created netif\n");
54525 +       return netif;
54526 +}
54527 +
54528 +static int map_frontend_pages(
54529 +       netif_t *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref)
54530 +{
54531 +       struct gnttab_map_grant_ref op;
54532 +       int ret;
54533 +
54534 +       gnttab_set_map_op(&op, (unsigned long)netif->tx_comms_area->addr,
54535 +                         GNTMAP_host_map, tx_ring_ref, netif->domid);
54536 +    
54537 +       lock_vm_area(netif->tx_comms_area);
54538 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
54539 +       unlock_vm_area(netif->tx_comms_area);
54540 +       BUG_ON(ret);
54541 +
54542 +       if (op.status) { 
54543 +               DPRINTK(" Gnttab failure mapping tx_ring_ref!\n");
54544 +               return op.status;
54545 +       }
54546 +
54547 +       netif->tx_shmem_ref    = tx_ring_ref;
54548 +       netif->tx_shmem_handle = op.handle;
54549 +
54550 +       gnttab_set_map_op(&op, (unsigned long)netif->rx_comms_area->addr,
54551 +                         GNTMAP_host_map, rx_ring_ref, netif->domid);
54552 +
54553 +       lock_vm_area(netif->rx_comms_area);
54554 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
54555 +       unlock_vm_area(netif->rx_comms_area);
54556 +       BUG_ON(ret);
54557 +
54558 +       if (op.status) {
54559 +               DPRINTK(" Gnttab failure mapping rx_ring_ref!\n");
54560 +               return op.status;
54561 +       }
54562 +
54563 +       netif->rx_shmem_ref    = rx_ring_ref;
54564 +       netif->rx_shmem_handle = op.handle;
54565 +
54566 +       return 0;
54567 +}
54568 +
54569 +static void unmap_frontend_pages(netif_t *netif)
54570 +{
54571 +       struct gnttab_unmap_grant_ref op;
54572 +       int ret;
54573 +
54574 +       gnttab_set_unmap_op(&op, (unsigned long)netif->tx_comms_area->addr,
54575 +                           GNTMAP_host_map, netif->tx_shmem_handle);
54576 +
54577 +       lock_vm_area(netif->tx_comms_area);
54578 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
54579 +       unlock_vm_area(netif->tx_comms_area);
54580 +       BUG_ON(ret);
54581 +
54582 +       gnttab_set_unmap_op(&op, (unsigned long)netif->rx_comms_area->addr,
54583 +                           GNTMAP_host_map, netif->rx_shmem_handle);
54584 +
54585 +       lock_vm_area(netif->rx_comms_area);
54586 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
54587 +       unlock_vm_area(netif->rx_comms_area);
54588 +       BUG_ON(ret);
54589 +}
54590 +
54591 +int netif_map(netif_t *netif, unsigned long tx_ring_ref,
54592 +             unsigned long rx_ring_ref, unsigned int evtchn)
54593 +{
54594 +       int err = -ENOMEM;
54595 +       netif_tx_sring_t *txs;
54596 +       netif_rx_sring_t *rxs;
54597 +       struct evtchn_bind_interdomain bind_interdomain;
54598 +
54599 +       /* Already connected through? */
54600 +       if (netif->irq)
54601 +               return 0;
54602 +
54603 +       netif->tx_comms_area = alloc_vm_area(PAGE_SIZE);
54604 +       if (netif->tx_comms_area == NULL)
54605 +               return -ENOMEM;
54606 +       netif->rx_comms_area = alloc_vm_area(PAGE_SIZE);
54607 +       if (netif->rx_comms_area == NULL)
54608 +               goto err_rx;
54609 +
54610 +       err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref);
54611 +       if (err)
54612 +               goto err_map;
54613 +
54614 +       bind_interdomain.remote_dom = netif->domid;
54615 +       bind_interdomain.remote_port = evtchn;
54616 +
54617 +       err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
54618 +                                         &bind_interdomain);
54619 +       if (err)
54620 +               goto err_hypervisor;
54621 +
54622 +       netif->evtchn = bind_interdomain.local_port;
54623 +
54624 +       netif->irq = bind_evtchn_to_irqhandler(
54625 +               netif->evtchn, netif_be_int, 0, netif->dev->name, netif);
54626 +       disable_irq(netif->irq);
54627 +
54628 +       txs = (netif_tx_sring_t *)netif->tx_comms_area->addr;
54629 +       BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE);
54630 +
54631 +       rxs = (netif_rx_sring_t *)
54632 +               ((char *)netif->rx_comms_area->addr);
54633 +       BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE);
54634 +
54635 +       netif->rx_req_cons_peek = 0;
54636 +
54637 +       netif_get(netif);
54638 +       wmb(); /* Other CPUs see new state before interface is started. */
54639 +
54640 +       rtnl_lock();
54641 +       netif->status = CONNECTED;
54642 +       wmb();
54643 +       if (netif_running(netif->dev))
54644 +               __netif_up(netif);
54645 +       rtnl_unlock();
54646 +
54647 +       return 0;
54648 +err_hypervisor:
54649 +       unmap_frontend_pages(netif);
54650 +err_map:
54651 +       free_vm_area(netif->rx_comms_area);
54652 +err_rx:
54653 +       free_vm_area(netif->tx_comms_area);
54654 +       return err;
54655 +}
54656 +
54657 +static void netif_free(netif_t *netif)
54658 +{
54659 +       atomic_dec(&netif->refcnt);
54660 +       wait_event(netif->waiting_to_free, atomic_read(&netif->refcnt) == 0);
54661 +
54662 +       if (netif->irq)
54663 +               unbind_from_irqhandler(netif->irq, netif);
54664 +       
54665 +       unregister_netdev(netif->dev);
54666 +
54667 +       if (netif->tx.sring) {
54668 +               unmap_frontend_pages(netif);
54669 +               free_vm_area(netif->tx_comms_area);
54670 +               free_vm_area(netif->rx_comms_area);
54671 +       }
54672 +
54673 +       free_netdev(netif->dev);
54674 +}
54675 +
54676 +void netif_disconnect(netif_t *netif)
54677 +{
54678 +       switch (netif->status) {
54679 +       case CONNECTED:
54680 +               rtnl_lock();
54681 +               netif->status = DISCONNECTING;
54682 +               wmb();
54683 +               if (netif_running(netif->dev))
54684 +                       __netif_down(netif);
54685 +               rtnl_unlock();
54686 +               netif_put(netif);
54687 +               /* fall through */
54688 +       case DISCONNECTED:
54689 +               netif_free(netif);
54690 +               break;
54691 +       default:
54692 +               BUG();
54693 +       }
54694 +}
54695 diff -urNp linux-2.6/drivers/xen/netback/loopback.c new/drivers/xen/netback/loopback.c
54696 --- linux-2.6/drivers/xen/netback/loopback.c    1970-01-01 01:00:00.000000000 +0100
54697 +++ new/drivers/xen/netback/loopback.c  2006-07-07 15:10:03.000000000 +0200
54698 @@ -0,0 +1,260 @@
54699 +/******************************************************************************
54700 + * netback/loopback.c
54701 + * 
54702 + * A two-interface loopback device to emulate a local netfront-netback
54703 + * connection. This ensures that local packet delivery looks identical
54704 + * to inter-domain delivery. Most importantly, packets delivered locally
54705 + * originating from other domains will get *copied* when they traverse this
54706 + * driver. This prevents unbounded delays in socket-buffer queues from
54707 + * causing the netback driver to "seize up".
54708 + * 
54709 + * This driver creates a symmetric pair of loopback interfaces with names
54710 + * vif0.0 and veth0. The intention is that 'vif0.0' is bound to an Ethernet
54711 + * bridge, just like a proper netback interface, while a local IP interface
54712 + * is configured on 'veth0'.
54713 + * 
54714 + * As with a real netback interface, vif0.0 is configured with a suitable
54715 + * dummy MAC address. No default is provided for veth0: a reasonable strategy
54716 + * is to transfer eth0's MAC address to veth0, and give eth0 a dummy address
54717 + * (to avoid confusing the Etherbridge).
54718 + * 
54719 + * Copyright (c) 2005 K A Fraser
54720 + * 
54721 + * This program is free software; you can redistribute it and/or
54722 + * modify it under the terms of the GNU General Public License version 2
54723 + * as published by the Free Software Foundation; or, when distributed
54724 + * separately from the Linux kernel or incorporated into other
54725 + * software packages, subject to the following license:
54726 + * 
54727 + * Permission is hereby granted, free of charge, to any person obtaining a copy
54728 + * of this source file (the "Software"), to deal in the Software without
54729 + * restriction, including without limitation the rights to use, copy, modify,
54730 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
54731 + * and to permit persons to whom the Software is furnished to do so, subject to
54732 + * the following conditions:
54733 + * 
54734 + * The above copyright notice and this permission notice shall be included in
54735 + * all copies or substantial portions of the Software.
54736 + * 
54737 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
54738 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
54739 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
54740 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
54741 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
54742 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
54743 + * IN THE SOFTWARE.
54744 + */
54745 +
54746 +#include <linux/config.h>
54747 +#include <linux/module.h>
54748 +#include <linux/netdevice.h>
54749 +#include <linux/inetdevice.h>
54750 +#include <linux/etherdevice.h>
54751 +#include <linux/skbuff.h>
54752 +#include <linux/ethtool.h>
54753 +#include <net/dst.h>
54754 +
54755 +static int nloopbacks = 8;
54756 +module_param(nloopbacks, int, 0);
54757 +MODULE_PARM_DESC(nloopbacks, "Number of netback-loopback devices to create");
54758 +
54759 +struct net_private {
54760 +       struct net_device *loopback_dev;
54761 +       struct net_device_stats stats;
54762 +};
54763 +
54764 +static int loopback_open(struct net_device *dev)
54765 +{
54766 +       struct net_private *np = netdev_priv(dev);
54767 +       memset(&np->stats, 0, sizeof(np->stats));
54768 +       netif_start_queue(dev);
54769 +       return 0;
54770 +}
54771 +
54772 +static int loopback_close(struct net_device *dev)
54773 +{
54774 +       netif_stop_queue(dev);
54775 +       return 0;
54776 +}
54777 +
54778 +static int loopback_start_xmit(struct sk_buff *skb, struct net_device *dev)
54779 +{
54780 +       struct net_private *np = netdev_priv(dev);
54781 +
54782 +       dst_release(skb->dst);
54783 +       skb->dst = NULL;
54784 +
54785 +       skb_orphan(skb);
54786 +
54787 +       np->stats.tx_bytes += skb->len;
54788 +       np->stats.tx_packets++;
54789 +
54790 +       /* Switch to loopback context. */
54791 +       dev = np->loopback_dev;
54792 +       np  = netdev_priv(dev);
54793 +
54794 +       np->stats.rx_bytes += skb->len;
54795 +       np->stats.rx_packets++;
54796 +
54797 +       if (skb->ip_summed == CHECKSUM_HW) {
54798 +               /* Defer checksum calculation. */
54799 +               skb->proto_csum_blank = 1;
54800 +               /* Must be a local packet: assert its integrity. */
54801 +               skb->proto_data_valid = 1;
54802 +       }
54803 +
54804 +       skb->ip_summed = skb->proto_data_valid ?
54805 +               CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
54806 +
54807 +       skb->pkt_type = PACKET_HOST; /* overridden by eth_type_trans() */
54808 +       skb->protocol = eth_type_trans(skb, dev);
54809 +       skb->dev      = dev;
54810 +       dev->last_rx  = jiffies;
54811 +       netif_rx(skb);
54812 +
54813 +       return 0;
54814 +}
54815 +
54816 +static struct net_device_stats *loopback_get_stats(struct net_device *dev)
54817 +{
54818 +       struct net_private *np = netdev_priv(dev);
54819 +       return &np->stats;
54820 +}
54821 +
54822 +static struct ethtool_ops network_ethtool_ops =
54823 +{
54824 +       .get_tx_csum = ethtool_op_get_tx_csum,
54825 +       .set_tx_csum = ethtool_op_set_tx_csum,
54826 +       .get_sg = ethtool_op_get_sg,
54827 +       .set_sg = ethtool_op_set_sg,
54828 +       .get_tso = ethtool_op_get_tso,
54829 +       .set_tso = ethtool_op_set_tso,
54830 +};
54831 +
54832 +/*
54833 + * Nothing to do here. Virtual interface is point-to-point and the
54834 + * physical interface is probably promiscuous anyway.
54835 + */
54836 +static void loopback_set_multicast_list(struct net_device *dev)
54837 +{
54838 +}
54839 +
54840 +static void loopback_construct(struct net_device *dev, struct net_device *lo)
54841 +{
54842 +       struct net_private *np = netdev_priv(dev);
54843 +
54844 +       np->loopback_dev     = lo;
54845 +
54846 +       dev->open            = loopback_open;
54847 +       dev->stop            = loopback_close;
54848 +       dev->hard_start_xmit = loopback_start_xmit;
54849 +       dev->get_stats       = loopback_get_stats;
54850 +       dev->set_multicast_list = loopback_set_multicast_list;
54851 +       dev->change_mtu      = NULL; /* allow arbitrary mtu */
54852 +
54853 +       dev->tx_queue_len    = 0;
54854 +
54855 +       dev->features        = (NETIF_F_HIGHDMA |
54856 +                               NETIF_F_LLTX |
54857 +                               NETIF_F_TSO |
54858 +                               NETIF_F_SG |
54859 +                               NETIF_F_IP_CSUM);
54860 +
54861 +       SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
54862 +
54863 +       /*
54864 +        * We do not set a jumbo MTU on the interface. Otherwise the network
54865 +        * stack will try to send large packets that will get dropped by the
54866 +        * Ethernet bridge (unless the physical Ethernet interface is
54867 +        * configured to transfer jumbo packets). If a larger MTU is desired
54868 +        * then the system administrator can specify it using the 'ifconfig'
54869 +        * command.
54870 +        */
54871 +       /*dev->mtu             = 16*1024;*/
54872 +}
54873 +
54874 +static int __init make_loopback(int i)
54875 +{
54876 +       struct net_device *dev1, *dev2;
54877 +       char dev_name[IFNAMSIZ];
54878 +       int err = -ENOMEM;
54879 +
54880 +       sprintf(dev_name, "vif0.%d", i);
54881 +       dev1 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup);
54882 +       if (!dev1)
54883 +               return err;
54884 +
54885 +       sprintf(dev_name, "veth%d", i);
54886 +       dev2 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup);
54887 +       if (!dev2)
54888 +               goto fail_netdev2;
54889 +
54890 +       loopback_construct(dev1, dev2);
54891 +       loopback_construct(dev2, dev1);
54892 +
54893 +       /*
54894 +        * Initialise a dummy MAC address for the 'dummy backend' interface. We
54895 +        * choose the numerically largest non-broadcast address to prevent the
54896 +        * address getting stolen by an Ethernet bridge for STP purposes.
54897 +        */
54898 +       memset(dev1->dev_addr, 0xFF, ETH_ALEN);
54899 +       dev1->dev_addr[0] &= ~0x01;
54900 +
54901 +       if ((err = register_netdev(dev1)) != 0)
54902 +               goto fail;
54903 +
54904 +       if ((err = register_netdev(dev2)) != 0) {
54905 +               unregister_netdev(dev1);
54906 +               goto fail;
54907 +       }
54908 +
54909 +       return 0;
54910 +
54911 + fail:
54912 +       free_netdev(dev2);
54913 + fail_netdev2:
54914 +       free_netdev(dev1);
54915 +       return err;
54916 +}
54917 +
54918 +static void __init clean_loopback(int i)
54919 +{
54920 +       struct net_device *dev1, *dev2;
54921 +       char dev_name[IFNAMSIZ];
54922 +
54923 +       sprintf(dev_name, "vif0.%d", i);
54924 +       dev1 = dev_get_by_name(dev_name);
54925 +       sprintf(dev_name, "veth%d", i);
54926 +       dev2 = dev_get_by_name(dev_name);
54927 +       if (dev1 && dev2) {
54928 +               unregister_netdev(dev2);
54929 +               unregister_netdev(dev1);
54930 +               free_netdev(dev2);
54931 +               free_netdev(dev1);
54932 +       }
54933 +}
54934 +
54935 +static int __init loopback_init(void)
54936 +{
54937 +       int i, err = 0;
54938 +
54939 +       for (i = 0; i < nloopbacks; i++)
54940 +               if ((err = make_loopback(i)) != 0)
54941 +                       break;
54942 +
54943 +       return err;
54944 +}
54945 +
54946 +module_init(loopback_init);
54947 +
54948 +static void __exit loopback_exit(void)
54949 +{
54950 +       int i;
54951 +
54952 +       for (i = nloopbacks; i-- > 0; )
54953 +               clean_loopback(i);
54954 +}
54955 +
54956 +module_exit(loopback_exit);
54957 +
54958 +MODULE_LICENSE("Dual BSD/GPL");
54959 diff -urNp linux-2.6/drivers/xen/netback/Makefile new/drivers/xen/netback/Makefile
54960 --- linux-2.6/drivers/xen/netback/Makefile      1970-01-01 01:00:00.000000000 +0100
54961 +++ new/drivers/xen/netback/Makefile    2006-05-09 12:34:37.000000000 +0200
54962 @@ -0,0 +1,5 @@
54963 +obj-$(CONFIG_XEN_NETDEV_BACKEND) := netbk.o
54964 +obj-$(CONFIG_XEN_NETDEV_LOOPBACK) += netloop.o
54965 +
54966 +netbk-y   := netback.o xenbus.o interface.o
54967 +netloop-y := loopback.o
54968 diff -urNp linux-2.6/drivers/xen/netback/netback.c new/drivers/xen/netback/netback.c
54969 --- linux-2.6/drivers/xen/netback/netback.c     1970-01-01 01:00:00.000000000 +0100
54970 +++ new/drivers/xen/netback/netback.c   2006-07-07 16:05:52.000000000 +0200
54971 @@ -0,0 +1,1107 @@
54972 +/******************************************************************************
54973 + * drivers/xen/netback/netback.c
54974 + * 
54975 + * Back-end of the driver for virtual network devices. This portion of the
54976 + * driver exports a 'unified' network-device interface that can be accessed
54977 + * by any operating system that implements a compatible front end. A 
54978 + * reference front-end implementation can be found in:
54979 + *  drivers/xen/netfront/netfront.c
54980 + * 
54981 + * Copyright (c) 2002-2005, K A Fraser
54982 + * 
54983 + * This program is free software; you can redistribute it and/or
54984 + * modify it under the terms of the GNU General Public License version 2
54985 + * as published by the Free Software Foundation; or, when distributed
54986 + * separately from the Linux kernel or incorporated into other
54987 + * software packages, subject to the following license:
54988 + * 
54989 + * Permission is hereby granted, free of charge, to any person obtaining a copy
54990 + * of this source file (the "Software"), to deal in the Software without
54991 + * restriction, including without limitation the rights to use, copy, modify,
54992 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
54993 + * and to permit persons to whom the Software is furnished to do so, subject to
54994 + * the following conditions:
54995 + * 
54996 + * The above copyright notice and this permission notice shall be included in
54997 + * all copies or substantial portions of the Software.
54998 + * 
54999 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
55000 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
55001 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
55002 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
55003 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
55004 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
55005 + * IN THE SOFTWARE.
55006 + */
55007 +
55008 +#include "common.h"
55009 +#include <xen/balloon.h>
55010 +#include <xen/interface/memory.h>
55011 +
55012 +/*#define NETBE_DEBUG_INTERRUPT*/
55013 +
55014 +static void netif_idx_release(u16 pending_idx);
55015 +static void netif_page_release(struct page *page);
55016 +static void make_tx_response(netif_t *netif, 
55017 +                            netif_tx_request_t *txp,
55018 +                            s8       st);
55019 +static int  make_rx_response(netif_t *netif, 
55020 +                            u16      id, 
55021 +                            s8       st,
55022 +                            u16      offset,
55023 +                            u16      size,
55024 +                            u16      flags);
55025 +
55026 +static void net_tx_action(unsigned long unused);
55027 +static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
55028 +
55029 +static void net_rx_action(unsigned long unused);
55030 +static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
55031 +
55032 +static struct timer_list net_timer;
55033 +
55034 +#define MAX_PENDING_REQS 256
55035 +
55036 +static struct sk_buff_head rx_queue;
55037 +static multicall_entry_t rx_mcl[NET_RX_RING_SIZE+1];
55038 +static mmu_update_t rx_mmu[NET_RX_RING_SIZE];
55039 +static gnttab_transfer_t grant_rx_op[NET_RX_RING_SIZE];
55040 +static unsigned char rx_notify[NR_IRQS];
55041 +
55042 +static unsigned long mmap_vstart;
55043 +#define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE))
55044 +
55045 +#define PKT_PROT_LEN 64
55046 +
55047 +static struct {
55048 +       netif_tx_request_t req;
55049 +       netif_t *netif;
55050 +} pending_tx_info[MAX_PENDING_REQS];
55051 +static u16 pending_ring[MAX_PENDING_REQS];
55052 +typedef unsigned int PEND_RING_IDX;
55053 +#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
55054 +static PEND_RING_IDX pending_prod, pending_cons;
55055 +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
55056 +
55057 +/* Freed TX SKBs get batched on this ring before return to pending_ring. */
55058 +static u16 dealloc_ring[MAX_PENDING_REQS];
55059 +static PEND_RING_IDX dealloc_prod, dealloc_cons;
55060 +
55061 +static struct sk_buff_head tx_queue;
55062 +
55063 +static grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
55064 +static gnttab_unmap_grant_ref_t tx_unmap_ops[MAX_PENDING_REQS];
55065 +static gnttab_map_grant_ref_t tx_map_ops[MAX_PENDING_REQS];
55066 +
55067 +static struct list_head net_schedule_list;
55068 +static spinlock_t net_schedule_list_lock;
55069 +
55070 +#define MAX_MFN_ALLOC 64
55071 +static unsigned long mfn_list[MAX_MFN_ALLOC];
55072 +static unsigned int alloc_index = 0;
55073 +static DEFINE_SPINLOCK(mfn_lock);
55074 +
55075 +static unsigned long alloc_mfn(void)
55076 +{
55077 +       unsigned long mfn = 0, flags;
55078 +       struct xen_memory_reservation reservation = {
55079 +               .nr_extents   = MAX_MFN_ALLOC,
55080 +               .extent_order = 0,
55081 +               .domid        = DOMID_SELF
55082 +       };
55083 +       set_xen_guest_handle(reservation.extent_start, mfn_list);
55084 +       spin_lock_irqsave(&mfn_lock, flags);
55085 +       if ( unlikely(alloc_index == 0) )
55086 +               alloc_index = HYPERVISOR_memory_op(
55087 +                       XENMEM_increase_reservation, &reservation);
55088 +       if ( alloc_index != 0 )
55089 +               mfn = mfn_list[--alloc_index];
55090 +       spin_unlock_irqrestore(&mfn_lock, flags);
55091 +       return mfn;
55092 +}
55093 +
55094 +static inline void maybe_schedule_tx_action(void)
55095 +{
55096 +       smp_mb();
55097 +       if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
55098 +           !list_empty(&net_schedule_list))
55099 +               tasklet_schedule(&net_tx_tasklet);
55100 +}
55101 +
55102 +/*
55103 + * A gross way of confirming the origin of an skb data page. The slab
55104 + * allocator abuses a field in the page struct to cache the kmem_cache_t ptr.
55105 + */
55106 +static inline int is_xen_skb(struct sk_buff *skb)
55107 +{
55108 +       extern kmem_cache_t *skbuff_cachep;
55109 +       kmem_cache_t *cp = (kmem_cache_t *)virt_to_page(skb->head)->lru.next;
55110 +       return (cp == skbuff_cachep);
55111 +}
55112 +
55113 +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
55114 +{
55115 +       netif_t *netif = netdev_priv(dev);
55116 +
55117 +       BUG_ON(skb->dev != dev);
55118 +
55119 +       /* Drop the packet if the target domain has no receive buffers. */
55120 +       if (!netif->active || 
55121 +           (netif->rx_req_cons_peek == netif->rx.sring->req_prod) ||
55122 +           ((netif->rx_req_cons_peek - netif->rx.rsp_prod_pvt) ==
55123 +            NET_RX_RING_SIZE))
55124 +               goto drop;
55125 +
55126 +       /*
55127 +        * We do not copy the packet unless:
55128 +        *  1. The data is shared; or
55129 +        *  2. The data is not allocated from our special cache.
55130 +        * NB. We also couldn't cope with fragmented packets, but we won't get
55131 +        *     any because we not advertise the NETIF_F_SG feature.
55132 +        */
55133 +       if (skb_shared(skb) || skb_cloned(skb) || !is_xen_skb(skb)) {
55134 +               int hlen = skb->data - skb->head;
55135 +               int ret;
55136 +               struct sk_buff *nskb = dev_alloc_skb(hlen + skb->len);
55137 +               if ( unlikely(nskb == NULL) )
55138 +                       goto drop;
55139 +               skb_reserve(nskb, hlen);
55140 +               __skb_put(nskb, skb->len);
55141 +               ret = skb_copy_bits(skb, -hlen, nskb->data - hlen,
55142 +                                    skb->len + hlen);
55143 +               BUG_ON(ret);
55144 +               /* Copy only the header fields we use in this driver. */
55145 +               nskb->dev = skb->dev;
55146 +               nskb->ip_summed = skb->ip_summed;
55147 +               nskb->proto_data_valid = skb->proto_data_valid;
55148 +               dev_kfree_skb(skb);
55149 +               skb = nskb;
55150 +       }
55151 +
55152 +       netif->rx_req_cons_peek++;
55153 +       netif_get(netif);
55154 +
55155 +       skb_queue_tail(&rx_queue, skb);
55156 +       tasklet_schedule(&net_rx_tasklet);
55157 +
55158 +       return 0;
55159 +
55160 + drop:
55161 +       netif->stats.tx_dropped++;
55162 +       dev_kfree_skb(skb);
55163 +       return 0;
55164 +}
55165 +
55166 +#if 0
55167 +static void xen_network_done_notify(void)
55168 +{
55169 +       static struct net_device *eth0_dev = NULL;
55170 +       if (unlikely(eth0_dev == NULL))
55171 +               eth0_dev = __dev_get_by_name("eth0");
55172 +       netif_rx_schedule(eth0_dev);
55173 +}
55174 +/* 
55175 + * Add following to poll() function in NAPI driver (Tigon3 is example):
55176 + *  if ( xen_network_done() )
55177 + *      tg3_enable_ints(tp); 
55178 + */
55179 +int xen_network_done(void)
55180 +{
55181 +       return skb_queue_empty(&rx_queue);
55182 +}
55183 +#endif
55184 +
55185 +static void net_rx_action(unsigned long unused)
55186 +{
55187 +       netif_t *netif = NULL; 
55188 +       s8 status;
55189 +       u16 size, id, irq, flags;
55190 +       multicall_entry_t *mcl;
55191 +       mmu_update_t *mmu;
55192 +       gnttab_transfer_t *gop;
55193 +       unsigned long vdata, old_mfn, new_mfn;
55194 +       struct sk_buff_head rxq;
55195 +       struct sk_buff *skb;
55196 +       u16 notify_list[NET_RX_RING_SIZE];
55197 +       int notify_nr = 0;
55198 +       int ret;
55199 +
55200 +       skb_queue_head_init(&rxq);
55201 +
55202 +       mcl = rx_mcl;
55203 +       mmu = rx_mmu;
55204 +       gop = grant_rx_op;
55205 +
55206 +       while ((skb = skb_dequeue(&rx_queue)) != NULL) {
55207 +               netif   = netdev_priv(skb->dev);
55208 +               vdata   = (unsigned long)skb->data;
55209 +               old_mfn = virt_to_mfn(vdata);
55210 +
55211 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
55212 +                       /* Memory squeeze? Back off for an arbitrary while. */
55213 +                       if ((new_mfn = alloc_mfn()) == 0) {
55214 +                               if ( net_ratelimit() )
55215 +                                       WPRINTK("Memory squeeze in netback "
55216 +                                               "driver.\n");
55217 +                               mod_timer(&net_timer, jiffies + HZ);
55218 +                               skb_queue_head(&rx_queue, skb);
55219 +                               break;
55220 +                       }
55221 +                       /*
55222 +                        * Set the new P2M table entry before reassigning
55223 +                        * the old data page. Heed the comment in
55224 +                        * pgtable-2level.h:pte_page(). :-)
55225 +                        */
55226 +                       set_phys_to_machine(
55227 +                               __pa(skb->data) >> PAGE_SHIFT,
55228 +                               new_mfn);
55229 +
55230 +                       MULTI_update_va_mapping(mcl, vdata,
55231 +                                               pfn_pte_ma(new_mfn,
55232 +                                                          PAGE_KERNEL), 0);
55233 +                       mcl++;
55234 +
55235 +                       mmu->ptr = ((maddr_t)new_mfn << PAGE_SHIFT) |
55236 +                               MMU_MACHPHYS_UPDATE;
55237 +                       mmu->val = __pa(vdata) >> PAGE_SHIFT;
55238 +                       mmu++;
55239 +               }
55240 +
55241 +               gop->mfn = old_mfn;
55242 +               gop->domid = netif->domid;
55243 +               gop->ref = RING_GET_REQUEST(
55244 +                       &netif->rx, netif->rx.req_cons)->gref;
55245 +               netif->rx.req_cons++;
55246 +               gop++;
55247 +
55248 +               __skb_queue_tail(&rxq, skb);
55249 +
55250 +               /* Filled the batch queue? */
55251 +               if ((gop - grant_rx_op) == ARRAY_SIZE(grant_rx_op))
55252 +                       break;
55253 +       }
55254 +
55255 +       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
55256 +               if (mcl == rx_mcl)
55257 +                       return;
55258 +
55259 +               mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
55260 +
55261 +               if (mmu - rx_mmu) {
55262 +                       mcl->op = __HYPERVISOR_mmu_update;
55263 +                       mcl->args[0] = (unsigned long)rx_mmu;
55264 +                       mcl->args[1] = mmu - rx_mmu;
55265 +                       mcl->args[2] = 0;
55266 +                       mcl->args[3] = DOMID_SELF;
55267 +                       mcl++;
55268 +               }
55269 +
55270 +               ret = HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl);
55271 +               BUG_ON(ret != 0);
55272 +       }
55273 +
55274 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_transfer, grant_rx_op, 
55275 +                                       gop - grant_rx_op);
55276 +       BUG_ON(ret != 0);
55277 +
55278 +       mcl = rx_mcl;
55279 +       gop = grant_rx_op;
55280 +       while ((skb = __skb_dequeue(&rxq)) != NULL) {
55281 +               netif   = netdev_priv(skb->dev);
55282 +               size    = skb->tail - skb->data;
55283 +
55284 +               atomic_set(&(skb_shinfo(skb)->dataref), 1);
55285 +               skb_shinfo(skb)->nr_frags = 0;
55286 +               skb_shinfo(skb)->frag_list = NULL;
55287 +
55288 +               netif->stats.tx_bytes += size;
55289 +               netif->stats.tx_packets++;
55290 +
55291 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
55292 +                       /* The update_va_mapping() must not fail. */
55293 +                       BUG_ON(mcl->result != 0);
55294 +                       mcl++;
55295 +               }
55296 +
55297 +               /* Check the reassignment error code. */
55298 +               status = NETIF_RSP_OKAY;
55299 +               if (gop->status != 0) { 
55300 +                       DPRINTK("Bad status %d from grant transfer to DOM%u\n",
55301 +                               gop->status, netif->domid);
55302 +                       /*
55303 +                        * Page no longer belongs to us unless GNTST_bad_page,
55304 +                        * but that should be a fatal error anyway.
55305 +                        */
55306 +                       BUG_ON(gop->status == GNTST_bad_page);
55307 +                       status = NETIF_RSP_ERROR; 
55308 +               }
55309 +               irq = netif->irq;
55310 +               id = RING_GET_REQUEST(&netif->rx, netif->rx.rsp_prod_pvt)->id;
55311 +               flags = 0;
55312 +               if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
55313 +                       flags |= NETRXF_csum_blank | NETRXF_data_validated;
55314 +               else if (skb->proto_data_valid) /* remote but checksummed? */
55315 +                       flags |= NETRXF_data_validated;
55316 +               if (make_rx_response(netif, id, status,
55317 +                                    (unsigned long)skb->data & ~PAGE_MASK,
55318 +                                    size, flags) &&
55319 +                   (rx_notify[irq] == 0)) {
55320 +                       rx_notify[irq] = 1;
55321 +                       notify_list[notify_nr++] = irq;
55322 +               }
55323 +
55324 +               netif_put(netif);
55325 +               dev_kfree_skb(skb);
55326 +               gop++;
55327 +       }
55328 +
55329 +       while (notify_nr != 0) {
55330 +               irq = notify_list[--notify_nr];
55331 +               rx_notify[irq] = 0;
55332 +               notify_remote_via_irq(irq);
55333 +       }
55334 +
55335 +       /* More work to do? */
55336 +       if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer))
55337 +               tasklet_schedule(&net_rx_tasklet);
55338 +#if 0
55339 +       else
55340 +               xen_network_done_notify();
55341 +#endif
55342 +}
55343 +
55344 +static void net_alarm(unsigned long unused)
55345 +{
55346 +       tasklet_schedule(&net_rx_tasklet);
55347 +}
55348 +
55349 +struct net_device_stats *netif_be_get_stats(struct net_device *dev)
55350 +{
55351 +       netif_t *netif = netdev_priv(dev);
55352 +       return &netif->stats;
55353 +}
55354 +
55355 +static int __on_net_schedule_list(netif_t *netif)
55356 +{
55357 +       return netif->list.next != NULL;
55358 +}
55359 +
55360 +static void remove_from_net_schedule_list(netif_t *netif)
55361 +{
55362 +       spin_lock_irq(&net_schedule_list_lock);
55363 +       if (likely(__on_net_schedule_list(netif))) {
55364 +               list_del(&netif->list);
55365 +               netif->list.next = NULL;
55366 +               netif_put(netif);
55367 +       }
55368 +       spin_unlock_irq(&net_schedule_list_lock);
55369 +}
55370 +
55371 +static void add_to_net_schedule_list_tail(netif_t *netif)
55372 +{
55373 +       if (__on_net_schedule_list(netif))
55374 +               return;
55375 +
55376 +       spin_lock_irq(&net_schedule_list_lock);
55377 +       if (!__on_net_schedule_list(netif) && netif->active) {
55378 +               list_add_tail(&netif->list, &net_schedule_list);
55379 +               netif_get(netif);
55380 +       }
55381 +       spin_unlock_irq(&net_schedule_list_lock);
55382 +}
55383 +
55384 +/*
55385 + * Note on CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER:
55386 + * If this driver is pipelining transmit requests then we can be very
55387 + * aggressive in avoiding new-packet notifications -- frontend only needs to
55388 + * send a notification if there are no outstanding unreceived responses.
55389 + * If we may be buffer transmit buffers for any reason then we must be rather
55390 + * more conservative and treat this as the final check for pending work.
55391 + */
55392 +void netif_schedule_work(netif_t *netif)
55393 +{
55394 +       int more_to_do;
55395 +
55396 +#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
55397 +       more_to_do = RING_HAS_UNCONSUMED_REQUESTS(&netif->tx);
55398 +#else
55399 +       RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
55400 +#endif
55401 +
55402 +       if (more_to_do) {
55403 +               add_to_net_schedule_list_tail(netif);
55404 +               maybe_schedule_tx_action();
55405 +       }
55406 +}
55407 +
55408 +void netif_deschedule_work(netif_t *netif)
55409 +{
55410 +       remove_from_net_schedule_list(netif);
55411 +}
55412 +
55413 +
55414 +static void tx_credit_callback(unsigned long data)
55415 +{
55416 +       netif_t *netif = (netif_t *)data;
55417 +       netif->remaining_credit = netif->credit_bytes;
55418 +       netif_schedule_work(netif);
55419 +}
55420 +
55421 +inline static void net_tx_action_dealloc(void)
55422 +{
55423 +       gnttab_unmap_grant_ref_t *gop;
55424 +       u16 pending_idx;
55425 +       PEND_RING_IDX dc, dp;
55426 +       netif_t *netif;
55427 +       int ret;
55428 +
55429 +       dc = dealloc_cons;
55430 +       dp = dealloc_prod;
55431 +
55432 +       /* Ensure we see all indexes enqueued by netif_idx_release(). */
55433 +       smp_rmb();
55434 +
55435 +       /*
55436 +        * Free up any grants we have finished using
55437 +        */
55438 +       gop = tx_unmap_ops;
55439 +       while (dc != dp) {
55440 +               pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)];
55441 +               gnttab_set_unmap_op(gop, MMAP_VADDR(pending_idx),
55442 +                                   GNTMAP_host_map,
55443 +                                   grant_tx_handle[pending_idx]);
55444 +               gop++;
55445 +       }
55446 +       ret = HYPERVISOR_grant_table_op(
55447 +               GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops);
55448 +       BUG_ON(ret);
55449 +
55450 +       while (dealloc_cons != dp) {
55451 +               pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)];
55452 +
55453 +               netif = pending_tx_info[pending_idx].netif;
55454 +
55455 +               make_tx_response(netif, &pending_tx_info[pending_idx].req, 
55456 +                                NETIF_RSP_OKAY);
55457 +
55458 +               pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
55459 +
55460 +               netif_put(netif);
55461 +       }
55462 +}
55463 +
55464 +static void netbk_tx_err(netif_t *netif, netif_tx_request_t *txp, RING_IDX end)
55465 +{
55466 +       RING_IDX cons = netif->tx.req_cons;
55467 +
55468 +       do {
55469 +               make_tx_response(netif, txp, NETIF_RSP_ERROR);
55470 +               if (++cons >= end)
55471 +                       break;
55472 +               txp = RING_GET_REQUEST(&netif->tx, cons);
55473 +       } while (1);
55474 +       netif->tx.req_cons = cons;
55475 +       netif_schedule_work(netif);
55476 +       netif_put(netif);
55477 +}
55478 +
55479 +static int netbk_count_requests(netif_t *netif, netif_tx_request_t *txp,
55480 +                               int work_to_do)
55481 +{
55482 +       netif_tx_request_t *first = txp;
55483 +       RING_IDX cons = netif->tx.req_cons;
55484 +       int frags = 0;
55485 +
55486 +       while (txp->flags & NETTXF_more_data) {
55487 +               if (frags >= work_to_do) {
55488 +                       DPRINTK("Need more frags\n");
55489 +                       return -frags;
55490 +               }
55491 +
55492 +               txp = RING_GET_REQUEST(&netif->tx, cons + frags);
55493 +               if (txp->size > first->size) {
55494 +                       DPRINTK("Frags galore\n");
55495 +                       return -frags;
55496 +               }
55497 +
55498 +               first->size -= txp->size;
55499 +               frags++;
55500 +
55501 +               if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
55502 +                       DPRINTK("txp->offset: %x, size: %u\n",
55503 +                               txp->offset, txp->size);
55504 +                       return -frags;
55505 +               }
55506 +       }
55507 +
55508 +       return frags;
55509 +}
55510 +
55511 +static gnttab_map_grant_ref_t *netbk_get_requests(netif_t *netif,
55512 +                                                 struct sk_buff *skb,
55513 +                                                 gnttab_map_grant_ref_t *mop)
55514 +{
55515 +       struct skb_shared_info *shinfo = skb_shinfo(skb);
55516 +       skb_frag_t *frags = shinfo->frags;
55517 +       netif_tx_request_t *txp;
55518 +       unsigned long pending_idx = *((u16 *)skb->data);
55519 +       RING_IDX cons = netif->tx.req_cons;
55520 +       int i, start;
55521 +
55522 +       /* Skip first skb fragment if it is on same page as header fragment. */
55523 +       start = ((unsigned long)shinfo->frags[0].page == pending_idx);
55524 +
55525 +       for (i = start; i < shinfo->nr_frags; i++) {
55526 +               txp = RING_GET_REQUEST(&netif->tx, cons++);
55527 +               pending_idx = pending_ring[MASK_PEND_IDX(pending_cons++)];
55528 +
55529 +               gnttab_set_map_op(mop++, MMAP_VADDR(pending_idx),
55530 +                                 GNTMAP_host_map | GNTMAP_readonly,
55531 +                                 txp->gref, netif->domid);
55532 +
55533 +               memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
55534 +               netif_get(netif);
55535 +               pending_tx_info[pending_idx].netif = netif;
55536 +               frags[i].page = (void *)pending_idx;
55537 +       }
55538 +
55539 +       return mop;
55540 +}
55541 +
55542 +static int netbk_tx_check_mop(struct sk_buff *skb,
55543 +                              gnttab_map_grant_ref_t **mopp)
55544 +{
55545 +       gnttab_map_grant_ref_t *mop = *mopp;
55546 +       int pending_idx = *((u16 *)skb->data);
55547 +       netif_t *netif = pending_tx_info[pending_idx].netif;
55548 +       netif_tx_request_t *txp;
55549 +       struct skb_shared_info *shinfo = skb_shinfo(skb);
55550 +       int nr_frags = shinfo->nr_frags;
55551 +       int i, err, start;
55552 +
55553 +       /* Check status of header. */
55554 +       err = mop->status;
55555 +       if (unlikely(err)) {
55556 +               txp = &pending_tx_info[pending_idx].req;
55557 +               make_tx_response(netif, txp, NETIF_RSP_ERROR);
55558 +               pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
55559 +               netif_put(netif);
55560 +       } else {
55561 +               set_phys_to_machine(
55562 +                       __pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT,
55563 +                       FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
55564 +               grant_tx_handle[pending_idx] = mop->handle;
55565 +       }
55566 +
55567 +       /* Skip first skb fragment if it is on same page as header fragment. */
55568 +       start = ((unsigned long)shinfo->frags[0].page == pending_idx);
55569 +
55570 +       for (i = start; i < nr_frags; i++) {
55571 +               int j, newerr;
55572 +
55573 +               pending_idx = (unsigned long)shinfo->frags[i].page;
55574 +
55575 +               /* Check error status: if okay then remember grant handle. */
55576 +               newerr = (++mop)->status;
55577 +               if (likely(!newerr)) {
55578 +                       set_phys_to_machine(
55579 +                               __pa(MMAP_VADDR(pending_idx))>>PAGE_SHIFT,
55580 +                               FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
55581 +                       grant_tx_handle[pending_idx] = mop->handle;
55582 +                       /* Had a previous error? Invalidate this fragment. */
55583 +                       if (unlikely(err))
55584 +                               netif_idx_release(pending_idx);
55585 +                       continue;
55586 +               }
55587 +
55588 +               /* Error on this fragment: respond to client with an error. */
55589 +               txp = &pending_tx_info[pending_idx].req;
55590 +               make_tx_response(netif, txp, NETIF_RSP_ERROR);
55591 +               pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
55592 +               netif_put(netif);
55593 +
55594 +               /* Not the first error? Preceding frags already invalidated. */
55595 +               if (err)
55596 +                       continue;
55597 +
55598 +               /* First error: invalidate header and preceding fragments. */
55599 +               pending_idx = *((u16 *)skb->data);
55600 +               netif_idx_release(pending_idx);
55601 +               for (j = start; j < i; j++) {
55602 +                       pending_idx = (unsigned long)shinfo->frags[i].page;
55603 +                       netif_idx_release(pending_idx);
55604 +               }
55605 +
55606 +               /* Remember the error: invalidate all subsequent fragments. */
55607 +               err = newerr;
55608 +       }
55609 +
55610 +       *mopp = mop + 1;
55611 +       return err;
55612 +}
55613 +
55614 +static void netbk_fill_frags(struct sk_buff *skb)
55615 +{
55616 +       struct skb_shared_info *shinfo = skb_shinfo(skb);
55617 +       int nr_frags = shinfo->nr_frags;
55618 +       int i;
55619 +
55620 +       for (i = 0; i < nr_frags; i++) {
55621 +               skb_frag_t *frag = shinfo->frags + i;
55622 +               netif_tx_request_t *txp;
55623 +               unsigned long pending_idx;
55624 +
55625 +               pending_idx = (unsigned long)frag->page;
55626 +               txp = &pending_tx_info[pending_idx].req;
55627 +               frag->page = virt_to_page(MMAP_VADDR(pending_idx));
55628 +               frag->size = txp->size;
55629 +               frag->page_offset = txp->offset;
55630 +
55631 +               skb->len += txp->size;
55632 +               skb->data_len += txp->size;
55633 +               skb->truesize += txp->size;
55634 +       }
55635 +}
55636 +
55637 +int netbk_get_extras(netif_t *netif, struct netif_extra_info *extras,
55638 +                    int work_to_do)
55639 +{
55640 +       struct netif_extra_info *extra;
55641 +       RING_IDX cons = netif->tx.req_cons;
55642 +
55643 +       do {
55644 +               if (unlikely(work_to_do-- <= 0)) {
55645 +                       DPRINTK("Missing extra info\n");
55646 +                       return -EBADR;
55647 +               }
55648 +
55649 +               extra = (struct netif_extra_info *)
55650 +                       RING_GET_REQUEST(&netif->tx, cons);
55651 +               if (unlikely(!extra->type ||
55652 +                            extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
55653 +                       netif->tx.req_cons = ++cons;
55654 +                       DPRINTK("Invalid extra type: %d\n", extra->type);
55655 +                       return -EINVAL;
55656 +               }
55657 +
55658 +               memcpy(&extras[extra->type - 1], extra, sizeof(*extra));
55659 +               netif->tx.req_cons = ++cons;
55660 +       } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
55661 +
55662 +       return work_to_do;
55663 +}
55664 +
55665 +/* Called after netfront has transmitted */
55666 +static void net_tx_action(unsigned long unused)
55667 +{
55668 +       struct list_head *ent;
55669 +       struct sk_buff *skb;
55670 +       netif_t *netif;
55671 +       netif_tx_request_t txreq;
55672 +       struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
55673 +       u16 pending_idx;
55674 +       RING_IDX i;
55675 +       gnttab_map_grant_ref_t *mop;
55676 +       unsigned int data_len;
55677 +       int ret, work_to_do;
55678 +
55679 +       if (dealloc_cons != dealloc_prod)
55680 +               net_tx_action_dealloc();
55681 +
55682 +       mop = tx_map_ops;
55683 +       while (((NR_PENDING_REQS + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
55684 +               !list_empty(&net_schedule_list)) {
55685 +               /* Get a netif from the list with work to do. */
55686 +               ent = net_schedule_list.next;
55687 +               netif = list_entry(ent, netif_t, list);
55688 +               netif_get(netif);
55689 +               remove_from_net_schedule_list(netif);
55690 +
55691 +               RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do);
55692 +               if (!work_to_do) {
55693 +                       netif_put(netif);
55694 +                       continue;
55695 +               }
55696 +
55697 +               i = netif->tx.req_cons;
55698 +               rmb(); /* Ensure that we see the request before we copy it. */
55699 +               memcpy(&txreq, RING_GET_REQUEST(&netif->tx, i), sizeof(txreq));
55700 +               /* Credit-based scheduling. */
55701 +               if (txreq.size > netif->remaining_credit) {
55702 +                       unsigned long now = jiffies;
55703 +                       unsigned long next_credit = 
55704 +                               netif->credit_timeout.expires +
55705 +                               msecs_to_jiffies(netif->credit_usec / 1000);
55706 +
55707 +                       /* Timer could already be pending in rare cases. */
55708 +                       if (timer_pending(&netif->credit_timeout))
55709 +                               break;
55710 +
55711 +                       /* Passed the point where we can replenish credit? */
55712 +                       if (time_after_eq(now, next_credit)) {
55713 +                               netif->credit_timeout.expires = now;
55714 +                               netif->remaining_credit = netif->credit_bytes;
55715 +                       }
55716 +
55717 +                       /* Still too big to send right now? Set a callback. */
55718 +                       if (txreq.size > netif->remaining_credit) {
55719 +                               netif->remaining_credit = 0;
55720 +                               netif->credit_timeout.data     =
55721 +                                       (unsigned long)netif;
55722 +                               netif->credit_timeout.function =
55723 +                                       tx_credit_callback;
55724 +                               __mod_timer(&netif->credit_timeout,
55725 +                                           next_credit);
55726 +                               break;
55727 +                       }
55728 +               }
55729 +               netif->remaining_credit -= txreq.size;
55730 +
55731 +               work_to_do--;
55732 +               netif->tx.req_cons = ++i;
55733 +
55734 +               memset(extras, 0, sizeof(extras));
55735 +               if (txreq.flags & NETTXF_extra_info) {
55736 +                       work_to_do = netbk_get_extras(netif, extras,
55737 +                                                     work_to_do);
55738 +                       if (unlikely(work_to_do < 0)) {
55739 +                               netbk_tx_err(netif, &txreq, 0);
55740 +                               continue;
55741 +                       }
55742 +                       i = netif->tx.req_cons;
55743 +               }
55744 +
55745 +               ret = netbk_count_requests(netif, &txreq, work_to_do);
55746 +               if (unlikely(ret < 0)) {
55747 +                       netbk_tx_err(netif, &txreq, i - ret);
55748 +                       continue;
55749 +               }
55750 +               i += ret;
55751 +
55752 +               if (unlikely(ret > MAX_SKB_FRAGS)) {
55753 +                       DPRINTK("Too many frags\n");
55754 +                       netbk_tx_err(netif, &txreq, i);
55755 +                       continue;
55756 +               }
55757 +
55758 +               if (unlikely(txreq.size < ETH_HLEN)) {
55759 +                       DPRINTK("Bad packet size: %d\n", txreq.size);
55760 +                       netbk_tx_err(netif, &txreq, i);
55761 +                       continue; 
55762 +               }
55763 +
55764 +               /* No crossing a page as the payload mustn't fragment. */
55765 +               if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
55766 +                       DPRINTK("txreq.offset: %x, size: %u, end: %lu\n", 
55767 +                               txreq.offset, txreq.size, 
55768 +                               (txreq.offset &~PAGE_MASK) + txreq.size);
55769 +                       netbk_tx_err(netif, &txreq, i);
55770 +                       continue;
55771 +               }
55772 +
55773 +               pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
55774 +
55775 +               data_len = (txreq.size > PKT_PROT_LEN &&
55776 +                           ret < MAX_SKB_FRAGS) ?
55777 +                       PKT_PROT_LEN : txreq.size;
55778 +
55779 +               skb = alloc_skb(data_len+16, GFP_ATOMIC);
55780 +               if (unlikely(skb == NULL)) {
55781 +                       DPRINTK("Can't allocate a skb in start_xmit.\n");
55782 +                       netbk_tx_err(netif, &txreq, i);
55783 +                       break;
55784 +               }
55785 +
55786 +               /* Packets passed to netif_rx() must have some headroom. */
55787 +               skb_reserve(skb, 16);
55788 +
55789 +               if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
55790 +                       struct netif_extra_info *gso;
55791 +                       gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
55792 +
55793 +                       /* Currently on TCPv4 S.O. is supported. */
55794 +                       if (gso->u.gso.type != XEN_NETIF_GSO_TCPV4) {
55795 +                               DPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
55796 +                               kfree_skb(skb);
55797 +                               netbk_tx_err(netif, &txreq, i);
55798 +                               break;
55799 +                       }
55800 +
55801 +                       skb_shinfo(skb)->gso_size = gso->u.gso.size;
55802 +                       skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
55803 +
55804 +                       /* Header must be checked, and gso_segs computed. */
55805 +                       skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
55806 +                       skb_shinfo(skb)->gso_segs = 0;
55807 +               }
55808 +
55809 +               gnttab_set_map_op(mop, MMAP_VADDR(pending_idx),
55810 +                                 GNTMAP_host_map | GNTMAP_readonly,
55811 +                                 txreq.gref, netif->domid);
55812 +               mop++;
55813 +
55814 +               memcpy(&pending_tx_info[pending_idx].req,
55815 +                      &txreq, sizeof(txreq));
55816 +               pending_tx_info[pending_idx].netif = netif;
55817 +               *((u16 *)skb->data) = pending_idx;
55818 +
55819 +               __skb_put(skb, data_len);
55820 +
55821 +               skb_shinfo(skb)->nr_frags = ret;
55822 +               if (data_len < txreq.size) {
55823 +                       skb_shinfo(skb)->nr_frags++;
55824 +                       skb_shinfo(skb)->frags[0].page =
55825 +                               (void *)(unsigned long)pending_idx;
55826 +               }
55827 +
55828 +               __skb_queue_tail(&tx_queue, skb);
55829 +
55830 +               pending_cons++;
55831 +
55832 +               mop = netbk_get_requests(netif, skb, mop);
55833 +
55834 +               netif->tx.req_cons = i;
55835 +               netif_schedule_work(netif);
55836 +
55837 +               if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
55838 +                       break;
55839 +       }
55840 +
55841 +       if (mop == tx_map_ops)
55842 +               return;
55843 +
55844 +       ret = HYPERVISOR_grant_table_op(
55845 +               GNTTABOP_map_grant_ref, tx_map_ops, mop - tx_map_ops);
55846 +       BUG_ON(ret);
55847 +
55848 +       mop = tx_map_ops;
55849 +       while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
55850 +               netif_tx_request_t *txp;
55851 +
55852 +               pending_idx = *((u16 *)skb->data);
55853 +               netif       = pending_tx_info[pending_idx].netif;
55854 +               txp         = &pending_tx_info[pending_idx].req;
55855 +
55856 +               /* Check the remap error code. */
55857 +               if (unlikely(netbk_tx_check_mop(skb, &mop))) {
55858 +                       printk(KERN_ALERT "#### netback grant fails\n");
55859 +                       skb_shinfo(skb)->nr_frags = 0;
55860 +                       kfree_skb(skb);
55861 +                       continue;
55862 +               }
55863 +
55864 +               data_len = skb->len;
55865 +               memcpy(skb->data, 
55866 +                      (void *)(MMAP_VADDR(pending_idx)|txp->offset),
55867 +                      data_len);
55868 +               if (data_len < txp->size) {
55869 +                       /* Append the packet payload as a fragment. */
55870 +                       txp->offset += data_len;
55871 +                       txp->size -= data_len;
55872 +               } else {
55873 +                       /* Schedule a response immediately. */
55874 +                       netif_idx_release(pending_idx);
55875 +               }
55876 +
55877 +               /*
55878 +                * Old frontends do not assert data_validated but we
55879 +                * can infer it from csum_blank so test both flags.
55880 +                */
55881 +               if (txp->flags & (NETTXF_data_validated|NETTXF_csum_blank)) {
55882 +                       skb->ip_summed = CHECKSUM_UNNECESSARY;
55883 +                       skb->proto_data_valid = 1;
55884 +               } else {
55885 +                       skb->ip_summed = CHECKSUM_NONE;
55886 +                       skb->proto_data_valid = 0;
55887 +               }
55888 +               skb->proto_csum_blank = !!(txp->flags & NETTXF_csum_blank);
55889 +
55890 +               netbk_fill_frags(skb);
55891 +
55892 +               skb->dev      = netif->dev;
55893 +               skb->protocol = eth_type_trans(skb, skb->dev);
55894 +
55895 +               netif->stats.rx_bytes += skb->len;
55896 +               netif->stats.rx_packets++;
55897 +
55898 +               netif_rx(skb);
55899 +               netif->dev->last_rx = jiffies;
55900 +       }
55901 +}
55902 +
55903 +static void netif_idx_release(u16 pending_idx)
55904 +{
55905 +       static DEFINE_SPINLOCK(_lock);
55906 +       unsigned long flags;
55907 +
55908 +       spin_lock_irqsave(&_lock, flags);
55909 +       dealloc_ring[MASK_PEND_IDX(dealloc_prod)] = pending_idx;
55910 +       /* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
55911 +       smp_wmb();
55912 +       dealloc_prod++;
55913 +       spin_unlock_irqrestore(&_lock, flags);
55914 +
55915 +       tasklet_schedule(&net_tx_tasklet);
55916 +}
55917 +
55918 +static void netif_page_release(struct page *page)
55919 +{
55920 +       u16 pending_idx = page - virt_to_page(mmap_vstart);
55921 +
55922 +       /* Ready for next use. */
55923 +       init_page_count(page);
55924 +
55925 +       netif_idx_release(pending_idx);
55926 +}
55927 +
55928 +irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs)
55929 +{
55930 +       netif_t *netif = dev_id;
55931 +       add_to_net_schedule_list_tail(netif);
55932 +       maybe_schedule_tx_action();
55933 +       return IRQ_HANDLED;
55934 +}
55935 +
55936 +static void make_tx_response(netif_t *netif, 
55937 +                            netif_tx_request_t *txp,
55938 +                            s8       st)
55939 +{
55940 +       RING_IDX i = netif->tx.rsp_prod_pvt;
55941 +       netif_tx_response_t *resp;
55942 +       int notify;
55943 +
55944 +       resp = RING_GET_RESPONSE(&netif->tx, i);
55945 +       resp->id     = txp->id;
55946 +       resp->status = st;
55947 +
55948 +       if (txp->flags & NETTXF_extra_info)
55949 +               RING_GET_RESPONSE(&netif->tx, ++i)->status = NETIF_RSP_NULL;
55950 +
55951 +       netif->tx.rsp_prod_pvt = ++i;
55952 +       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify);
55953 +       if (notify)
55954 +               notify_remote_via_irq(netif->irq);
55955 +
55956 +#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
55957 +       if (i == netif->tx.req_cons) {
55958 +               int more_to_do;
55959 +               RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
55960 +               if (more_to_do)
55961 +                       add_to_net_schedule_list_tail(netif);
55962 +       }
55963 +#endif
55964 +}
55965 +
55966 +static int make_rx_response(netif_t *netif, 
55967 +                           u16      id, 
55968 +                           s8       st,
55969 +                           u16      offset,
55970 +                           u16      size,
55971 +                           u16      flags)
55972 +{
55973 +       RING_IDX i = netif->rx.rsp_prod_pvt;
55974 +       netif_rx_response_t *resp;
55975 +       int notify;
55976 +
55977 +       resp = RING_GET_RESPONSE(&netif->rx, i);
55978 +       resp->offset     = offset;
55979 +       resp->flags      = flags;
55980 +       resp->id         = id;
55981 +       resp->status     = (s16)size;
55982 +       if (st < 0)
55983 +               resp->status = (s16)st;
55984 +
55985 +       netif->rx.rsp_prod_pvt = ++i;
55986 +       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, notify);
55987 +
55988 +       return notify;
55989 +}
55990 +
55991 +#ifdef NETBE_DEBUG_INTERRUPT
55992 +static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
55993 +{
55994 +       struct list_head *ent;
55995 +       netif_t *netif;
55996 +       int i = 0;
55997 +
55998 +       printk(KERN_ALERT "netif_schedule_list:\n");
55999 +       spin_lock_irq(&net_schedule_list_lock);
56000 +
56001 +       list_for_each (ent, &net_schedule_list) {
56002 +               netif = list_entry(ent, netif_t, list);
56003 +               printk(KERN_ALERT " %d: private(rx_req_cons=%08x "
56004 +                      "rx_resp_prod=%08x\n",
56005 +                      i, netif->rx.req_cons, netif->rx.rsp_prod_pvt);
56006 +               printk(KERN_ALERT "   tx_req_cons=%08x tx_resp_prod=%08x)\n",
56007 +                      netif->tx.req_cons, netif->tx.rsp_prod_pvt);
56008 +               printk(KERN_ALERT "   shared(rx_req_prod=%08x "
56009 +                      "rx_resp_prod=%08x\n",
56010 +                      netif->rx.sring->req_prod, netif->rx.sring->rsp_prod);
56011 +               printk(KERN_ALERT "   rx_event=%08x tx_req_prod=%08x\n",
56012 +                      netif->rx.sring->rsp_event, netif->tx.sring->req_prod);
56013 +               printk(KERN_ALERT "   tx_resp_prod=%08x, tx_event=%08x)\n",
56014 +                      netif->tx.sring->rsp_prod, netif->tx.sring->rsp_event);
56015 +               i++;
56016 +       }
56017 +
56018 +       spin_unlock_irq(&net_schedule_list_lock);
56019 +       printk(KERN_ALERT " ** End of netif_schedule_list **\n");
56020 +
56021 +       return IRQ_HANDLED;
56022 +}
56023 +#endif
56024 +
56025 +static int __init netback_init(void)
56026 +{
56027 +       int i;
56028 +       struct page *page;
56029 +
56030 +       if (!is_running_on_xen())
56031 +               return -ENODEV;
56032 +
56033 +       /* We can increase reservation by this much in net_rx_action(). */
56034 +       balloon_update_driver_allowance(NET_RX_RING_SIZE);
56035 +
56036 +       skb_queue_head_init(&rx_queue);
56037 +       skb_queue_head_init(&tx_queue);
56038 +
56039 +       init_timer(&net_timer);
56040 +       net_timer.data = 0;
56041 +       net_timer.function = net_alarm;
56042 +    
56043 +       page = balloon_alloc_empty_page_range(MAX_PENDING_REQS);
56044 +       BUG_ON(page == NULL);
56045 +       mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
56046 +
56047 +       for (i = 0; i < MAX_PENDING_REQS; i++) {
56048 +               page = virt_to_page(MMAP_VADDR(i));
56049 +               init_page_count(page);
56050 +               SetPageForeign(page, netif_page_release);
56051 +       }
56052 +
56053 +       pending_cons = 0;
56054 +       pending_prod = MAX_PENDING_REQS;
56055 +       for (i = 0; i < MAX_PENDING_REQS; i++)
56056 +               pending_ring[i] = i;
56057 +
56058 +       spin_lock_init(&net_schedule_list_lock);
56059 +       INIT_LIST_HEAD(&net_schedule_list);
56060 +
56061 +       netif_xenbus_init();
56062 +
56063 +#ifdef NETBE_DEBUG_INTERRUPT
56064 +       (void)bind_virq_to_irqhandler(
56065 +               VIRQ_DEBUG,
56066 +               0,
56067 +               netif_be_dbg,
56068 +               SA_SHIRQ, 
56069 +               "net-be-dbg",
56070 +               &netif_be_dbg);
56071 +#endif
56072 +
56073 +       return 0;
56074 +}
56075 +
56076 +module_init(netback_init);
56077 +
56078 +MODULE_LICENSE("Dual BSD/GPL");
56079 diff -urNp linux-2.6/drivers/xen/netback/xenbus.c new/drivers/xen/netback/xenbus.c
56080 --- linux-2.6/drivers/xen/netback/xenbus.c      1970-01-01 01:00:00.000000000 +0100
56081 +++ new/drivers/xen/netback/xenbus.c    2006-07-07 15:10:03.000000000 +0200
56082 @@ -0,0 +1,404 @@
56083 +/*  Xenbus code for netif backend
56084 +    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
56085 +    Copyright (C) 2005 XenSource Ltd
56086 +
56087 +    This program is free software; you can redistribute it and/or modify
56088 +    it under the terms of the GNU General Public License as published by
56089 +    the Free Software Foundation; either version 2 of the License, or
56090 +    (at your option) any later version.
56091 +
56092 +    This program is distributed in the hope that it will be useful,
56093 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
56094 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
56095 +    GNU General Public License for more details.
56096 +
56097 +    You should have received a copy of the GNU General Public License
56098 +    along with this program; if not, write to the Free Software
56099 +    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
56100 +*/
56101 +
56102 +#include <stdarg.h>
56103 +#include <linux/module.h>
56104 +#include <xen/xenbus.h>
56105 +#include "common.h"
56106 +
56107 +#if 0
56108 +#undef DPRINTK
56109 +#define DPRINTK(fmt, args...) \
56110 +    printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
56111 +#endif
56112 +
56113 +struct backend_info
56114 +{
56115 +       struct xenbus_device *dev;
56116 +       netif_t *netif;
56117 +       struct xenbus_watch backend_watch;
56118 +       enum xenbus_state frontend_state;
56119 +};
56120 +
56121 +static int connect_rings(struct backend_info *);
56122 +static void connect(struct backend_info *);
56123 +static void maybe_connect(struct backend_info *);
56124 +static void backend_changed(struct xenbus_watch *, const char **,
56125 +                           unsigned int);
56126 +
56127 +static int netback_remove(struct xenbus_device *dev)
56128 +{
56129 +       struct backend_info *be = dev->dev.driver_data;
56130 +
56131 +       if (be->backend_watch.node) {
56132 +               unregister_xenbus_watch(&be->backend_watch);
56133 +               kfree(be->backend_watch.node);
56134 +               be->backend_watch.node = NULL;
56135 +       }
56136 +       if (be->netif) {
56137 +               netif_disconnect(be->netif);
56138 +               be->netif = NULL;
56139 +       }
56140 +       kfree(be);
56141 +       dev->dev.driver_data = NULL;
56142 +       return 0;
56143 +}
56144 +
56145 +
56146 +/**
56147 + * Entry point to this code when a new device is created.  Allocate the basic
56148 + * structures, and watch the store waiting for the hotplug scripts to tell us
56149 + * the device's handle.  Switch to InitWait.
56150 + */
56151 +static int netback_probe(struct xenbus_device *dev,
56152 +                        const struct xenbus_device_id *id)
56153 +{
56154 +       const char *message;
56155 +       struct xenbus_transaction xbt;
56156 +       int err;
56157 +       struct backend_info *be = kzalloc(sizeof(struct backend_info),
56158 +                                         GFP_KERNEL);
56159 +       if (!be) {
56160 +               xenbus_dev_fatal(dev, -ENOMEM,
56161 +                                "allocating backend structure");
56162 +               return -ENOMEM;
56163 +       }
56164 +
56165 +       be->dev = dev;
56166 +       dev->dev.driver_data = be;
56167 +
56168 +       err = xenbus_watch_path2(dev, dev->nodename, "handle",
56169 +                                &be->backend_watch, backend_changed);
56170 +       if (err)
56171 +               goto fail;
56172 +
56173 +       do {
56174 +               err = xenbus_transaction_start(&xbt);
56175 +               if (err) {
56176 +                       xenbus_dev_fatal(dev, err, "starting transaction");
56177 +                       goto fail;
56178 +               }
56179 +
56180 +               err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
56181 +               if (err) {
56182 +                       message = "writing feature-sg";
56183 +                       goto abort_transaction;
56184 +               }
56185 +
56186 +#if 0 /* KAF: After the protocol is finalised. */
56187 +               err = xenbus_printf(xbt, dev->nodename, "feature-tso", "%d", 1);
56188 +               if (err) {
56189 +                       message = "writing feature-tso";
56190 +                       goto abort_transaction;
56191 +               }
56192 +#endif
56193 +
56194 +               err = xenbus_transaction_end(xbt, 0);
56195 +       } while (err == -EAGAIN);
56196 +
56197 +       if (err) {
56198 +               xenbus_dev_fatal(dev, err, "completing transaction");
56199 +               goto fail;
56200 +       }
56201 +
56202 +       err = xenbus_switch_state(dev, XenbusStateInitWait);
56203 +       if (err) {
56204 +               goto fail;
56205 +       }
56206 +
56207 +       return 0;
56208 +
56209 +abort_transaction:
56210 +       xenbus_transaction_end(xbt, 1);
56211 +       xenbus_dev_fatal(dev, err, "%s", message);
56212 +fail:
56213 +       DPRINTK("failed");
56214 +       netback_remove(dev);
56215 +       return err;
56216 +}
56217 +
56218 +
56219 +/**
56220 + * Handle the creation of the hotplug script environment.  We add the script
56221 + * and vif variables to the environment, for the benefit of the vif-* hotplug
56222 + * scripts.
56223 + */
56224 +static int netback_uevent(struct xenbus_device *xdev, char **envp,
56225 +                         int num_envp, char *buffer, int buffer_size)
56226 +{
56227 +       struct backend_info *be = xdev->dev.driver_data;
56228 +       netif_t *netif = be->netif;
56229 +       int i = 0, length = 0;
56230 +       char *val;
56231 +
56232 +       DPRINTK("netback_uevent");
56233 +
56234 +       val = xenbus_read(XBT_NIL, xdev->nodename, "script", NULL);
56235 +       if (IS_ERR(val)) {
56236 +               int err = PTR_ERR(val);
56237 +               xenbus_dev_fatal(xdev, err, "reading script");
56238 +               return err;
56239 +       }
56240 +       else {
56241 +               add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
56242 +                              &length, "script=%s", val);
56243 +               kfree(val);
56244 +       }
56245 +
56246 +       add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
56247 +                      "vif=%s", netif->dev->name);
56248 +
56249 +       envp[i] = NULL;
56250 +
56251 +       return 0;
56252 +}
56253 +
56254 +
56255 +/**
56256 + * Callback received when the hotplug scripts have placed the handle node.
56257 + * Read it, and create a netif structure.  If the frontend is ready, connect.
56258 + */
56259 +static void backend_changed(struct xenbus_watch *watch,
56260 +                           const char **vec, unsigned int len)
56261 +{
56262 +       int err;
56263 +       long handle;
56264 +       struct backend_info *be
56265 +               = container_of(watch, struct backend_info, backend_watch);
56266 +       struct xenbus_device *dev = be->dev;
56267 +
56268 +       DPRINTK("");
56269 +
56270 +       err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle);
56271 +       if (XENBUS_EXIST_ERR(err)) {
56272 +               /* Since this watch will fire once immediately after it is
56273 +                  registered, we expect this.  Ignore it, and wait for the
56274 +                  hotplug scripts. */
56275 +               return;
56276 +       }
56277 +       if (err != 1) {
56278 +               xenbus_dev_fatal(dev, err, "reading handle");
56279 +               return;
56280 +       }
56281 +
56282 +       if (be->netif == NULL) {
56283 +               u8 be_mac[ETH_ALEN] = { 0, 0, 0, 0, 0, 0 };
56284 +
56285 +               be->netif = netif_alloc(dev->otherend_id, handle, be_mac);
56286 +               if (IS_ERR(be->netif)) {
56287 +                       err = PTR_ERR(be->netif);
56288 +                       be->netif = NULL;
56289 +                       xenbus_dev_fatal(dev, err, "creating interface");
56290 +                       return;
56291 +               }
56292 +
56293 +               kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE);
56294 +
56295 +               maybe_connect(be);
56296 +       }
56297 +}
56298 +
56299 +
56300 +/**
56301 + * Callback received when the frontend's state changes.
56302 + */
56303 +static void frontend_changed(struct xenbus_device *dev,
56304 +                            enum xenbus_state frontend_state)
56305 +{
56306 +       struct backend_info *be = dev->dev.driver_data;
56307 +
56308 +       DPRINTK("");
56309 +
56310 +       be->frontend_state = frontend_state;
56311 +
56312 +       switch (frontend_state) {
56313 +       case XenbusStateInitialising:
56314 +       case XenbusStateInitialised:
56315 +               break;
56316 +
56317 +       case XenbusStateConnected:
56318 +               maybe_connect(be);
56319 +               break;
56320 +
56321 +       case XenbusStateClosing:
56322 +               xenbus_switch_state(dev, XenbusStateClosing);
56323 +               break;
56324 +
56325 +       case XenbusStateClosed:
56326 +               if (be->netif != NULL)
56327 +                       kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
56328 +               device_unregister(&dev->dev);
56329 +               break;
56330 +
56331 +       case XenbusStateUnknown:
56332 +       case XenbusStateInitWait:
56333 +       default:
56334 +               xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
56335 +                                frontend_state);
56336 +               break;
56337 +       }
56338 +}
56339 +
56340 +
56341 +/* ** Connection ** */
56342 +
56343 +
56344 +static void maybe_connect(struct backend_info *be)
56345 +{
56346 +       if (be->netif && (be->frontend_state == XenbusStateConnected))
56347 +               connect(be);
56348 +}
56349 +
56350 +static void xen_net_read_rate(struct xenbus_device *dev,
56351 +                             unsigned long *bytes, unsigned long *usec)
56352 +{
56353 +       char *s, *e;
56354 +       unsigned long b, u;
56355 +       char *ratestr;
56356 +
56357 +       /* Default to unlimited bandwidth. */
56358 +       *bytes = ~0UL;
56359 +       *usec = 0;
56360 +
56361 +       ratestr = xenbus_read(XBT_NIL, dev->nodename, "rate", NULL);
56362 +       if (IS_ERR(ratestr))
56363 +               return;
56364 +
56365 +       s = ratestr;
56366 +       b = simple_strtoul(s, &e, 10);
56367 +       if ((s == e) || (*e != ','))
56368 +               goto fail;
56369 +
56370 +       s = e + 1;
56371 +       u = simple_strtoul(s, &e, 10);
56372 +       if ((s == e) || (*e != '\0'))
56373 +               goto fail;
56374 +
56375 +       *bytes = b;
56376 +       *usec = u;
56377 +
56378 +       kfree(ratestr);
56379 +       return;
56380 +
56381 + fail:
56382 +       WPRINTK("Failed to parse network rate limit. Traffic unlimited.\n");
56383 +       kfree(ratestr);
56384 +}
56385 +
56386 +static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
56387 +{
56388 +       char *s, *e, *macstr;
56389 +       int i;
56390 +
56391 +       macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
56392 +       if (IS_ERR(macstr))
56393 +               return PTR_ERR(macstr);
56394 +
56395 +       for (i = 0; i < ETH_ALEN; i++) {
56396 +               mac[i] = simple_strtoul(s, &e, 16);
56397 +               if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
56398 +                       kfree(macstr);
56399 +                       return -ENOENT;
56400 +               }
56401 +               s = e+1;
56402 +       }
56403 +
56404 +       kfree(macstr);
56405 +       return 0;
56406 +}
56407 +
56408 +static void connect(struct backend_info *be)
56409 +{
56410 +       int err;
56411 +       struct xenbus_device *dev = be->dev;
56412 +
56413 +       err = connect_rings(be);
56414 +       if (err)
56415 +               return;
56416 +
56417 +       err = xen_net_read_mac(dev, be->netif->fe_dev_addr);
56418 +       if (err) {
56419 +               xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
56420 +               return;
56421 +       }
56422 +
56423 +       xen_net_read_rate(dev, &be->netif->credit_bytes,
56424 +                         &be->netif->credit_usec);
56425 +       be->netif->remaining_credit = be->netif->credit_bytes;
56426 +
56427 +       xenbus_switch_state(dev, XenbusStateConnected);
56428 +}
56429 +
56430 +
56431 +static int connect_rings(struct backend_info *be)
56432 +{
56433 +       struct xenbus_device *dev = be->dev;
56434 +       unsigned long tx_ring_ref, rx_ring_ref;
56435 +       unsigned int evtchn;
56436 +       int err;
56437 +
56438 +       DPRINTK("");
56439 +
56440 +       err = xenbus_gather(XBT_NIL, dev->otherend,
56441 +                           "tx-ring-ref", "%lu", &tx_ring_ref,
56442 +                           "rx-ring-ref", "%lu", &rx_ring_ref,
56443 +                           "event-channel", "%u", &evtchn, NULL);
56444 +       if (err) {
56445 +               xenbus_dev_fatal(dev, err,
56446 +                                "reading %s/ring-ref and event-channel",
56447 +                                dev->otherend);
56448 +               return err;
56449 +       }
56450 +
56451 +       /* Map the shared frame, irq etc. */
56452 +       err = netif_map(be->netif, tx_ring_ref, rx_ring_ref, evtchn);
56453 +       if (err) {
56454 +               xenbus_dev_fatal(dev, err,
56455 +                                "mapping shared-frames %lu/%lu port %u",
56456 +                                tx_ring_ref, rx_ring_ref, evtchn);
56457 +               return err;
56458 +       }
56459 +       return 0;
56460 +}
56461 +
56462 +
56463 +/* ** Driver Registration ** */
56464 +
56465 +
56466 +static struct xenbus_device_id netback_ids[] = {
56467 +       { "vif" },
56468 +       { "" }
56469 +};
56470 +
56471 +
56472 +static struct xenbus_driver netback = {
56473 +       .name = "vif",
56474 +       .owner = THIS_MODULE,
56475 +       .ids = netback_ids,
56476 +       .probe = netback_probe,
56477 +       .remove = netback_remove,
56478 +       .uevent = netback_uevent,
56479 +       .otherend_changed = frontend_changed,
56480 +};
56481 +
56482 +
56483 +void netif_xenbus_init(void)
56484 +{
56485 +       xenbus_register_backend(&netback);
56486 +}
56487 diff -urNp linux-2.6/drivers/xen/netfront/Kconfig new/drivers/xen/netfront/Kconfig
56488 --- linux-2.6/drivers/xen/netfront/Kconfig      1970-01-01 01:00:00.000000000 +0100
56489 +++ new/drivers/xen/netfront/Kconfig    2006-05-09 12:34:38.000000000 +0200
56490 @@ -0,0 +1,6 @@
56491 +
56492 +config XENNET
56493 +       tristate "Xen network driver"
56494 +       depends on NETDEVICES && ARCH_XEN
56495 +       help
56496 +         Network driver for Xen
56497 diff -urNp linux-2.6/drivers/xen/netfront/Makefile new/drivers/xen/netfront/Makefile
56498 --- linux-2.6/drivers/xen/netfront/Makefile     1970-01-01 01:00:00.000000000 +0100
56499 +++ new/drivers/xen/netfront/Makefile   2006-05-09 12:34:38.000000000 +0200
56500 @@ -0,0 +1,4 @@
56501 +
56502 +obj-$(CONFIG_XEN_NETDEV_FRONTEND)      := xennet.o
56503 +
56504 +xennet-objs := netfront.o
56505 diff -urNp linux-2.6/drivers/xen/netfront/netfront.c new/drivers/xen/netfront/netfront.c
56506 --- linux-2.6/drivers/xen/netfront/netfront.c   1970-01-01 01:00:00.000000000 +0100
56507 +++ new/drivers/xen/netfront/netfront.c 2006-07-07 15:10:03.000000000 +0200
56508 @@ -0,0 +1,1599 @@
56509 +/******************************************************************************
56510 + * Virtual network driver for conversing with remote driver backends.
56511 + *
56512 + * Copyright (c) 2002-2005, K A Fraser
56513 + * Copyright (c) 2005, XenSource Ltd
56514 + *
56515 + * This program is free software; you can redistribute it and/or
56516 + * modify it under the terms of the GNU General Public License version 2
56517 + * as published by the Free Software Foundation; or, when distributed
56518 + * separately from the Linux kernel or incorporated into other
56519 + * software packages, subject to the following license:
56520 + *
56521 + * Permission is hereby granted, free of charge, to any person obtaining a copy
56522 + * of this source file (the "Software"), to deal in the Software without
56523 + * restriction, including without limitation the rights to use, copy, modify,
56524 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
56525 + * and to permit persons to whom the Software is furnished to do so, subject to
56526 + * the following conditions:
56527 + *
56528 + * The above copyright notice and this permission notice shall be included in
56529 + * all copies or substantial portions of the Software.
56530 + *
56531 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
56532 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
56533 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
56534 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
56535 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
56536 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
56537 + * IN THE SOFTWARE.
56538 + */
56539 +
56540 +#include <linux/config.h>
56541 +#include <linux/module.h>
56542 +#include <linux/version.h>
56543 +#include <linux/kernel.h>
56544 +#include <linux/sched.h>
56545 +#include <linux/slab.h>
56546 +#include <linux/string.h>
56547 +#include <linux/errno.h>
56548 +#include <linux/netdevice.h>
56549 +#include <linux/inetdevice.h>
56550 +#include <linux/etherdevice.h>
56551 +#include <linux/skbuff.h>
56552 +#include <linux/init.h>
56553 +#include <linux/bitops.h>
56554 +#include <linux/ethtool.h>
56555 +#include <linux/in.h>
56556 +#include <linux/if_ether.h>
56557 +#include <net/sock.h>
56558 +#include <net/pkt_sched.h>
56559 +#include <net/arp.h>
56560 +#include <net/route.h>
56561 +#include <asm/io.h>
56562 +#include <asm/uaccess.h>
56563 +#include <xen/evtchn.h>
56564 +#include <xen/xenbus.h>
56565 +#include <xen/interface/io/netif.h>
56566 +#include <xen/interface/memory.h>
56567 +#include <xen/balloon.h>
56568 +#include <asm/page.h>
56569 +#include <asm/uaccess.h>
56570 +#include <xen/interface/grant_table.h>
56571 +#include <xen/gnttab.h>
56572 +
56573 +#define GRANT_INVALID_REF      0
56574 +
56575 +#define NET_TX_RING_SIZE __RING_SIZE((struct netif_tx_sring *)0, PAGE_SIZE)
56576 +#define NET_RX_RING_SIZE __RING_SIZE((struct netif_rx_sring *)0, PAGE_SIZE)
56577 +
56578 +static inline void init_skb_shinfo(struct sk_buff *skb)
56579 +{
56580 +       atomic_set(&(skb_shinfo(skb)->dataref), 1);
56581 +       skb_shinfo(skb)->nr_frags = 0;
56582 +       skb_shinfo(skb)->frag_list = NULL;
56583 +}
56584 +
56585 +struct netfront_info {
56586 +       struct list_head list;
56587 +       struct net_device *netdev;
56588 +
56589 +       struct net_device_stats stats;
56590 +
56591 +       struct netif_tx_front_ring tx;
56592 +       struct netif_rx_front_ring rx;
56593 +
56594 +       spinlock_t   tx_lock;
56595 +       spinlock_t   rx_lock;
56596 +
56597 +       unsigned int handle;
56598 +       unsigned int evtchn, irq;
56599 +
56600 +       /* Receive-ring batched refills. */
56601 +#define RX_MIN_TARGET 8
56602 +#define RX_DFL_MIN_TARGET 64
56603 +#define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
56604 +       unsigned rx_min_target, rx_max_target, rx_target;
56605 +       struct sk_buff_head rx_batch;
56606 +
56607 +       struct timer_list rx_refill_timer;
56608 +
56609 +       /*
56610 +        * {tx,rx}_skbs store outstanding skbuffs. The first entry in each
56611 +        * array is an index into a chain of free entries.
56612 +        */
56613 +       struct sk_buff *tx_skbs[NET_TX_RING_SIZE+1];
56614 +       struct sk_buff *rx_skbs[NET_RX_RING_SIZE+1];
56615 +
56616 +#define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
56617 +       grant_ref_t gref_tx_head;
56618 +       grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1];
56619 +       grant_ref_t gref_rx_head;
56620 +       grant_ref_t grant_rx_ref[NET_TX_RING_SIZE + 1];
56621 +
56622 +       struct xenbus_device *xbdev;
56623 +       int tx_ring_ref;
56624 +       int rx_ring_ref;
56625 +       u8 mac[ETH_ALEN];
56626 +
56627 +       unsigned long rx_pfn_array[NET_RX_RING_SIZE];
56628 +       struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1];
56629 +       struct mmu_update rx_mmu[NET_RX_RING_SIZE];
56630 +};
56631 +
56632 +/*
56633 + * Access macros for acquiring freeing slots in {tx,rx}_skbs[].
56634 + */
56635 +
56636 +static inline void add_id_to_freelist(struct sk_buff **list, unsigned short id)
56637 +{
56638 +       list[id] = list[0];
56639 +       list[0]  = (void *)(unsigned long)id;
56640 +}
56641 +
56642 +static inline unsigned short get_id_from_freelist(struct sk_buff **list)
56643 +{
56644 +       unsigned int id = (unsigned int)(unsigned long)list[0];
56645 +       list[0] = list[id];
56646 +       return id;
56647 +}
56648 +
56649 +#define DPRINTK(fmt, args...)                          \
56650 +       pr_debug("netfront (%s:%d) " fmt,               \
56651 +                __FUNCTION__, __LINE__, ##args)
56652 +#define IPRINTK(fmt, args...)                          \
56653 +       printk(KERN_INFO "netfront: " fmt, ##args)
56654 +#define WPRINTK(fmt, args...)                          \
56655 +       printk(KERN_WARNING "netfront: " fmt, ##args)
56656 +
56657 +static int talk_to_backend(struct xenbus_device *, struct netfront_info *);
56658 +static int setup_device(struct xenbus_device *, struct netfront_info *);
56659 +static struct net_device *create_netdev(int, struct xenbus_device *);
56660 +
56661 +static void netfront_closing(struct xenbus_device *);
56662 +
56663 +static void end_access(int, void *);
56664 +static void netif_disconnect_backend(struct netfront_info *);
56665 +static void close_netdev(struct netfront_info *);
56666 +static void netif_free(struct netfront_info *);
56667 +
56668 +static void network_connect(struct net_device *);
56669 +static void network_tx_buf_gc(struct net_device *);
56670 +static void network_alloc_rx_buffers(struct net_device *);
56671 +static int send_fake_arp(struct net_device *);
56672 +
56673 +static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs);
56674 +
56675 +#ifdef CONFIG_SYSFS
56676 +static int xennet_sysfs_addif(struct net_device *netdev);
56677 +static void xennet_sysfs_delif(struct net_device *netdev);
56678 +#else /* !CONFIG_SYSFS */
56679 +#define xennet_sysfs_addif(dev) (0)
56680 +#define xennet_sysfs_delif(dev) do { } while(0)
56681 +#endif
56682 +
56683 +static inline int xennet_can_sg(struct net_device *dev)
56684 +{
56685 +       return dev->features & NETIF_F_SG;
56686 +}
56687 +
56688 +/**
56689 + * Entry point to this code when a new device is created.  Allocate the basic
56690 + * structures and the ring buffers for communication with the backend, and
56691 + * inform the backend of the appropriate details for those.  Switch to
56692 + * Connected state.
56693 + */
56694 +static int __devinit netfront_probe(struct xenbus_device *dev,
56695 +                                   const struct xenbus_device_id *id)
56696 +{
56697 +       int err;
56698 +       struct net_device *netdev;
56699 +       struct netfront_info *info;
56700 +       unsigned int handle;
56701 +
56702 +       err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%u", &handle);
56703 +       if (err != 1) {
56704 +               xenbus_dev_fatal(dev, err, "reading handle");
56705 +               return err;
56706 +       }
56707 +
56708 +       netdev = create_netdev(handle, dev);
56709 +       if (IS_ERR(netdev)) {
56710 +               err = PTR_ERR(netdev);
56711 +               xenbus_dev_fatal(dev, err, "creating netdev");
56712 +               return err;
56713 +       }
56714 +
56715 +       info = netdev_priv(netdev);
56716 +       dev->dev.driver_data = info;
56717 +
56718 +       err = talk_to_backend(dev, info);
56719 +       if (err) {
56720 +               xennet_sysfs_delif(info->netdev);
56721 +               unregister_netdev(netdev);
56722 +               free_netdev(netdev);
56723 +               dev->dev.driver_data = NULL;
56724 +               return err;
56725 +       }
56726 +
56727 +       return 0;
56728 +}
56729 +
56730 +
56731 +/**
56732 + * We are reconnecting to the backend, due to a suspend/resume, or a backend
56733 + * driver restart.  We tear down our netif structure and recreate it, but
56734 + * leave the device-layer structures intact so that this is transparent to the
56735 + * rest of the kernel.
56736 + */
56737 +static int netfront_resume(struct xenbus_device *dev)
56738 +{
56739 +       struct netfront_info *info = dev->dev.driver_data;
56740 +
56741 +       DPRINTK("%s\n", dev->nodename);
56742 +
56743 +       netif_disconnect_backend(info);
56744 +       return talk_to_backend(dev, info);
56745 +}
56746 +
56747 +static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
56748 +{
56749 +       char *s, *e, *macstr;
56750 +       int i;
56751 +
56752 +       macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
56753 +       if (IS_ERR(macstr))
56754 +               return PTR_ERR(macstr);
56755 +
56756 +       for (i = 0; i < ETH_ALEN; i++) {
56757 +               mac[i] = simple_strtoul(s, &e, 16);
56758 +               if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
56759 +                       kfree(macstr);
56760 +                       return -ENOENT;
56761 +               }
56762 +               s = e+1;
56763 +       }
56764 +
56765 +       kfree(macstr);
56766 +       return 0;
56767 +}
56768 +
56769 +/* Common code used when first setting up, and when resuming. */
56770 +static int talk_to_backend(struct xenbus_device *dev,
56771 +                          struct netfront_info *info)
56772 +{
56773 +       const char *message;
56774 +       struct xenbus_transaction xbt;
56775 +       int err;
56776 +
56777 +       err = xen_net_read_mac(dev, info->mac);
56778 +       if (err) {
56779 +               xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
56780 +               goto out;
56781 +       }
56782 +
56783 +       /* Create shared ring, alloc event channel. */
56784 +       err = setup_device(dev, info);
56785 +       if (err)
56786 +               goto out;
56787 +
56788 +again:
56789 +       err = xenbus_transaction_start(&xbt);
56790 +       if (err) {
56791 +               xenbus_dev_fatal(dev, err, "starting transaction");
56792 +               goto destroy_ring;
56793 +       }
56794 +
56795 +       err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref","%u",
56796 +                           info->tx_ring_ref);
56797 +       if (err) {
56798 +               message = "writing tx ring-ref";
56799 +               goto abort_transaction;
56800 +       }
56801 +       err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref","%u",
56802 +                           info->rx_ring_ref);
56803 +       if (err) {
56804 +               message = "writing rx ring-ref";
56805 +               goto abort_transaction;
56806 +       }
56807 +       err = xenbus_printf(xbt, dev->nodename,
56808 +                           "event-channel", "%u", info->evtchn);
56809 +       if (err) {
56810 +               message = "writing event-channel";
56811 +               goto abort_transaction;
56812 +       }
56813 +
56814 +       err = xenbus_transaction_end(xbt, 0);
56815 +       if (err) {
56816 +               if (err == -EAGAIN)
56817 +                       goto again;
56818 +               xenbus_dev_fatal(dev, err, "completing transaction");
56819 +               goto destroy_ring;
56820 +       }
56821 +
56822 +       return 0;
56823 +
56824 + abort_transaction:
56825 +       xenbus_transaction_end(xbt, 1);
56826 +       xenbus_dev_fatal(dev, err, "%s", message);
56827 + destroy_ring:
56828 +       netif_free(info);
56829 + out:
56830 +       return err;
56831 +}
56832 +
56833 +
56834 +static int setup_device(struct xenbus_device *dev, struct netfront_info *info)
56835 +{
56836 +       struct netif_tx_sring *txs;
56837 +       struct netif_rx_sring *rxs;
56838 +       int err;
56839 +       struct net_device *netdev = info->netdev;
56840 +
56841 +       info->tx_ring_ref = GRANT_INVALID_REF;
56842 +       info->rx_ring_ref = GRANT_INVALID_REF;
56843 +       info->rx.sring = NULL;
56844 +       info->tx.sring = NULL;
56845 +       info->irq = 0;
56846 +
56847 +       txs = (struct netif_tx_sring *)get_zeroed_page(GFP_KERNEL);
56848 +       if (!txs) {
56849 +               err = -ENOMEM;
56850 +               xenbus_dev_fatal(dev, err, "allocating tx ring page");
56851 +               goto fail;
56852 +       }
56853 +       SHARED_RING_INIT(txs);
56854 +       FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);
56855 +
56856 +       err = xenbus_grant_ring(dev, virt_to_mfn(txs));
56857 +       if (err < 0) {
56858 +               free_page((unsigned long)txs);
56859 +               goto fail;
56860 +       }
56861 +       info->tx_ring_ref = err;
56862 +
56863 +       rxs = (struct netif_rx_sring *)get_zeroed_page(GFP_KERNEL);
56864 +       if (!rxs) {
56865 +               err = -ENOMEM;
56866 +               xenbus_dev_fatal(dev, err, "allocating rx ring page");
56867 +               goto fail;
56868 +       }
56869 +       SHARED_RING_INIT(rxs);
56870 +       FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);
56871 +
56872 +       err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
56873 +       if (err < 0) {
56874 +               free_page((unsigned long)rxs);
56875 +               goto fail;
56876 +       }
56877 +       info->rx_ring_ref = err;
56878 +
56879 +       err = xenbus_alloc_evtchn(dev, &info->evtchn);
56880 +       if (err)
56881 +               goto fail;
56882 +
56883 +       memcpy(netdev->dev_addr, info->mac, ETH_ALEN);
56884 +       err = bind_evtchn_to_irqhandler(info->evtchn, netif_int,
56885 +                                       SA_SAMPLE_RANDOM, netdev->name, netdev);
56886 +       if (err < 0)
56887 +               goto fail;
56888 +       info->irq = err;
56889 +       return 0;
56890 +
56891 + fail:
56892 +       netif_free(info);
56893 +       return err;
56894 +}
56895 +
56896 +
56897 +/**
56898 + * Callback received when the backend's state changes.
56899 + */
56900 +static void backend_changed(struct xenbus_device *dev,
56901 +                           enum xenbus_state backend_state)
56902 +{
56903 +       struct netfront_info *np = dev->dev.driver_data;
56904 +       struct net_device *netdev = np->netdev;
56905 +
56906 +       DPRINTK("\n");
56907 +
56908 +       switch (backend_state) {
56909 +       case XenbusStateInitialising:
56910 +       case XenbusStateInitialised:
56911 +       case XenbusStateConnected:
56912 +       case XenbusStateUnknown:
56913 +       case XenbusStateClosed:
56914 +               break;
56915 +
56916 +       case XenbusStateInitWait:
56917 +               network_connect(netdev);
56918 +               xenbus_switch_state(dev, XenbusStateConnected);
56919 +               (void)send_fake_arp(netdev);
56920 +               break;
56921 +
56922 +       case XenbusStateClosing:
56923 +               netfront_closing(dev);
56924 +               break;
56925 +       }
56926 +}
56927 +
56928 +
56929 +/** Send a packet on a net device to encourage switches to learn the
56930 + * MAC. We send a fake ARP request.
56931 + *
56932 + * @param dev device
56933 + * @return 0 on success, error code otherwise
56934 + */
56935 +static int send_fake_arp(struct net_device *dev)
56936 +{
56937 +       struct sk_buff *skb;
56938 +       u32             src_ip, dst_ip;
56939 +
56940 +       dst_ip = INADDR_BROADCAST;
56941 +       src_ip = inet_select_addr(dev, dst_ip, RT_SCOPE_LINK);
56942 +
56943 +       /* No IP? Then nothing to do. */
56944 +       if (src_ip == 0)
56945 +               return 0;
56946 +
56947 +       skb = arp_create(ARPOP_REPLY, ETH_P_ARP,
56948 +                        dst_ip, dev, src_ip,
56949 +                        /*dst_hw*/ NULL, /*src_hw*/ NULL,
56950 +                        /*target_hw*/ dev->dev_addr);
56951 +       if (skb == NULL)
56952 +               return -ENOMEM;
56953 +
56954 +       return dev_queue_xmit(skb);
56955 +}
56956 +
56957 +
56958 +static int network_open(struct net_device *dev)
56959 +{
56960 +       struct netfront_info *np = netdev_priv(dev);
56961 +
56962 +       memset(&np->stats, 0, sizeof(np->stats));
56963 +
56964 +       network_alloc_rx_buffers(dev);
56965 +       np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
56966 +
56967 +       netif_start_queue(dev);
56968 +
56969 +       return 0;
56970 +}
56971 +
56972 +static inline int netfront_tx_slot_available(struct netfront_info *np)
56973 +{
56974 +       return RING_FREE_REQUESTS(&np->tx) >= MAX_SKB_FRAGS + 2;
56975 +}
56976 +
56977 +static inline void network_maybe_wake_tx(struct net_device *dev)
56978 +{
56979 +       struct netfront_info *np = netdev_priv(dev);
56980 +
56981 +       if (unlikely(netif_queue_stopped(dev)) &&
56982 +           netfront_tx_slot_available(np) &&
56983 +           likely(netif_running(dev)))
56984 +               netif_wake_queue(dev);
56985 +}
56986 +
56987 +static void network_tx_buf_gc(struct net_device *dev)
56988 +{
56989 +       RING_IDX cons, prod;
56990 +       unsigned short id;
56991 +       struct netfront_info *np = netdev_priv(dev);
56992 +       struct sk_buff *skb;
56993 +
56994 +       if (unlikely(!netif_carrier_ok(dev)))
56995 +               return;
56996 +
56997 +       do {
56998 +               prod = np->tx.sring->rsp_prod;
56999 +               rmb(); /* Ensure we see responses up to 'rp'. */
57000 +
57001 +               for (cons = np->tx.rsp_cons; cons != prod; cons++) {
57002 +                       struct netif_tx_response *txrsp;
57003 +
57004 +                       txrsp = RING_GET_RESPONSE(&np->tx, cons);
57005 +                       if (txrsp->status == NETIF_RSP_NULL)
57006 +                               continue;
57007 +
57008 +                       id  = txrsp->id;
57009 +                       skb = np->tx_skbs[id];
57010 +                       if (unlikely(gnttab_query_foreign_access(
57011 +                               np->grant_tx_ref[id]) != 0)) {
57012 +                               printk(KERN_ALERT "network_tx_buf_gc: warning "
57013 +                                      "-- grant still in use by backend "
57014 +                                      "domain.\n");
57015 +                               BUG();
57016 +                       }
57017 +                       gnttab_end_foreign_access_ref(
57018 +                               np->grant_tx_ref[id], GNTMAP_readonly);
57019 +                       gnttab_release_grant_reference(
57020 +                               &np->gref_tx_head, np->grant_tx_ref[id]);
57021 +                       np->grant_tx_ref[id] = GRANT_INVALID_REF;
57022 +                       add_id_to_freelist(np->tx_skbs, id);
57023 +                       dev_kfree_skb_irq(skb);
57024 +               }
57025 +
57026 +               np->tx.rsp_cons = prod;
57027 +
57028 +               /*
57029 +                * Set a new event, then check for race with update of tx_cons.
57030 +                * Note that it is essential to schedule a callback, no matter
57031 +                * how few buffers are pending. Even if there is space in the
57032 +                * transmit ring, higher layers may be blocked because too much
57033 +                * data is outstanding: in such cases notification from Xen is
57034 +                * likely to be the only kick that we'll get.
57035 +                */
57036 +               np->tx.sring->rsp_event =
57037 +                       prod + ((np->tx.sring->req_prod - prod) >> 1) + 1;
57038 +               mb();
57039 +       } while ((cons == prod) && (prod != np->tx.sring->rsp_prod));
57040 +
57041 +       network_maybe_wake_tx(dev);
57042 +}
57043 +
57044 +
57045 +static void rx_refill_timeout(unsigned long data)
57046 +{
57047 +       struct net_device *dev = (struct net_device *)data;
57048 +       netif_rx_schedule(dev);
57049 +}
57050 +
57051 +
57052 +static void network_alloc_rx_buffers(struct net_device *dev)
57053 +{
57054 +       unsigned short id;
57055 +       struct netfront_info *np = netdev_priv(dev);
57056 +       struct sk_buff *skb;
57057 +       int i, batch_target;
57058 +       RING_IDX req_prod = np->rx.req_prod_pvt;
57059 +       struct xen_memory_reservation reservation;
57060 +       grant_ref_t ref;
57061 +
57062 +       if (unlikely(!netif_carrier_ok(dev)))
57063 +               return;
57064 +
57065 +       /*
57066 +        * Allocate skbuffs greedily, even though we batch updates to the
57067 +        * receive ring. This creates a less bursty demand on the memory
57068 +        * allocator, so should reduce the chance of failed allocation requests
57069 +        * both for ourself and for other kernel subsystems.
57070 +        */
57071 +       batch_target = np->rx_target - (req_prod - np->rx.rsp_cons);
57072 +       for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) {
57073 +               /*
57074 +                * Subtract dev_alloc_skb headroom (16 bytes) and shared info
57075 +                * tailroom then round down to SKB_DATA_ALIGN boundary.
57076 +                */
57077 +               skb = __dev_alloc_skb(
57078 +                       ((PAGE_SIZE - sizeof(struct skb_shared_info)) &
57079 +                        (-SKB_DATA_ALIGN(1))) - 16,
57080 +                       GFP_ATOMIC|__GFP_NOWARN);
57081 +               if (skb == NULL) {
57082 +                       /* Any skbuffs queued for refill? Force them out. */
57083 +                       if (i != 0)
57084 +                               goto refill;
57085 +                       /* Could not allocate any skbuffs. Try again later. */
57086 +                       mod_timer(&np->rx_refill_timer,
57087 +                                 jiffies + (HZ/10));
57088 +                       return;
57089 +               }
57090 +               __skb_queue_tail(&np->rx_batch, skb);
57091 +       }
57092 +
57093 +       /* Is the batch large enough to be worthwhile? */
57094 +       if (i < (np->rx_target/2))
57095 +               return;
57096 +
57097 +       /* Adjust our fill target if we risked running out of buffers. */
57098 +       if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) &&
57099 +           ((np->rx_target *= 2) > np->rx_max_target))
57100 +               np->rx_target = np->rx_max_target;
57101 +
57102 + refill:
57103 +       for (i = 0; ; i++) {
57104 +               if ((skb = __skb_dequeue(&np->rx_batch)) == NULL)
57105 +                       break;
57106 +
57107 +               skb->dev = dev;
57108 +
57109 +               id = get_id_from_freelist(np->rx_skbs);
57110 +
57111 +               np->rx_skbs[id] = skb;
57112 +
57113 +               RING_GET_REQUEST(&np->rx, req_prod + i)->id = id;
57114 +               ref = gnttab_claim_grant_reference(&np->gref_rx_head);
57115 +               BUG_ON((signed short)ref < 0);
57116 +               np->grant_rx_ref[id] = ref;
57117 +               gnttab_grant_foreign_transfer_ref(ref,
57118 +                                                 np->xbdev->otherend_id,
57119 +                                                 __pa(skb->head)>>PAGE_SHIFT);
57120 +               RING_GET_REQUEST(&np->rx, req_prod + i)->gref = ref;
57121 +               np->rx_pfn_array[i] = virt_to_mfn(skb->head);
57122 +
57123 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
57124 +                       /* Remove this page before passing back to Xen. */
57125 +                       set_phys_to_machine(__pa(skb->head) >> PAGE_SHIFT,
57126 +                                           INVALID_P2M_ENTRY);
57127 +                       MULTI_update_va_mapping(np->rx_mcl+i,
57128 +                                               (unsigned long)skb->head,
57129 +                                               __pte(0), 0);
57130 +               }
57131 +       }
57132 +
57133 +       /* Tell the ballon driver what is going on. */
57134 +       balloon_update_driver_allowance(i);
57135 +
57136 +       set_xen_guest_handle(reservation.extent_start, np->rx_pfn_array);
57137 +       reservation.nr_extents   = i;
57138 +       reservation.extent_order = 0;
57139 +       reservation.address_bits = 0;
57140 +       reservation.domid        = DOMID_SELF;
57141 +
57142 +       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
57143 +               /* After all PTEs have been zapped, flush the TLB. */
57144 +               np->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] =
57145 +                       UVMF_TLB_FLUSH|UVMF_ALL;
57146 +
57147 +               /* Give away a batch of pages. */
57148 +               np->rx_mcl[i].op = __HYPERVISOR_memory_op;
57149 +               np->rx_mcl[i].args[0] = XENMEM_decrease_reservation;
57150 +               np->rx_mcl[i].args[1] = (unsigned long)&reservation;
57151 +
57152 +               /* Zap PTEs and give away pages in one big multicall. */
57153 +               (void)HYPERVISOR_multicall(np->rx_mcl, i+1);
57154 +
57155 +               /* Check return status of HYPERVISOR_memory_op(). */
57156 +               if (unlikely(np->rx_mcl[i].result != i))
57157 +                       panic("Unable to reduce memory reservation\n");
57158 +       } else
57159 +               if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
57160 +                                        &reservation) != i)
57161 +                       panic("Unable to reduce memory reservation\n");
57162 +
57163 +       /* Above is a suitable barrier to ensure backend will see requests. */
57164 +       np->rx.req_prod_pvt = req_prod + i;
57165 +       RING_PUSH_REQUESTS(&np->rx);
57166 +}
57167 +
57168 +static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
57169 +                             struct netif_tx_request *tx)
57170 +{
57171 +       struct netfront_info *np = netdev_priv(dev);
57172 +       char *data = skb->data;
57173 +       unsigned long mfn;
57174 +       RING_IDX prod = np->tx.req_prod_pvt;
57175 +       int frags = skb_shinfo(skb)->nr_frags;
57176 +       unsigned int offset = offset_in_page(data);
57177 +       unsigned int len = skb_headlen(skb);
57178 +       unsigned int id;
57179 +       grant_ref_t ref;
57180 +       int i;
57181 +
57182 +       while (len > PAGE_SIZE - offset) {
57183 +               tx->size = PAGE_SIZE - offset;
57184 +               tx->flags |= NETTXF_more_data;
57185 +               len -= tx->size;
57186 +               data += tx->size;
57187 +               offset = 0;
57188 +
57189 +               id = get_id_from_freelist(np->tx_skbs);
57190 +               np->tx_skbs[id] = skb_get(skb);
57191 +               tx = RING_GET_REQUEST(&np->tx, prod++);
57192 +               tx->id = id;
57193 +               ref = gnttab_claim_grant_reference(&np->gref_tx_head);
57194 +               BUG_ON((signed short)ref < 0);
57195 +
57196 +               mfn = virt_to_mfn(data);
57197 +               gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
57198 +                                               mfn, GNTMAP_readonly);
57199 +
57200 +               tx->gref = np->grant_tx_ref[id] = ref;
57201 +               tx->offset = offset;
57202 +               tx->size = len;
57203 +               tx->flags = 0;
57204 +       }
57205 +
57206 +       for (i = 0; i < frags; i++) {
57207 +               skb_frag_t *frag = skb_shinfo(skb)->frags + i;
57208 +
57209 +               tx->flags |= NETTXF_more_data;
57210 +
57211 +               id = get_id_from_freelist(np->tx_skbs);
57212 +               np->tx_skbs[id] = skb_get(skb);
57213 +               tx = RING_GET_REQUEST(&np->tx, prod++);
57214 +               tx->id = id;
57215 +               ref = gnttab_claim_grant_reference(&np->gref_tx_head);
57216 +               BUG_ON((signed short)ref < 0);
57217 +
57218 +               mfn = pfn_to_mfn(page_to_pfn(frag->page));
57219 +               gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
57220 +                                               mfn, GNTMAP_readonly);
57221 +
57222 +               tx->gref = np->grant_tx_ref[id] = ref;
57223 +               tx->offset = frag->page_offset;
57224 +               tx->size = frag->size;
57225 +               tx->flags = 0;
57226 +       }
57227 +
57228 +       np->tx.req_prod_pvt = prod;
57229 +}
57230 +
57231 +static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
57232 +{
57233 +       unsigned short id;
57234 +       struct netfront_info *np = netdev_priv(dev);
57235 +       struct netif_tx_request *tx;
57236 +       struct netif_extra_info *extra;
57237 +       char *data = skb->data;
57238 +       RING_IDX i;
57239 +       grant_ref_t ref;
57240 +       unsigned long mfn;
57241 +       int notify;
57242 +       int frags = skb_shinfo(skb)->nr_frags;
57243 +       unsigned int offset = offset_in_page(data);
57244 +       unsigned int len = skb_headlen(skb);
57245 +
57246 +       frags += (offset + len + PAGE_SIZE - 1) / PAGE_SIZE;
57247 +       if (unlikely(frags > MAX_SKB_FRAGS + 1)) {
57248 +               printk(KERN_ALERT "xennet: skb rides the rocket: %d frags\n",
57249 +                      frags);
57250 +               dump_stack();
57251 +               goto drop;
57252 +       }
57253 +
57254 +       spin_lock_irq(&np->tx_lock);
57255 +
57256 +       if (unlikely(!netif_carrier_ok(dev) ||
57257 +                    (frags > 1 && !xennet_can_sg(dev)) ||
57258 +                    netif_needs_gso(dev, skb))) {
57259 +               spin_unlock_irq(&np->tx_lock);
57260 +               goto drop;
57261 +       }
57262 +
57263 +       i = np->tx.req_prod_pvt;
57264 +
57265 +       id = get_id_from_freelist(np->tx_skbs);
57266 +       np->tx_skbs[id] = skb;
57267 +
57268 +       tx = RING_GET_REQUEST(&np->tx, i);
57269 +
57270 +       tx->id   = id;
57271 +       ref = gnttab_claim_grant_reference(&np->gref_tx_head);
57272 +       BUG_ON((signed short)ref < 0);
57273 +       mfn = virt_to_mfn(data);
57274 +       gnttab_grant_foreign_access_ref(
57275 +               ref, np->xbdev->otherend_id, mfn, GNTMAP_readonly);
57276 +       tx->gref = np->grant_tx_ref[id] = ref;
57277 +       tx->offset = offset;
57278 +       tx->size = len;
57279 +
57280 +       tx->flags = 0;
57281 +       extra = NULL;
57282 +
57283 +       if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
57284 +               tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
57285 +       if (skb->proto_data_valid) /* remote but checksummed? */
57286 +               tx->flags |= NETTXF_data_validated;
57287 +
57288 +       if (skb_shinfo(skb)->gso_size) {
57289 +               struct netif_extra_info *gso = (struct netif_extra_info *)
57290 +                       RING_GET_REQUEST(&np->tx, ++i);
57291 +
57292 +               if (extra)
57293 +                       extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
57294 +               else
57295 +                       tx->flags |= NETTXF_extra_info;
57296 +
57297 +               gso->u.gso.size = skb_shinfo(skb)->gso_size;
57298 +               gso->u.gso.type = XEN_NETIF_GSO_TCPV4;
57299 +
57300 +               gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
57301 +               gso->flags = 0;
57302 +               extra = gso;
57303 +       }
57304 +
57305 +       np->tx.req_prod_pvt = i + 1;
57306 +
57307 +       xennet_make_frags(skb, dev, tx);
57308 +       tx->size = skb->len;
57309 +
57310 +       RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify);
57311 +       if (notify)
57312 +               notify_remote_via_irq(np->irq);
57313 +
57314 +       network_tx_buf_gc(dev);
57315 +
57316 +       if (!netfront_tx_slot_available(np))
57317 +               netif_stop_queue(dev);
57318 +
57319 +       spin_unlock_irq(&np->tx_lock);
57320 +
57321 +       np->stats.tx_bytes += skb->len;
57322 +       np->stats.tx_packets++;
57323 +
57324 +       return 0;
57325 +
57326 + drop:
57327 +       np->stats.tx_dropped++;
57328 +       dev_kfree_skb(skb);
57329 +       return 0;
57330 +}
57331 +
57332 +static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
57333 +{
57334 +       struct net_device *dev = dev_id;
57335 +       struct netfront_info *np = netdev_priv(dev);
57336 +       unsigned long flags;
57337 +
57338 +       spin_lock_irqsave(&np->tx_lock, flags);
57339 +       network_tx_buf_gc(dev);
57340 +       spin_unlock_irqrestore(&np->tx_lock, flags);
57341 +
57342 +       if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx) &&
57343 +           likely(netif_running(dev)))
57344 +               netif_rx_schedule(dev);
57345 +
57346 +       return IRQ_HANDLED;
57347 +}
57348 +
57349 +
57350 +static int netif_poll(struct net_device *dev, int *pbudget)
57351 +{
57352 +       struct netfront_info *np = netdev_priv(dev);
57353 +       struct sk_buff *skb, *nskb;
57354 +       struct netif_rx_response *rx;
57355 +       RING_IDX i, rp;
57356 +       struct mmu_update *mmu = np->rx_mmu;
57357 +       struct multicall_entry *mcl = np->rx_mcl;
57358 +       int work_done, budget, more_to_do = 1;
57359 +       struct sk_buff_head rxq;
57360 +       unsigned long flags;
57361 +       unsigned long mfn;
57362 +       grant_ref_t ref;
57363 +
57364 +       spin_lock(&np->rx_lock);
57365 +
57366 +       if (unlikely(!netif_carrier_ok(dev))) {
57367 +               spin_unlock(&np->rx_lock);
57368 +               return 0;
57369 +       }
57370 +
57371 +       skb_queue_head_init(&rxq);
57372 +
57373 +       if ((budget = *pbudget) > dev->quota)
57374 +               budget = dev->quota;
57375 +       rp = np->rx.sring->rsp_prod;
57376 +       rmb(); /* Ensure we see queued responses up to 'rp'. */
57377 +
57378 +       for (i = np->rx.rsp_cons, work_done = 0;
57379 +            (i != rp) && (work_done < budget);
57380 +            i++, work_done++) {
57381 +               rx = RING_GET_RESPONSE(&np->rx, i);
57382 +
57383 +               /*
57384 +                * This definitely indicates a bug, either in this driver or in
57385 +                * the backend driver. In future this should flag the bad
57386 +                * situation to the system controller to reboot the backed.
57387 +                */
57388 +               if ((ref = np->grant_rx_ref[rx->id]) == GRANT_INVALID_REF) {
57389 +                       WPRINTK("Bad rx response id %d.\n", rx->id);
57390 +                       work_done--;
57391 +                       continue;
57392 +               }
57393 +
57394 +               /* Memory pressure, insufficient buffer headroom, ... */
57395 +               if ((mfn = gnttab_end_foreign_transfer_ref(ref)) == 0) {
57396 +                       if (net_ratelimit())
57397 +                               WPRINTK("Unfulfilled rx req (id=%d, st=%d).\n",
57398 +                                       rx->id, rx->status);
57399 +                       RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id =
57400 +                               rx->id;
57401 +                       RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref =
57402 +                               ref;
57403 +                       np->rx.req_prod_pvt++;
57404 +                       RING_PUSH_REQUESTS(&np->rx);
57405 +                       work_done--;
57406 +                       continue;
57407 +               }
57408 +
57409 +               gnttab_release_grant_reference(&np->gref_rx_head, ref);
57410 +               np->grant_rx_ref[rx->id] = GRANT_INVALID_REF;
57411 +
57412 +               skb = np->rx_skbs[rx->id];
57413 +               add_id_to_freelist(np->rx_skbs, rx->id);
57414 +
57415 +               /* NB. We handle skb overflow later. */
57416 +               skb->data = skb->head + rx->offset;
57417 +               skb->len  = rx->status;
57418 +               skb->tail = skb->data + skb->len;
57419 +
57420 +               /*
57421 +                * Old backends do not assert data_validated but we
57422 +                * can infer it from csum_blank so test both flags.
57423 +                */
57424 +               if (rx->flags & (NETRXF_data_validated|NETRXF_csum_blank)) {
57425 +                       skb->ip_summed = CHECKSUM_UNNECESSARY;
57426 +                       skb->proto_data_valid = 1;
57427 +               } else {
57428 +                       skb->ip_summed = CHECKSUM_NONE;
57429 +                       skb->proto_data_valid = 0;
57430 +               }
57431 +               skb->proto_csum_blank = !!(rx->flags & NETRXF_csum_blank);
57432 +
57433 +               np->stats.rx_packets++;
57434 +               np->stats.rx_bytes += rx->status;
57435 +
57436 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
57437 +                       /* Remap the page. */
57438 +                       MULTI_update_va_mapping(mcl, (unsigned long)skb->head,
57439 +                                               pfn_pte_ma(mfn, PAGE_KERNEL),
57440 +                                               0);
57441 +                       mcl++;
57442 +                       mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
57443 +                               | MMU_MACHPHYS_UPDATE;
57444 +                       mmu->val = __pa(skb->head) >> PAGE_SHIFT;
57445 +                       mmu++;
57446 +
57447 +                       set_phys_to_machine(__pa(skb->head) >> PAGE_SHIFT,
57448 +                                           mfn);
57449 +               }
57450 +
57451 +               __skb_queue_tail(&rxq, skb);
57452 +       }
57453 +
57454 +       /* Some pages are no longer absent... */
57455 +       balloon_update_driver_allowance(-work_done);
57456 +
57457 +       /* Do all the remapping work, and M2P updates, in one big hypercall. */
57458 +       if (likely((mcl - np->rx_mcl) != 0)) {
57459 +               mcl->op = __HYPERVISOR_mmu_update;
57460 +               mcl->args[0] = (unsigned long)np->rx_mmu;
57461 +               mcl->args[1] = mmu - np->rx_mmu;
57462 +               mcl->args[2] = 0;
57463 +               mcl->args[3] = DOMID_SELF;
57464 +               mcl++;
57465 +               (void)HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl);
57466 +       }
57467 +
57468 +       while ((skb = __skb_dequeue(&rxq)) != NULL) {
57469 +               if (skb->len > (dev->mtu + ETH_HLEN + 4)) {
57470 +                       if (net_ratelimit())
57471 +                               printk(KERN_INFO "Received packet too big for "
57472 +                                      "MTU (%d > %d)\n",
57473 +                                      skb->len - ETH_HLEN - 4, dev->mtu);
57474 +                       skb->len  = 0;
57475 +                       skb->tail = skb->data;
57476 +                       init_skb_shinfo(skb);
57477 +                       dev_kfree_skb(skb);
57478 +                       continue;
57479 +               }
57480 +
57481 +               /*
57482 +                * Enough room in skbuff for the data we were passed? Also,
57483 +                * Linux expects at least 16 bytes headroom in each rx buffer.
57484 +                */
57485 +               if (unlikely(skb->tail > skb->end) ||
57486 +                   unlikely((skb->data - skb->head) < 16)) {
57487 +                       if (net_ratelimit()) {
57488 +                               if (skb->tail > skb->end)
57489 +                                       printk(KERN_INFO "Received packet "
57490 +                                              "is %zd bytes beyond tail.\n",
57491 +                                              skb->tail - skb->end);
57492 +                               else
57493 +                                       printk(KERN_INFO "Received packet "
57494 +                                              "is %zd bytes before head.\n",
57495 +                                              16 - (skb->data - skb->head));
57496 +                       }
57497 +
57498 +                       nskb = __dev_alloc_skb(skb->len + 2,
57499 +                                              GFP_ATOMIC|__GFP_NOWARN);
57500 +                       if (nskb != NULL) {
57501 +                               skb_reserve(nskb, 2);
57502 +                               skb_put(nskb, skb->len);
57503 +                               memcpy(nskb->data, skb->data, skb->len);
57504 +                               /* Copy any other fields we already set up. */
57505 +                               nskb->dev = skb->dev;
57506 +                               nskb->ip_summed = skb->ip_summed;
57507 +                               nskb->proto_data_valid = skb->proto_data_valid;
57508 +                               nskb->proto_csum_blank = skb->proto_csum_blank;
57509 +                       }
57510 +
57511 +                       /* Reinitialise and then destroy the old skbuff. */
57512 +                       skb->len  = 0;
57513 +                       skb->tail = skb->data;
57514 +                       init_skb_shinfo(skb);
57515 +                       dev_kfree_skb(skb);
57516 +
57517 +                       /* Switch old for new, if we copied the buffer. */
57518 +                       if ((skb = nskb) == NULL)
57519 +                               continue;
57520 +               }
57521 +
57522 +               /* Set the shinfo area, which is hidden behind the data. */
57523 +               init_skb_shinfo(skb);
57524 +               /* Ethernet work: Delayed to here as it peeks the header. */
57525 +               skb->protocol = eth_type_trans(skb, dev);
57526 +
57527 +               /* Pass it up. */
57528 +               netif_receive_skb(skb);
57529 +               dev->last_rx = jiffies;
57530 +       }
57531 +
57532 +       np->rx.rsp_cons = i;
57533 +
57534 +       /* If we get a callback with very few responses, reduce fill target. */
57535 +       /* NB. Note exponential increase, linear decrease. */
57536 +       if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) >
57537 +            ((3*np->rx_target) / 4)) &&
57538 +           (--np->rx_target < np->rx_min_target))
57539 +               np->rx_target = np->rx_min_target;
57540 +
57541 +       network_alloc_rx_buffers(dev);
57542 +
57543 +       *pbudget   -= work_done;
57544 +       dev->quota -= work_done;
57545 +
57546 +       if (work_done < budget) {
57547 +               local_irq_save(flags);
57548 +
57549 +               RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do);
57550 +               if (!more_to_do)
57551 +                       __netif_rx_complete(dev);
57552 +
57553 +               local_irq_restore(flags);
57554 +       }
57555 +
57556 +       spin_unlock(&np->rx_lock);
57557 +
57558 +       return more_to_do;
57559 +}
57560 +
57561 +
57562 +static int network_close(struct net_device *dev)
57563 +{
57564 +       struct netfront_info *np = netdev_priv(dev);
57565 +       netif_stop_queue(np->netdev);
57566 +       return 0;
57567 +}
57568 +
57569 +
57570 +static struct net_device_stats *network_get_stats(struct net_device *dev)
57571 +{
57572 +       struct netfront_info *np = netdev_priv(dev);
57573 +       return &np->stats;
57574 +}
57575 +
57576 +static int xennet_change_mtu(struct net_device *dev, int mtu)
57577 +{
57578 +       int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
57579 +
57580 +       if (mtu > max)
57581 +               return -EINVAL;
57582 +       dev->mtu = mtu;
57583 +       return 0;
57584 +}
57585 +
57586 +static int xennet_set_sg(struct net_device *dev, u32 data)
57587 +{
57588 +       if (data) {
57589 +               struct netfront_info *np = netdev_priv(dev);
57590 +               int val;
57591 +
57592 +               if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, "feature-sg",
57593 +                                "%d", &val) < 0)
57594 +                       val = 0;
57595 +               if (!val)
57596 +                       return -ENOSYS;
57597 +       } else if (dev->mtu > ETH_DATA_LEN)
57598 +               dev->mtu = ETH_DATA_LEN;
57599 +
57600 +       return ethtool_op_set_sg(dev, data);
57601 +}
57602 +
57603 +static int xennet_set_tso(struct net_device *dev, u32 data)
57604 +{
57605 +       if (data) {
57606 +               struct netfront_info *np = netdev_priv(dev);
57607 +               int val;
57608 +
57609 +               if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, "feature-tso",
57610 +                                "%d", &val) < 0)
57611 +                       val = 0;
57612 +#if 0 /* KAF: After the protocol is finalised. */
57613 +               if (!val)
57614 +#endif
57615 +                       return -ENOSYS;
57616 +       }
57617 +
57618 +       return ethtool_op_set_tso(dev, data);
57619 +}
57620 +
57621 +static void xennet_set_features(struct net_device *dev)
57622 +{
57623 +       if (!xennet_set_sg(dev, 1))
57624 +               xennet_set_tso(dev, 1);
57625 +}
57626 +
57627 +static void network_connect(struct net_device *dev)
57628 +{
57629 +       struct netfront_info *np = netdev_priv(dev);
57630 +       int i, requeue_idx;
57631 +       struct sk_buff *skb;
57632 +
57633 +       xennet_set_features(dev);
57634 +
57635 +       spin_lock_irq(&np->tx_lock);
57636 +       spin_lock(&np->rx_lock);
57637 +
57638 +       /*
57639 +         * Recovery procedure:
57640 +        *  NB. Freelist index entries are always going to be less than
57641 +        *  PAGE_OFFSET, whereas pointers to skbs will always be equal or
57642 +        *  greater than PAGE_OFFSET: we use this property to distinguish
57643 +        *  them.
57644 +         */
57645 +
57646 +       /* Step 1: Discard all pending TX packet fragments. */
57647 +       for (requeue_idx = 0, i = 1; i <= NET_TX_RING_SIZE; i++) {
57648 +               if ((unsigned long)np->tx_skbs[i] < PAGE_OFFSET)
57649 +                       continue;
57650 +
57651 +               skb = np->tx_skbs[i];
57652 +               gnttab_end_foreign_access_ref(
57653 +                       np->grant_tx_ref[i], GNTMAP_readonly);
57654 +               gnttab_release_grant_reference(
57655 +                       &np->gref_tx_head, np->grant_tx_ref[i]);
57656 +               np->grant_tx_ref[i] = GRANT_INVALID_REF;
57657 +               add_id_to_freelist(np->tx_skbs, i);
57658 +               dev_kfree_skb_irq(skb);
57659 +       }
57660 +
57661 +       /* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */
57662 +       for (requeue_idx = 0, i = 1; i <= NET_RX_RING_SIZE; i++) {
57663 +               if ((unsigned long)np->rx_skbs[i] < PAGE_OFFSET)
57664 +                       continue;
57665 +               gnttab_grant_foreign_transfer_ref(
57666 +                       np->grant_rx_ref[i], np->xbdev->otherend_id,
57667 +                       __pa(np->rx_skbs[i]->data) >> PAGE_SHIFT);
57668 +               RING_GET_REQUEST(&np->rx, requeue_idx)->gref =
57669 +                       np->grant_rx_ref[i];
57670 +               RING_GET_REQUEST(&np->rx, requeue_idx)->id = i;
57671 +               requeue_idx++;
57672 +       }
57673 +
57674 +       np->rx.req_prod_pvt = requeue_idx;
57675 +       RING_PUSH_REQUESTS(&np->rx);
57676 +
57677 +       /*
57678 +        * Step 3: All public and private state should now be sane.  Get
57679 +        * ready to start sending and receiving packets and give the driver
57680 +        * domain a kick because we've probably just requeued some
57681 +        * packets.
57682 +        */
57683 +       netif_carrier_on(dev);
57684 +       notify_remote_via_irq(np->irq);
57685 +       network_tx_buf_gc(dev);
57686 +       network_alloc_rx_buffers(dev);
57687 +
57688 +       spin_unlock(&np->rx_lock);
57689 +       spin_unlock_irq(&np->tx_lock);
57690 +}
57691 +
57692 +static void netif_uninit(struct net_device *dev)
57693 +{
57694 +       struct netfront_info *np = netdev_priv(dev);
57695 +       gnttab_free_grant_references(np->gref_tx_head);
57696 +       gnttab_free_grant_references(np->gref_rx_head);
57697 +}
57698 +
57699 +static struct ethtool_ops network_ethtool_ops =
57700 +{
57701 +       .get_tx_csum = ethtool_op_get_tx_csum,
57702 +       .set_tx_csum = ethtool_op_set_tx_csum,
57703 +       .get_sg = ethtool_op_get_sg,
57704 +       .set_sg = xennet_set_sg,
57705 +       .get_tso = ethtool_op_get_tso,
57706 +       .set_tso = xennet_set_tso,
57707 +};
57708 +
57709 +#ifdef CONFIG_SYSFS
57710 +static ssize_t show_rxbuf_min(struct class_device *cd, char *buf)
57711 +{
57712 +       struct net_device *netdev = container_of(cd, struct net_device,
57713 +                                                class_dev);
57714 +       struct netfront_info *info = netdev_priv(netdev);
57715 +
57716 +       return sprintf(buf, "%u\n", info->rx_min_target);
57717 +}
57718 +
57719 +static ssize_t store_rxbuf_min(struct class_device *cd,
57720 +                              const char *buf, size_t len)
57721 +{
57722 +       struct net_device *netdev = container_of(cd, struct net_device,
57723 +                                                class_dev);
57724 +       struct netfront_info *np = netdev_priv(netdev);
57725 +       char *endp;
57726 +       unsigned long target;
57727 +
57728 +       if (!capable(CAP_NET_ADMIN))
57729 +               return -EPERM;
57730 +
57731 +       target = simple_strtoul(buf, &endp, 0);
57732 +       if (endp == buf)
57733 +               return -EBADMSG;
57734 +
57735 +       if (target < RX_MIN_TARGET)
57736 +               target = RX_MIN_TARGET;
57737 +       if (target > RX_MAX_TARGET)
57738 +               target = RX_MAX_TARGET;
57739 +
57740 +       spin_lock(&np->rx_lock);
57741 +       if (target > np->rx_max_target)
57742 +               np->rx_max_target = target;
57743 +       np->rx_min_target = target;
57744 +       if (target > np->rx_target)
57745 +               np->rx_target = target;
57746 +
57747 +       network_alloc_rx_buffers(netdev);
57748 +
57749 +       spin_unlock(&np->rx_lock);
57750 +       return len;
57751 +}
57752 +
57753 +static ssize_t show_rxbuf_max(struct class_device *cd, char *buf)
57754 +{
57755 +       struct net_device *netdev = container_of(cd, struct net_device,
57756 +                                                class_dev);
57757 +       struct netfront_info *info = netdev_priv(netdev);
57758 +
57759 +       return sprintf(buf, "%u\n", info->rx_max_target);
57760 +}
57761 +
57762 +static ssize_t store_rxbuf_max(struct class_device *cd,
57763 +                              const char *buf, size_t len)
57764 +{
57765 +       struct net_device *netdev = container_of(cd, struct net_device,
57766 +                                                class_dev);
57767 +       struct netfront_info *np = netdev_priv(netdev);
57768 +       char *endp;
57769 +       unsigned long target;
57770 +
57771 +       if (!capable(CAP_NET_ADMIN))
57772 +               return -EPERM;
57773 +
57774 +       target = simple_strtoul(buf, &endp, 0);
57775 +       if (endp == buf)
57776 +               return -EBADMSG;
57777 +
57778 +       if (target < RX_MIN_TARGET)
57779 +               target = RX_MIN_TARGET;
57780 +       if (target > RX_MAX_TARGET)
57781 +               target = RX_MAX_TARGET;
57782 +
57783 +       spin_lock(&np->rx_lock);
57784 +       if (target < np->rx_min_target)
57785 +               np->rx_min_target = target;
57786 +       np->rx_max_target = target;
57787 +       if (target < np->rx_target)
57788 +               np->rx_target = target;
57789 +
57790 +       network_alloc_rx_buffers(netdev);
57791 +
57792 +       spin_unlock(&np->rx_lock);
57793 +       return len;
57794 +}
57795 +
57796 +static ssize_t show_rxbuf_cur(struct class_device *cd, char *buf)
57797 +{
57798 +       struct net_device *netdev = container_of(cd, struct net_device,
57799 +                                                class_dev);
57800 +       struct netfront_info *info = netdev_priv(netdev);
57801 +
57802 +       return sprintf(buf, "%u\n", info->rx_target);
57803 +}
57804 +
57805 +static const struct class_device_attribute xennet_attrs[] = {
57806 +       __ATTR(rxbuf_min, S_IRUGO|S_IWUSR, show_rxbuf_min, store_rxbuf_min),
57807 +       __ATTR(rxbuf_max, S_IRUGO|S_IWUSR, show_rxbuf_max, store_rxbuf_max),
57808 +       __ATTR(rxbuf_cur, S_IRUGO, show_rxbuf_cur, NULL),
57809 +};
57810 +
57811 +static int xennet_sysfs_addif(struct net_device *netdev)
57812 +{
57813 +       int i;
57814 +       int error = 0;
57815 +
57816 +       for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) {
57817 +               error = class_device_create_file(&netdev->class_dev, 
57818 +                                                &xennet_attrs[i]);
57819 +               if (error)
57820 +                       goto fail;
57821 +       }
57822 +       return 0;
57823 +
57824 + fail:
57825 +       while (--i >= 0)
57826 +               class_device_remove_file(&netdev->class_dev,
57827 +                                        &xennet_attrs[i]);
57828 +       return error;
57829 +}
57830 +
57831 +static void xennet_sysfs_delif(struct net_device *netdev)
57832 +{
57833 +       int i;
57834 +
57835 +       for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) {
57836 +               class_device_remove_file(&netdev->class_dev,
57837 +                                        &xennet_attrs[i]);
57838 +       }
57839 +}
57840 +
57841 +#endif /* CONFIG_SYSFS */
57842 +
57843 +
57844 +/*
57845 + * Nothing to do here. Virtual interface is point-to-point and the
57846 + * physical interface is probably promiscuous anyway.
57847 + */
57848 +static void network_set_multicast_list(struct net_device *dev)
57849 +{
57850 +}
57851 +
57852 +/** Create a network device.
57853 + * @param handle device handle
57854 + * @param val return parameter for created device
57855 + * @return 0 on success, error code otherwise
57856 + */
57857 +static struct net_device * __devinit create_netdev(int handle,
57858 +                                                  struct xenbus_device *dev)
57859 +{
57860 +       int i, err = 0;
57861 +       struct net_device *netdev = NULL;
57862 +       struct netfront_info *np = NULL;
57863 +
57864 +       netdev = alloc_etherdev(sizeof(struct netfront_info));
57865 +       if (!netdev) {
57866 +               printk(KERN_WARNING "%s> alloc_etherdev failed.\n",
57867 +                      __FUNCTION__);
57868 +               return ERR_PTR(-ENOMEM);
57869 +       }
57870 +
57871 +       np                = netdev_priv(netdev);
57872 +       np->handle        = handle;
57873 +       np->xbdev         = dev;
57874 +
57875 +       netif_carrier_off(netdev);
57876 +
57877 +       spin_lock_init(&np->tx_lock);
57878 +       spin_lock_init(&np->rx_lock);
57879 +
57880 +       skb_queue_head_init(&np->rx_batch);
57881 +       np->rx_target     = RX_DFL_MIN_TARGET;
57882 +       np->rx_min_target = RX_DFL_MIN_TARGET;
57883 +       np->rx_max_target = RX_MAX_TARGET;
57884 +
57885 +       init_timer(&np->rx_refill_timer);
57886 +       np->rx_refill_timer.data = (unsigned long)netdev;
57887 +       np->rx_refill_timer.function = rx_refill_timeout;
57888 +
57889 +       /* Initialise {tx,rx}_skbs as a free chain containing every entry. */
57890 +       for (i = 0; i <= NET_TX_RING_SIZE; i++) {
57891 +               np->tx_skbs[i] = (void *)((unsigned long) i+1);
57892 +               np->grant_tx_ref[i] = GRANT_INVALID_REF;
57893 +       }
57894 +
57895 +       for (i = 0; i <= NET_RX_RING_SIZE; i++) {
57896 +               np->rx_skbs[i] = (void *)((unsigned long) i+1);
57897 +               np->grant_rx_ref[i] = GRANT_INVALID_REF;
57898 +       }
57899 +
57900 +       /* A grant for every tx ring slot */
57901 +       if (gnttab_alloc_grant_references(TX_MAX_TARGET,
57902 +                                         &np->gref_tx_head) < 0) {
57903 +               printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n");
57904 +               err = -ENOMEM;
57905 +               goto exit;
57906 +       }
57907 +       /* A grant for every rx ring slot */
57908 +       if (gnttab_alloc_grant_references(RX_MAX_TARGET,
57909 +                                         &np->gref_rx_head) < 0) {
57910 +               printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n");
57911 +               err = -ENOMEM;
57912 +               goto exit_free_tx;
57913 +       }
57914 +
57915 +       netdev->open            = network_open;
57916 +       netdev->hard_start_xmit = network_start_xmit;
57917 +       netdev->stop            = network_close;
57918 +       netdev->get_stats       = network_get_stats;
57919 +       netdev->poll            = netif_poll;
57920 +       netdev->set_multicast_list = network_set_multicast_list;
57921 +       netdev->uninit          = netif_uninit;
57922 +       netdev->change_mtu      = xennet_change_mtu;
57923 +       netdev->weight          = 64;
57924 +       netdev->features        = NETIF_F_IP_CSUM;
57925 +
57926 +       SET_ETHTOOL_OPS(netdev, &network_ethtool_ops);
57927 +       SET_MODULE_OWNER(netdev);
57928 +       SET_NETDEV_DEV(netdev, &dev->dev);
57929 +
57930 +       err = register_netdev(netdev);
57931 +       if (err) {
57932 +               printk(KERN_WARNING "%s> register_netdev err=%d\n",
57933 +                      __FUNCTION__, err);
57934 +               goto exit_free_rx;
57935 +       }
57936 +
57937 +       err = xennet_sysfs_addif(netdev);
57938 +       if (err) {
57939 +               /* This can be non-fatal: it only means no tuning parameters */
57940 +               printk(KERN_WARNING "%s> add sysfs failed err=%d\n",
57941 +                      __FUNCTION__, err);
57942 +       }
57943 +
57944 +       np->netdev = netdev;
57945 +
57946 +       return netdev;
57947 +
57948 +
57949 + exit_free_rx:
57950 +       gnttab_free_grant_references(np->gref_rx_head);
57951 + exit_free_tx:
57952 +       gnttab_free_grant_references(np->gref_tx_head);
57953 + exit:
57954 +       free_netdev(netdev);
57955 +       return ERR_PTR(err);
57956 +}
57957 +
57958 +/*
57959 + * We use this notifier to send out a fake ARP reply to reset switches and
57960 + * router ARP caches when an IP interface is brought up on a VIF.
57961 + */
57962 +static int
57963 +inetdev_notify(struct notifier_block *this, unsigned long event, void *ptr)
57964 +{
57965 +       struct in_ifaddr  *ifa = (struct in_ifaddr *)ptr;
57966 +       struct net_device *dev = ifa->ifa_dev->dev;
57967 +
57968 +       /* UP event and is it one of our devices? */
57969 +       if (event == NETDEV_UP && dev->open == network_open)
57970 +               (void)send_fake_arp(dev);
57971 +
57972 +       return NOTIFY_DONE;
57973 +}
57974 +
57975 +
57976 +/* ** Close down ** */
57977 +
57978 +
57979 +/**
57980 + * Handle the change of state of the backend to Closing.  We must delete our
57981 + * device-layer structures now, to ensure that writes are flushed through to
57982 + * the backend.  Once is this done, we can switch to Closed in
57983 + * acknowledgement.
57984 + */
57985 +static void netfront_closing(struct xenbus_device *dev)
57986 +{
57987 +       struct netfront_info *info = dev->dev.driver_data;
57988 +
57989 +       DPRINTK("netfront_closing: %s removed\n", dev->nodename);
57990 +
57991 +       close_netdev(info);
57992 +
57993 +       xenbus_switch_state(dev, XenbusStateClosed);
57994 +}
57995 +
57996 +
57997 +static int __devexit netfront_remove(struct xenbus_device *dev)
57998 +{
57999 +       struct netfront_info *info = dev->dev.driver_data;
58000 +
58001 +       DPRINTK("%s\n", dev->nodename);
58002 +
58003 +       netif_disconnect_backend(info);
58004 +       free_netdev(info->netdev);
58005 +
58006 +       return 0;
58007 +}
58008 +
58009 +
58010 +static void close_netdev(struct netfront_info *info)
58011 +{
58012 +       del_timer_sync(&info->rx_refill_timer);
58013 +
58014 +       xennet_sysfs_delif(info->netdev);
58015 +       unregister_netdev(info->netdev);
58016 +}
58017 +
58018 +
58019 +static void netif_disconnect_backend(struct netfront_info *info)
58020 +{
58021 +       /* Stop old i/f to prevent errors whilst we rebuild the state. */
58022 +       spin_lock_irq(&info->tx_lock);
58023 +       spin_lock(&info->rx_lock);
58024 +       netif_carrier_off(info->netdev);
58025 +       spin_unlock(&info->rx_lock);
58026 +       spin_unlock_irq(&info->tx_lock);
58027 +
58028 +       if (info->irq)
58029 +               unbind_from_irqhandler(info->irq, info->netdev);
58030 +       info->evtchn = info->irq = 0;
58031 +
58032 +       end_access(info->tx_ring_ref, info->tx.sring);
58033 +       end_access(info->rx_ring_ref, info->rx.sring);
58034 +       info->tx_ring_ref = GRANT_INVALID_REF;
58035 +       info->rx_ring_ref = GRANT_INVALID_REF;
58036 +       info->tx.sring = NULL;
58037 +       info->rx.sring = NULL;
58038 +}
58039 +
58040 +
58041 +static void netif_free(struct netfront_info *info)
58042 +{
58043 +       close_netdev(info);
58044 +       netif_disconnect_backend(info);
58045 +       free_netdev(info->netdev);
58046 +}
58047 +
58048 +
58049 +static void end_access(int ref, void *page)
58050 +{
58051 +       if (ref != GRANT_INVALID_REF)
58052 +               gnttab_end_foreign_access(ref, 0, (unsigned long)page);
58053 +}
58054 +
58055 +
58056 +/* ** Driver registration ** */
58057 +
58058 +
58059 +static struct xenbus_device_id netfront_ids[] = {
58060 +       { "vif" },
58061 +       { "" }
58062 +};
58063 +
58064 +
58065 +static struct xenbus_driver netfront = {
58066 +       .name = "vif",
58067 +       .owner = THIS_MODULE,
58068 +       .ids = netfront_ids,
58069 +       .probe = netfront_probe,
58070 +       .remove = __devexit_p(netfront_remove),
58071 +       .resume = netfront_resume,
58072 +       .otherend_changed = backend_changed,
58073 +};
58074 +
58075 +
58076 +static struct notifier_block notifier_inetdev = {
58077 +       .notifier_call  = inetdev_notify,
58078 +       .next           = NULL,
58079 +       .priority       = 0
58080 +};
58081 +
58082 +static int __init netif_init(void)
58083 +{
58084 +       if (!is_running_on_xen())
58085 +               return -ENODEV;
58086 +
58087 +       if (xen_start_info->flags & SIF_INITDOMAIN)
58088 +               return 0;
58089 +
58090 +       IPRINTK("Initialising virtual ethernet driver.\n");
58091 +
58092 +       (void)register_inetaddr_notifier(&notifier_inetdev);
58093 +
58094 +       return xenbus_register_frontend(&netfront);
58095 +}
58096 +module_init(netif_init);
58097 +
58098 +
58099 +static void __exit netif_exit(void)
58100 +{
58101 +       unregister_inetaddr_notifier(&notifier_inetdev);
58102 +
58103 +       return xenbus_unregister_driver(&netfront);
58104 +}
58105 +module_exit(netif_exit);
58106 +
58107 +MODULE_LICENSE("Dual BSD/GPL");
58108 diff -urNp linux-2.6/drivers/xen/pciback/conf_space.c new/drivers/xen/pciback/conf_space.c
58109 --- linux-2.6/drivers/xen/pciback/conf_space.c  1970-01-01 01:00:00.000000000 +0100
58110 +++ new/drivers/xen/pciback/conf_space.c        2006-05-09 12:34:38.000000000 +0200
58111 @@ -0,0 +1,403 @@
58112 +/*
58113 + * PCI Backend - Functions for creating a virtual configuration space for
58114 + *               exported PCI Devices.
58115 + *               It's dangerous to allow PCI Driver Domains to change their
58116 + *               device's resources (memory, i/o ports, interrupts). We need to
58117 + *               restrict changes to certain PCI Configuration registers:
58118 + *               BARs, INTERRUPT_PIN, most registers in the header...
58119 + *
58120 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
58121 + */
58122 +
58123 +#include <linux/kernel.h>
58124 +#include <linux/pci.h>
58125 +#include "pciback.h"
58126 +#include "conf_space.h"
58127 +
58128 +static int permissive = 0;
58129 +module_param(permissive, bool, 0644);
58130 +
58131 +#define DEFINE_PCI_CONFIG(op,size,type)                        \
58132 +int pciback_##op##_config_##size                               \
58133 +(struct pci_dev *dev, int offset, type value, void *data)      \
58134 +{                                                              \
58135 +       return pci_##op##_config_##size (dev, offset, value);   \
58136 +}
58137 +
58138 +DEFINE_PCI_CONFIG(read, byte, u8 *)
58139 +DEFINE_PCI_CONFIG(read, word, u16 *)
58140 +DEFINE_PCI_CONFIG(read, dword, u32 *)
58141 +
58142 +DEFINE_PCI_CONFIG(write, byte, u8)
58143 +DEFINE_PCI_CONFIG(write, word, u16)
58144 +DEFINE_PCI_CONFIG(write, dword, u32)
58145 +
58146 +static int conf_space_read(struct pci_dev *dev,
58147 +                          struct config_field_entry *entry, int offset,
58148 +                          u32 * value)
58149 +{
58150 +       int ret = 0;
58151 +       struct config_field *field = entry->field;
58152 +
58153 +       *value = 0;
58154 +
58155 +       switch (field->size) {
58156 +       case 1:
58157 +               if (field->u.b.read)
58158 +                       ret = field->u.b.read(dev, offset, (u8 *) value,
58159 +                                             entry->data);
58160 +               break;
58161 +       case 2:
58162 +               if (field->u.w.read)
58163 +                       ret = field->u.w.read(dev, offset, (u16 *) value,
58164 +                                             entry->data);
58165 +               break;
58166 +       case 4:
58167 +               if (field->u.dw.read)
58168 +                       ret = field->u.dw.read(dev, offset, value, entry->data);
58169 +               break;
58170 +       }
58171 +       return ret;
58172 +}
58173 +
58174 +static int conf_space_write(struct pci_dev *dev,
58175 +                           struct config_field_entry *entry, int offset,
58176 +                           u32 value)
58177 +{
58178 +       int ret = 0;
58179 +       struct config_field *field = entry->field;
58180 +
58181 +       switch (field->size) {
58182 +       case 1:
58183 +               if (field->u.b.write)
58184 +                       ret = field->u.b.write(dev, offset, (u8) value,
58185 +                                              entry->data);
58186 +               break;
58187 +       case 2:
58188 +               if (field->u.w.write)
58189 +                       ret = field->u.w.write(dev, offset, (u16) value,
58190 +                                              entry->data);
58191 +               break;
58192 +       case 4:
58193 +               if (field->u.dw.write)
58194 +                       ret = field->u.dw.write(dev, offset, value,
58195 +                                               entry->data);
58196 +               break;
58197 +       }
58198 +       return ret;
58199 +}
58200 +
58201 +static inline u32 get_mask(int size)
58202 +{
58203 +       if (size == 1)
58204 +               return 0xff;
58205 +       else if (size == 2)
58206 +               return 0xffff;
58207 +       else
58208 +               return 0xffffffff;
58209 +}
58210 +
58211 +static inline int valid_request(int offset, int size)
58212 +{
58213 +       /* Validate request (no un-aligned requests) */
58214 +       if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0)
58215 +               return 1;
58216 +       return 0;
58217 +}
58218 +
58219 +static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask,
58220 +                             int offset)
58221 +{
58222 +       if (offset >= 0) {
58223 +               new_val_mask <<= (offset * 8);
58224 +               new_val <<= (offset * 8);
58225 +       } else {
58226 +               new_val_mask >>= (offset * -8);
58227 +               new_val >>= (offset * -8);
58228 +       }
58229 +       val = (val & ~new_val_mask) | (new_val & new_val_mask);
58230 +
58231 +       return val;
58232 +}
58233 +
58234 +static int pcibios_err_to_errno(int err)
58235 +{
58236 +       switch (err) {
58237 +       case PCIBIOS_SUCCESSFUL:
58238 +               return XEN_PCI_ERR_success;
58239 +       case PCIBIOS_DEVICE_NOT_FOUND:
58240 +               return XEN_PCI_ERR_dev_not_found;
58241 +       case PCIBIOS_BAD_REGISTER_NUMBER:
58242 +               return XEN_PCI_ERR_invalid_offset;
58243 +       case PCIBIOS_FUNC_NOT_SUPPORTED:
58244 +               return XEN_PCI_ERR_not_implemented;
58245 +       case PCIBIOS_SET_FAILED:
58246 +               return XEN_PCI_ERR_access_denied;
58247 +       }
58248 +       return err;
58249 +}
58250 +
58251 +int pciback_config_read(struct pci_dev *dev, int offset, int size,
58252 +                       u32 * ret_val)
58253 +{
58254 +       int err = 0;
58255 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
58256 +       struct config_field_entry *cfg_entry;
58257 +       struct config_field *field;
58258 +       int req_start, req_end, field_start, field_end;
58259 +       /* if read fails for any reason, return 0 (as if device didn't respond) */
58260 +       u32 value = 0, tmp_val;
58261 +
58262 +       if (unlikely(verbose_request))
58263 +               printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x\n",
58264 +                      pci_name(dev), size, offset);
58265 +
58266 +       if (!valid_request(offset, size)) {
58267 +               err = XEN_PCI_ERR_invalid_offset;
58268 +               goto out;
58269 +       }
58270 +
58271 +       /* Get the real value first, then modify as appropriate */
58272 +       switch (size) {
58273 +       case 1:
58274 +               err = pci_read_config_byte(dev, offset, (u8 *) & value);
58275 +               break;
58276 +       case 2:
58277 +               err = pci_read_config_word(dev, offset, (u16 *) & value);
58278 +               break;
58279 +       case 4:
58280 +               err = pci_read_config_dword(dev, offset, &value);
58281 +               break;
58282 +       }
58283 +
58284 +       list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
58285 +               field = cfg_entry->field;
58286 +
58287 +               req_start = offset;
58288 +               req_end = offset + size;
58289 +               field_start = OFFSET(cfg_entry);
58290 +               field_end = OFFSET(cfg_entry) + field->size;
58291 +
58292 +               if ((req_start >= field_start && req_start < field_end)
58293 +                   || (req_end > field_start && req_end <= field_end)) {
58294 +                       err = conf_space_read(dev, cfg_entry, field_start,
58295 +                                             &tmp_val);
58296 +                       if (err)
58297 +                               goto out;
58298 +
58299 +                       value = merge_value(value, tmp_val,
58300 +                                           get_mask(field->size),
58301 +                                           field_start - req_start);
58302 +               }
58303 +       }
58304 +
58305 +      out:
58306 +       if (unlikely(verbose_request))
58307 +               printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x = %x\n",
58308 +                      pci_name(dev), size, offset, value);
58309 +
58310 +       *ret_val = value;
58311 +       return pcibios_err_to_errno(err);
58312 +}
58313 +
58314 +int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value)
58315 +{
58316 +       int err = 0, handled = 0;
58317 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
58318 +       struct config_field_entry *cfg_entry;
58319 +       struct config_field *field;
58320 +       u32 tmp_val;
58321 +       int req_start, req_end, field_start, field_end;
58322 +
58323 +       if (unlikely(verbose_request))
58324 +               printk(KERN_DEBUG
58325 +                      "pciback: %s: write request %d bytes at 0x%x = %x\n",
58326 +                      pci_name(dev), size, offset, value);
58327 +
58328 +       if (!valid_request(offset, size))
58329 +               return XEN_PCI_ERR_invalid_offset;
58330 +
58331 +       list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
58332 +               field = cfg_entry->field;
58333 +
58334 +               req_start = offset;
58335 +               req_end = offset + size;
58336 +               field_start = OFFSET(cfg_entry);
58337 +               field_end = OFFSET(cfg_entry) + field->size;
58338 +
58339 +               if ((req_start >= field_start && req_start < field_end)
58340 +                   || (req_end > field_start && req_end <= field_end)) {
58341 +                       tmp_val = 0;
58342 +
58343 +                       err = pciback_config_read(dev, field_start,
58344 +                                                 field->size, &tmp_val);
58345 +                       if (err)
58346 +                               break;
58347 +
58348 +                       tmp_val = merge_value(tmp_val, value, get_mask(size),
58349 +                                             req_start - field_start);
58350 +
58351 +                       err = conf_space_write(dev, cfg_entry, field_start,
58352 +                                              tmp_val);
58353 +
58354 +                       /* handled is set true here, but not every byte
58355 +                        * may have been written! Properly detecting if
58356 +                        * every byte is handled is unnecessary as the
58357 +                        * flag is used to detect devices that need
58358 +                        * special helpers to work correctly.
58359 +                        */
58360 +                       handled = 1;
58361 +               }
58362 +       }
58363 +
58364 +       if (!handled && !err) {
58365 +               /* By default, anything not specificially handled above is
58366 +                * read-only. The permissive flag changes this behavior so
58367 +                * that anything not specifically handled above is writable.
58368 +                * This means that some fields may still be read-only because
58369 +                * they have entries in the config_field list that intercept
58370 +                * the write and do nothing. */
58371 +               if (permissive) {
58372 +                       switch (size) {
58373 +                       case 1:
58374 +                               err = pci_write_config_byte(dev, offset,
58375 +                                                           (u8)value);
58376 +                               break;
58377 +                       case 2:
58378 +                               err = pci_write_config_word(dev, offset,
58379 +                                                           (u16)value);
58380 +                               break;
58381 +                       case 4:
58382 +                               err = pci_write_config_dword(dev, offset,
58383 +                                                            (u32)value);
58384 +                               break;
58385 +                       }
58386 +               } else if (!dev_data->warned_on_write) {
58387 +                       dev_data->warned_on_write = 1;
58388 +                       dev_warn(&dev->dev, "Driver wrote to a read-only "
58389 +                                "configuration space field!\n");
58390 +                       dev_warn(&dev->dev, "Write at offset 0x%x size %d\n",
58391 +                               offset, size);
58392 +                       dev_warn(&dev->dev, "This may be harmless, but if\n");
58393 +                       dev_warn(&dev->dev, "you have problems with your "
58394 +                                "device:\n");
58395 +                       dev_warn(&dev->dev, "1) see the permissive "
58396 +                                "attribute in sysfs.\n");
58397 +                       dev_warn(&dev->dev, "2) report problems to the "
58398 +                                "xen-devel mailing list along\n");
58399 +                       dev_warn(&dev->dev, "   with details of your device "
58400 +                                "obtained from lspci.\n");
58401 +               }
58402 +       }
58403 +
58404 +       return pcibios_err_to_errno(err);
58405 +}
58406 +
58407 +void pciback_config_reset_dev(struct pci_dev *dev)
58408 +{
58409 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
58410 +       struct config_field_entry *cfg_entry;
58411 +       struct config_field *field;
58412 +
58413 +       dev_dbg(&dev->dev, "resetting virtual configuration space\n");
58414 +
58415 +       list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
58416 +               field = cfg_entry->field;
58417 +
58418 +               if (field->reset)
58419 +                       field->reset(dev, OFFSET(cfg_entry), cfg_entry->data);
58420 +       }
58421 +}
58422 +
58423 +void pciback_config_free_dev(struct pci_dev *dev)
58424 +{
58425 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
58426 +       struct config_field_entry *cfg_entry, *t;
58427 +       struct config_field *field;
58428 +
58429 +       dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n");
58430 +
58431 +       list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
58432 +               list_del(&cfg_entry->list);
58433 +
58434 +               field = cfg_entry->field;
58435 +
58436 +               if (field->release)
58437 +                       field->release(dev, OFFSET(cfg_entry), cfg_entry->data);
58438 +
58439 +               kfree(cfg_entry);
58440 +       }
58441 +}
58442 +
58443 +int pciback_config_add_field_offset(struct pci_dev *dev,
58444 +                                   struct config_field *field,
58445 +                                   unsigned int offset)
58446 +{
58447 +       int err = 0;
58448 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
58449 +       struct config_field_entry *cfg_entry;
58450 +       void *tmp;
58451 +
58452 +       cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL);
58453 +       if (!cfg_entry) {
58454 +               err = -ENOMEM;
58455 +               goto out;
58456 +       }
58457 +
58458 +       cfg_entry->data = NULL;
58459 +       cfg_entry->field = field;
58460 +       cfg_entry->base_offset = offset;
58461 +
58462 +       if (field->init) {
58463 +               tmp = field->init(dev, OFFSET(cfg_entry));
58464 +
58465 +               if (IS_ERR(tmp)) {
58466 +                       err = PTR_ERR(tmp);
58467 +                       goto out;
58468 +               }
58469 +
58470 +               cfg_entry->data = tmp;
58471 +       }
58472 +
58473 +       dev_dbg(&dev->dev, "added config field at offset 0x%02x\n",
58474 +               OFFSET(cfg_entry));
58475 +       list_add_tail(&cfg_entry->list, &dev_data->config_fields);
58476 +
58477 +      out:
58478 +       if (err)
58479 +               kfree(cfg_entry);
58480 +
58481 +       return err;
58482 +}
58483 +
58484 +/* This sets up the device's virtual configuration space to keep track of 
58485 + * certain registers (like the base address registers (BARs) so that we can
58486 + * keep the client from manipulating them directly.
58487 + */
58488 +int pciback_config_init_dev(struct pci_dev *dev)
58489 +{
58490 +       int err = 0;
58491 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
58492 +
58493 +       dev_dbg(&dev->dev, "initializing virtual configuration space\n");
58494 +
58495 +       INIT_LIST_HEAD(&dev_data->config_fields);
58496 +
58497 +       err = pciback_config_header_add_fields(dev);
58498 +       if (err)
58499 +               goto out;
58500 +
58501 +       err = pciback_config_capability_add_fields(dev);
58502 +
58503 +      out:
58504 +       return err;
58505 +}
58506 +
58507 +int pciback_config_init(void)
58508 +{
58509 +       int err;
58510 +
58511 +       err = pciback_config_capability_init();
58512 +
58513 +       return err;
58514 +}
58515 diff -urNp linux-2.6/drivers/xen/pciback/conf_space_capability.c new/drivers/xen/pciback/conf_space_capability.c
58516 --- linux-2.6/drivers/xen/pciback/conf_space_capability.c       1970-01-01 01:00:00.000000000 +0100
58517 +++ new/drivers/xen/pciback/conf_space_capability.c     2006-05-09 12:34:38.000000000 +0200
58518 @@ -0,0 +1,71 @@
58519 +/*
58520 + * PCI Backend - Handles the virtual fields found on the capability lists
58521 + *               in the configuration space.
58522 + *
58523 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
58524 + */
58525 +
58526 +#include <linux/kernel.h>
58527 +#include <linux/pci.h>
58528 +#include "pciback.h"
58529 +#include "conf_space.h"
58530 +#include "conf_space_capability.h"
58531 +
58532 +static LIST_HEAD(capabilities);
58533 +
58534 +static struct config_field caplist_header[] = {
58535 +       {
58536 +        .offset    = PCI_CAP_LIST_ID,
58537 +        .size      = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */
58538 +        .u.w.read  = pciback_read_config_word,
58539 +        .u.w.write = NULL,
58540 +       },
58541 +       {
58542 +        .size = 0,
58543 +       },
58544 +};
58545 +
58546 +static inline void register_capability(struct pciback_config_capability *cap)
58547 +{
58548 +       list_add_tail(&cap->cap_list, &capabilities);
58549 +}
58550 +
58551 +int pciback_config_capability_add_fields(struct pci_dev *dev)
58552 +{
58553 +       int err = 0;
58554 +       struct pciback_config_capability *cap;
58555 +       int cap_offset;
58556 +
58557 +       list_for_each_entry(cap, &capabilities, cap_list) {
58558 +               cap_offset = pci_find_capability(dev, cap->capability);
58559 +               if (cap_offset) {
58560 +                       dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n",
58561 +                               cap->capability, cap_offset);
58562 +
58563 +                       err = pciback_config_add_fields_offset(dev,
58564 +                                                              caplist_header,
58565 +                                                              cap_offset);
58566 +                       if (err)
58567 +                               goto out;
58568 +                       err = pciback_config_add_fields_offset(dev,
58569 +                                                              cap->fields,
58570 +                                                              cap_offset);
58571 +                       if (err)
58572 +                               goto out;
58573 +               }
58574 +       }
58575 +
58576 +      out:
58577 +       return err;
58578 +}
58579 +
58580 +extern struct pciback_config_capability pciback_config_capability_vpd;
58581 +extern struct pciback_config_capability pciback_config_capability_pm;
58582 +
58583 +int pciback_config_capability_init(void)
58584 +{
58585 +       register_capability(&pciback_config_capability_vpd);
58586 +       register_capability(&pciback_config_capability_pm);
58587 +
58588 +       return 0;
58589 +}
58590 diff -urNp linux-2.6/drivers/xen/pciback/conf_space_capability.h new/drivers/xen/pciback/conf_space_capability.h
58591 --- linux-2.6/drivers/xen/pciback/conf_space_capability.h       1970-01-01 01:00:00.000000000 +0100
58592 +++ new/drivers/xen/pciback/conf_space_capability.h     2006-05-09 12:34:38.000000000 +0200
58593 @@ -0,0 +1,23 @@
58594 +/*
58595 + * PCI Backend - Data structures for special overlays for structures on
58596 + *               the capability list.
58597 + *
58598 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
58599 + */
58600 +
58601 +#ifndef __PCIBACK_CONFIG_CAPABILITY_H__
58602 +#define __PCIBACK_CONFIG_CAPABILITY_H__
58603 +
58604 +#include <linux/pci.h>
58605 +#include <linux/list.h>
58606 +
58607 +struct pciback_config_capability {
58608 +       struct list_head cap_list;
58609 +
58610 +       int capability;
58611 +
58612 +       /* If the device has the capability found above, add these fields */
58613 +       struct config_field *fields;
58614 +};
58615 +
58616 +#endif
58617 diff -urNp linux-2.6/drivers/xen/pciback/conf_space_capability_pm.c new/drivers/xen/pciback/conf_space_capability_pm.c
58618 --- linux-2.6/drivers/xen/pciback/conf_space_capability_pm.c    1970-01-01 01:00:00.000000000 +0100
58619 +++ new/drivers/xen/pciback/conf_space_capability_pm.c  2006-05-09 12:34:38.000000000 +0200
58620 @@ -0,0 +1,113 @@
58621 +/*
58622 + * PCI Backend - Configuration space overlay for power management
58623 + *
58624 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
58625 + */
58626 +
58627 +#include <linux/pci.h>
58628 +#include "conf_space.h"
58629 +#include "conf_space_capability.h"
58630 +
58631 +static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value,
58632 +                       void *data)
58633 +{
58634 +       int err;
58635 +       u16 real_value;
58636 +
58637 +       err = pci_read_config_word(dev, offset, &real_value);
58638 +       if (err)
58639 +               goto out;
58640 +
58641 +       *value = real_value & ~PCI_PM_CAP_PME_MASK;
58642 +
58643 +      out:
58644 +       return err;
58645 +}
58646 +
58647 +/* PM_OK_BITS specifies the bits that the driver domain is allowed to change.
58648 + * Can't allow driver domain to enable PMEs - they're shared */
58649 +#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK)
58650 +
58651 +static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
58652 +                        void *data)
58653 +{
58654 +       int err;
58655 +       u16 cur_value;
58656 +       pci_power_t new_state;
58657 +
58658 +       /* Handle setting power state separately */
58659 +       new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
58660 +
58661 +       err = pci_read_config_word(dev, offset, &cur_value);
58662 +       if (err)
58663 +               goto out;
58664 +
58665 +       new_value &= PM_OK_BITS;
58666 +       if ((cur_value & PM_OK_BITS) != new_value) {
58667 +               new_value = (cur_value & ~PM_OK_BITS) | new_value;
58668 +               err = pci_write_config_word(dev, offset, new_value);
58669 +               if (err)
58670 +                       goto out;
58671 +       }
58672 +
58673 +       /* Let pci core handle the power management change */
58674 +       dev_dbg(&dev->dev, "set power state to %x\n", new_state);
58675 +       err = pci_set_power_state(dev, new_state);
58676 +       if (err)
58677 +               err = PCIBIOS_SET_FAILED;
58678 +
58679 +      out:
58680 +       return err;
58681 +}
58682 +
58683 +/* Ensure PMEs are disabled */
58684 +static void *pm_ctrl_init(struct pci_dev *dev, int offset)
58685 +{
58686 +       int err;
58687 +       u16 value;
58688 +
58689 +       err = pci_read_config_word(dev, offset, &value);
58690 +       if (err)
58691 +               goto out;
58692 +
58693 +       if (value & PCI_PM_CTRL_PME_ENABLE) {
58694 +               value &= ~PCI_PM_CTRL_PME_ENABLE;
58695 +               err = pci_write_config_word(dev, offset, value);
58696 +       }
58697 +
58698 +      out:
58699 +       return ERR_PTR(err);
58700 +}
58701 +
58702 +static struct config_field caplist_pm[] = {
58703 +       {
58704 +               .offset     = PCI_PM_PMC,
58705 +               .size       = 2,
58706 +               .u.w.read   = pm_caps_read,
58707 +       },
58708 +       {
58709 +               .offset     = PCI_PM_CTRL,
58710 +               .size       = 2,
58711 +               .init       = pm_ctrl_init,
58712 +               .u.w.read   = pciback_read_config_word,
58713 +               .u.w.write  = pm_ctrl_write,
58714 +       },
58715 +       {
58716 +               .offset     = PCI_PM_PPB_EXTENSIONS,
58717 +               .size       = 1,
58718 +               .u.b.read   = pciback_read_config_byte,
58719 +       },
58720 +       {
58721 +               .offset     = PCI_PM_DATA_REGISTER,
58722 +               .size       = 1,
58723 +               .u.b.read   = pciback_read_config_byte,
58724 +       },
58725 +       {
58726 +               .size = 0,
58727 +       },
58728 +};
58729 +
58730 +struct pciback_config_capability pciback_config_capability_pm = {
58731 +       .capability = PCI_CAP_ID_PM,
58732 +       .fields = caplist_pm,
58733 +};
58734 diff -urNp linux-2.6/drivers/xen/pciback/conf_space_capability_vpd.c new/drivers/xen/pciback/conf_space_capability_vpd.c
58735 --- linux-2.6/drivers/xen/pciback/conf_space_capability_vpd.c   1970-01-01 01:00:00.000000000 +0100
58736 +++ new/drivers/xen/pciback/conf_space_capability_vpd.c 2006-05-09 12:34:38.000000000 +0200
58737 @@ -0,0 +1,42 @@
58738 +/*
58739 + * PCI Backend - Configuration space overlay for Vital Product Data
58740 + *
58741 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
58742 + */
58743 +
58744 +#include <linux/pci.h>
58745 +#include "conf_space.h"
58746 +#include "conf_space_capability.h"
58747 +
58748 +static int vpd_address_write(struct pci_dev *dev, int offset, u16 value,
58749 +                            void *data)
58750 +{
58751 +       /* Disallow writes to the vital product data */
58752 +       if (value & PCI_VPD_ADDR_F)
58753 +               return PCIBIOS_SET_FAILED;
58754 +       else
58755 +               return pci_write_config_word(dev, offset, value);
58756 +}
58757 +
58758 +static struct config_field caplist_vpd[] = {
58759 +       {
58760 +        .offset    = PCI_VPD_ADDR,
58761 +        .size      = 2,
58762 +        .u.w.read  = pciback_read_config_word,
58763 +        .u.w.write = vpd_address_write,
58764 +        },
58765 +       {
58766 +        .offset     = PCI_VPD_DATA,
58767 +        .size       = 4,
58768 +        .u.dw.read  = pciback_read_config_dword,
58769 +        .u.dw.write = NULL,
58770 +        },
58771 +       {
58772 +        .size = 0,
58773 +        },
58774 +};
58775
58776 +struct pciback_config_capability pciback_config_capability_vpd = {
58777 +       .capability = PCI_CAP_ID_VPD,
58778 +       .fields = caplist_vpd,
58779 +};
58780 diff -urNp linux-2.6/drivers/xen/pciback/conf_space.h new/drivers/xen/pciback/conf_space.h
58781 --- linux-2.6/drivers/xen/pciback/conf_space.h  1970-01-01 01:00:00.000000000 +0100
58782 +++ new/drivers/xen/pciback/conf_space.h        2006-05-09 12:34:38.000000000 +0200
58783 @@ -0,0 +1,123 @@
58784 +/*
58785 + * PCI Backend - Common data structures for overriding the configuration space
58786 + *
58787 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
58788 + */
58789 +
58790 +#ifndef __XEN_PCIBACK_CONF_SPACE_H__
58791 +#define __XEN_PCIBACK_CONF_SPACE_H__
58792 +
58793 +#include <linux/list.h>
58794 +#include <linux/err.h>
58795 +
58796 +/* conf_field_init can return an errno in a ptr with ERR_PTR() */
58797 +typedef void *(*conf_field_init) (struct pci_dev * dev, int offset);
58798 +typedef void (*conf_field_reset) (struct pci_dev * dev, int offset, void *data);
58799 +typedef void (*conf_field_free) (struct pci_dev * dev, int offset, void *data);
58800 +
58801 +typedef int (*conf_dword_write) (struct pci_dev * dev, int offset, u32 value,
58802 +                                void *data);
58803 +typedef int (*conf_word_write) (struct pci_dev * dev, int offset, u16 value,
58804 +                               void *data);
58805 +typedef int (*conf_byte_write) (struct pci_dev * dev, int offset, u8 value,
58806 +                               void *data);
58807 +typedef int (*conf_dword_read) (struct pci_dev * dev, int offset, u32 * value,
58808 +                               void *data);
58809 +typedef int (*conf_word_read) (struct pci_dev * dev, int offset, u16 * value,
58810 +                              void *data);
58811 +typedef int (*conf_byte_read) (struct pci_dev * dev, int offset, u8 * value,
58812 +                              void *data);
58813 +
58814 +/* These are the fields within the configuration space which we
58815 + * are interested in intercepting reads/writes to and changing their
58816 + * values.
58817 + */
58818 +struct config_field {
58819 +       unsigned int     offset;
58820 +       unsigned int     size;
58821 +       conf_field_init  init;
58822 +       conf_field_reset reset;
58823 +       conf_field_free  release;
58824 +       union {
58825 +               struct {
58826 +                       conf_dword_write write;
58827 +                       conf_dword_read read;
58828 +               } dw;
58829 +               struct {
58830 +                       conf_word_write write;
58831 +                       conf_word_read read;
58832 +               } w;
58833 +               struct {
58834 +                       conf_byte_write write;
58835 +                       conf_byte_read read;
58836 +               } b;
58837 +       } u;
58838 +};
58839 +
58840 +struct config_field_entry {
58841 +       struct list_head list;
58842 +       struct config_field *field;
58843 +       unsigned int base_offset;
58844 +       void *data;
58845 +};
58846 +
58847 +#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset)
58848 +
58849 +/* Add fields to a device - the add_fields macro expects to get a pointer to
58850 + * the first entry in an array (of which the ending is marked by size==0)
58851 + */
58852 +int pciback_config_add_field_offset(struct pci_dev *dev,
58853 +                                   struct config_field *field,
58854 +                                   unsigned int offset);
58855 +
58856 +static inline int pciback_config_add_field(struct pci_dev *dev,
58857 +                                          struct config_field *field)
58858 +{
58859 +       return pciback_config_add_field_offset(dev, field, 0);
58860 +}
58861 +
58862 +static inline int pciback_config_add_fields(struct pci_dev *dev,
58863 +                                           struct config_field *field)
58864 +{
58865 +       int i, err = 0;
58866 +       for (i = 0; field[i].size != 0; i++) {
58867 +               err = pciback_config_add_field(dev, &field[i]);
58868 +               if (err)
58869 +                       break;
58870 +       }
58871 +       return err;
58872 +}
58873 +
58874 +static inline int pciback_config_add_fields_offset(struct pci_dev *dev,
58875 +                                                  struct config_field *field,
58876 +                                                  unsigned int offset)
58877 +{
58878 +       int i, err = 0;
58879 +       for (i = 0; field[i].size != 0; i++) {
58880 +               err = pciback_config_add_field_offset(dev, &field[i], offset);
58881 +               if (err)
58882 +                       break;
58883 +       }
58884 +       return err;
58885 +}
58886 +
58887 +/* Read/Write the real configuration space */
58888 +int pciback_read_config_byte(struct pci_dev *dev, int offset, u8 * value,
58889 +                            void *data);
58890 +int pciback_read_config_word(struct pci_dev *dev, int offset, u16 * value,
58891 +                            void *data);
58892 +int pciback_read_config_dword(struct pci_dev *dev, int offset, u32 * value,
58893 +                             void *data);
58894 +int pciback_write_config_byte(struct pci_dev *dev, int offset, u8 value,
58895 +                             void *data);
58896 +int pciback_write_config_word(struct pci_dev *dev, int offset, u16 value,
58897 +                             void *data);
58898 +int pciback_write_config_dword(struct pci_dev *dev, int offset, u32 value,
58899 +                              void *data);
58900 +
58901 +int pciback_config_capability_init(void);
58902 +
58903 +int pciback_config_header_add_fields(struct pci_dev *dev);
58904 +int pciback_config_capability_add_fields(struct pci_dev *dev);
58905 +
58906 +#endif                         /* __XEN_PCIBACK_CONF_SPACE_H__ */
58907 diff -urNp linux-2.6/drivers/xen/pciback/conf_space_header.c new/drivers/xen/pciback/conf_space_header.c
58908 --- linux-2.6/drivers/xen/pciback/conf_space_header.c   1970-01-01 01:00:00.000000000 +0100
58909 +++ new/drivers/xen/pciback/conf_space_header.c 2006-05-09 12:34:38.000000000 +0200
58910 @@ -0,0 +1,299 @@
58911 +/*
58912 + * PCI Backend - Handles the virtual fields in the configuration space headers.
58913 + *
58914 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
58915 + */
58916 +
58917 +#include <linux/kernel.h>
58918 +#include <linux/pci.h>
58919 +#include "pciback.h"
58920 +#include "conf_space.h"
58921 +
58922 +struct pci_bar_info {
58923 +       u32 val;
58924 +       u32 len_val;
58925 +       int which;
58926 +};
58927 +
58928 +#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO))
58929 +#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER)
58930 +
58931 +static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
58932 +{
58933 +       if (!dev->is_enabled && is_enable_cmd(value)) {
58934 +               if (unlikely(verbose_request))
58935 +                       printk(KERN_DEBUG "pciback: %s: enable\n",
58936 +                              pci_name(dev));
58937 +               pci_enable_device(dev);
58938 +       } else if (dev->is_enabled && !is_enable_cmd(value)) {
58939 +               if (unlikely(verbose_request))
58940 +                       printk(KERN_DEBUG "pciback: %s: disable\n",
58941 +                              pci_name(dev));
58942 +               pci_disable_device(dev);
58943 +       }
58944 +
58945 +       if (!dev->is_busmaster && is_master_cmd(value)) {
58946 +               if (unlikely(verbose_request))
58947 +                       printk(KERN_DEBUG "pciback: %s: set bus master\n",
58948 +                              pci_name(dev));
58949 +               pci_set_master(dev);
58950 +       }
58951 +
58952 +       if (value & PCI_COMMAND_INVALIDATE) {
58953 +               if (unlikely(verbose_request))
58954 +                       printk(KERN_DEBUG
58955 +                              "pciback: %s: enable memory-write-invalidate\n",
58956 +                              pci_name(dev));
58957 +               pci_set_mwi(dev);
58958 +       }
58959 +
58960 +       return pci_write_config_word(dev, offset, value);
58961 +}
58962 +
58963 +static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data)
58964 +{
58965 +       struct pci_bar_info *bar = data;
58966 +
58967 +       if (unlikely(!bar)) {
58968 +               printk(KERN_WARNING "pciback: driver data not found for %s\n",
58969 +                      pci_name(dev));
58970 +               return XEN_PCI_ERR_op_failed;
58971 +       }
58972 +
58973 +       /* A write to obtain the length must happen as a 32-bit write.
58974 +        * This does not (yet) support writing individual bytes
58975 +        */
58976 +       if (value == ~PCI_ROM_ADDRESS_ENABLE)
58977 +               bar->which = 1;
58978 +       else
58979 +               bar->which = 0;
58980 +
58981 +       /* Do we need to support enabling/disabling the rom address here? */
58982 +
58983 +       return 0;
58984 +}
58985 +
58986 +/* For the BARs, only allow writes which write ~0 or
58987 + * the correct resource information
58988 + * (Needed for when the driver probes the resource usage)
58989 + */
58990 +static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data)
58991 +{
58992 +       struct pci_bar_info *bar = data;
58993 +
58994 +       if (unlikely(!bar)) {
58995 +               printk(KERN_WARNING "pciback: driver data not found for %s\n",
58996 +                      pci_name(dev));
58997 +               return XEN_PCI_ERR_op_failed;
58998 +       }
58999 +
59000 +       /* A write to obtain the length must happen as a 32-bit write.
59001 +        * This does not (yet) support writing individual bytes
59002 +        */
59003 +       if (value == ~0)
59004 +               bar->which = 1;
59005 +       else
59006 +               bar->which = 0;
59007 +
59008 +       return 0;
59009 +}
59010 +
59011 +static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data)
59012 +{
59013 +       struct pci_bar_info *bar = data;
59014 +
59015 +       if (unlikely(!bar)) {
59016 +               printk(KERN_WARNING "pciback: driver data not found for %s\n",
59017 +                      pci_name(dev));
59018 +               return XEN_PCI_ERR_op_failed;
59019 +       }
59020 +
59021 +       *value = bar->which ? bar->len_val : bar->val;
59022 +
59023 +       return 0;
59024 +}
59025 +
59026 +static inline void read_dev_bar(struct pci_dev *dev,
59027 +                               struct pci_bar_info *bar_info, int offset,
59028 +                               u32 len_mask)
59029 +{
59030 +       pci_read_config_dword(dev, offset, &bar_info->val);
59031 +       pci_write_config_dword(dev, offset, len_mask);
59032 +       pci_read_config_dword(dev, offset, &bar_info->len_val);
59033 +       pci_write_config_dword(dev, offset, bar_info->val);
59034 +}
59035 +
59036 +static void *bar_init(struct pci_dev *dev, int offset)
59037 +{
59038 +       struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
59039 +
59040 +       if (!bar)
59041 +               return ERR_PTR(-ENOMEM);
59042 +
59043 +       read_dev_bar(dev, bar, offset, ~0);
59044 +       bar->which = 0;
59045 +
59046 +       return bar;
59047 +}
59048 +
59049 +static void *rom_init(struct pci_dev *dev, int offset)
59050 +{
59051 +       struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
59052 +
59053 +       if (!bar)
59054 +               return ERR_PTR(-ENOMEM);
59055 +
59056 +       read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE);
59057 +       bar->which = 0;
59058 +
59059 +       return bar;
59060 +}
59061 +
59062 +static void bar_reset(struct pci_dev *dev, int offset, void *data)
59063 +{
59064 +       struct pci_bar_info *bar = data;
59065 +
59066 +       bar->which = 0;
59067 +}
59068 +
59069 +static void bar_release(struct pci_dev *dev, int offset, void *data)
59070 +{
59071 +       kfree(data);
59072 +}
59073 +
59074 +static int interrupt_read(struct pci_dev *dev, int offset, u8 * value,
59075 +                         void *data)
59076 +{
59077 +       *value = (u8) dev->irq;
59078 +
59079 +       return 0;
59080 +}
59081 +
59082 +static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data)
59083 +{
59084 +       u8 cur_value;
59085 +       int err;
59086 +
59087 +       err = pci_read_config_byte(dev, offset, &cur_value);
59088 +       if (err)
59089 +               goto out;
59090 +
59091 +       if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START)
59092 +           || value == PCI_BIST_START)
59093 +               err = pci_write_config_byte(dev, offset, value);
59094 +
59095 +      out:
59096 +       return err;
59097 +}
59098 +
59099 +static struct config_field header_common[] = {
59100 +       {
59101 +        .offset    = PCI_COMMAND,
59102 +        .size      = 2,
59103 +        .u.w.read  = pciback_read_config_word,
59104 +        .u.w.write = command_write,
59105 +       },
59106 +       {
59107 +        .offset    = PCI_INTERRUPT_LINE,
59108 +        .size      = 1,
59109 +        .u.b.read  = interrupt_read,
59110 +       },
59111 +       {
59112 +        .offset    = PCI_INTERRUPT_PIN,
59113 +        .size      = 1,
59114 +        .u.b.read  = pciback_read_config_byte,
59115 +       },
59116 +       {
59117 +        /* Any side effects of letting driver domain control cache line? */
59118 +        .offset    = PCI_CACHE_LINE_SIZE,
59119 +        .size      = 1,
59120 +        .u.b.read  = pciback_read_config_byte,
59121 +        .u.b.write = pciback_write_config_byte,
59122 +       },
59123 +       {
59124 +        .offset    = PCI_LATENCY_TIMER,
59125 +        .size      = 1,
59126 +        .u.b.read  = pciback_read_config_byte,
59127 +       },
59128 +       {
59129 +        .offset    = PCI_BIST,
59130 +        .size      = 1,
59131 +        .u.b.read  = pciback_read_config_byte,
59132 +        .u.b.write = bist_write,
59133 +       },
59134 +       {
59135 +        .size = 0,
59136 +       },
59137 +};
59138 +
59139 +#define CFG_FIELD_BAR(reg_offset)                      \
59140 +       {                                               \
59141 +        .offset     = reg_offset,                      \
59142 +        .size       = 4,                               \
59143 +        .init       = bar_init,                        \
59144 +        .reset      = bar_reset,                       \
59145 +        .release    = bar_release,                     \
59146 +        .u.dw.read  = bar_read,                        \
59147 +        .u.dw.write = bar_write,                       \
59148 +        }
59149 +
59150 +#define CFG_FIELD_ROM(reg_offset)                      \
59151 +       {                                               \
59152 +        .offset     = reg_offset,                      \
59153 +        .size       = 4,                               \
59154 +        .init       = rom_init,                        \
59155 +        .reset      = bar_reset,                       \
59156 +        .release    = bar_release,                     \
59157 +        .u.dw.read  = bar_read,                        \
59158 +        .u.dw.write = rom_write,                       \
59159 +        }
59160 +
59161 +static struct config_field header_0[] = {
59162 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
59163 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
59164 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_2),
59165 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_3),
59166 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_4),
59167 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_5),
59168 +       CFG_FIELD_ROM(PCI_ROM_ADDRESS),
59169 +       {
59170 +        .size = 0,
59171 +       },
59172 +};
59173 +
59174 +static struct config_field header_1[] = {
59175 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
59176 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
59177 +       CFG_FIELD_ROM(PCI_ROM_ADDRESS1),
59178 +       {
59179 +        .size = 0,
59180 +       },
59181 +};
59182 +
59183 +int pciback_config_header_add_fields(struct pci_dev *dev)
59184 +{
59185 +       int err;
59186 +
59187 +       err = pciback_config_add_fields(dev, header_common);
59188 +       if (err)
59189 +               goto out;
59190 +
59191 +       switch (dev->hdr_type) {
59192 +       case PCI_HEADER_TYPE_NORMAL:
59193 +               err = pciback_config_add_fields(dev, header_0);
59194 +               break;
59195 +
59196 +       case PCI_HEADER_TYPE_BRIDGE:
59197 +               err = pciback_config_add_fields(dev, header_1);
59198 +               break;
59199 +
59200 +       default:
59201 +               err = -EINVAL;
59202 +               printk(KERN_ERR "pciback: %s: Unsupported header type %d!\n",
59203 +                      pci_name(dev), dev->hdr_type);
59204 +               break;
59205 +       }
59206 +
59207 +      out:
59208 +       return err;
59209 +}
59210 diff -urNp linux-2.6/drivers/xen/pciback/Makefile new/drivers/xen/pciback/Makefile
59211 --- linux-2.6/drivers/xen/pciback/Makefile      1970-01-01 01:00:00.000000000 +0100
59212 +++ new/drivers/xen/pciback/Makefile    2006-05-09 12:34:38.000000000 +0200
59213 @@ -0,0 +1,13 @@
59214 +obj-$(CONFIG_XEN_PCIDEV_BACKEND) += pciback.o
59215 +
59216 +pciback-y := pci_stub.o pciback_ops.o xenbus.o
59217 +pciback-y += conf_space.o conf_space_header.o \
59218 +            conf_space_capability.o \
59219 +            conf_space_capability_vpd.o \
59220 +            conf_space_capability_pm.o
59221 +pciback-$(CONFIG_XEN_PCIDEV_BACKEND_VPCI) += vpci.o
59222 +pciback-$(CONFIG_XEN_PCIDEV_BACKEND_PASS) += passthrough.o
59223 +
59224 +ifeq ($(CONFIG_XEN_PCIDEV_BE_DEBUG),y)
59225 +EXTRA_CFLAGS += -DDEBUG
59226 +endif
59227 diff -urNp linux-2.6/drivers/xen/pciback/passthrough.c new/drivers/xen/pciback/passthrough.c
59228 --- linux-2.6/drivers/xen/pciback/passthrough.c 1970-01-01 01:00:00.000000000 +0100
59229 +++ new/drivers/xen/pciback/passthrough.c       2006-05-09 12:34:38.000000000 +0200
59230 @@ -0,0 +1,157 @@
59231 +/*
59232 + * PCI Backend - Provides restricted access to the real PCI bus topology
59233 + *               to the frontend
59234 + *
59235 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
59236 + */
59237 +
59238 +#include <linux/list.h>
59239 +#include <linux/pci.h>
59240 +#include <linux/spinlock.h>
59241 +#include "pciback.h"
59242 +
59243 +struct passthrough_dev_data {
59244 +       /* Access to dev_list must be protected by lock */
59245 +       struct list_head dev_list;
59246 +       spinlock_t lock;
59247 +};
59248 +
59249 +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
59250 +                                   unsigned int domain, unsigned int bus,
59251 +                                   unsigned int devfn)
59252 +{
59253 +       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
59254 +       struct pci_dev_entry *dev_entry;
59255 +       struct pci_dev *dev = NULL;
59256 +       unsigned long flags;
59257 +
59258 +       spin_lock_irqsave(&dev_data->lock, flags);
59259 +
59260 +       list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
59261 +               if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus)
59262 +                   && bus == (unsigned int)dev_entry->dev->bus->number
59263 +                   && devfn == dev_entry->dev->devfn) {
59264 +                       dev = dev_entry->dev;
59265 +                       break;
59266 +               }
59267 +       }
59268 +
59269 +       spin_unlock_irqrestore(&dev_data->lock, flags);
59270 +
59271 +       return dev;
59272 +}
59273 +
59274 +int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
59275 +{
59276 +       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
59277 +       struct pci_dev_entry *dev_entry;
59278 +       unsigned long flags;
59279 +
59280 +       dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
59281 +       if (!dev_entry)
59282 +               return -ENOMEM;
59283 +       dev_entry->dev = dev;
59284 +
59285 +       spin_lock_irqsave(&dev_data->lock, flags);
59286 +       list_add_tail(&dev_entry->list, &dev_data->dev_list);
59287 +       spin_unlock_irqrestore(&dev_data->lock, flags);
59288 +
59289 +       return 0;
59290 +}
59291 +
59292 +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
59293 +{
59294 +       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
59295 +       struct pci_dev_entry *dev_entry, *t;
59296 +       struct pci_dev *found_dev = NULL;
59297 +       unsigned long flags;
59298 +
59299 +       spin_lock_irqsave(&dev_data->lock, flags);
59300 +
59301 +       list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
59302 +               if (dev_entry->dev == dev) {
59303 +                       list_del(&dev_entry->list);
59304 +                       found_dev = dev_entry->dev;
59305 +                       kfree(dev_entry);
59306 +               }
59307 +       }
59308 +
59309 +       spin_unlock_irqrestore(&dev_data->lock, flags);
59310 +
59311 +       if (found_dev)
59312 +               pcistub_put_pci_dev(found_dev);
59313 +}
59314 +
59315 +int pciback_init_devices(struct pciback_device *pdev)
59316 +{
59317 +       struct passthrough_dev_data *dev_data;
59318 +
59319 +       dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
59320 +       if (!dev_data)
59321 +               return -ENOMEM;
59322 +
59323 +       spin_lock_init(&dev_data->lock);
59324 +
59325 +       INIT_LIST_HEAD(&dev_data->dev_list);
59326 +
59327 +       pdev->pci_dev_data = dev_data;
59328 +
59329 +       return 0;
59330 +}
59331 +
59332 +int pciback_publish_pci_roots(struct pciback_device *pdev,
59333 +                             publish_pci_root_cb publish_root_cb)
59334 +{
59335 +       int err = 0;
59336 +       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
59337 +       struct pci_dev_entry *dev_entry, *e;
59338 +       struct pci_dev *dev;
59339 +       int found;
59340 +       unsigned int domain, bus;
59341 +
59342 +       spin_lock(&dev_data->lock);
59343 +
59344 +       list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
59345 +               /* Only publish this device as a root if none of its
59346 +                * parent bridges are exported
59347 +                */
59348 +               found = 0;
59349 +               dev = dev_entry->dev->bus->self;
59350 +               for (; !found && dev != NULL; dev = dev->bus->self) {
59351 +                       list_for_each_entry(e, &dev_data->dev_list, list) {
59352 +                               if (dev == e->dev) {
59353 +                                       found = 1;
59354 +                                       break;
59355 +                               }
59356 +                       }
59357 +               }
59358 +
59359 +               domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus);
59360 +               bus = (unsigned int)dev_entry->dev->bus->number;
59361 +
59362 +               if (!found) {
59363 +                       err = publish_root_cb(pdev, domain, bus);
59364 +                       if (err)
59365 +                               break;
59366 +               }
59367 +       }
59368 +
59369 +       spin_unlock(&dev_data->lock);
59370 +
59371 +       return err;
59372 +}
59373 +
59374 +void pciback_release_devices(struct pciback_device *pdev)
59375 +{
59376 +       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
59377 +       struct pci_dev_entry *dev_entry, *t;
59378 +
59379 +       list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
59380 +               list_del(&dev_entry->list);
59381 +               pcistub_put_pci_dev(dev_entry->dev);
59382 +               kfree(dev_entry);
59383 +       }
59384 +
59385 +       kfree(dev_data);
59386 +       pdev->pci_dev_data = NULL;
59387 +}
59388 diff -urNp linux-2.6/drivers/xen/pciback/pciback.h new/drivers/xen/pciback/pciback.h
59389 --- linux-2.6/drivers/xen/pciback/pciback.h     1970-01-01 01:00:00.000000000 +0100
59390 +++ new/drivers/xen/pciback/pciback.h   2006-05-09 12:34:38.000000000 +0200
59391 @@ -0,0 +1,91 @@
59392 +/*
59393 + * PCI Backend Common Data Structures & Function Declarations
59394 + *
59395 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
59396 + */
59397 +#ifndef __XEN_PCIBACK_H__
59398 +#define __XEN_PCIBACK_H__
59399 +
59400 +#include <linux/pci.h>
59401 +#include <linux/interrupt.h>
59402 +#include <xen/xenbus.h>
59403 +#include <linux/list.h>
59404 +#include <linux/spinlock.h>
59405 +#include <linux/workqueue.h>
59406 +#include <asm/atomic.h>
59407 +#include <xen/interface/io/pciif.h>
59408 +
59409 +struct pci_dev_entry {
59410 +       struct list_head list;
59411 +       struct pci_dev *dev;
59412 +};
59413 +
59414 +#define _PDEVF_op_active       (0)
59415 +#define PDEVF_op_active        (1<<(_PDEVF_op_active))
59416 +
59417 +struct pciback_device {
59418 +       void *pci_dev_data;
59419 +       spinlock_t dev_lock;
59420 +
59421 +       struct xenbus_device *xdev;
59422 +
59423 +       struct xenbus_watch be_watch;
59424 +       u8 be_watching;
59425 +
59426 +       int evtchn_irq;
59427 +
59428 +       struct vm_struct *sh_area;
59429 +       struct xen_pci_sharedinfo *sh_info;
59430 +
59431 +       unsigned long flags;
59432 +
59433 +       struct work_struct op_work;
59434 +};
59435 +
59436 +struct pciback_dev_data {
59437 +       struct list_head config_fields;
59438 +       int warned_on_write;
59439 +};
59440 +
59441 +/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */
59442 +struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
59443 +                                           int domain, int bus,
59444 +                                           int slot, int func);
59445 +struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
59446 +                                   struct pci_dev *dev);
59447 +void pcistub_put_pci_dev(struct pci_dev *dev);
59448 +
59449 +/* Ensure a device is turned off or reset */
59450 +void pciback_reset_device(struct pci_dev *pdev);
59451 +
59452 +/* Access a virtual configuration space for a PCI device */
59453 +int pciback_config_init(void);
59454 +int pciback_config_init_dev(struct pci_dev *dev);
59455 +void pciback_config_reset_dev(struct pci_dev *dev);
59456 +void pciback_config_free_dev(struct pci_dev *dev);
59457 +int pciback_config_read(struct pci_dev *dev, int offset, int size,
59458 +                       u32 * ret_val);
59459 +int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value);
59460 +
59461 +/* Handle requests for specific devices from the frontend */
59462 +typedef int (*publish_pci_root_cb) (struct pciback_device * pdev,
59463 +                                   unsigned int domain, unsigned int bus);
59464 +int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev);
59465 +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev);
59466 +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
59467 +                                   unsigned int domain, unsigned int bus,
59468 +                                   unsigned int devfn);
59469 +int pciback_init_devices(struct pciback_device *pdev);
59470 +int pciback_publish_pci_roots(struct pciback_device *pdev,
59471 +                             publish_pci_root_cb cb);
59472 +void pciback_release_devices(struct pciback_device *pdev);
59473 +
59474 +/* Handles events from front-end */
59475 +irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs);
59476 +void pciback_do_op(void *data);
59477 +
59478 +int pciback_xenbus_register(void);
59479 +void pciback_xenbus_unregister(void);
59480 +
59481 +extern int verbose_request;
59482 +#endif
59483 diff -urNp linux-2.6/drivers/xen/pciback/pciback_ops.c new/drivers/xen/pciback/pciback_ops.c
59484 --- linux-2.6/drivers/xen/pciback/pciback_ops.c 1970-01-01 01:00:00.000000000 +0100
59485 +++ new/drivers/xen/pciback/pciback_ops.c       2006-05-09 12:34:38.000000000 +0200
59486 @@ -0,0 +1,95 @@
59487 +/*
59488 + * PCI Backend Operations - respond to PCI requests from Frontend
59489 + *
59490 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
59491 + */
59492 +#include <linux/module.h>
59493 +#include <asm/bitops.h>
59494 +#include <xen/evtchn.h>
59495 +#include "pciback.h"
59496 +
59497 +int verbose_request = 0;
59498 +module_param(verbose_request, int, 0644);
59499 +
59500 +/* Ensure a device is "turned off" and ready to be exported.
59501 + * (Also see pciback_config_reset to ensure virtual configuration space is
59502 + * ready to be re-exported)
59503 + */
59504 +void pciback_reset_device(struct pci_dev *dev)
59505 +{
59506 +       u16 cmd;
59507 +
59508 +       /* Disable devices (but not bridges) */
59509 +       if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
59510 +               pci_disable_device(dev);
59511 +
59512 +               pci_write_config_word(dev, PCI_COMMAND, 0);
59513 +
59514 +               dev->is_enabled = 0;
59515 +               dev->is_busmaster = 0;
59516 +       } else {
59517 +               pci_read_config_word(dev, PCI_COMMAND, &cmd);
59518 +               if (cmd & (PCI_COMMAND_INVALIDATE)) {
59519 +                       cmd &= ~(PCI_COMMAND_INVALIDATE);
59520 +                       pci_write_config_word(dev, PCI_COMMAND, cmd);
59521 +
59522 +                       dev->is_busmaster = 0;
59523 +               }
59524 +       }
59525 +}
59526 +
59527 +static inline void test_and_schedule_op(struct pciback_device *pdev)
59528 +{
59529 +       /* Check that frontend is requesting an operation and that we are not
59530 +        * already processing a request */
59531 +       if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags)
59532 +           && !test_and_set_bit(_PDEVF_op_active, &pdev->flags))
59533 +               schedule_work(&pdev->op_work);
59534 +}
59535 +
59536 +/* Performing the configuration space reads/writes must not be done in atomic
59537 + * context because some of the pci_* functions can sleep (mostly due to ACPI
59538 + * use of semaphores). This function is intended to be called from a work
59539 + * queue in process context taking a struct pciback_device as a parameter */
59540 +void pciback_do_op(void *data)
59541 +{
59542 +       struct pciback_device *pdev = data;
59543 +       struct pci_dev *dev;
59544 +       struct xen_pci_op *op = &pdev->sh_info->op;
59545 +
59546 +       dev = pciback_get_pci_dev(pdev, op->domain, op->bus, op->devfn);
59547 +
59548 +       if (dev == NULL)
59549 +               op->err = XEN_PCI_ERR_dev_not_found;
59550 +       else if (op->cmd == XEN_PCI_OP_conf_read)
59551 +               op->err = pciback_config_read(dev, op->offset, op->size,
59552 +                                             &op->value);
59553 +       else if (op->cmd == XEN_PCI_OP_conf_write)
59554 +               op->err = pciback_config_write(dev, op->offset, op->size,
59555 +                                              op->value);
59556 +       else
59557 +               op->err = XEN_PCI_ERR_not_implemented;
59558 +
59559 +       /* Tell the driver domain that we're done. */ 
59560 +       wmb();
59561 +       clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
59562 +       notify_remote_via_irq(pdev->evtchn_irq);
59563 +
59564 +       /* Mark that we're done. */
59565 +       smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */
59566 +       clear_bit(_PDEVF_op_active, &pdev->flags);
59567 +       smp_mb__after_clear_bit(); /* /before/ final check for work */
59568 +
59569 +       /* Check to see if the driver domain tried to start another request in
59570 +        * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active. */
59571 +       test_and_schedule_op(pdev);
59572 +}
59573 +
59574 +irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs)
59575 +{
59576 +       struct pciback_device *pdev = dev_id;
59577 +
59578 +       test_and_schedule_op(pdev);
59579 +
59580 +       return IRQ_HANDLED;
59581 +}
59582 diff -urNp linux-2.6/drivers/xen/pciback/pci_stub.c new/drivers/xen/pciback/pci_stub.c
59583 --- linux-2.6/drivers/xen/pciback/pci_stub.c    1970-01-01 01:00:00.000000000 +0100
59584 +++ new/drivers/xen/pciback/pci_stub.c  2006-05-09 12:34:38.000000000 +0200
59585 @@ -0,0 +1,690 @@
59586 +/*
59587 + * PCI Stub Driver - Grabs devices in backend to be exported later
59588 + *
59589 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
59590 + */
59591 +#include <linux/module.h>
59592 +#include <linux/init.h>
59593 +#include <linux/list.h>
59594 +#include <linux/spinlock.h>
59595 +#include <linux/kref.h>
59596 +#include <asm/atomic.h>
59597 +#include "pciback.h"
59598 +
59599 +static char *pci_devs_to_hide = NULL;
59600 +module_param_named(hide, pci_devs_to_hide, charp, 0444);
59601 +
59602 +struct pcistub_device_id {
59603 +       struct list_head slot_list;
59604 +       int domain;
59605 +       unsigned char bus;
59606 +       unsigned int devfn;
59607 +};
59608 +static LIST_HEAD(pcistub_device_ids);
59609 +static DEFINE_SPINLOCK(device_ids_lock);
59610 +
59611 +struct pcistub_device {
59612 +       struct kref kref;
59613 +       struct list_head dev_list;
59614 +       spinlock_t lock;
59615 +
59616 +       struct pci_dev *dev;
59617 +       struct pciback_device *pdev;    /* non-NULL if struct pci_dev is in use */
59618 +};
59619 +/* Access to pcistub_devices & seized_devices lists and the initialize_devices
59620 + * flag must be locked with pcistub_devices_lock
59621 + */
59622 +static DEFINE_SPINLOCK(pcistub_devices_lock);
59623 +static LIST_HEAD(pcistub_devices);
59624 +
59625 +/* wait for device_initcall before initializing our devices
59626 + * (see pcistub_init_devices_late)
59627 + */
59628 +static int initialize_devices = 0;
59629 +static LIST_HEAD(seized_devices);
59630 +
59631 +static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev)
59632 +{
59633 +       struct pcistub_device *psdev;
59634 +
59635 +       dev_dbg(&dev->dev, "pcistub_device_alloc\n");
59636 +
59637 +       psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC);
59638 +       if (!psdev)
59639 +               return NULL;
59640 +
59641 +       psdev->dev = pci_dev_get(dev);
59642 +       if (!psdev->dev) {
59643 +               kfree(psdev);
59644 +               return NULL;
59645 +       }
59646 +
59647 +       kref_init(&psdev->kref);
59648 +       spin_lock_init(&psdev->lock);
59649 +
59650 +       return psdev;
59651 +}
59652 +
59653 +/* Don't call this directly as it's called by pcistub_device_put */
59654 +static void pcistub_device_release(struct kref *kref)
59655 +{
59656 +       struct pcistub_device *psdev;
59657 +
59658 +       psdev = container_of(kref, struct pcistub_device, kref);
59659 +
59660 +       dev_dbg(&psdev->dev->dev, "pcistub_device_release\n");
59661 +
59662 +       /* Clean-up the device */
59663 +       pciback_reset_device(psdev->dev);
59664 +       pciback_config_free_dev(psdev->dev);
59665 +       kfree(pci_get_drvdata(psdev->dev));
59666 +       pci_set_drvdata(psdev->dev, NULL);
59667 +
59668 +       pci_dev_put(psdev->dev);
59669 +
59670 +       kfree(psdev);
59671 +}
59672 +
59673 +static inline void pcistub_device_get(struct pcistub_device *psdev)
59674 +{
59675 +       kref_get(&psdev->kref);
59676 +}
59677 +
59678 +static inline void pcistub_device_put(struct pcistub_device *psdev)
59679 +{
59680 +       kref_put(&psdev->kref, pcistub_device_release);
59681 +}
59682 +
59683 +static struct pci_dev *pcistub_device_get_pci_dev(struct pciback_device *pdev,
59684 +                                                 struct pcistub_device *psdev)
59685 +{
59686 +       struct pci_dev *pci_dev = NULL;
59687 +       unsigned long flags;
59688 +
59689 +       pcistub_device_get(psdev);
59690 +
59691 +       spin_lock_irqsave(&psdev->lock, flags);
59692 +       if (!psdev->pdev) {
59693 +               psdev->pdev = pdev;
59694 +               pci_dev = psdev->dev;
59695 +       }
59696 +       spin_unlock_irqrestore(&psdev->lock, flags);
59697 +
59698 +       if (!pci_dev)
59699 +               pcistub_device_put(psdev);
59700 +
59701 +       return pci_dev;
59702 +}
59703 +
59704 +struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
59705 +                                           int domain, int bus,
59706 +                                           int slot, int func)
59707 +{
59708 +       struct pcistub_device *psdev;
59709 +       struct pci_dev *found_dev = NULL;
59710 +       unsigned long flags;
59711 +
59712 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
59713 +
59714 +       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
59715 +               if (psdev->dev != NULL
59716 +                   && domain == pci_domain_nr(psdev->dev->bus)
59717 +                   && bus == psdev->dev->bus->number
59718 +                   && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
59719 +                       found_dev = pcistub_device_get_pci_dev(pdev, psdev);
59720 +                       break;
59721 +               }
59722 +       }
59723 +
59724 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
59725 +       return found_dev;
59726 +}
59727 +
59728 +struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
59729 +                                   struct pci_dev *dev)
59730 +{
59731 +       struct pcistub_device *psdev;
59732 +       struct pci_dev *found_dev = NULL;
59733 +       unsigned long flags;
59734 +
59735 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
59736 +
59737 +       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
59738 +               if (psdev->dev == dev) {
59739 +                       found_dev = pcistub_device_get_pci_dev(pdev, psdev);
59740 +                       break;
59741 +               }
59742 +       }
59743 +
59744 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
59745 +       return found_dev;
59746 +}
59747 +
59748 +void pcistub_put_pci_dev(struct pci_dev *dev)
59749 +{
59750 +       struct pcistub_device *psdev, *found_psdev = NULL;
59751 +       unsigned long flags;
59752 +
59753 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
59754 +
59755 +       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
59756 +               if (psdev->dev == dev) {
59757 +                       found_psdev = psdev;
59758 +                       break;
59759 +               }
59760 +       }
59761 +
59762 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
59763 +
59764 +       /* Cleanup our device
59765 +        * (so it's ready for the next domain)
59766 +        */
59767 +       pciback_reset_device(found_psdev->dev);
59768 +       pciback_config_reset_dev(found_psdev->dev);
59769 +
59770 +       spin_lock_irqsave(&found_psdev->lock, flags);
59771 +       found_psdev->pdev = NULL;
59772 +       spin_unlock_irqrestore(&found_psdev->lock, flags);
59773 +
59774 +       pcistub_device_put(found_psdev);
59775 +}
59776 +
59777 +static int __devinit pcistub_match_one(struct pci_dev *dev,
59778 +                                      struct pcistub_device_id *pdev_id)
59779 +{
59780 +       /* Match the specified device by domain, bus, slot, func and also if
59781 +        * any of the device's parent bridges match.
59782 +        */
59783 +       for (; dev != NULL; dev = dev->bus->self) {
59784 +               if (pci_domain_nr(dev->bus) == pdev_id->domain
59785 +                   && dev->bus->number == pdev_id->bus
59786 +                   && dev->devfn == pdev_id->devfn)
59787 +                       return 1;
59788 +       }
59789 +
59790 +       return 0;
59791 +}
59792 +
59793 +static int __devinit pcistub_match(struct pci_dev *dev)
59794 +{
59795 +       struct pcistub_device_id *pdev_id;
59796 +       unsigned long flags;
59797 +       int found = 0;
59798 +
59799 +       spin_lock_irqsave(&device_ids_lock, flags);
59800 +       list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) {
59801 +               if (pcistub_match_one(dev, pdev_id)) {
59802 +                       found = 1;
59803 +                       break;
59804 +               }
59805 +       }
59806 +       spin_unlock_irqrestore(&device_ids_lock, flags);
59807 +
59808 +       return found;
59809 +}
59810 +
59811 +static int __devinit pcistub_init_device(struct pci_dev *dev)
59812 +{
59813 +       struct pciback_dev_data *dev_data;
59814 +       int err = 0;
59815 +
59816 +       dev_dbg(&dev->dev, "initializing...\n");
59817 +
59818 +       /* The PCI backend is not intended to be a module (or to work with
59819 +        * removable PCI devices (yet). If it were, pciback_config_free()
59820 +        * would need to be called somewhere to free the memory allocated
59821 +        * here and then to call kfree(pci_get_drvdata(psdev->dev)).
59822 +        */
59823 +       dev_data = kzalloc(sizeof(*dev_data), GFP_ATOMIC);
59824 +       if (!dev_data) {
59825 +               err = -ENOMEM;
59826 +               goto out;
59827 +       }
59828 +       pci_set_drvdata(dev, dev_data);
59829 +
59830 +       dev_dbg(&dev->dev, "initializing config\n");
59831 +       err = pciback_config_init_dev(dev);
59832 +       if (err)
59833 +               goto out;
59834 +
59835 +       /* HACK: Force device (& ACPI) to determine what IRQ it's on - we
59836 +        * must do this here because pcibios_enable_device may specify
59837 +        * the pci device's true irq (and possibly its other resources)
59838 +        * if they differ from what's in the configuration space.
59839 +        * This makes the assumption that the device's resources won't
59840 +        * change after this point (otherwise this code may break!)
59841 +        */
59842 +       dev_dbg(&dev->dev, "enabling device\n");
59843 +       err = pci_enable_device(dev);
59844 +       if (err)
59845 +               goto config_release;
59846 +
59847 +       /* Now disable the device (this also ensures some private device
59848 +        * data is setup before we export)
59849 +        */
59850 +       dev_dbg(&dev->dev, "reset device\n");
59851 +       pciback_reset_device(dev);
59852 +
59853 +       return 0;
59854 +
59855 +      config_release:
59856 +       pciback_config_free_dev(dev);
59857 +
59858 +      out:
59859 +       pci_set_drvdata(dev, NULL);
59860 +       kfree(dev_data);
59861 +       return err;
59862 +}
59863 +
59864 +/*
59865 + * Because some initialization still happens on
59866 + * devices during fs_initcall, we need to defer
59867 + * full initialization of our devices until
59868 + * device_initcall.
59869 + */
59870 +static int __init pcistub_init_devices_late(void)
59871 +{
59872 +       struct pcistub_device *psdev;
59873 +       unsigned long flags;
59874 +       int err = 0;
59875 +
59876 +       pr_debug("pciback: pcistub_init_devices_late\n");
59877 +
59878 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
59879 +
59880 +       while (!list_empty(&seized_devices)) {
59881 +               psdev = container_of(seized_devices.next,
59882 +                                    struct pcistub_device, dev_list);
59883 +               list_del(&psdev->dev_list);
59884 +
59885 +               spin_unlock_irqrestore(&pcistub_devices_lock, flags);
59886 +
59887 +               err = pcistub_init_device(psdev->dev);
59888 +               if (err) {
59889 +                       dev_err(&psdev->dev->dev,
59890 +                               "error %d initializing device\n", err);
59891 +                       kfree(psdev);
59892 +                       psdev = NULL;
59893 +               }
59894 +
59895 +               spin_lock_irqsave(&pcistub_devices_lock, flags);
59896 +
59897 +               if (psdev)
59898 +                       list_add_tail(&psdev->dev_list, &pcistub_devices);
59899 +       }
59900 +
59901 +       initialize_devices = 1;
59902 +
59903 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
59904 +
59905 +       return 0;
59906 +}
59907 +
59908 +static int __devinit pcistub_seize(struct pci_dev *dev)
59909 +{
59910 +       struct pcistub_device *psdev;
59911 +       unsigned long flags;
59912 +       int err = 0;
59913 +
59914 +       psdev = pcistub_device_alloc(dev);
59915 +       if (!psdev)
59916 +               return -ENOMEM;
59917 +
59918 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
59919 +
59920 +       if (initialize_devices) {
59921 +               spin_unlock_irqrestore(&pcistub_devices_lock, flags);
59922 +
59923 +               /* don't want irqs disabled when calling pcistub_init_device */
59924 +               err = pcistub_init_device(psdev->dev);
59925 +
59926 +               spin_lock_irqsave(&pcistub_devices_lock, flags);
59927 +
59928 +               if (!err)
59929 +                       list_add(&psdev->dev_list, &pcistub_devices);
59930 +       } else {
59931 +               dev_dbg(&dev->dev, "deferring initialization\n");
59932 +               list_add(&psdev->dev_list, &seized_devices);
59933 +       }
59934 +
59935 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
59936 +
59937 +       if (err)
59938 +               pcistub_device_put(psdev);
59939 +
59940 +       return err;
59941 +}
59942 +
59943 +static int __devinit pcistub_probe(struct pci_dev *dev,
59944 +                                  const struct pci_device_id *id)
59945 +{
59946 +       int err = 0;
59947 +
59948 +       dev_dbg(&dev->dev, "probing...\n");
59949 +
59950 +       if (pcistub_match(dev)) {
59951 +
59952 +               if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL
59953 +                   && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
59954 +                       dev_err(&dev->dev, "can't export pci devices that "
59955 +                               "don't have a normal (0) or bridge (1) "
59956 +                               "header type!\n");
59957 +                       err = -ENODEV;
59958 +                       goto out;
59959 +               }
59960 +
59961 +               dev_info(&dev->dev, "seizing device\n");
59962 +               err = pcistub_seize(dev);
59963 +       } else
59964 +               /* Didn't find the device */
59965 +               err = -ENODEV;
59966 +
59967 +      out:
59968 +       return err;
59969 +}
59970 +
59971 +static void pcistub_remove(struct pci_dev *dev)
59972 +{
59973 +       struct pcistub_device *psdev, *found_psdev = NULL;
59974 +       unsigned long flags;
59975 +
59976 +       dev_dbg(&dev->dev, "removing\n");
59977 +
59978 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
59979 +
59980 +       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
59981 +               if (psdev->dev == dev) {
59982 +                       found_psdev = psdev;
59983 +                       break;
59984 +               }
59985 +       }
59986 +
59987 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
59988 +
59989 +       if (found_psdev) {
59990 +               dev_dbg(&dev->dev, "found device to remove - in use? %p\n",
59991 +                       found_psdev->pdev);
59992 +
59993 +               if (found_psdev->pdev) {
59994 +                       printk(KERN_WARNING "pciback: ****** removing device "
59995 +                              "%s while still in-use! ******\n",
59996 +                              pci_name(found_psdev->dev));
59997 +                       printk(KERN_WARNING "pciback: ****** driver domain may "
59998 +                              "still access this device's i/o resources!\n");
59999 +                       printk(KERN_WARNING "pciback: ****** shutdown driver "
60000 +                              "domain before binding device\n");
60001 +                       printk(KERN_WARNING "pciback: ****** to other drivers "
60002 +                              "or domains\n");
60003 +
60004 +                       pciback_release_pci_dev(found_psdev->pdev,
60005 +                                               found_psdev->dev);
60006 +               }
60007 +
60008 +               spin_lock_irqsave(&pcistub_devices_lock, flags);
60009 +               list_del(&found_psdev->dev_list);
60010 +               spin_unlock_irqrestore(&pcistub_devices_lock, flags);
60011 +
60012 +               /* the final put for releasing from the list */
60013 +               pcistub_device_put(found_psdev);
60014 +       }
60015 +}
60016 +
60017 +static struct pci_device_id pcistub_ids[] = {
60018 +       {
60019 +        .vendor = PCI_ANY_ID,
60020 +        .device = PCI_ANY_ID,
60021 +        .subvendor = PCI_ANY_ID,
60022 +        .subdevice = PCI_ANY_ID,
60023 +        },
60024 +       {0,},
60025 +};
60026 +
60027 +/*
60028 + * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't
60029 + * for a normal device. I don't want it to be loaded automatically.
60030 + */
60031 +
60032 +static struct pci_driver pciback_pci_driver = {
60033 +       .name = "pciback",
60034 +       .id_table = pcistub_ids,
60035 +       .probe = pcistub_probe,
60036 +       .remove = pcistub_remove,
60037 +};
60038 +
60039 +static inline int str_to_slot(const char *buf, int *domain, int *bus,
60040 +                             int *slot, int *func)
60041 +{
60042 +       int err;
60043 +
60044 +       err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func);
60045 +       if (err == 4)
60046 +               return 0;
60047 +       else if (err < 0)
60048 +               return -EINVAL;
60049 +
60050 +       /* try again without domain */
60051 +       *domain = 0;
60052 +       err = sscanf(buf, " %x:%x.%x", bus, slot, func);
60053 +       if (err == 3)
60054 +               return 0;
60055 +
60056 +       return -EINVAL;
60057 +}
60058 +
60059 +static int pcistub_device_id_add(int domain, int bus, int slot, int func)
60060 +{
60061 +       struct pcistub_device_id *pci_dev_id;
60062 +       unsigned long flags;
60063 +
60064 +       pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL);
60065 +       if (!pci_dev_id)
60066 +               return -ENOMEM;
60067 +
60068 +       pci_dev_id->domain = domain;
60069 +       pci_dev_id->bus = bus;
60070 +       pci_dev_id->devfn = PCI_DEVFN(slot, func);
60071 +
60072 +       pr_debug("pciback: wants to seize %04x:%02x:%02x.%01x\n",
60073 +                domain, bus, slot, func);
60074 +
60075 +       spin_lock_irqsave(&device_ids_lock, flags);
60076 +       list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids);
60077 +       spin_unlock_irqrestore(&device_ids_lock, flags);
60078 +
60079 +       return 0;
60080 +}
60081 +
60082 +static int pcistub_device_id_remove(int domain, int bus, int slot, int func)
60083 +{
60084 +       struct pcistub_device_id *pci_dev_id, *t;
60085 +       int devfn = PCI_DEVFN(slot, func);
60086 +       int err = -ENOENT;
60087 +       unsigned long flags;
60088 +
60089 +       spin_lock_irqsave(&device_ids_lock, flags);
60090 +       list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids, slot_list) {
60091 +
60092 +               if (pci_dev_id->domain == domain
60093 +                   && pci_dev_id->bus == bus && pci_dev_id->devfn == devfn) {
60094 +                       /* Don't break; here because it's possible the same
60095 +                        * slot could be in the list more than once
60096 +                        */
60097 +                       list_del(&pci_dev_id->slot_list);
60098 +                       kfree(pci_dev_id);
60099 +
60100 +                       err = 0;
60101 +
60102 +                       pr_debug("pciback: removed %04x:%02x:%02x.%01x from "
60103 +                                "seize list\n", domain, bus, slot, func);
60104 +               }
60105 +       }
60106 +       spin_unlock_irqrestore(&device_ids_lock, flags);
60107 +
60108 +       return err;
60109 +}
60110 +
60111 +static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf,
60112 +                               size_t count)
60113 +{
60114 +       int domain, bus, slot, func;
60115 +       int err;
60116 +
60117 +       err = str_to_slot(buf, &domain, &bus, &slot, &func);
60118 +       if (err)
60119 +               goto out;
60120 +
60121 +       err = pcistub_device_id_add(domain, bus, slot, func);
60122 +
60123 +      out:
60124 +       if (!err)
60125 +               err = count;
60126 +       return err;
60127 +}
60128 +
60129 +DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add);
60130 +
60131 +static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf,
60132 +                                  size_t count)
60133 +{
60134 +       int domain, bus, slot, func;
60135 +       int err;
60136 +
60137 +       err = str_to_slot(buf, &domain, &bus, &slot, &func);
60138 +       if (err)
60139 +               goto out;
60140 +
60141 +       err = pcistub_device_id_remove(domain, bus, slot, func);
60142 +
60143 +      out:
60144 +       if (!err)
60145 +               err = count;
60146 +       return err;
60147 +}
60148 +
60149 +DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove);
60150 +
60151 +static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf)
60152 +{
60153 +       struct pcistub_device_id *pci_dev_id;
60154 +       size_t count = 0;
60155 +       unsigned long flags;
60156 +
60157 +       spin_lock_irqsave(&device_ids_lock, flags);
60158 +       list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) {
60159 +               if (count >= PAGE_SIZE)
60160 +                       break;
60161 +
60162 +               count += scnprintf(buf + count, PAGE_SIZE - count,
60163 +                                  "%04x:%02x:%02x.%01x\n",
60164 +                                  pci_dev_id->domain, pci_dev_id->bus,
60165 +                                  PCI_SLOT(pci_dev_id->devfn),
60166 +                                  PCI_FUNC(pci_dev_id->devfn));
60167 +       }
60168 +       spin_unlock_irqrestore(&device_ids_lock, flags);
60169 +
60170 +       return count;
60171 +}
60172 +
60173 +DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL);
60174 +
60175 +static int __init pcistub_init(void)
60176 +{
60177 +       int pos = 0;
60178 +       int err = 0;
60179 +       int domain, bus, slot, func;
60180 +       int parsed;
60181 +
60182 +       if (pci_devs_to_hide && *pci_devs_to_hide) {
60183 +               do {
60184 +                       parsed = 0;
60185 +
60186 +                       err = sscanf(pci_devs_to_hide + pos,
60187 +                                    " (%x:%x:%x.%x) %n",
60188 +                                    &domain, &bus, &slot, &func, &parsed);
60189 +                       if (err != 4) {
60190 +                               domain = 0;
60191 +                               err = sscanf(pci_devs_to_hide + pos,
60192 +                                            " (%x:%x.%x) %n",
60193 +                                            &bus, &slot, &func, &parsed);
60194 +                               if (err != 3)
60195 +                                       goto parse_error;
60196 +                       }
60197 +
60198 +                       err = pcistub_device_id_add(domain, bus, slot, func);
60199 +                       if (err)
60200 +                               goto out;
60201 +
60202 +                       /* if parsed<=0, we've reached the end of the string */
60203 +                       pos += parsed;
60204 +               } while (parsed > 0 && pci_devs_to_hide[pos]);
60205 +       }
60206 +
60207 +       /* If we're the first PCI Device Driver to register, we're the
60208 +        * first one to get offered PCI devices as they become
60209 +        * available (and thus we can be the first to grab them)
60210 +        */
60211 +       err = pci_register_driver(&pciback_pci_driver);
60212 +       if (err < 0)
60213 +               goto out;
60214 +
60215 +       driver_create_file(&pciback_pci_driver.driver, &driver_attr_new_slot);
60216 +       driver_create_file(&pciback_pci_driver.driver,
60217 +                          &driver_attr_remove_slot);
60218 +       driver_create_file(&pciback_pci_driver.driver, &driver_attr_slots);
60219 +
60220 +      out:
60221 +       return err;
60222 +
60223 +      parse_error:
60224 +       printk(KERN_ERR "pciback: Error parsing pci_devs_to_hide at \"%s\"\n",
60225 +              pci_devs_to_hide + pos);
60226 +       return -EINVAL;
60227 +}
60228 +
60229 +#ifndef MODULE
60230 +/*
60231 + * fs_initcall happens before device_initcall
60232 + * so pciback *should* get called first (b/c we 
60233 + * want to suck up any device before other drivers
60234 + * get a chance by being the first pci device
60235 + * driver to register)
60236 + */
60237 +fs_initcall(pcistub_init);
60238 +#endif
60239 +
60240 +static int __init pciback_init(void)
60241 +{
60242 +       int err;
60243 +
60244 +       err = pciback_config_init();
60245 +       if (err)
60246 +               return err;
60247 +
60248 +#ifdef MODULE
60249 +       err = pcistub_init();
60250 +       if (err < 0)
60251 +               return err;
60252 +#endif
60253 +
60254 +       pcistub_init_devices_late();
60255 +       pciback_xenbus_register();
60256 +
60257 +       return 0;
60258 +}
60259 +
60260 +static void __exit pciback_cleanup(void)
60261 +{
60262 +       pciback_xenbus_unregister();
60263 +
60264 +       driver_remove_file(&pciback_pci_driver.driver, &driver_attr_new_slot);
60265 +       driver_remove_file(&pciback_pci_driver.driver,
60266 +                          &driver_attr_remove_slot);
60267 +       driver_remove_file(&pciback_pci_driver.driver, &driver_attr_slots);
60268 +
60269 +       pci_unregister_driver(&pciback_pci_driver);
60270 +}
60271 +
60272 +module_init(pciback_init);
60273 +module_exit(pciback_cleanup);
60274 +
60275 +MODULE_LICENSE("Dual BSD/GPL");
60276 diff -urNp linux-2.6/drivers/xen/pciback/vpci.c new/drivers/xen/pciback/vpci.c
60277 --- linux-2.6/drivers/xen/pciback/vpci.c        1970-01-01 01:00:00.000000000 +0100
60278 +++ new/drivers/xen/pciback/vpci.c      2006-05-09 12:34:38.000000000 +0200
60279 @@ -0,0 +1,204 @@
60280 +/*
60281 + * PCI Backend - Provides a Virtual PCI bus (with real devices)
60282 + *               to the frontend
60283 + *
60284 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
60285 + */
60286 +
60287 +#include <linux/list.h>
60288 +#include <linux/slab.h>
60289 +#include <linux/pci.h>
60290 +#include <linux/spinlock.h>
60291 +#include "pciback.h"
60292 +
60293 +#define PCI_SLOT_MAX 32
60294 +
60295 +struct vpci_dev_data {
60296 +       /* Access to dev_list must be protected by lock */
60297 +       struct list_head dev_list[PCI_SLOT_MAX];
60298 +       spinlock_t lock;
60299 +};
60300 +
60301 +static inline struct list_head *list_first(struct list_head *head)
60302 +{
60303 +       return head->next;
60304 +}
60305 +
60306 +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
60307 +                                   unsigned int domain, unsigned int bus,
60308 +                                   unsigned int devfn)
60309 +{
60310 +       struct pci_dev_entry *entry;
60311 +       struct pci_dev *dev = NULL;
60312 +       struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
60313 +       unsigned long flags;
60314 +
60315 +       if (domain != 0 || bus != 0)
60316 +               return NULL;
60317 +
60318 +       if (PCI_SLOT(devfn) < PCI_SLOT_MAX) {
60319 +               spin_lock_irqsave(&vpci_dev->lock, flags);
60320 +
60321 +               list_for_each_entry(entry,
60322 +                                   &vpci_dev->dev_list[PCI_SLOT(devfn)],
60323 +                                   list) {
60324 +                       if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) {
60325 +                               dev = entry->dev;
60326 +                               break;
60327 +                       }
60328 +               }
60329 +
60330 +               spin_unlock_irqrestore(&vpci_dev->lock, flags);
60331 +       }
60332 +       return dev;
60333 +}
60334 +
60335 +static inline int match_slot(struct pci_dev *l, struct pci_dev *r)
60336 +{
60337 +       if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus)
60338 +           && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn))
60339 +               return 1;
60340 +
60341 +       return 0;
60342 +}
60343 +
60344 +int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
60345 +{
60346 +       int err = 0, slot;
60347 +       struct pci_dev_entry *t, *dev_entry;
60348 +       struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
60349 +       unsigned long flags;
60350 +
60351 +       if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
60352 +               err = -EFAULT;
60353 +               xenbus_dev_fatal(pdev->xdev, err,
60354 +                                "Can't export bridges on the virtual PCI bus");
60355 +               goto out;
60356 +       }
60357 +
60358 +       dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
60359 +       if (!dev_entry) {
60360 +               err = -ENOMEM;
60361 +               xenbus_dev_fatal(pdev->xdev, err,
60362 +                                "Error adding entry to virtual PCI bus");
60363 +               goto out;
60364 +       }
60365 +
60366 +       dev_entry->dev = dev;
60367 +
60368 +       spin_lock_irqsave(&vpci_dev->lock, flags);
60369 +
60370 +       /* Keep multi-function devices together on the virtual PCI bus */
60371 +       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
60372 +               if (!list_empty(&vpci_dev->dev_list[slot])) {
60373 +                       t = list_entry(list_first(&vpci_dev->dev_list[slot]),
60374 +                                      struct pci_dev_entry, list);
60375 +
60376 +                       if (match_slot(dev, t->dev)) {
60377 +                               pr_info("pciback: vpci: %s: "
60378 +                                       "assign to virtual slot %d func %d\n",
60379 +                                       pci_name(dev), slot,
60380 +                                       PCI_FUNC(dev->devfn));
60381 +                               list_add_tail(&dev_entry->list,
60382 +                                             &vpci_dev->dev_list[slot]);
60383 +                               goto unlock;
60384 +                       }
60385 +               }
60386 +       }
60387 +
60388 +       /* Assign to a new slot on the virtual PCI bus */
60389 +       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
60390 +               if (list_empty(&vpci_dev->dev_list[slot])) {
60391 +                       printk(KERN_INFO
60392 +                              "pciback: vpci: %s: assign to virtual slot %d\n",
60393 +                              pci_name(dev), slot);
60394 +                       list_add_tail(&dev_entry->list,
60395 +                                     &vpci_dev->dev_list[slot]);
60396 +                       goto unlock;
60397 +               }
60398 +       }
60399 +
60400 +       err = -ENOMEM;
60401 +       xenbus_dev_fatal(pdev->xdev, err,
60402 +                        "No more space on root virtual PCI bus");
60403 +
60404 +      unlock:
60405 +       spin_unlock_irqrestore(&vpci_dev->lock, flags);
60406 +      out:
60407 +       return err;
60408 +}
60409 +
60410 +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
60411 +{
60412 +       int slot;
60413 +       struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
60414 +       struct pci_dev *found_dev = NULL;
60415 +       unsigned long flags;
60416 +
60417 +       spin_lock_irqsave(&vpci_dev->lock, flags);
60418 +
60419 +       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
60420 +               struct pci_dev_entry *e, *tmp;
60421 +               list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
60422 +                                        list) {
60423 +                       if (e->dev == dev) {
60424 +                               list_del(&e->list);
60425 +                               found_dev = e->dev;
60426 +                               kfree(e);
60427 +                               goto out;
60428 +                       }
60429 +               }
60430 +       }
60431 +
60432 +      out:
60433 +       spin_unlock_irqrestore(&vpci_dev->lock, flags);
60434 +
60435 +       if (found_dev)
60436 +               pcistub_put_pci_dev(found_dev);
60437 +}
60438 +
60439 +int pciback_init_devices(struct pciback_device *pdev)
60440 +{
60441 +       int slot;
60442 +       struct vpci_dev_data *vpci_dev;
60443 +
60444 +       vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL);
60445 +       if (!vpci_dev)
60446 +               return -ENOMEM;
60447 +
60448 +       spin_lock_init(&vpci_dev->lock);
60449 +
60450 +       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
60451 +               INIT_LIST_HEAD(&vpci_dev->dev_list[slot]);
60452 +       }
60453 +
60454 +       pdev->pci_dev_data = vpci_dev;
60455 +
60456 +       return 0;
60457 +}
60458 +
60459 +int pciback_publish_pci_roots(struct pciback_device *pdev,
60460 +                             publish_pci_root_cb publish_cb)
60461 +{
60462 +       /* The Virtual PCI bus has only one root */
60463 +       return publish_cb(pdev, 0, 0);
60464 +}
60465 +
60466 +void pciback_release_devices(struct pciback_device *pdev)
60467 +{
60468 +       int slot;
60469 +       struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
60470 +
60471 +       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
60472 +               struct pci_dev_entry *e, *tmp;
60473 +               list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
60474 +                                        list) {
60475 +                       list_del(&e->list);
60476 +                       pcistub_put_pci_dev(e->dev);
60477 +                       kfree(e);
60478 +               }
60479 +       }
60480 +
60481 +       kfree(vpci_dev);
60482 +       pdev->pci_dev_data = NULL;
60483 +}
60484 diff -urNp linux-2.6/drivers/xen/pciback/xenbus.c new/drivers/xen/pciback/xenbus.c
60485 --- linux-2.6/drivers/xen/pciback/xenbus.c      1970-01-01 01:00:00.000000000 +0100
60486 +++ new/drivers/xen/pciback/xenbus.c    2006-06-28 14:32:14.000000000 +0200
60487 @@ -0,0 +1,454 @@
60488 +/*
60489 + * PCI Backend Xenbus Setup - handles setup with frontend and xend
60490 + *
60491 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
60492 + */
60493 +#include <linux/module.h>
60494 +#include <linux/init.h>
60495 +#include <linux/list.h>
60496 +#include <linux/vmalloc.h>
60497 +#include <xen/xenbus.h>
60498 +#include <xen/evtchn.h>
60499 +#include "pciback.h"
60500 +
60501 +#define INVALID_EVTCHN_IRQ  (-1)
60502 +
60503 +static struct pciback_device *alloc_pdev(struct xenbus_device *xdev)
60504 +{
60505 +       struct pciback_device *pdev;
60506 +
60507 +       pdev = kzalloc(sizeof(struct pciback_device), GFP_KERNEL);
60508 +       if (pdev == NULL)
60509 +               goto out;
60510 +       dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev);
60511 +
60512 +       pdev->xdev = xdev;
60513 +       xdev->dev.driver_data = pdev;
60514 +
60515 +       spin_lock_init(&pdev->dev_lock);
60516 +
60517 +       pdev->sh_area = NULL;
60518 +       pdev->sh_info = NULL;
60519 +       pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
60520 +       pdev->be_watching = 0;
60521 +
60522 +       INIT_WORK(&pdev->op_work, pciback_do_op, pdev);
60523 +
60524 +       if (pciback_init_devices(pdev)) {
60525 +               kfree(pdev);
60526 +               pdev = NULL;
60527 +       }
60528 +      out:
60529 +       return pdev;
60530 +}
60531 +
60532 +static void free_pdev(struct pciback_device *pdev)
60533 +{
60534 +       if (pdev->be_watching)
60535 +               unregister_xenbus_watch(&pdev->be_watch);
60536 +
60537 +       /* Ensure the guest can't trigger our handler before removing devices */
60538 +       if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ)
60539 +               unbind_from_irqhandler(pdev->evtchn_irq, pdev);
60540 +
60541 +       /* If the driver domain started an op, make sure we complete it or
60542 +        * delete it before releasing the shared memory */
60543 +       cancel_delayed_work(&pdev->op_work);
60544 +       flush_scheduled_work();
60545 +
60546 +       if (pdev->sh_info)
60547 +               xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_area);
60548 +
60549 +       pciback_release_devices(pdev);
60550 +
60551 +       pdev->xdev->dev.driver_data = NULL;
60552 +       pdev->xdev = NULL;
60553 +
60554 +       kfree(pdev);
60555 +}
60556 +
60557 +static int pciback_do_attach(struct pciback_device *pdev, int gnt_ref,
60558 +                            int remote_evtchn)
60559 +{
60560 +       int err = 0;
60561 +       int evtchn;
60562 +       struct vm_struct *area;
60563 +
60564 +       dev_dbg(&pdev->xdev->dev,
60565 +               "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
60566 +               gnt_ref, remote_evtchn);
60567 +
60568 +       area = xenbus_map_ring_valloc(pdev->xdev, gnt_ref);
60569 +       if (IS_ERR(area)) {
60570 +               err = PTR_ERR(area);
60571 +               goto out;
60572 +       }
60573 +       pdev->sh_area = area;
60574 +       pdev->sh_info = area->addr;
60575 +
60576 +       err = xenbus_bind_evtchn(pdev->xdev, remote_evtchn, &evtchn);
60577 +       if (err)
60578 +               goto out;
60579 +
60580 +       err = bind_evtchn_to_irqhandler(evtchn, pciback_handle_event,
60581 +                                       SA_SAMPLE_RANDOM, "pciback", pdev);
60582 +       if (err < 0) {
60583 +               xenbus_dev_fatal(pdev->xdev, err,
60584 +                                "Error binding event channel to IRQ");
60585 +               goto out;
60586 +       }
60587 +       pdev->evtchn_irq = err;
60588 +       err = 0;
60589 +
60590 +       dev_dbg(&pdev->xdev->dev, "Attached!\n");
60591 +      out:
60592 +       return err;
60593 +}
60594 +
60595 +static int pciback_attach(struct pciback_device *pdev)
60596 +{
60597 +       int err = 0;
60598 +       int gnt_ref, remote_evtchn;
60599 +       char *magic = NULL;
60600 +
60601 +       spin_lock(&pdev->dev_lock);
60602 +
60603 +       /* Make sure we only do this setup once */
60604 +       if (xenbus_read_driver_state(pdev->xdev->nodename) !=
60605 +           XenbusStateInitialised)
60606 +               goto out;
60607 +
60608 +       /* Wait for frontend to state that it has published the configuration */
60609 +       if (xenbus_read_driver_state(pdev->xdev->otherend) !=
60610 +           XenbusStateInitialised)
60611 +               goto out;
60612 +
60613 +       dev_dbg(&pdev->xdev->dev, "Reading frontend config\n");
60614 +
60615 +       err = xenbus_gather(XBT_NIL, pdev->xdev->otherend,
60616 +                           "pci-op-ref", "%u", &gnt_ref,
60617 +                           "event-channel", "%u", &remote_evtchn,
60618 +                           "magic", NULL, &magic, NULL);
60619 +       if (err) {
60620 +               /* If configuration didn't get read correctly, wait longer */
60621 +               xenbus_dev_fatal(pdev->xdev, err,
60622 +                                "Error reading configuration from frontend");
60623 +               goto out;
60624 +       }
60625 +
60626 +       if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) {
60627 +               xenbus_dev_fatal(pdev->xdev, -EFAULT,
60628 +                                "version mismatch (%s/%s) with pcifront - "
60629 +                                "halting pciback",
60630 +                                magic, XEN_PCI_MAGIC);
60631 +               goto out;
60632 +       }
60633 +
60634 +       err = pciback_do_attach(pdev, gnt_ref, remote_evtchn);
60635 +       if (err)
60636 +               goto out;
60637 +
60638 +       dev_dbg(&pdev->xdev->dev, "Connecting...\n");
60639 +
60640 +       err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
60641 +       if (err)
60642 +               xenbus_dev_fatal(pdev->xdev, err,
60643 +                                "Error switching to connected state!");
60644 +
60645 +       dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err);
60646 +      out:
60647 +       spin_unlock(&pdev->dev_lock);
60648 +
60649 +       if (magic)
60650 +               kfree(magic);
60651 +
60652 +       return err;
60653 +}
60654 +
60655 +static void pciback_frontend_changed(struct xenbus_device *xdev,
60656 +                                    enum xenbus_state fe_state)
60657 +{
60658 +       struct pciback_device *pdev = xdev->dev.driver_data;
60659 +
60660 +       dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state);
60661 +
60662 +       switch (fe_state) {
60663 +       case XenbusStateInitialised:
60664 +               pciback_attach(pdev);
60665 +               break;
60666 +
60667 +       case XenbusStateClosing:
60668 +               xenbus_switch_state(xdev, XenbusStateClosing);
60669 +               break;
60670 +
60671 +       case XenbusStateClosed:
60672 +               dev_dbg(&xdev->dev, "frontend is gone! unregister device\n");
60673 +               device_unregister(&xdev->dev);
60674 +               break;
60675 +
60676 +       default:
60677 +               break;
60678 +       }
60679 +}
60680 +
60681 +static int pciback_publish_pci_root(struct pciback_device *pdev,
60682 +                                   unsigned int domain, unsigned int bus)
60683 +{
60684 +       unsigned int d, b;
60685 +       int i, root_num, len, err;
60686 +       char str[64];
60687 +
60688 +       dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n");
60689 +
60690 +       err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
60691 +                          "root_num", "%d", &root_num);
60692 +       if (err == 0 || err == -ENOENT)
60693 +               root_num = 0;
60694 +       else if (err < 0)
60695 +               goto out;
60696 +
60697 +       /* Verify that we haven't already published this pci root */
60698 +       for (i = 0; i < root_num; i++) {
60699 +               len = snprintf(str, sizeof(str), "root-%d", i);
60700 +               if (unlikely(len >= (sizeof(str) - 1))) {
60701 +                       err = -ENOMEM;
60702 +                       goto out;
60703 +               }
60704 +
60705 +               err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
60706 +                                  str, "%x:%x", &d, &b);
60707 +               if (err < 0)
60708 +                       goto out;
60709 +               if (err != 2) {
60710 +                       err = -EINVAL;
60711 +                       goto out;
60712 +               }
60713 +
60714 +               if (d == domain && b == bus) {
60715 +                       err = 0;
60716 +                       goto out;
60717 +               }
60718 +       }
60719 +
60720 +       len = snprintf(str, sizeof(str), "root-%d", root_num);
60721 +       if (unlikely(len >= (sizeof(str) - 1))) {
60722 +               err = -ENOMEM;
60723 +               goto out;
60724 +       }
60725 +
60726 +       dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n",
60727 +               root_num, domain, bus);
60728 +
60729 +       err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
60730 +                           "%04x:%02x", domain, bus);
60731 +       if (err)
60732 +               goto out;
60733 +
60734 +       err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
60735 +                           "root_num", "%d", (root_num + 1));
60736 +
60737 +      out:
60738 +       return err;
60739 +}
60740 +
60741 +static int pciback_export_device(struct pciback_device *pdev,
60742 +                                int domain, int bus, int slot, int func)
60743 +{
60744 +       struct pci_dev *dev;
60745 +       int err = 0;
60746 +
60747 +       dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n",
60748 +               domain, bus, slot, func);
60749 +
60750 +       dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func);
60751 +       if (!dev) {
60752 +               err = -EINVAL;
60753 +               xenbus_dev_fatal(pdev->xdev, err,
60754 +                                "Couldn't locate PCI device "
60755 +                                "(%04x:%02x:%02x.%01x)! "
60756 +                                "perhaps already in-use?",
60757 +                                domain, bus, slot, func);
60758 +               goto out;
60759 +       }
60760 +
60761 +       err = pciback_add_pci_dev(pdev, dev);
60762 +       if (err)
60763 +               goto out;
60764 +
60765 +       /* TODO: It'd be nice to export a bridge and have all of its children
60766 +        * get exported with it. This may be best done in xend (which will
60767 +        * have to calculate resource usage anyway) but we probably want to
60768 +        * put something in here to ensure that if a bridge gets given to a
60769 +        * driver domain, that all devices under that bridge are not given
60770 +        * to other driver domains (as he who controls the bridge can disable
60771 +        * it and stop the other devices from working).
60772 +        */
60773 +      out:
60774 +       return err;
60775 +}
60776 +
60777 +static int pciback_setup_backend(struct pciback_device *pdev)
60778 +{
60779 +       /* Get configuration from xend (if available now) */
60780 +       int domain, bus, slot, func;
60781 +       int err = 0;
60782 +       int i, num_devs;
60783 +       char dev_str[64];
60784 +
60785 +       spin_lock(&pdev->dev_lock);
60786 +
60787 +       /* It's possible we could get the call to setup twice, so make sure
60788 +        * we're not already connected.
60789 +        */
60790 +       if (xenbus_read_driver_state(pdev->xdev->nodename) !=
60791 +           XenbusStateInitWait)
60792 +               goto out;
60793 +
60794 +       dev_dbg(&pdev->xdev->dev, "getting be setup\n");
60795 +
60796 +       err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
60797 +                          &num_devs);
60798 +       if (err != 1) {
60799 +               if (err >= 0)
60800 +                       err = -EINVAL;
60801 +               xenbus_dev_fatal(pdev->xdev, err,
60802 +                                "Error reading number of devices");
60803 +               goto out;
60804 +       }
60805 +
60806 +       for (i = 0; i < num_devs; i++) {
60807 +               int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
60808 +               if (unlikely(l >= (sizeof(dev_str) - 1))) {
60809 +                       err = -ENOMEM;
60810 +                       xenbus_dev_fatal(pdev->xdev, err,
60811 +                                        "String overflow while reading "
60812 +                                        "configuration");
60813 +                       goto out;
60814 +               }
60815 +
60816 +               err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str,
60817 +                                  "%x:%x:%x.%x", &domain, &bus, &slot, &func);
60818 +               if (err < 0) {
60819 +                       xenbus_dev_fatal(pdev->xdev, err,
60820 +                                        "Error reading device configuration");
60821 +                       goto out;
60822 +               }
60823 +               if (err != 4) {
60824 +                       err = -EINVAL;
60825 +                       xenbus_dev_fatal(pdev->xdev, err,
60826 +                                        "Error parsing pci device "
60827 +                                        "configuration");
60828 +                       goto out;
60829 +               }
60830 +
60831 +               err = pciback_export_device(pdev, domain, bus, slot, func);
60832 +               if (err)
60833 +                       goto out;
60834 +       }
60835 +
60836 +       err = pciback_publish_pci_roots(pdev, pciback_publish_pci_root);
60837 +       if (err) {
60838 +               xenbus_dev_fatal(pdev->xdev, err,
60839 +                                "Error while publish PCI root buses "
60840 +                                "for frontend");
60841 +               goto out;
60842 +       }
60843 +
60844 +       err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
60845 +       if (err)
60846 +               xenbus_dev_fatal(pdev->xdev, err,
60847 +                                "Error switching to initialised state!");
60848 +
60849 +      out:
60850 +       spin_unlock(&pdev->dev_lock);
60851 +
60852 +       if (!err)
60853 +               /* see if pcifront is already configured (if not, we'll wait) */
60854 +               pciback_attach(pdev);
60855 +
60856 +       return err;
60857 +}
60858 +
60859 +static void pciback_be_watch(struct xenbus_watch *watch,
60860 +                            const char **vec, unsigned int len)
60861 +{
60862 +       struct pciback_device *pdev =
60863 +           container_of(watch, struct pciback_device, be_watch);
60864 +
60865 +       switch (xenbus_read_driver_state(pdev->xdev->nodename)) {
60866 +       case XenbusStateInitWait:
60867 +               pciback_setup_backend(pdev);
60868 +               break;
60869 +
60870 +       default:
60871 +               break;
60872 +       }
60873 +}
60874 +
60875 +static int pciback_xenbus_probe(struct xenbus_device *dev,
60876 +                               const struct xenbus_device_id *id)
60877 +{
60878 +       int err = 0;
60879 +       struct pciback_device *pdev = alloc_pdev(dev);
60880 +
60881 +       if (pdev == NULL) {
60882 +               err = -ENOMEM;
60883 +               xenbus_dev_fatal(dev, err,
60884 +                                "Error allocating pciback_device struct");
60885 +               goto out;
60886 +       }
60887 +
60888 +       /* wait for xend to configure us */
60889 +       err = xenbus_switch_state(dev, XenbusStateInitWait);
60890 +       if (err)
60891 +               goto out;
60892 +
60893 +       /* watch the backend node for backend configuration information */
60894 +       err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch,
60895 +                               pciback_be_watch);
60896 +       if (err)
60897 +               goto out;
60898 +       pdev->be_watching = 1;
60899 +
60900 +       /* We need to force a call to our callback here in case
60901 +        * xend already configured us!
60902 +        */
60903 +       pciback_be_watch(&pdev->be_watch, NULL, 0);
60904 +
60905 +      out:
60906 +       return err;
60907 +}
60908 +
60909 +static int pciback_xenbus_remove(struct xenbus_device *dev)
60910 +{
60911 +       struct pciback_device *pdev = dev->dev.driver_data;
60912 +
60913 +       if (pdev != NULL)
60914 +               free_pdev(pdev);
60915 +
60916 +       return 0;
60917 +}
60918 +
60919 +static struct xenbus_device_id xenpci_ids[] = {
60920 +       {"pci"},
60921 +       {{0}},
60922 +};
60923 +
60924 +static struct xenbus_driver xenbus_pciback_driver = {
60925 +       .name                   = "pciback",
60926 +       .owner                  = THIS_MODULE,
60927 +       .ids                    = xenpci_ids,
60928 +       .probe                  = pciback_xenbus_probe,
60929 +       .remove                 = pciback_xenbus_remove,
60930 +       .otherend_changed       = pciback_frontend_changed,
60931 +};
60932 +
60933 +int __init pciback_xenbus_register(void)
60934 +{
60935 +       return xenbus_register_backend(&xenbus_pciback_driver);
60936 +}
60937 +
60938 +void __exit pciback_xenbus_unregister(void)
60939 +{
60940 +       xenbus_unregister_driver(&xenbus_pciback_driver);
60941 +}
60942 diff -urNp linux-2.6/drivers/xen/pcifront/Makefile new/drivers/xen/pcifront/Makefile
60943 --- linux-2.6/drivers/xen/pcifront/Makefile     1970-01-01 01:00:00.000000000 +0100
60944 +++ new/drivers/xen/pcifront/Makefile   2006-05-09 12:34:38.000000000 +0200
60945 @@ -0,0 +1,7 @@
60946 +obj-y += pcifront.o
60947 +
60948 +pcifront-y := pci_op.o xenbus.o pci.o
60949 +
60950 +ifeq ($(CONFIG_XEN_PCIDEV_FE_DEBUG),y)
60951 +EXTRA_CFLAGS += -DDEBUG
60952 +endif
60953 diff -urNp linux-2.6/drivers/xen/pcifront/pci.c new/drivers/xen/pcifront/pci.c
60954 --- linux-2.6/drivers/xen/pcifront/pci.c        1970-01-01 01:00:00.000000000 +0100
60955 +++ new/drivers/xen/pcifront/pci.c      2006-05-09 12:34:38.000000000 +0200
60956 @@ -0,0 +1,46 @@
60957 +/*
60958 + * PCI Frontend Operations - ensure only one PCI frontend runs at a time
60959 + *
60960 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
60961 + */
60962 +#include <linux/module.h>
60963 +#include <linux/init.h>
60964 +#include <linux/pci.h>
60965 +#include <linux/spinlock.h>
60966 +#include "pcifront.h"
60967 +
60968 +DEFINE_SPINLOCK(pcifront_dev_lock);
60969 +static struct pcifront_device *pcifront_dev = NULL;
60970 +
60971 +int pcifront_connect(struct pcifront_device *pdev)
60972 +{
60973 +       int err = 0;
60974 +
60975 +       spin_lock(&pcifront_dev_lock);
60976 +
60977 +       if (!pcifront_dev) {
60978 +               dev_info(&pdev->xdev->dev, "Installing PCI frontend\n");
60979 +               pcifront_dev = pdev;
60980 +       }
60981 +       else {
60982 +               dev_err(&pdev->xdev->dev, "PCI frontend already installed!\n");
60983 +               err = -EEXIST;
60984 +       }
60985 +
60986 +       spin_unlock(&pcifront_dev_lock);
60987 +
60988 +       return err;
60989 +}
60990 +
60991 +void pcifront_disconnect(struct pcifront_device *pdev)
60992 +{
60993 +       spin_lock(&pcifront_dev_lock);
60994 +
60995 +       if (pdev == pcifront_dev) {
60996 +               dev_info(&pdev->xdev->dev,
60997 +                        "Disconnecting PCI Frontend Buses\n");
60998 +               pcifront_dev = NULL;
60999 +       }
61000 +
61001 +       spin_unlock(&pcifront_dev_lock);
61002 +}
61003 diff -urNp linux-2.6/drivers/xen/pcifront/pcifront.h new/drivers/xen/pcifront/pcifront.h
61004 --- linux-2.6/drivers/xen/pcifront/pcifront.h   1970-01-01 01:00:00.000000000 +0100
61005 +++ new/drivers/xen/pcifront/pcifront.h 2006-05-09 12:34:39.000000000 +0200
61006 @@ -0,0 +1,40 @@
61007 +/*
61008 + * PCI Frontend - Common data structures & function declarations
61009 + *
61010 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
61011 + */
61012 +#ifndef __XEN_PCIFRONT_H__
61013 +#define __XEN_PCIFRONT_H__
61014 +
61015 +#include <linux/spinlock.h>
61016 +#include <linux/pci.h>
61017 +#include <xen/xenbus.h>
61018 +#include <xen/interface/io/pciif.h>
61019 +#include <xen/pcifront.h>
61020 +
61021 +struct pci_bus_entry {
61022 +       struct list_head list;
61023 +       struct pci_bus *bus;
61024 +};
61025 +
61026 +struct pcifront_device {
61027 +       struct xenbus_device *xdev;
61028 +       struct list_head root_buses;
61029 +       spinlock_t dev_lock;
61030 +
61031 +       int evtchn;
61032 +       int gnt_ref;
61033 +
61034 +       /* Lock this when doing any operations in sh_info */
61035 +       spinlock_t sh_info_lock;
61036 +       struct xen_pci_sharedinfo *sh_info;
61037 +};
61038 +
61039 +int pcifront_connect(struct pcifront_device *pdev);
61040 +void pcifront_disconnect(struct pcifront_device *pdev);
61041 +
61042 +int pcifront_scan_root(struct pcifront_device *pdev,
61043 +                      unsigned int domain, unsigned int bus);
61044 +void pcifront_free_roots(struct pcifront_device *pdev);
61045 +
61046 +#endif /* __XEN_PCIFRONT_H__ */
61047 diff -urNp linux-2.6/drivers/xen/pcifront/pci_op.c new/drivers/xen/pcifront/pci_op.c
61048 --- linux-2.6/drivers/xen/pcifront/pci_op.c     1970-01-01 01:00:00.000000000 +0100
61049 +++ new/drivers/xen/pcifront/pci_op.c   2006-05-09 12:34:39.000000000 +0200
61050 @@ -0,0 +1,272 @@
61051 +/*
61052 + * PCI Frontend Operations - Communicates with frontend
61053 + *
61054 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
61055 + */
61056 +#include <linux/module.h>
61057 +#include <linux/version.h>
61058 +#include <linux/init.h>
61059 +#include <linux/pci.h>
61060 +#include <linux/spinlock.h>
61061 +#include <linux/time.h>
61062 +#include <xen/evtchn.h>
61063 +#include "pcifront.h"
61064 +
61065 +static int verbose_request = 0;
61066 +module_param(verbose_request, int, 0644);
61067 +
61068 +static int errno_to_pcibios_err(int errno)
61069 +{
61070 +       switch (errno) {
61071 +       case XEN_PCI_ERR_success:
61072 +               return PCIBIOS_SUCCESSFUL;
61073 +
61074 +       case XEN_PCI_ERR_dev_not_found:
61075 +               return PCIBIOS_DEVICE_NOT_FOUND;
61076 +
61077 +       case XEN_PCI_ERR_invalid_offset:
61078 +       case XEN_PCI_ERR_op_failed:
61079 +               return PCIBIOS_BAD_REGISTER_NUMBER;
61080 +
61081 +       case XEN_PCI_ERR_not_implemented:
61082 +               return PCIBIOS_FUNC_NOT_SUPPORTED;
61083 +
61084 +       case XEN_PCI_ERR_access_denied:
61085 +               return PCIBIOS_SET_FAILED;
61086 +       }
61087 +       return errno;
61088 +}
61089 +
61090 +static int do_pci_op(struct pcifront_device *pdev, struct xen_pci_op *op)
61091 +{
61092 +       int err = 0;
61093 +       struct xen_pci_op *active_op = &pdev->sh_info->op;
61094 +       unsigned long irq_flags;
61095 +       evtchn_port_t port = pdev->evtchn;
61096 +       s64 ns, ns_timeout;
61097 +       struct timeval tv;
61098 +
61099 +       spin_lock_irqsave(&pdev->sh_info_lock, irq_flags);
61100 +
61101 +       memcpy(active_op, op, sizeof(struct xen_pci_op));
61102 +
61103 +       /* Go */
61104 +       wmb();
61105 +       set_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
61106 +       notify_remote_via_evtchn(port);
61107 +
61108 +       /*
61109 +        * We set a poll timeout of 3 seconds but give up on return after
61110 +        * 2 seconds. It is better to time out too late rather than too early
61111 +        * (in the latter case we end up continually re-executing poll() with a
61112 +        * timeout in the past). 1s difference gives plenty of slack for error.
61113 +        */
61114 +       do_gettimeofday(&tv);
61115 +       ns_timeout = timeval_to_ns(&tv) + 2 * NSEC_PER_SEC;
61116 +
61117 +       clear_evtchn(port);
61118 +
61119 +       while (test_bit(_XEN_PCIF_active,
61120 +                       (unsigned long *)&pdev->sh_info->flags)) {
61121 +               if (HYPERVISOR_poll(&port, 1, jiffies + 3*HZ))
61122 +                       BUG();
61123 +               clear_evtchn(port);
61124 +               do_gettimeofday(&tv);
61125 +               ns = timeval_to_ns(&tv);
61126 +               if (ns > ns_timeout) {
61127 +                       dev_err(&pdev->xdev->dev,
61128 +                               "pciback not responding!!!\n");
61129 +                       clear_bit(_XEN_PCIF_active,
61130 +                                 (unsigned long *)&pdev->sh_info->flags);
61131 +                       err = XEN_PCI_ERR_dev_not_found;
61132 +                       goto out;
61133 +               }
61134 +       }
61135 +
61136 +       memcpy(op, active_op, sizeof(struct xen_pci_op));
61137 +
61138 +       err = op->err;
61139 +      out:
61140 +       spin_unlock_irqrestore(&pdev->sh_info_lock, irq_flags);
61141 +       return err;
61142 +}
61143 +
61144 +/* Access to this function is spinlocked in drivers/pci/access.c */
61145 +static int pcifront_bus_read(struct pci_bus *bus, unsigned int devfn,
61146 +                            int where, int size, u32 * val)
61147 +{
61148 +       int err = 0;
61149 +       struct xen_pci_op op = {
61150 +               .cmd    = XEN_PCI_OP_conf_read,
61151 +               .domain = pci_domain_nr(bus),
61152 +               .bus    = bus->number,
61153 +               .devfn  = devfn,
61154 +               .offset = where,
61155 +               .size   = size,
61156 +       };
61157 +       struct pcifront_sd *sd = bus->sysdata;
61158 +       struct pcifront_device *pdev = sd->pdev;
61159 +
61160 +       if (verbose_request)
61161 +               dev_info(&pdev->xdev->dev,
61162 +                        "read dev=%04x:%02x:%02x.%01x - offset %x size %d\n",
61163 +                        pci_domain_nr(bus), bus->number, PCI_SLOT(devfn),
61164 +                        PCI_FUNC(devfn), where, size);
61165 +
61166 +       err = do_pci_op(pdev, &op);
61167 +
61168 +       if (likely(!err)) {
61169 +               if (verbose_request)
61170 +                       dev_info(&pdev->xdev->dev, "read got back value %x\n",
61171 +                                op.value);
61172 +
61173 +               *val = op.value;
61174 +       } else if (err == -ENODEV) {
61175 +               /* No device here, pretend that it just returned 0 */
61176 +               err = 0;
61177 +               *val = 0;
61178 +       }
61179 +
61180 +       return errno_to_pcibios_err(err);
61181 +}
61182 +
61183 +/* Access to this function is spinlocked in drivers/pci/access.c */
61184 +static int pcifront_bus_write(struct pci_bus *bus, unsigned int devfn,
61185 +                             int where, int size, u32 val)
61186 +{
61187 +       struct xen_pci_op op = {
61188 +               .cmd    = XEN_PCI_OP_conf_write,
61189 +               .domain = pci_domain_nr(bus),
61190 +               .bus    = bus->number,
61191 +               .devfn  = devfn,
61192 +               .offset = where,
61193 +               .size   = size,
61194 +               .value  = val,
61195 +       };
61196 +       struct pcifront_sd *sd = bus->sysdata;
61197 +       struct pcifront_device *pdev = sd->pdev;
61198 +
61199 +       if (verbose_request)
61200 +               dev_info(&pdev->xdev->dev,
61201 +                        "write dev=%04x:%02x:%02x.%01x - "
61202 +                        "offset %x size %d val %x\n",
61203 +                        pci_domain_nr(bus), bus->number,
61204 +                        PCI_SLOT(devfn), PCI_FUNC(devfn), where, size, val);
61205 +
61206 +       return errno_to_pcibios_err(do_pci_op(pdev, &op));
61207 +}
61208 +
61209 +struct pci_ops pcifront_bus_ops = {
61210 +       .read = pcifront_bus_read,
61211 +       .write = pcifront_bus_write,
61212 +};
61213 +
61214 +/* Claim resources for the PCI frontend as-is, backend won't allow changes */
61215 +static void pcifront_claim_resource(struct pci_dev *dev, void *data)
61216 +{
61217 +       struct pcifront_device *pdev = data;
61218 +       int i;
61219 +       struct resource *r;
61220 +
61221 +       for (i = 0; i < PCI_NUM_RESOURCES; i++) {
61222 +               r = &dev->resource[i];
61223 +
61224 +               if (!r->parent && r->start && r->flags) {
61225 +                       dev_dbg(&pdev->xdev->dev, "claiming resource %s/%d\n",
61226 +                               pci_name(dev), i);
61227 +                       pci_claim_resource(dev, i);
61228 +               }
61229 +       }
61230 +}
61231 +
61232 +int pcifront_scan_root(struct pcifront_device *pdev,
61233 +                      unsigned int domain, unsigned int bus)
61234 +{
61235 +       struct pci_bus *b;
61236 +       struct pcifront_sd *sd = NULL;
61237 +       struct pci_bus_entry *bus_entry = NULL;
61238 +       int err = 0;
61239 +
61240 +#ifndef CONFIG_PCI_DOMAINS
61241 +       if (domain != 0) {
61242 +               dev_err(&pdev->xdev->dev,
61243 +                       "PCI Root in non-zero PCI Domain! domain=%d\n", domain);
61244 +               dev_err(&pdev->xdev->dev,
61245 +                       "Please compile with CONFIG_PCI_DOMAINS\n");
61246 +               err = -EINVAL;
61247 +               goto err_out;
61248 +       }
61249 +#endif
61250 +
61251 +       dev_info(&pdev->xdev->dev, "Creating PCI Frontend Bus %04x:%02x\n",
61252 +                domain, bus);
61253 +
61254 +       bus_entry = kmalloc(sizeof(*bus_entry), GFP_KERNEL);
61255 +       sd = kmalloc(sizeof(*sd), GFP_KERNEL);
61256 +       if (!bus_entry || !sd) {
61257 +               err = -ENOMEM;
61258 +               goto err_out;
61259 +       }
61260 +       sd->domain = domain;
61261 +       sd->pdev = pdev;
61262 +
61263 +       b = pci_scan_bus_parented(&pdev->xdev->dev, bus, &pcifront_bus_ops, sd);
61264 +       if (!b) {
61265 +               dev_err(&pdev->xdev->dev, "Error creating PCI Frontend Bus!\n");
61266 +               err = -ENOMEM;
61267 +               goto err_out;
61268 +       }
61269 +       bus_entry->bus = b;
61270 +
61271 +       list_add(&bus_entry->list, &pdev->root_buses);
61272 +
61273 +       /* Claim resources before going "live" with our devices */
61274 +       pci_walk_bus(b, pcifront_claim_resource, pdev);
61275 +
61276 +       pci_bus_add_devices(b);
61277 +
61278 +       return 0;
61279 +
61280 +      err_out:
61281 +       kfree(bus_entry);
61282 +       kfree(sd);
61283 +
61284 +       return err;
61285 +}
61286 +
61287 +static void free_root_bus_devs(struct pci_bus *bus)
61288 +{
61289 +       struct pci_dev *dev;
61290 +
61291 +       spin_lock(&pci_bus_lock);
61292 +       while (!list_empty(&bus->devices)) {
61293 +               dev = container_of(bus->devices.next, struct pci_dev, bus_list);
61294 +               spin_unlock(&pci_bus_lock);
61295 +
61296 +               dev_dbg(&dev->dev, "removing device\n");
61297 +               pci_remove_bus_device(dev);
61298 +
61299 +               spin_lock(&pci_bus_lock);
61300 +       }
61301 +       spin_unlock(&pci_bus_lock);
61302 +}
61303 +
61304 +void pcifront_free_roots(struct pcifront_device *pdev)
61305 +{
61306 +       struct pci_bus_entry *bus_entry, *t;
61307 +
61308 +       dev_dbg(&pdev->xdev->dev, "cleaning up root buses\n");
61309 +
61310 +       list_for_each_entry_safe(bus_entry, t, &pdev->root_buses, list) {
61311 +               list_del(&bus_entry->list);
61312 +
61313 +               free_root_bus_devs(bus_entry->bus);
61314 +
61315 +               kfree(bus_entry->bus->sysdata);
61316 +
61317 +               device_unregister(bus_entry->bus->bridge);
61318 +               pci_remove_bus(bus_entry->bus);
61319 +
61320 +               kfree(bus_entry);
61321 +       }
61322 +}
61323 diff -urNp linux-2.6/drivers/xen/pcifront/xenbus.c new/drivers/xen/pcifront/xenbus.c
61324 --- linux-2.6/drivers/xen/pcifront/xenbus.c     1970-01-01 01:00:00.000000000 +0100
61325 +++ new/drivers/xen/pcifront/xenbus.c   2006-06-28 14:32:14.000000000 +0200
61326 @@ -0,0 +1,294 @@
61327 +/*
61328 + * PCI Frontend Xenbus Setup - handles setup with backend (imports page/evtchn)
61329 + *
61330 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
61331 + */
61332 +#include <linux/module.h>
61333 +#include <linux/init.h>
61334 +#include <linux/mm.h>
61335 +#include <xen/xenbus.h>
61336 +#include "pcifront.h"
61337 +
61338 +#define INVALID_GRANT_REF (0)
61339 +#define INVALID_EVTCHN    (-1)
61340 +
61341 +static struct pcifront_device *alloc_pdev(struct xenbus_device *xdev)
61342 +{
61343 +       struct pcifront_device *pdev;
61344 +
61345 +       pdev = kmalloc(sizeof(struct pcifront_device), GFP_KERNEL);
61346 +       if (pdev == NULL)
61347 +               goto out;
61348 +
61349 +       pdev->sh_info =
61350 +           (struct xen_pci_sharedinfo *)__get_free_page(GFP_KERNEL);
61351 +       if (pdev->sh_info == NULL) {
61352 +               kfree(pdev);
61353 +               pdev = NULL;
61354 +               goto out;
61355 +       }
61356 +       pdev->sh_info->flags = 0;
61357 +
61358 +       xdev->dev.driver_data = pdev;
61359 +       pdev->xdev = xdev;
61360 +
61361 +       INIT_LIST_HEAD(&pdev->root_buses);
61362 +
61363 +       spin_lock_init(&pdev->dev_lock);
61364 +       spin_lock_init(&pdev->sh_info_lock);
61365 +
61366 +       pdev->evtchn = INVALID_EVTCHN;
61367 +       pdev->gnt_ref = INVALID_GRANT_REF;
61368 +
61369 +       dev_dbg(&xdev->dev, "Allocated pdev @ 0x%p pdev->sh_info @ 0x%p\n",
61370 +               pdev, pdev->sh_info);
61371 +      out:
61372 +       return pdev;
61373 +}
61374 +
61375 +static void free_pdev(struct pcifront_device *pdev)
61376 +{
61377 +       dev_dbg(&pdev->xdev->dev, "freeing pdev @ 0x%p\n", pdev);
61378 +
61379 +       pcifront_free_roots(pdev);
61380 +
61381 +       if (pdev->evtchn != INVALID_EVTCHN)
61382 +               xenbus_free_evtchn(pdev->xdev, pdev->evtchn);
61383 +
61384 +       if (pdev->gnt_ref != INVALID_GRANT_REF)
61385 +               gnttab_end_foreign_access(pdev->gnt_ref, 0,
61386 +                                         (unsigned long)pdev->sh_info);
61387 +
61388 +       pdev->xdev->dev.driver_data = NULL;
61389 +
61390 +       kfree(pdev);
61391 +}
61392 +
61393 +static int pcifront_publish_info(struct pcifront_device *pdev)
61394 +{
61395 +       int err = 0;
61396 +       struct xenbus_transaction trans;
61397 +
61398 +       err = xenbus_grant_ring(pdev->xdev, virt_to_mfn(pdev->sh_info));
61399 +       if (err < 0)
61400 +               goto out;
61401 +
61402 +       pdev->gnt_ref = err;
61403 +
61404 +       err = xenbus_alloc_evtchn(pdev->xdev, &pdev->evtchn);
61405 +       if (err)
61406 +               goto out;
61407 +
61408 +      do_publish:
61409 +       err = xenbus_transaction_start(&trans);
61410 +       if (err) {
61411 +               xenbus_dev_fatal(pdev->xdev, err,
61412 +                                "Error writing configuration for backend "
61413 +                                "(start transaction)");
61414 +               goto out;
61415 +       }
61416 +
61417 +       err = xenbus_printf(trans, pdev->xdev->nodename,
61418 +                           "pci-op-ref", "%u", pdev->gnt_ref);
61419 +       if (!err)
61420 +               err = xenbus_printf(trans, pdev->xdev->nodename,
61421 +                                   "event-channel", "%u", pdev->evtchn);
61422 +       if (!err)
61423 +               err = xenbus_printf(trans, pdev->xdev->nodename,
61424 +                                   "magic", XEN_PCI_MAGIC);
61425 +
61426 +       if (err) {
61427 +               xenbus_transaction_end(trans, 1);
61428 +               xenbus_dev_fatal(pdev->xdev, err,
61429 +                                "Error writing configuration for backend");
61430 +               goto out;
61431 +       } else {
61432 +               err = xenbus_transaction_end(trans, 0);
61433 +               if (err == -EAGAIN)
61434 +                       goto do_publish;
61435 +               else if (err) {
61436 +                       xenbus_dev_fatal(pdev->xdev, err,
61437 +                                        "Error completing transaction "
61438 +                                        "for backend");
61439 +                       goto out;
61440 +               }
61441 +       }
61442 +
61443 +       xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
61444 +
61445 +       dev_dbg(&pdev->xdev->dev, "publishing successful!\n");
61446 +
61447 +      out:
61448 +       return err;
61449 +}
61450 +
61451 +static int pcifront_try_connect(struct pcifront_device *pdev)
61452 +{
61453 +       int err = -EFAULT;
61454 +       int i, num_roots, len;
61455 +       char str[64];
61456 +       unsigned int domain, bus;
61457 +
61458 +       spin_lock(&pdev->dev_lock);
61459 +
61460 +       /* Only connect once */
61461 +       if (xenbus_read_driver_state(pdev->xdev->nodename) !=
61462 +           XenbusStateInitialised)
61463 +               goto out;
61464 +
61465 +       err = pcifront_connect(pdev);
61466 +       if (err) {
61467 +               xenbus_dev_fatal(pdev->xdev, err,
61468 +                                "Error connecting PCI Frontend");
61469 +               goto out;
61470 +       }
61471 +
61472 +       err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
61473 +                          "root_num", "%d", &num_roots);
61474 +       if (err == -ENOENT) {
61475 +               xenbus_dev_error(pdev->xdev, err,
61476 +                                "No PCI Roots found, trying 0000:00");
61477 +               err = pcifront_scan_root(pdev, 0, 0);
61478 +               num_roots = 0;
61479 +       } else if (err != 1) {
61480 +               if (err == 0)
61481 +                       err = -EINVAL;
61482 +               xenbus_dev_fatal(pdev->xdev, err,
61483 +                                "Error reading number of PCI roots");
61484 +               goto out;
61485 +       }
61486 +
61487 +       for (i = 0; i < num_roots; i++) {
61488 +               len = snprintf(str, sizeof(str), "root-%d", i);
61489 +               if (unlikely(len >= (sizeof(str) - 1))) {
61490 +                       err = -ENOMEM;
61491 +                       goto out;
61492 +               }
61493 +
61494 +               err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
61495 +                                  "%x:%x", &domain, &bus);
61496 +               if (err != 2) {
61497 +                       if (err >= 0)
61498 +                               err = -EINVAL;
61499 +                       xenbus_dev_fatal(pdev->xdev, err,
61500 +                                        "Error reading PCI root %d", i);
61501 +                       goto out;
61502 +               }
61503 +
61504 +               err = pcifront_scan_root(pdev, domain, bus);
61505 +               if (err) {
61506 +                       xenbus_dev_fatal(pdev->xdev, err,
61507 +                                        "Error scanning PCI root %04x:%02x",
61508 +                                        domain, bus);
61509 +                       goto out;
61510 +               }
61511 +       }
61512 +
61513 +       err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
61514 +       if (err)
61515 +               goto out;
61516 +
61517 +      out:
61518 +       spin_unlock(&pdev->dev_lock);
61519 +       return err;
61520 +}
61521 +
61522 +static int pcifront_try_disconnect(struct pcifront_device *pdev)
61523 +{
61524 +       int err = 0;
61525 +       enum xenbus_state prev_state;
61526 +
61527 +       spin_lock(&pdev->dev_lock);
61528 +
61529 +       prev_state = xenbus_read_driver_state(pdev->xdev->nodename);
61530 +
61531 +       if (prev_state < XenbusStateClosing)
61532 +               err = xenbus_switch_state(pdev->xdev, XenbusStateClosing);
61533 +
61534 +       if (!err && prev_state == XenbusStateConnected)
61535 +               pcifront_disconnect(pdev);
61536 +
61537 +       spin_unlock(&pdev->dev_lock);
61538 +
61539 +       return err;
61540 +}
61541 +
61542 +static void pcifront_backend_changed(struct xenbus_device *xdev,
61543 +                                    enum xenbus_state be_state)
61544 +{
61545 +       struct pcifront_device *pdev = xdev->dev.driver_data;
61546 +
61547 +       switch (be_state) {
61548 +       case XenbusStateClosing:
61549 +               dev_warn(&xdev->dev, "backend going away!\n");
61550 +               pcifront_try_disconnect(pdev);
61551 +               break;
61552 +
61553 +       case XenbusStateClosed:
61554 +               dev_warn(&xdev->dev, "backend went away!\n");
61555 +               pcifront_try_disconnect(pdev);
61556 +
61557 +               device_unregister(&pdev->xdev->dev);
61558 +               break;
61559 +
61560 +       case XenbusStateConnected:
61561 +               pcifront_try_connect(pdev);
61562 +               break;
61563 +
61564 +       default:
61565 +               break;
61566 +       }
61567 +}
61568 +
61569 +static int pcifront_xenbus_probe(struct xenbus_device *xdev,
61570 +                                const struct xenbus_device_id *id)
61571 +{
61572 +       int err = 0;
61573 +       struct pcifront_device *pdev = alloc_pdev(xdev);
61574 +
61575 +       if (pdev == NULL) {
61576 +               err = -ENOMEM;
61577 +               xenbus_dev_fatal(xdev, err,
61578 +                                "Error allocating pcifront_device struct");
61579 +               goto out;
61580 +       }
61581 +
61582 +       err = pcifront_publish_info(pdev);
61583 +
61584 +      out:
61585 +       return err;
61586 +}
61587 +
61588 +static int pcifront_xenbus_remove(struct xenbus_device *xdev)
61589 +{
61590 +       if (xdev->dev.driver_data)
61591 +               free_pdev(xdev->dev.driver_data);
61592 +
61593 +       return 0;
61594 +}
61595 +
61596 +static struct xenbus_device_id xenpci_ids[] = {
61597 +       {"pci"},
61598 +       {{0}},
61599 +};
61600 +
61601 +static struct xenbus_driver xenbus_pcifront_driver = {
61602 +       .name                   = "pcifront",
61603 +       .owner                  = THIS_MODULE,
61604 +       .ids                    = xenpci_ids,
61605 +       .probe                  = pcifront_xenbus_probe,
61606 +       .remove                 = pcifront_xenbus_remove,
61607 +       .otherend_changed       = pcifront_backend_changed,
61608 +};
61609 +
61610 +static int __init pcifront_init(void)
61611 +{
61612 +       int err = 0;
61613 +
61614 +       err = xenbus_register_frontend(&xenbus_pcifront_driver);
61615 +
61616 +       return err;
61617 +}
61618 +
61619 +/* Initialize after the Xen PCI Frontend Stub is initialized */
61620 +subsys_initcall(pcifront_init);
61621 diff -urNp linux-2.6/drivers/xen/privcmd/Makefile new/drivers/xen/privcmd/Makefile
61622 --- linux-2.6/drivers/xen/privcmd/Makefile      1970-01-01 01:00:00.000000000 +0100
61623 +++ new/drivers/xen/privcmd/Makefile    2006-06-28 14:32:14.000000000 +0200
61624 @@ -0,0 +1,2 @@
61625 +
61626 +obj-$(CONFIG_XEN_PRIVCMD)      := privcmd.o
61627 diff -urNp linux-2.6/drivers/xen/privcmd/privcmd.c new/drivers/xen/privcmd/privcmd.c
61628 --- linux-2.6/drivers/xen/privcmd/privcmd.c     1970-01-01 01:00:00.000000000 +0100
61629 +++ new/drivers/xen/privcmd/privcmd.c   2006-06-28 14:32:14.000000000 +0200
61630 @@ -0,0 +1,285 @@
61631 +/******************************************************************************
61632 + * privcmd.c
61633 + * 
61634 + * Interface to privileged domain-0 commands.
61635 + * 
61636 + * Copyright (c) 2002-2004, K A Fraser, B Dragovic
61637 + */
61638 +
61639 +#include <linux/config.h>
61640 +#include <linux/kernel.h>
61641 +#include <linux/sched.h>
61642 +#include <linux/slab.h>
61643 +#include <linux/string.h>
61644 +#include <linux/errno.h>
61645 +#include <linux/mm.h>
61646 +#include <linux/mman.h>
61647 +#include <linux/swap.h>
61648 +#include <linux/smp_lock.h>
61649 +#include <linux/highmem.h>
61650 +#include <linux/pagemap.h>
61651 +#include <linux/seq_file.h>
61652 +#include <linux/kthread.h>
61653 +#include <asm/hypervisor.h>
61654 +
61655 +#include <asm/pgalloc.h>
61656 +#include <asm/pgtable.h>
61657 +#include <asm/uaccess.h>
61658 +#include <asm/tlb.h>
61659 +#include <asm/hypervisor.h>
61660 +#include <xen/public/privcmd.h>
61661 +#include <xen/interface/xen.h>
61662 +#include <xen/interface/dom0_ops.h>
61663 +#include <xen/xen_proc.h>
61664 +
61665 +static struct proc_dir_entry *privcmd_intf;
61666 +static struct proc_dir_entry *capabilities_intf;
61667 +
61668 +#define NR_HYPERCALLS 64
61669 +static DECLARE_BITMAP(hypercall_permission_map, NR_HYPERCALLS);
61670 +
61671 +static int privcmd_ioctl(struct inode *inode, struct file *file,
61672 +                        unsigned int cmd, unsigned long data)
61673 +{
61674 +       int ret = -ENOSYS;
61675 +       void __user *udata = (void __user *) data;
61676 +
61677 +       switch (cmd) {
61678 +       case IOCTL_PRIVCMD_HYPERCALL: {
61679 +               privcmd_hypercall_t hypercall;
61680 +  
61681 +               if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
61682 +                       return -EFAULT;
61683 +
61684 +               /* Check hypercall number for validity. */
61685 +               if (hypercall.op >= NR_HYPERCALLS)
61686 +                       return -EINVAL;
61687 +               if (!test_bit(hypercall.op, hypercall_permission_map))
61688 +                       return -EINVAL;
61689 +
61690 +#if defined(__i386__)
61691 +               __asm__ __volatile__ (
61692 +                       "pushl %%ebx; pushl %%ecx; pushl %%edx; "
61693 +                       "pushl %%esi; pushl %%edi; "
61694 +                       "movl  8(%%eax),%%ebx ;"
61695 +                       "movl 16(%%eax),%%ecx ;"
61696 +                       "movl 24(%%eax),%%edx ;"
61697 +                       "movl 32(%%eax),%%esi ;"
61698 +                       "movl 40(%%eax),%%edi ;"
61699 +                       "movl   (%%eax),%%eax ;"
61700 +                       "shll $5,%%eax ;"
61701 +                       "addl $hypercall_page,%%eax ;"
61702 +                       "call *%%eax ;"
61703 +                       "popl %%edi; popl %%esi; popl %%edx; "
61704 +                       "popl %%ecx; popl %%ebx"
61705 +                       : "=a" (ret) : "0" (&hypercall) : "memory" );
61706 +#elif defined (__x86_64__)
61707 +               {
61708 +                       long ign1, ign2, ign3;
61709 +                       __asm__ __volatile__ (
61710 +                               "movq %8,%%r10; movq %9,%%r8;"
61711 +                               "shlq $5,%%rax ;"
61712 +                               "addq $hypercall_page,%%rax ;"
61713 +                               "call *%%rax"
61714 +                               : "=a" (ret), "=D" (ign1),
61715 +                                 "=S" (ign2), "=d" (ign3)
61716 +                               : "0" ((unsigned long)hypercall.op), 
61717 +                               "1" ((unsigned long)hypercall.arg[0]), 
61718 +                               "2" ((unsigned long)hypercall.arg[1]),
61719 +                               "3" ((unsigned long)hypercall.arg[2]), 
61720 +                               "g" ((unsigned long)hypercall.arg[3]),
61721 +                               "g" ((unsigned long)hypercall.arg[4])
61722 +                               : "r8", "r10", "memory" );
61723 +               }
61724 +#elif defined (__ia64__)
61725 +               __asm__ __volatile__ (
61726 +                       ";; mov r14=%2; mov r15=%3; "
61727 +                       "mov r16=%4; mov r17=%5; mov r18=%6;"
61728 +                       "mov r2=%1; break 0x1000;; mov %0=r8 ;;"
61729 +                       : "=r" (ret)
61730 +                       : "r" (hypercall.op),
61731 +                       "r" (hypercall.arg[0]),
61732 +                       "r" (hypercall.arg[1]),
61733 +                       "r" (hypercall.arg[2]),
61734 +                       "r" (hypercall.arg[3]),
61735 +                       "r" (hypercall.arg[4])
61736 +                       : "r14","r15","r16","r17","r18","r2","r8","memory");
61737 +#endif
61738 +       }
61739 +       break;
61740 +
61741 +#if defined(CONFIG_XEN_PRIVILEGED_GUEST)
61742 +       case IOCTL_PRIVCMD_MMAP: {
61743 +#define PRIVCMD_MMAP_SZ 32
61744 +               privcmd_mmap_t mmapcmd;
61745 +               privcmd_mmap_entry_t msg[PRIVCMD_MMAP_SZ];
61746 +               privcmd_mmap_entry_t __user *p;
61747 +               int i, rc;
61748 +
61749 +               if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
61750 +                       return -EFAULT;
61751 +
61752 +               p = mmapcmd.entry;
61753 +
61754 +               for (i = 0; i < mmapcmd.num;
61755 +                    i += PRIVCMD_MMAP_SZ, p += PRIVCMD_MMAP_SZ) {
61756 +                       int j, n = ((mmapcmd.num-i)>PRIVCMD_MMAP_SZ)?
61757 +                               PRIVCMD_MMAP_SZ:(mmapcmd.num-i);
61758 +
61759 +                       if (copy_from_user(&msg, p,
61760 +                                          n*sizeof(privcmd_mmap_entry_t)))
61761 +                               return -EFAULT;
61762 +     
61763 +                       for (j = 0; j < n; j++) {
61764 +                               struct vm_area_struct *vma = 
61765 +                                       find_vma( current->mm, msg[j].va );
61766 +
61767 +                               if (!vma)
61768 +                                       return -EINVAL;
61769 +
61770 +                               if (msg[j].va > PAGE_OFFSET)
61771 +                                       return -EINVAL;
61772 +
61773 +                               if ((msg[j].va + (msg[j].npages << PAGE_SHIFT))
61774 +                                   > vma->vm_end )
61775 +                                       return -EINVAL;
61776 +
61777 +                               if ((rc = direct_remap_pfn_range(
61778 +                                       vma,
61779 +                                       msg[j].va&PAGE_MASK, 
61780 +                                       msg[j].mfn, 
61781 +                                       msg[j].npages<<PAGE_SHIFT, 
61782 +                                       vma->vm_page_prot,
61783 +                                       mmapcmd.dom)) < 0)
61784 +                                       return rc;
61785 +                       }
61786 +               }
61787 +               ret = 0;
61788 +       }
61789 +       break;
61790 +
61791 +       case IOCTL_PRIVCMD_MMAPBATCH: {
61792 +               privcmd_mmapbatch_t m;
61793 +               struct vm_area_struct *vma = NULL;
61794 +               xen_pfn_t __user *p;
61795 +               unsigned long addr, mfn; 
61796 +               int i;
61797 +
61798 +               if (copy_from_user(&m, udata, sizeof(m))) {
61799 +                       ret = -EFAULT;
61800 +                       goto batch_err;
61801 +               }
61802 +
61803 +               if (m.dom == DOMID_SELF) {
61804 +                       ret = -EINVAL;
61805 +                       goto batch_err;
61806 +               }
61807 +
61808 +               vma = find_vma(current->mm, m.addr);
61809 +               if (!vma) {
61810 +                       ret = -EINVAL;
61811 +                       goto batch_err;
61812 +               }
61813 +
61814 +               if (m.addr > PAGE_OFFSET) {
61815 +                       ret = -EFAULT;
61816 +                       goto batch_err;
61817 +               }
61818 +
61819 +               if ((m.addr + (m.num<<PAGE_SHIFT)) > vma->vm_end) {
61820 +                       ret = -EFAULT;
61821 +                       goto batch_err;
61822 +               }
61823 +
61824 +               p = m.arr;
61825 +               addr = m.addr;
61826 +               for (i = 0; i < m.num; i++, addr += PAGE_SIZE, p++) {
61827 +                       if (get_user(mfn, p))
61828 +                               return -EFAULT;
61829 +
61830 +                       ret = direct_remap_pfn_range(vma, addr & PAGE_MASK,
61831 +                                                    mfn, PAGE_SIZE,
61832 +                                                    vma->vm_page_prot, m.dom);
61833 +                       if (ret < 0)
61834 +                               put_user(0xF0000000 | mfn, p);
61835 +               }
61836 +
61837 +               ret = 0;
61838 +               break;
61839 +
61840 +       batch_err:
61841 +               printk("batch_err ret=%d vma=%p addr=%lx "
61842 +                      "num=%d arr=%p %lx-%lx\n", 
61843 +                      ret, vma, (unsigned long)m.addr, m.num, m.arr,
61844 +                      vma ? vma->vm_start : 0, vma ? vma->vm_end : 0);
61845 +               break;
61846 +       }
61847 +       break;
61848 +#endif
61849 +
61850 +       default:
61851 +               ret = -EINVAL;
61852 +               break;
61853 +       }
61854 +
61855 +       return ret;
61856 +}
61857 +
61858 +#ifndef HAVE_ARCH_PRIVCMD_MMAP
61859 +static int privcmd_mmap(struct file * file, struct vm_area_struct * vma)
61860 +{
61861 +       /* DONTCOPY is essential for Xen as copy_page_range is broken. */
61862 +       vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY;
61863 +
61864 +       return 0;
61865 +}
61866 +#endif
61867 +
61868 +static struct file_operations privcmd_file_ops = {
61869 +       .ioctl = privcmd_ioctl,
61870 +       .mmap  = privcmd_mmap,
61871 +};
61872 +
61873 +static int capabilities_read(char *page, char **start, off_t off,
61874 +                            int count, int *eof, void *data)
61875 +{
61876 +       int len = 0;
61877 +       *page = 0;
61878 +
61879 +       if (xen_start_info->flags & SIF_INITDOMAIN)
61880 +               len = sprintf( page, "control_d\n" );
61881 +
61882 +       *eof = 1;
61883 +       return len;
61884 +}
61885 +
61886 +static int __init privcmd_init(void)
61887 +{
61888 +       if (!is_running_on_xen())
61889 +               return -ENODEV;
61890 +
61891 +       /* Set of hypercalls that privileged applications may execute. */
61892 +       set_bit(__HYPERVISOR_acm_op,           hypercall_permission_map);
61893 +       set_bit(__HYPERVISOR_dom0_op,          hypercall_permission_map);
61894 +       set_bit(__HYPERVISOR_event_channel_op, hypercall_permission_map);
61895 +       set_bit(__HYPERVISOR_memory_op,        hypercall_permission_map);
61896 +       set_bit(__HYPERVISOR_mmu_update,       hypercall_permission_map);
61897 +       set_bit(__HYPERVISOR_mmuext_op,        hypercall_permission_map);
61898 +       set_bit(__HYPERVISOR_xen_version,      hypercall_permission_map);
61899 +       set_bit(__HYPERVISOR_sched_op,         hypercall_permission_map);
61900 +       set_bit(__HYPERVISOR_sched_op_compat,  hypercall_permission_map);
61901 +       set_bit(__HYPERVISOR_event_channel_op_compat,
61902 +               hypercall_permission_map);
61903 +
61904 +       privcmd_intf = create_xen_proc_entry("privcmd", 0400);
61905 +       if (privcmd_intf != NULL)
61906 +               privcmd_intf->proc_fops = &privcmd_file_ops;
61907 +
61908 +       capabilities_intf = create_xen_proc_entry("capabilities", 0400 );
61909 +       if (capabilities_intf != NULL)
61910 +               capabilities_intf->read_proc = capabilities_read;
61911 +
61912 +       return 0;
61913 +}
61914 +
61915 +__initcall(privcmd_init);
61916 diff -urNp linux-2.6/drivers/xen/tpmback/common.h new/drivers/xen/tpmback/common.h
61917 --- linux-2.6/drivers/xen/tpmback/common.h      1970-01-01 01:00:00.000000000 +0100
61918 +++ new/drivers/xen/tpmback/common.h    2006-07-07 15:10:03.000000000 +0200
61919 @@ -0,0 +1,85 @@
61920 +/******************************************************************************
61921 + * drivers/xen/tpmback/common.h
61922 + */
61923 +
61924 +#ifndef __NETIF__BACKEND__COMMON_H__
61925 +#define __NETIF__BACKEND__COMMON_H__
61926 +
61927 +#include <linux/config.h>
61928 +#include <linux/version.h>
61929 +#include <linux/module.h>
61930 +#include <linux/interrupt.h>
61931 +#include <linux/slab.h>
61932 +#include <xen/evtchn.h>
61933 +#include <xen/driver_util.h>
61934 +#include <xen/interface/grant_table.h>
61935 +#include <xen/interface/io/tpmif.h>
61936 +#include <asm/io.h>
61937 +#include <asm/pgalloc.h>
61938 +
61939 +#define DPRINTK(_f, _a...)                     \
61940 +       pr_debug("(file=%s, line=%d) " _f,      \
61941 +                __FILE__ , __LINE__ , ## _a )
61942 +
61943 +struct backend_info;
61944 +
61945 +typedef struct tpmif_st {
61946 +       struct list_head tpmif_list;
61947 +       /* Unique identifier for this interface. */
61948 +       domid_t domid;
61949 +       unsigned int handle;
61950 +
61951 +       /* Physical parameters of the comms window. */
61952 +       unsigned int evtchn;
61953 +       unsigned int irq;
61954 +
61955 +       /* The shared rings and indexes. */
61956 +       tpmif_tx_interface_t *tx;
61957 +       struct vm_struct *tx_area;
61958 +
61959 +       /* Miscellaneous private stuff. */
61960 +       enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
61961 +       int active;
61962 +
61963 +       struct tpmif_st *hash_next;
61964 +       struct list_head list;  /* scheduling list */
61965 +       atomic_t refcnt;
61966 +
61967 +       struct backend_info *bi;
61968 +       unsigned long mmap_vstart;
61969 +
61970 +       grant_handle_t shmem_handle;
61971 +       grant_ref_t shmem_ref;
61972 +       struct page *pagerange;
61973 +
61974 +       char devname[20];
61975 +} tpmif_t;
61976 +
61977 +void tpmif_disconnect_complete(tpmif_t * tpmif);
61978 +tpmif_t *tpmif_find(domid_t domid, struct backend_info *bi);
61979 +void tpmif_interface_init(void);
61980 +void tpmif_interface_exit(void);
61981 +void tpmif_schedule_work(tpmif_t * tpmif);
61982 +void tpmif_deschedule_work(tpmif_t * tpmif);
61983 +void tpmif_xenbus_init(void);
61984 +void tpmif_xenbus_exit(void);
61985 +int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn);
61986 +irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs);
61987 +
61988 +long int tpmback_get_instance(struct backend_info *bi);
61989 +
61990 +int vtpm_release_packets(tpmif_t * tpmif, int send_msgs);
61991 +
61992 +
61993 +#define tpmif_get(_b) (atomic_inc(&(_b)->refcnt))
61994 +#define tpmif_put(_b)                                  \
61995 +       do {                                            \
61996 +               if (atomic_dec_and_test(&(_b)->refcnt)) \
61997 +                       tpmif_disconnect_complete(_b);  \
61998 +       } while (0)
61999 +
62000 +extern int num_frontends;
62001 +
62002 +#define MMAP_VADDR(t,_req) ((t)->mmap_vstart + ((_req) * PAGE_SIZE))
62003 +
62004 +#endif /* __TPMIF__BACKEND__COMMON_H__ */
62005 diff -urNp linux-2.6/drivers/xen/tpmback/interface.c new/drivers/xen/tpmback/interface.c
62006 --- linux-2.6/drivers/xen/tpmback/interface.c   1970-01-01 01:00:00.000000000 +0100
62007 +++ new/drivers/xen/tpmback/interface.c 2006-07-07 15:10:03.000000000 +0200
62008 @@ -0,0 +1,177 @@
62009 + /*****************************************************************************
62010 + * drivers/xen/tpmback/interface.c
62011 + *
62012 + * Vritual TPM interface management.
62013 + *
62014 + * Copyright (c) 2005, IBM Corporation
62015 + *
62016 + * Author: Stefan Berger, stefanb@us.ibm.com
62017 + *
62018 + * This code has been derived from drivers/xen/netback/interface.c
62019 + * Copyright (c) 2004, Keir Fraser
62020 + */
62021 +
62022 +#include "common.h"
62023 +#include <xen/balloon.h>
62024 +#include <xen/gnttab.h>
62025 +
62026 +static kmem_cache_t *tpmif_cachep;
62027 +int num_frontends = 0;
62028 +
62029 +LIST_HEAD(tpmif_list);
62030 +
62031 +static tpmif_t *alloc_tpmif(domid_t domid, struct backend_info *bi)
62032 +{
62033 +       tpmif_t *tpmif;
62034 +
62035 +       tpmif = kmem_cache_alloc(tpmif_cachep, GFP_KERNEL);
62036 +       if (!tpmif)
62037 +               return ERR_PTR(-ENOMEM);
62038 +
62039 +       memset(tpmif, 0, sizeof (*tpmif));
62040 +       tpmif->domid = domid;
62041 +       tpmif->status = DISCONNECTED;
62042 +       tpmif->bi = bi;
62043 +       snprintf(tpmif->devname, sizeof(tpmif->devname), "tpmif%d", domid);
62044 +       atomic_set(&tpmif->refcnt, 1);
62045 +
62046 +       tpmif->pagerange = balloon_alloc_empty_page_range(TPMIF_TX_RING_SIZE);
62047 +       BUG_ON(tpmif->pagerange == NULL);
62048 +       tpmif->mmap_vstart = (unsigned long)pfn_to_kaddr(
62049 +                                           page_to_pfn(tpmif->pagerange));
62050 +
62051 +       list_add(&tpmif->tpmif_list, &tpmif_list);
62052 +       num_frontends++;
62053 +
62054 +       return tpmif;
62055 +}
62056 +
62057 +static void free_tpmif(tpmif_t * tpmif)
62058 +{
62059 +       num_frontends--;
62060 +       list_del(&tpmif->tpmif_list);
62061 +       balloon_dealloc_empty_page_range(tpmif->pagerange, TPMIF_TX_RING_SIZE);
62062 +       kmem_cache_free(tpmif_cachep, tpmif);
62063 +}
62064 +
62065 +tpmif_t *tpmif_find(domid_t domid, struct backend_info *bi)
62066 +{
62067 +       tpmif_t *tpmif;
62068 +
62069 +       list_for_each_entry(tpmif, &tpmif_list, tpmif_list) {
62070 +               if (tpmif->bi == bi) {
62071 +                       if (tpmif->domid == domid) {
62072 +                               tpmif_get(tpmif);
62073 +                               return tpmif;
62074 +                       } else {
62075 +                               return ERR_PTR(-EEXIST);
62076 +                       }
62077 +               }
62078 +       }
62079 +
62080 +       return alloc_tpmif(domid, bi);
62081 +}
62082 +
62083 +static int map_frontend_page(tpmif_t *tpmif, unsigned long shared_page)
62084 +{
62085 +       int ret;
62086 +       struct gnttab_map_grant_ref op;
62087 +
62088 +       gnttab_set_map_op(&op, (unsigned long)tpmif->tx_area->addr,
62089 +                         GNTMAP_host_map, shared_page, tpmif->domid);
62090 +
62091 +       lock_vm_area(tpmif->tx_area);
62092 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
62093 +       unlock_vm_area(tpmif->tx_area);
62094 +       BUG_ON(ret);
62095 +
62096 +       if (op.status) {
62097 +               DPRINTK(" Grant table operation failure !\n");
62098 +               return op.status;
62099 +       }
62100 +
62101 +       tpmif->shmem_ref = shared_page;
62102 +       tpmif->shmem_handle = op.handle;
62103 +
62104 +       return 0;
62105 +}
62106 +
62107 +static void unmap_frontend_page(tpmif_t *tpmif)
62108 +{
62109 +       struct gnttab_unmap_grant_ref op;
62110 +       int ret;
62111 +
62112 +       gnttab_set_unmap_op(&op, (unsigned long)tpmif->tx_area->addr,
62113 +                           GNTMAP_host_map, tpmif->shmem_handle);
62114 +
62115 +       lock_vm_area(tpmif->tx_area);
62116 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
62117 +       unlock_vm_area(tpmif->tx_area);
62118 +       BUG_ON(ret);
62119 +}
62120 +
62121 +int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn)
62122 +{
62123 +       int err;
62124 +       struct evtchn_bind_interdomain bind_interdomain;
62125 +
62126 +       if (tpmif->irq) {
62127 +               return 0;
62128 +       }
62129 +
62130 +       if ((tpmif->tx_area = alloc_vm_area(PAGE_SIZE)) == NULL)
62131 +               return -ENOMEM;
62132 +
62133 +       err = map_frontend_page(tpmif, shared_page);
62134 +       if (err) {
62135 +               free_vm_area(tpmif->tx_area);
62136 +               return err;
62137 +       }
62138 +
62139 +
62140 +       bind_interdomain.remote_dom  = tpmif->domid;
62141 +       bind_interdomain.remote_port = evtchn;
62142 +
62143 +       err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
62144 +                                         &bind_interdomain);
62145 +       if (err) {
62146 +               unmap_frontend_page(tpmif);
62147 +               free_vm_area(tpmif->tx_area);
62148 +               return err;
62149 +       }
62150 +
62151 +       tpmif->evtchn = bind_interdomain.local_port;
62152 +
62153 +       tpmif->tx = (tpmif_tx_interface_t *)tpmif->tx_area->addr;
62154 +
62155 +       tpmif->irq = bind_evtchn_to_irqhandler(
62156 +               tpmif->evtchn, tpmif_be_int, 0, tpmif->devname, tpmif);
62157 +       tpmif->shmem_ref = shared_page;
62158 +       tpmif->active = 1;
62159 +
62160 +       return 0;
62161 +}
62162 +
62163 +void tpmif_disconnect_complete(tpmif_t *tpmif)
62164 +{
62165 +       if (tpmif->irq)
62166 +               unbind_from_irqhandler(tpmif->irq, tpmif);
62167 +
62168 +       if (tpmif->tx) {
62169 +               unmap_frontend_page(tpmif);
62170 +               free_vm_area(tpmif->tx_area);
62171 +       }
62172 +
62173 +       free_tpmif(tpmif);
62174 +}
62175 +
62176 +void __init tpmif_interface_init(void)
62177 +{
62178 +       tpmif_cachep = kmem_cache_create("tpmif_cache", sizeof (tpmif_t),
62179 +                                        0, 0, NULL, NULL);
62180 +}
62181 +
62182 +void __exit tpmif_interface_exit(void)
62183 +{
62184 +       kmem_cache_destroy(tpmif_cachep);
62185 +}
62186 diff -urNp linux-2.6/drivers/xen/tpmback/Makefile new/drivers/xen/tpmback/Makefile
62187 --- linux-2.6/drivers/xen/tpmback/Makefile      1970-01-01 01:00:00.000000000 +0100
62188 +++ new/drivers/xen/tpmback/Makefile    2006-05-09 12:34:39.000000000 +0200
62189 @@ -0,0 +1,4 @@
62190 +
62191 +obj-$(CONFIG_XEN_TPMDEV_BACKEND)       += tpmbk.o
62192 +
62193 +tpmbk-y += tpmback.o interface.o xenbus.o
62194 diff -urNp linux-2.6/drivers/xen/tpmback/tpmback.c new/drivers/xen/tpmback/tpmback.c
62195 --- linux-2.6/drivers/xen/tpmback/tpmback.c     1970-01-01 01:00:00.000000000 +0100
62196 +++ new/drivers/xen/tpmback/tpmback.c   2006-07-07 15:10:03.000000000 +0200
62197 @@ -0,0 +1,983 @@
62198 +/******************************************************************************
62199 + * drivers/xen/tpmback/tpmback.c
62200 + *
62201 + * Copyright (c) 2005, IBM Corporation
62202 + *
62203 + * Author: Stefan Berger, stefanb@us.ibm.com
62204 + * Grant table support: Mahadevan Gomathisankaran
62205 + *
62206 + * This code has been derived from drivers/xen/netback/netback.c
62207 + * Copyright (c) 2002-2004, K A Fraser
62208 + *
62209 + */
62210 +
62211 +#include "common.h"
62212 +#include <xen/evtchn.h>
62213 +
62214 +#include <linux/types.h>
62215 +#include <linux/list.h>
62216 +#include <linux/miscdevice.h>
62217 +#include <linux/poll.h>
62218 +#include <asm/uaccess.h>
62219 +#include <xen/xenbus.h>
62220 +#include <xen/interface/grant_table.h>
62221 +#include <xen/gnttab.h>
62222 +
62223 +/* local data structures */
62224 +struct data_exchange {
62225 +       struct list_head pending_pak;
62226 +       struct list_head current_pak;
62227 +       unsigned int copied_so_far;
62228 +       u8 has_opener:1;
62229 +       u8 aborted:1;
62230 +       rwlock_t pak_lock;      // protects all of the previous fields
62231 +       wait_queue_head_t wait_queue;
62232 +};
62233 +
62234 +struct vtpm_resp_hdr {
62235 +       uint32_t instance_no;
62236 +       uint16_t tag_no;
62237 +       uint32_t len_no;
62238 +       uint32_t ordinal_no;
62239 +} __attribute__ ((packed));
62240 +
62241 +struct packet {
62242 +       struct list_head next;
62243 +       unsigned int data_len;
62244 +       u8 *data_buffer;
62245 +       tpmif_t *tpmif;
62246 +       u32 tpm_instance;
62247 +       u8 req_tag;
62248 +       u32 last_read;
62249 +       u8 flags;
62250 +       struct timer_list processing_timer;
62251 +};
62252 +
62253 +enum {
62254 +       PACKET_FLAG_DISCARD_RESPONSE = 1,
62255 +       PACKET_FLAG_CHECK_RESPONSESTATUS = 2,
62256 +};
62257 +
62258 +/* local variables */
62259 +static struct data_exchange dataex;
62260 +
62261 +/* local function prototypes */
62262 +static int _packet_write(struct packet *pak,
62263 +                        const char *data, size_t size, int userbuffer);
62264 +static void processing_timeout(unsigned long ptr);
62265 +static int packet_read_shmem(struct packet *pak,
62266 +                            tpmif_t * tpmif,
62267 +                            u32 offset,
62268 +                            char *buffer, int isuserbuffer, u32 left);
62269 +static int vtpm_queue_packet(struct packet *pak);
62270 +
62271 +/***************************************************************
62272 + Buffer copying fo user and kernel space buffes.
62273 +***************************************************************/
62274 +static inline int copy_from_buffer(void *to,
62275 +                                  const void *from, unsigned long size,
62276 +                                  int isuserbuffer)
62277 +{
62278 +       if (isuserbuffer) {
62279 +               if (copy_from_user(to, (void __user *)from, size))
62280 +                       return -EFAULT;
62281 +       } else {
62282 +               memcpy(to, from, size);
62283 +       }
62284 +       return 0;
62285 +}
62286 +
62287 +static inline int copy_to_buffer(void *to,
62288 +                                const void *from, unsigned long size,
62289 +                                int isuserbuffer)
62290 +{
62291 +       if (isuserbuffer) {
62292 +               if (copy_to_user((void __user *)to, from, size))
62293 +                       return -EFAULT;
62294 +       } else {
62295 +               memcpy(to, from, size);
62296 +       }
62297 +       return 0;
62298 +}
62299 +
62300 +
62301 +static void dataex_init(struct data_exchange *dataex)
62302 +{
62303 +       INIT_LIST_HEAD(&dataex->pending_pak);
62304 +       INIT_LIST_HEAD(&dataex->current_pak);
62305 +       dataex->has_opener = 0;
62306 +       rwlock_init(&dataex->pak_lock);
62307 +       init_waitqueue_head(&dataex->wait_queue);
62308 +}
62309 +
62310 +/***************************************************************
62311 + Packet-related functions
62312 +***************************************************************/
62313 +
62314 +static struct packet *packet_find_instance(struct list_head *head,
62315 +                                          u32 tpm_instance)
62316 +{
62317 +       struct packet *pak;
62318 +       struct list_head *p;
62319 +
62320 +       /*
62321 +        * traverse the list of packets and return the first
62322 +        * one with the given instance number
62323 +        */
62324 +       list_for_each(p, head) {
62325 +               pak = list_entry(p, struct packet, next);
62326 +
62327 +               if (pak->tpm_instance == tpm_instance) {
62328 +                       return pak;
62329 +               }
62330 +       }
62331 +       return NULL;
62332 +}
62333 +
62334 +static struct packet *packet_find_packet(struct list_head *head, void *packet)
62335 +{
62336 +       struct packet *pak;
62337 +       struct list_head *p;
62338 +
62339 +       /*
62340 +        * traverse the list of packets and return the first
62341 +        * one with the given instance number
62342 +        */
62343 +       list_for_each(p, head) {
62344 +               pak = list_entry(p, struct packet, next);
62345 +
62346 +               if (pak == packet) {
62347 +                       return pak;
62348 +               }
62349 +       }
62350 +       return NULL;
62351 +}
62352 +
62353 +static struct packet *packet_alloc(tpmif_t * tpmif,
62354 +                                  u32 size, u8 req_tag, u8 flags)
62355 +{
62356 +       struct packet *pak = NULL;
62357 +       pak = kzalloc(sizeof (struct packet), GFP_ATOMIC);
62358 +       if (NULL != pak) {
62359 +               if (tpmif) {
62360 +                       pak->tpmif = tpmif;
62361 +                       pak->tpm_instance = tpmback_get_instance(tpmif->bi);
62362 +                       tpmif_get(tpmif);
62363 +               }
62364 +               pak->data_len = size;
62365 +               pak->req_tag = req_tag;
62366 +               pak->last_read = 0;
62367 +               pak->flags = flags;
62368 +
62369 +               /*
62370 +                * cannot do tpmif_get(tpmif); bad things happen
62371 +                * on the last tpmif_put()
62372 +                */
62373 +               init_timer(&pak->processing_timer);
62374 +               pak->processing_timer.function = processing_timeout;
62375 +               pak->processing_timer.data = (unsigned long)pak;
62376 +       }
62377 +       return pak;
62378 +}
62379 +
62380 +static void inline packet_reset(struct packet *pak)
62381 +{
62382 +       pak->last_read = 0;
62383 +}
62384 +
62385 +static void packet_free(struct packet *pak)
62386 +{
62387 +       if (timer_pending(&pak->processing_timer)) {
62388 +               BUG();
62389 +       }
62390 +
62391 +       if (pak->tpmif)
62392 +               tpmif_put(pak->tpmif);
62393 +       kfree(pak->data_buffer);
62394 +       /*
62395 +        * cannot do tpmif_put(pak->tpmif); bad things happen
62396 +        * on the last tpmif_put()
62397 +        */
62398 +       kfree(pak);
62399 +}
62400 +
62401 +static int packet_set(struct packet *pak,
62402 +                     const unsigned char *buffer, u32 size)
62403 +{
62404 +       int rc = 0;
62405 +       unsigned char *buf = kmalloc(size, GFP_KERNEL);
62406 +
62407 +       if (buf) {
62408 +               pak->data_buffer = buf;
62409 +               memcpy(buf, buffer, size);
62410 +               pak->data_len = size;
62411 +       } else {
62412 +               rc = -ENOMEM;
62413 +       }
62414 +       return rc;
62415 +}
62416 +
62417 +/*
62418 + * Write data to the shared memory and send it to the FE.
62419 + */
62420 +static int packet_write(struct packet *pak,
62421 +                       const char *data, size_t size, int isuserbuffer)
62422 +{
62423 +       int rc = 0;
62424 +
62425 +       if ((pak->flags & PACKET_FLAG_CHECK_RESPONSESTATUS)) {
62426 +#ifdef CONFIG_XEN_TPMDEV_CLOSE_IF_VTPM_FAILS
62427 +               u32 res;
62428 +
62429 +               if (copy_from_buffer(&res,
62430 +                                    &data[2 + 4], sizeof (res),
62431 +                                    isuserbuffer)) {
62432 +                       return -EFAULT;
62433 +               }
62434 +
62435 +               if (res != 0) {
62436 +                       /*
62437 +                        * Close down this device. Should have the
62438 +                        * FE notified about closure.
62439 +                        */
62440 +                       if (!pak->tpmif) {
62441 +                               return -EFAULT;
62442 +                       }
62443 +                       pak->tpmif->status = DISCONNECTING;
62444 +               }
62445 +#endif
62446 +       }
62447 +
62448 +       if (0 != (pak->flags & PACKET_FLAG_DISCARD_RESPONSE)) {
62449 +               /* Don't send a respone to this packet. Just acknowledge it. */
62450 +               rc = size;
62451 +       } else {
62452 +               rc = _packet_write(pak, data, size, isuserbuffer);
62453 +       }
62454 +
62455 +       return rc;
62456 +}
62457 +
62458 +int _packet_write(struct packet *pak,
62459 +                 const char *data, size_t size, int isuserbuffer)
62460 +{
62461 +       /*
62462 +        * Write into the shared memory pages directly
62463 +        * and send it to the front end.
62464 +        */
62465 +       tpmif_t *tpmif = pak->tpmif;
62466 +       grant_handle_t handle;
62467 +       int rc = 0;
62468 +       unsigned int i = 0;
62469 +       unsigned int offset = 0;
62470 +
62471 +       if (tpmif == NULL) {
62472 +               return -EFAULT;
62473 +       }
62474 +
62475 +       if (tpmif->status == DISCONNECTED) {
62476 +               return size;
62477 +       }
62478 +
62479 +       while (offset < size && i < TPMIF_TX_RING_SIZE) {
62480 +               unsigned int tocopy;
62481 +               struct gnttab_map_grant_ref map_op;
62482 +               struct gnttab_unmap_grant_ref unmap_op;
62483 +               tpmif_tx_request_t *tx;
62484 +
62485 +               tx = &tpmif->tx->ring[i].req;
62486 +
62487 +               if (0 == tx->addr) {
62488 +                       DPRINTK("ERROR: Buffer for outgoing packet NULL?! i=%d\n", i);
62489 +                       return 0;
62490 +               }
62491 +
62492 +               gnttab_set_map_op(&map_op, MMAP_VADDR(tpmif, i),
62493 +                                 GNTMAP_host_map, tx->ref, tpmif->domid);
62494 +
62495 +               if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
62496 +                                                      &map_op, 1))) {
62497 +                       BUG();
62498 +               }
62499 +
62500 +               handle = map_op.handle;
62501 +
62502 +               if (map_op.status) {
62503 +                       DPRINTK(" Grant table operation failure !\n");
62504 +                       return 0;
62505 +               }
62506 +
62507 +               tocopy = min_t(size_t, size - offset, PAGE_SIZE);
62508 +
62509 +               if (copy_from_buffer((void *)(MMAP_VADDR(tpmif, i) |
62510 +                                             (tx->addr & ~PAGE_MASK)),
62511 +                                    &data[offset], tocopy, isuserbuffer)) {
62512 +                       tpmif_put(tpmif);
62513 +                       return -EFAULT;
62514 +               }
62515 +               tx->size = tocopy;
62516 +
62517 +               gnttab_set_unmap_op(&unmap_op, MMAP_VADDR(tpmif, i),
62518 +                                   GNTMAP_host_map, handle);
62519 +
62520 +               if (unlikely
62521 +                   (HYPERVISOR_grant_table_op
62522 +                    (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) {
62523 +                       BUG();
62524 +               }
62525 +
62526 +               offset += tocopy;
62527 +               i++;
62528 +       }
62529 +
62530 +       rc = offset;
62531 +       DPRINTK("Notifying frontend via irq %d\n", tpmif->irq);
62532 +       notify_remote_via_irq(tpmif->irq);
62533 +
62534 +       return rc;
62535 +}
62536 +
62537 +/*
62538 + * Read data from the shared memory and copy it directly into the
62539 + * provided buffer. Advance the read_last indicator which tells
62540 + * how many bytes have already been read.
62541 + */
62542 +static int packet_read(struct packet *pak, size_t numbytes,
62543 +                      char *buffer, size_t buffersize, int isuserbuffer)
62544 +{
62545 +       tpmif_t *tpmif = pak->tpmif;
62546 +
62547 +       /*
62548 +        * Read 'numbytes' of data from the buffer. The first 4
62549 +        * bytes are the instance number in network byte order,
62550 +        * after that come the data from the shared memory buffer.
62551 +        */
62552 +       u32 to_copy;
62553 +       u32 offset = 0;
62554 +       u32 room_left = buffersize;
62555 +
62556 +       if (pak->last_read < 4) {
62557 +               /*
62558 +                * copy the instance number into the buffer
62559 +                */
62560 +               u32 instance_no = htonl(pak->tpm_instance);
62561 +               u32 last_read = pak->last_read;
62562 +
62563 +               to_copy = min_t(size_t, 4 - last_read, numbytes);
62564 +
62565 +               if (copy_to_buffer(&buffer[0],
62566 +                                  &(((u8 *) & instance_no)[last_read]),
62567 +                                  to_copy, isuserbuffer)) {
62568 +                       return -EFAULT;
62569 +               }
62570 +
62571 +               pak->last_read += to_copy;
62572 +               offset += to_copy;
62573 +               room_left -= to_copy;
62574 +       }
62575 +
62576 +       /*
62577 +        * If the packet has a data buffer appended, read from it...
62578 +        */
62579 +
62580 +       if (room_left > 0) {
62581 +               if (pak->data_buffer) {
62582 +                       u32 to_copy = min_t(u32, pak->data_len - offset, room_left);
62583 +                       u32 last_read = pak->last_read - 4;
62584 +
62585 +                       if (copy_to_buffer(&buffer[offset],
62586 +                                          &pak->data_buffer[last_read],
62587 +                                          to_copy, isuserbuffer)) {
62588 +                               return -EFAULT;
62589 +                       }
62590 +                       pak->last_read += to_copy;
62591 +                       offset += to_copy;
62592 +               } else {
62593 +                       offset = packet_read_shmem(pak,
62594 +                                                  tpmif,
62595 +                                                  offset,
62596 +                                                  buffer,
62597 +                                                  isuserbuffer, room_left);
62598 +               }
62599 +       }
62600 +       return offset;
62601 +}
62602 +
62603 +static int packet_read_shmem(struct packet *pak,
62604 +                            tpmif_t * tpmif,
62605 +                            u32 offset, char *buffer, int isuserbuffer,
62606 +                            u32 room_left)
62607 +{
62608 +       u32 last_read = pak->last_read - 4;
62609 +       u32 i = (last_read / PAGE_SIZE);
62610 +       u32 pg_offset = last_read & (PAGE_SIZE - 1);
62611 +       u32 to_copy;
62612 +       grant_handle_t handle;
62613 +
62614 +       tpmif_tx_request_t *tx;
62615 +
62616 +       tx = &tpmif->tx->ring[0].req;
62617 +       /*
62618 +        * Start copying data at the page with index 'index'
62619 +        * and within that page at offset 'offset'.
62620 +        * Copy a maximum of 'room_left' bytes.
62621 +        */
62622 +       to_copy = min_t(u32, PAGE_SIZE - pg_offset, room_left);
62623 +       while (to_copy > 0) {
62624 +               void *src;
62625 +               struct gnttab_map_grant_ref map_op;
62626 +               struct gnttab_unmap_grant_ref unmap_op;
62627 +
62628 +               tx = &tpmif->tx->ring[i].req;
62629 +
62630 +               gnttab_set_map_op(&map_op, MMAP_VADDR(tpmif, i),
62631 +                                 GNTMAP_host_map, tx->ref, tpmif->domid);
62632 +
62633 +               if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
62634 +                                                      &map_op, 1))) {
62635 +                       BUG();
62636 +               }
62637 +
62638 +               if (map_op.status) {
62639 +                       DPRINTK(" Grant table operation failure !\n");
62640 +                       return -EFAULT;
62641 +               }
62642 +
62643 +               handle = map_op.handle;
62644 +
62645 +               if (to_copy > tx->size) {
62646 +                       /*
62647 +                        * User requests more than what's available
62648 +                        */
62649 +                       to_copy = min_t(u32, tx->size, to_copy);
62650 +               }
62651 +
62652 +               DPRINTK("Copying from mapped memory at %08lx\n",
62653 +                       (unsigned long)(MMAP_VADDR(tpmif, i) |
62654 +                                       (tx->addr & ~PAGE_MASK)));
62655 +
62656 +               src = (void *)(MMAP_VADDR(tpmif, i) |
62657 +                              ((tx->addr & ~PAGE_MASK) + pg_offset));
62658 +               if (copy_to_buffer(&buffer[offset],
62659 +                                  src, to_copy, isuserbuffer)) {
62660 +                       return -EFAULT;
62661 +               }
62662 +
62663 +               DPRINTK("Data from TPM-FE of domain %d are %d %d %d %d\n",
62664 +                       tpmif->domid, buffer[offset], buffer[offset + 1],
62665 +                       buffer[offset + 2], buffer[offset + 3]);
62666 +
62667 +               gnttab_set_unmap_op(&unmap_op, MMAP_VADDR(tpmif, i),
62668 +                                   GNTMAP_host_map, handle);
62669 +
62670 +               if (unlikely
62671 +                   (HYPERVISOR_grant_table_op
62672 +                    (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) {
62673 +                       BUG();
62674 +               }
62675 +
62676 +               offset += to_copy;
62677 +               pg_offset = 0;
62678 +               last_read += to_copy;
62679 +               room_left -= to_copy;
62680 +
62681 +               to_copy = min_t(u32, PAGE_SIZE, room_left);
62682 +               i++;
62683 +       }                       /* while (to_copy > 0) */
62684 +       /*
62685 +        * Adjust the last_read pointer
62686 +        */
62687 +       pak->last_read = last_read + 4;
62688 +       return offset;
62689 +}
62690 +
62691 +/* ============================================================
62692 + * The file layer for reading data from this device
62693 + * ============================================================
62694 + */
62695 +static int vtpm_op_open(struct inode *inode, struct file *f)
62696 +{
62697 +       int rc = 0;
62698 +       unsigned long flags;
62699 +
62700 +       write_lock_irqsave(&dataex.pak_lock, flags);
62701 +       if (dataex.has_opener == 0) {
62702 +               dataex.has_opener = 1;
62703 +       } else {
62704 +               rc = -EPERM;
62705 +       }
62706 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
62707 +       return rc;
62708 +}
62709 +
62710 +static ssize_t vtpm_op_read(struct file *file,
62711 +                           char __user * data, size_t size, loff_t * offset)
62712 +{
62713 +       int ret_size = -ENODATA;
62714 +       struct packet *pak = NULL;
62715 +       unsigned long flags;
62716 +
62717 +       write_lock_irqsave(&dataex.pak_lock, flags);
62718 +       if (dataex.aborted) {
62719 +               dataex.aborted = 0;
62720 +               dataex.copied_so_far = 0;
62721 +               write_unlock_irqrestore(&dataex.pak_lock, flags);
62722 +               return -EIO;
62723 +       }
62724 +
62725 +       if (list_empty(&dataex.pending_pak)) {
62726 +               write_unlock_irqrestore(&dataex.pak_lock, flags);
62727 +               wait_event_interruptible(dataex.wait_queue,
62728 +                                        !list_empty(&dataex.pending_pak));
62729 +               write_lock_irqsave(&dataex.pak_lock, flags);
62730 +               dataex.copied_so_far = 0;
62731 +       }
62732 +
62733 +       if (!list_empty(&dataex.pending_pak)) {
62734 +               unsigned int left;
62735 +
62736 +               pak = list_entry(dataex.pending_pak.next, struct packet, next);
62737 +               left = pak->data_len - dataex.copied_so_far;
62738 +               list_del(&pak->next);
62739 +               write_unlock_irqrestore(&dataex.pak_lock, flags);
62740 +
62741 +               DPRINTK("size given by app: %d, available: %d\n", size, left);
62742 +
62743 +               ret_size = min_t(size_t, size, left);
62744 +
62745 +               ret_size = packet_read(pak, ret_size, data, size, 1);
62746 +
62747 +               write_lock_irqsave(&dataex.pak_lock, flags);
62748 +
62749 +               if (ret_size < 0) {
62750 +                       del_singleshot_timer_sync(&pak->processing_timer);
62751 +                       packet_free(pak);
62752 +                       dataex.copied_so_far = 0;
62753 +               } else {
62754 +                       DPRINTK("Copied %d bytes to user buffer\n", ret_size);
62755 +
62756 +                       dataex.copied_so_far += ret_size;
62757 +                       if (dataex.copied_so_far >= pak->data_len + 4) {
62758 +                               DPRINTK("All data from this packet given to app.\n");
62759 +                               /* All data given to app */
62760 +
62761 +                               del_singleshot_timer_sync(&pak->
62762 +                                                         processing_timer);
62763 +                               list_add_tail(&pak->next, &dataex.current_pak);
62764 +                               /*
62765 +                                * The more fontends that are handled at the same time,
62766 +                                * the more time we give the TPM to process the request.
62767 +                                */
62768 +                               mod_timer(&pak->processing_timer,
62769 +                                         jiffies + (num_frontends * 60 * HZ));
62770 +                               dataex.copied_so_far = 0;
62771 +                       } else {
62772 +                               list_add(&pak->next, &dataex.pending_pak);
62773 +                       }
62774 +               }
62775 +       }
62776 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
62777 +
62778 +       DPRINTK("Returning result from read to app: %d\n", ret_size);
62779 +
62780 +       return ret_size;
62781 +}
62782 +
62783 +/*
62784 + * Write operation - only works after a previous read operation!
62785 + */
62786 +static ssize_t vtpm_op_write(struct file *file,
62787 +                            const char __user * data, size_t size,
62788 +                            loff_t * offset)
62789 +{
62790 +       struct packet *pak;
62791 +       int rc = 0;
62792 +       unsigned int off = 4;
62793 +       unsigned long flags;
62794 +       struct vtpm_resp_hdr vrh;
62795 +
62796 +       /*
62797 +        * Minimum required packet size is:
62798 +        * 4 bytes for instance number
62799 +        * 2 bytes for tag
62800 +        * 4 bytes for paramSize
62801 +        * 4 bytes for the ordinal
62802 +        * sum: 14 bytes
62803 +        */
62804 +       if (size < sizeof (vrh))
62805 +               return -EFAULT;
62806 +
62807 +       if (copy_from_user(&vrh, data, sizeof (vrh)))
62808 +               return -EFAULT;
62809 +
62810 +       /* malformed packet? */
62811 +       if ((off + ntohl(vrh.len_no)) != size)
62812 +               return -EFAULT;
62813 +
62814 +       write_lock_irqsave(&dataex.pak_lock, flags);
62815 +       pak = packet_find_instance(&dataex.current_pak,
62816 +                                  ntohl(vrh.instance_no));
62817 +
62818 +       if (pak == NULL) {
62819 +               write_unlock_irqrestore(&dataex.pak_lock, flags);
62820 +               DPRINTK(KERN_ALERT "No associated packet! (inst=%d)\n",
62821 +                       ntohl(vrh.instance_no));
62822 +               return -EFAULT;
62823 +       }
62824 +
62825 +       del_singleshot_timer_sync(&pak->processing_timer);
62826 +       list_del(&pak->next);
62827 +
62828 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
62829 +
62830 +       /*
62831 +        * The first 'offset' bytes must be the instance number - skip them.
62832 +        */
62833 +       size -= off;
62834 +
62835 +       rc = packet_write(pak, &data[off], size, 1);
62836 +
62837 +       if (rc > 0) {
62838 +               /* I neglected the first 4 bytes */
62839 +               rc += off;
62840 +       }
62841 +       packet_free(pak);
62842 +       return rc;
62843 +}
62844 +
62845 +static int vtpm_op_release(struct inode *inode, struct file *file)
62846 +{
62847 +       unsigned long flags;
62848 +
62849 +       vtpm_release_packets(NULL, 1);
62850 +       write_lock_irqsave(&dataex.pak_lock, flags);
62851 +       dataex.has_opener = 0;
62852 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
62853 +       return 0;
62854 +}
62855 +
62856 +static unsigned int vtpm_op_poll(struct file *file,
62857 +                                struct poll_table_struct *pts)
62858 +{
62859 +       unsigned int flags = POLLOUT | POLLWRNORM;
62860 +
62861 +       poll_wait(file, &dataex.wait_queue, pts);
62862 +       if (!list_empty(&dataex.pending_pak)) {
62863 +               flags |= POLLIN | POLLRDNORM;
62864 +       }
62865 +       return flags;
62866 +}
62867 +
62868 +static struct file_operations vtpm_ops = {
62869 +       .owner = THIS_MODULE,
62870 +       .llseek = no_llseek,
62871 +       .open = vtpm_op_open,
62872 +       .read = vtpm_op_read,
62873 +       .write = vtpm_op_write,
62874 +       .release = vtpm_op_release,
62875 +       .poll = vtpm_op_poll,
62876 +};
62877 +
62878 +static struct miscdevice vtpms_miscdevice = {
62879 +       .minor = 225,
62880 +       .name = "vtpm",
62881 +       .fops = &vtpm_ops,
62882 +};
62883 +
62884 +/***************************************************************
62885 + Utility functions
62886 +***************************************************************/
62887 +
62888 +static int tpm_send_fail_message(struct packet *pak, u8 req_tag)
62889 +{
62890 +       int rc;
62891 +       static const unsigned char tpm_error_message_fail[] = {
62892 +               0x00, 0x00,
62893 +               0x00, 0x00, 0x00, 0x0a,
62894 +               0x00, 0x00, 0x00, 0x09  /* TPM_FAIL */
62895 +       };
62896 +       unsigned char buffer[sizeof (tpm_error_message_fail)];
62897 +
62898 +       memcpy(buffer, tpm_error_message_fail,
62899 +              sizeof (tpm_error_message_fail));
62900 +       /*
62901 +        * Insert the right response tag depending on the given tag
62902 +        * All response tags are '+3' to the request tag.
62903 +        */
62904 +       buffer[1] = req_tag + 3;
62905 +
62906 +       /*
62907 +        * Write the data to shared memory and notify the front-end
62908 +        */
62909 +       rc = packet_write(pak, buffer, sizeof (buffer), 0);
62910 +
62911 +       return rc;
62912 +}
62913 +
62914 +static int _vtpm_release_packets(struct list_head *head,
62915 +                                tpmif_t * tpmif, int send_msgs)
62916 +{
62917 +       int aborted = 0;
62918 +       int c = 0;
62919 +       struct packet *pak;
62920 +       struct list_head *pos, *tmp;
62921 +
62922 +       list_for_each_safe(pos, tmp, head) {
62923 +               pak = list_entry(pos, struct packet, next);
62924 +               c += 1;
62925 +
62926 +               if (tpmif == NULL || pak->tpmif == tpmif) {
62927 +                       int can_send = 0;
62928 +
62929 +                       del_singleshot_timer_sync(&pak->processing_timer);
62930 +                       list_del(&pak->next);
62931 +
62932 +                       if (pak->tpmif && pak->tpmif->status == CONNECTED) {
62933 +                               can_send = 1;
62934 +                       }
62935 +
62936 +                       if (send_msgs && can_send) {
62937 +                               tpm_send_fail_message(pak, pak->req_tag);
62938 +                       }
62939 +                       packet_free(pak);
62940 +                       if (c == 1)
62941 +                               aborted = 1;
62942 +               }
62943 +       }
62944 +       return aborted;
62945 +}
62946 +
62947 +int vtpm_release_packets(tpmif_t * tpmif, int send_msgs)
62948 +{
62949 +       unsigned long flags;
62950 +
62951 +       write_lock_irqsave(&dataex.pak_lock, flags);
62952 +
62953 +       dataex.aborted = _vtpm_release_packets(&dataex.pending_pak,
62954 +                                              tpmif,
62955 +                                              send_msgs);
62956 +       _vtpm_release_packets(&dataex.current_pak, tpmif, send_msgs);
62957 +
62958 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
62959 +       return 0;
62960 +}
62961 +
62962 +static int vtpm_queue_packet(struct packet *pak)
62963 +{
62964 +       int rc = 0;
62965 +
62966 +       if (dataex.has_opener) {
62967 +               unsigned long flags;
62968 +
62969 +               write_lock_irqsave(&dataex.pak_lock, flags);
62970 +               list_add_tail(&pak->next, &dataex.pending_pak);
62971 +               /* give the TPM some time to pick up the request */
62972 +               mod_timer(&pak->processing_timer, jiffies + (30 * HZ));
62973 +               write_unlock_irqrestore(&dataex.pak_lock, flags);
62974 +
62975 +               wake_up_interruptible(&dataex.wait_queue);
62976 +       } else {
62977 +               rc = -EFAULT;
62978 +       }
62979 +       return rc;
62980 +}
62981 +
62982 +static int vtpm_receive(tpmif_t * tpmif, u32 size)
62983 +{
62984 +       int rc = 0;
62985 +       unsigned char buffer[10];
62986 +       __be32 *native_size;
62987 +       struct packet *pak = packet_alloc(tpmif, size, 0, 0);
62988 +
62989 +       if (!pak)
62990 +               return -ENOMEM;
62991 +       /*
62992 +        * Read 10 bytes from the received buffer to test its
62993 +        * content for validity.
62994 +        */
62995 +       if (sizeof (buffer) != packet_read(pak,
62996 +                                          sizeof (buffer), buffer,
62997 +                                          sizeof (buffer), 0)) {
62998 +               goto failexit;
62999 +       }
63000 +       /*
63001 +        * Reset the packet read pointer so we can read all its
63002 +        * contents again.
63003 +        */
63004 +       packet_reset(pak);
63005 +
63006 +       native_size = (__force __be32 *) (&buffer[4 + 2]);
63007 +       /*
63008 +        * Verify that the size of the packet is correct
63009 +        * as indicated and that there's actually someone reading packets.
63010 +        * The minimum size of the packet is '10' for tag, size indicator
63011 +        * and ordinal.
63012 +        */
63013 +       if (size < 10 ||
63014 +           be32_to_cpu(*native_size) != size ||
63015 +           0 == dataex.has_opener || tpmif->status != CONNECTED) {
63016 +               rc = -EINVAL;
63017 +               goto failexit;
63018 +       } else {
63019 +               rc = vtpm_queue_packet(pak);
63020 +               if (rc < 0)
63021 +                       goto failexit;
63022 +       }
63023 +       return 0;
63024 +
63025 +      failexit:
63026 +       if (pak) {
63027 +               tpm_send_fail_message(pak, buffer[4 + 1]);
63028 +               packet_free(pak);
63029 +       }
63030 +       return rc;
63031 +}
63032 +
63033 +/*
63034 + * Timeout function that gets invoked when a packet has not been processed
63035 + * during the timeout period.
63036 + * The packet must be on a list when this function is invoked. This
63037 + * also means that once its taken off a list, the timer must be
63038 + * destroyed as well.
63039 + */
63040 +static void processing_timeout(unsigned long ptr)
63041 +{
63042 +       struct packet *pak = (struct packet *)ptr;
63043 +       unsigned long flags;
63044 +
63045 +       write_lock_irqsave(&dataex.pak_lock, flags);
63046 +       /*
63047 +        * The packet needs to be searched whether it
63048 +        * is still on the list.
63049 +        */
63050 +       if (pak == packet_find_packet(&dataex.pending_pak, pak) ||
63051 +           pak == packet_find_packet(&dataex.current_pak, pak)) {
63052 +               list_del(&pak->next);
63053 +               if ((pak->flags & PACKET_FLAG_DISCARD_RESPONSE) == 0) {
63054 +                       tpm_send_fail_message(pak, pak->req_tag);
63055 +               }
63056 +               packet_free(pak);
63057 +       }
63058 +
63059 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
63060 +}
63061 +
63062 +static void tpm_tx_action(unsigned long unused);
63063 +static DECLARE_TASKLET(tpm_tx_tasklet, tpm_tx_action, 0);
63064 +
63065 +static struct list_head tpm_schedule_list;
63066 +static spinlock_t tpm_schedule_list_lock;
63067 +
63068 +static inline void maybe_schedule_tx_action(void)
63069 +{
63070 +       smp_mb();
63071 +       tasklet_schedule(&tpm_tx_tasklet);
63072 +}
63073 +
63074 +static inline int __on_tpm_schedule_list(tpmif_t * tpmif)
63075 +{
63076 +       return tpmif->list.next != NULL;
63077 +}
63078 +
63079 +static void remove_from_tpm_schedule_list(tpmif_t * tpmif)
63080 +{
63081 +       spin_lock_irq(&tpm_schedule_list_lock);
63082 +       if (likely(__on_tpm_schedule_list(tpmif))) {
63083 +               list_del(&tpmif->list);
63084 +               tpmif->list.next = NULL;
63085 +               tpmif_put(tpmif);
63086 +       }
63087 +       spin_unlock_irq(&tpm_schedule_list_lock);
63088 +}
63089 +
63090 +static void add_to_tpm_schedule_list_tail(tpmif_t * tpmif)
63091 +{
63092 +       if (__on_tpm_schedule_list(tpmif))
63093 +               return;
63094 +
63095 +       spin_lock_irq(&tpm_schedule_list_lock);
63096 +       if (!__on_tpm_schedule_list(tpmif) && tpmif->active) {
63097 +               list_add_tail(&tpmif->list, &tpm_schedule_list);
63098 +               tpmif_get(tpmif);
63099 +       }
63100 +       spin_unlock_irq(&tpm_schedule_list_lock);
63101 +}
63102 +
63103 +void tpmif_schedule_work(tpmif_t * tpmif)
63104 +{
63105 +       add_to_tpm_schedule_list_tail(tpmif);
63106 +       maybe_schedule_tx_action();
63107 +}
63108 +
63109 +void tpmif_deschedule_work(tpmif_t * tpmif)
63110 +{
63111 +       remove_from_tpm_schedule_list(tpmif);
63112 +}
63113 +
63114 +static void tpm_tx_action(unsigned long unused)
63115 +{
63116 +       struct list_head *ent;
63117 +       tpmif_t *tpmif;
63118 +       tpmif_tx_request_t *tx;
63119 +
63120 +       DPRINTK("%s: Getting data from front-end(s)!\n", __FUNCTION__);
63121 +
63122 +       while (!list_empty(&tpm_schedule_list)) {
63123 +               /* Get a tpmif from the list with work to do. */
63124 +               ent = tpm_schedule_list.next;
63125 +               tpmif = list_entry(ent, tpmif_t, list);
63126 +               tpmif_get(tpmif);
63127 +               remove_from_tpm_schedule_list(tpmif);
63128 +
63129 +               tx = &tpmif->tx->ring[0].req;
63130 +
63131 +               /* pass it up */
63132 +               vtpm_receive(tpmif, tx->size);
63133 +
63134 +               tpmif_put(tpmif);
63135 +       }
63136 +}
63137 +
63138 +irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs)
63139 +{
63140 +       tpmif_t *tpmif = (tpmif_t *) dev_id;
63141 +
63142 +       add_to_tpm_schedule_list_tail(tpmif);
63143 +       maybe_schedule_tx_action();
63144 +       return IRQ_HANDLED;
63145 +}
63146 +
63147 +static int __init tpmback_init(void)
63148 +{
63149 +       int rc;
63150 +
63151 +       if ((rc = misc_register(&vtpms_miscdevice)) != 0) {
63152 +               printk(KERN_ALERT
63153 +                      "Could not register misc device for TPM BE.\n");
63154 +               return rc;
63155 +       }
63156 +
63157 +       dataex_init(&dataex);
63158 +
63159 +       spin_lock_init(&tpm_schedule_list_lock);
63160 +       INIT_LIST_HEAD(&tpm_schedule_list);
63161 +
63162 +       tpmif_interface_init();
63163 +       tpmif_xenbus_init();
63164 +
63165 +       printk(KERN_ALERT "Successfully initialized TPM backend driver.\n");
63166 +
63167 +       return 0;
63168 +}
63169 +
63170 +module_init(tpmback_init);
63171 +
63172 +void __exit tpmback_exit(void)
63173 +{
63174 +       vtpm_release_packets(NULL, 0);
63175 +       tpmif_xenbus_exit();
63176 +       tpmif_interface_exit();
63177 +       misc_deregister(&vtpms_miscdevice);
63178 +}
63179 +
63180 +MODULE_LICENSE("Dual BSD/GPL");
63181 diff -urNp linux-2.6/drivers/xen/tpmback/xenbus.c new/drivers/xen/tpmback/xenbus.c
63182 --- linux-2.6/drivers/xen/tpmback/xenbus.c      1970-01-01 01:00:00.000000000 +0100
63183 +++ new/drivers/xen/tpmback/xenbus.c    2006-07-07 15:10:03.000000000 +0200
63184 @@ -0,0 +1,291 @@
63185 +/*  Xenbus code for tpmif backend
63186 +    Copyright (C) 2005 IBM Corporation
63187 +    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
63188 +
63189 +    This program is free software; you can redistribute it and/or modify
63190 +    it under the terms of the GNU General Public License as published by
63191 +    the Free Software Foundation; either version 2 of the License, or
63192 +    (at your option) any later version.
63193 +
63194 +    This program is distributed in the hope that it will be useful,
63195 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
63196 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
63197 +    GNU General Public License for more details.
63198 +
63199 +    You should have received a copy of the GNU General Public License
63200 +    along with this program; if not, write to the Free Software
63201 +    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
63202 +*/
63203 +#include <stdarg.h>
63204 +#include <linux/module.h>
63205 +#include <xen/xenbus.h>
63206 +#include "common.h"
63207 +
63208 +struct backend_info
63209 +{
63210 +       struct xenbus_device *dev;
63211 +
63212 +       /* our communications channel */
63213 +       tpmif_t *tpmif;
63214 +
63215 +       long int frontend_id;
63216 +       long int instance; // instance of TPM
63217 +       u8 is_instance_set;// whether instance number has been set
63218 +
63219 +       /* watch front end for changes */
63220 +       struct xenbus_watch backend_watch;
63221 +       enum xenbus_state frontend_state;
63222 +};
63223 +
63224 +static void maybe_connect(struct backend_info *be);
63225 +static void connect(struct backend_info *be);
63226 +static int connect_ring(struct backend_info *be);
63227 +static void backend_changed(struct xenbus_watch *watch,
63228 +                           const char **vec, unsigned int len);
63229 +static void frontend_changed(struct xenbus_device *dev,
63230 +                            enum xenbus_state frontend_state);
63231 +
63232 +long int tpmback_get_instance(struct backend_info *bi)
63233 +{
63234 +       long int res = -1;
63235 +       if (bi && bi->is_instance_set)
63236 +               res = bi->instance;
63237 +       return res;
63238 +}
63239 +
63240 +static int tpmback_remove(struct xenbus_device *dev)
63241 +{
63242 +       struct backend_info *be = dev->dev.driver_data;
63243 +
63244 +       if (!be) return 0;
63245 +
63246 +       if (be->backend_watch.node) {
63247 +               unregister_xenbus_watch(&be->backend_watch);
63248 +               kfree(be->backend_watch.node);
63249 +               be->backend_watch.node = NULL;
63250 +       }
63251 +       if (be->tpmif) {
63252 +               be->tpmif->bi = NULL;
63253 +               vtpm_release_packets(be->tpmif, 0);
63254 +               tpmif_put(be->tpmif);
63255 +               be->tpmif = NULL;
63256 +       }
63257 +       kfree(be);
63258 +       dev->dev.driver_data = NULL;
63259 +       return 0;
63260 +}
63261 +
63262 +static int tpmback_probe(struct xenbus_device *dev,
63263 +                        const struct xenbus_device_id *id)
63264 +{
63265 +       int err;
63266 +       struct backend_info *be = kzalloc(sizeof(struct backend_info),
63267 +                                         GFP_KERNEL);
63268 +
63269 +       if (!be) {
63270 +               xenbus_dev_fatal(dev, -ENOMEM,
63271 +                                "allocating backend structure");
63272 +               return -ENOMEM;
63273 +       }
63274 +
63275 +       be->is_instance_set = 0;
63276 +       be->dev = dev;
63277 +       dev->dev.driver_data = be;
63278 +
63279 +       err = xenbus_watch_path2(dev, dev->nodename,
63280 +                                "instance", &be->backend_watch,
63281 +                                backend_changed);
63282 +       if (err) {
63283 +               goto fail;
63284 +       }
63285 +
63286 +       err = xenbus_switch_state(dev, XenbusStateInitWait);
63287 +       if (err) {
63288 +               goto fail;
63289 +       }
63290 +       return 0;
63291 +fail:
63292 +       tpmback_remove(dev);
63293 +       return err;
63294 +}
63295 +
63296 +
63297 +static void backend_changed(struct xenbus_watch *watch,
63298 +                           const char **vec, unsigned int len)
63299 +{
63300 +       int err;
63301 +       long instance;
63302 +       struct backend_info *be
63303 +               = container_of(watch, struct backend_info, backend_watch);
63304 +       struct xenbus_device *dev = be->dev;
63305 +
63306 +       err = xenbus_scanf(XBT_NIL, dev->nodename,
63307 +                          "instance","%li", &instance);
63308 +       if (XENBUS_EXIST_ERR(err)) {
63309 +               return;
63310 +       }
63311 +
63312 +       if (err != 1) {
63313 +               xenbus_dev_fatal(dev, err, "reading instance");
63314 +               return;
63315 +       }
63316 +
63317 +       if (be->is_instance_set == 0) {
63318 +               be->instance = instance;
63319 +               be->is_instance_set = 1;
63320 +       }
63321 +}
63322 +
63323 +
63324 +static void frontend_changed(struct xenbus_device *dev,
63325 +                            enum xenbus_state frontend_state)
63326 +{
63327 +       struct backend_info *be = dev->dev.driver_data;
63328 +       int err;
63329 +
63330 +       be->frontend_state = frontend_state;
63331 +
63332 +       switch (frontend_state) {
63333 +       case XenbusStateInitialising:
63334 +       case XenbusStateInitialised:
63335 +               break;
63336 +
63337 +       case XenbusStateConnected:
63338 +               err = connect_ring(be);
63339 +               if (err) {
63340 +                       return;
63341 +               }
63342 +               maybe_connect(be);
63343 +               break;
63344 +
63345 +       case XenbusStateClosing:
63346 +               be->instance = -1;
63347 +               break;
63348 +
63349 +       case XenbusStateClosed:
63350 +               device_unregister(&be->dev->dev);
63351 +               tpmback_remove(dev);
63352 +               break;
63353 +
63354 +       case XenbusStateUnknown:
63355 +       case XenbusStateInitWait:
63356 +       default:
63357 +               xenbus_dev_fatal(dev, -EINVAL,
63358 +                                "saw state %d at frontend",
63359 +                                frontend_state);
63360 +               break;
63361 +       }
63362 +}
63363 +
63364 +
63365 +
63366 +static void maybe_connect(struct backend_info *be)
63367 +{
63368 +       if (be->tpmif == NULL || be->tpmif->status == CONNECTED)
63369 +               return;
63370 +
63371 +       connect(be);
63372 +}
63373 +
63374 +
63375 +static void connect(struct backend_info *be)
63376 +{
63377 +       struct xenbus_transaction xbt;
63378 +       int err;
63379 +       struct xenbus_device *dev = be->dev;
63380 +       unsigned long ready = 1;
63381 +
63382 +again:
63383 +       err = xenbus_transaction_start(&xbt);
63384 +       if (err) {
63385 +               xenbus_dev_fatal(be->dev, err, "starting transaction");
63386 +               return;
63387 +       }
63388 +
63389 +       err = xenbus_printf(xbt, be->dev->nodename,
63390 +                           "ready", "%lu", ready);
63391 +       if (err) {
63392 +               xenbus_dev_fatal(be->dev, err, "writing 'ready'");
63393 +               goto abort;
63394 +       }
63395 +
63396 +       err = xenbus_transaction_end(xbt, 0);
63397 +       if (err == -EAGAIN)
63398 +               goto again;
63399 +       if (err)
63400 +               xenbus_dev_fatal(be->dev, err, "end of transaction");
63401 +
63402 +       err = xenbus_switch_state(dev, XenbusStateConnected);
63403 +       if (!err)
63404 +               be->tpmif->status = CONNECTED;
63405 +       return;
63406 +abort:
63407 +       xenbus_transaction_end(xbt, 1);
63408 +}
63409 +
63410 +
63411 +static int connect_ring(struct backend_info *be)
63412 +{
63413 +       struct xenbus_device *dev = be->dev;
63414 +       unsigned long ring_ref;
63415 +       unsigned int evtchn;
63416 +       int err;
63417 +
63418 +       err = xenbus_gather(XBT_NIL, dev->otherend,
63419 +                           "ring-ref", "%lu", &ring_ref,
63420 +                           "event-channel", "%u", &evtchn, NULL);
63421 +       if (err) {
63422 +               xenbus_dev_error(dev, err,
63423 +                                "reading %s/ring-ref and event-channel",
63424 +                                dev->otherend);
63425 +               return err;
63426 +       }
63427 +
63428 +       if (!be->tpmif) {
63429 +               be->tpmif = tpmif_find(dev->otherend_id, be);
63430 +               if (IS_ERR(be->tpmif)) {
63431 +                       err = PTR_ERR(be->tpmif);
63432 +                       be->tpmif = NULL;
63433 +                       xenbus_dev_fatal(dev,err,"creating vtpm interface");
63434 +                       return err;
63435 +               }
63436 +       }
63437 +
63438 +       if (be->tpmif != NULL) {
63439 +               err = tpmif_map(be->tpmif, ring_ref, evtchn);
63440 +               if (err) {
63441 +                       xenbus_dev_error(dev, err,
63442 +                                        "mapping shared-frame %lu port %u",
63443 +                                        ring_ref, evtchn);
63444 +                       return err;
63445 +               }
63446 +       }
63447 +       return 0;
63448 +}
63449 +
63450 +
63451 +static struct xenbus_device_id tpmback_ids[] = {
63452 +       { "vtpm" },
63453 +       { "" }
63454 +};
63455 +
63456 +
63457 +static struct xenbus_driver tpmback = {
63458 +       .name = "vtpm",
63459 +       .owner = THIS_MODULE,
63460 +       .ids = tpmback_ids,
63461 +       .probe = tpmback_probe,
63462 +       .remove = tpmback_remove,
63463 +       .otherend_changed = frontend_changed,
63464 +};
63465 +
63466 +
63467 +void tpmif_xenbus_init(void)
63468 +{
63469 +       xenbus_register_backend(&tpmback);
63470 +}
63471 +
63472 +void tpmif_xenbus_exit(void)
63473 +{
63474 +       xenbus_unregister_driver(&tpmback);
63475 +}
63476 diff -urNp linux-2.6/drivers/xen/util.c new/drivers/xen/util.c
63477 --- linux-2.6/drivers/xen/util.c        1970-01-01 01:00:00.000000000 +0100
63478 +++ new/drivers/xen/util.c      2006-05-23 18:42:17.000000000 +0200
63479 @@ -0,0 +1,70 @@
63480 +#include <linux/config.h>
63481 +#include <linux/mm.h>
63482 +#include <linux/module.h>
63483 +#include <linux/slab.h>
63484 +#include <linux/vmalloc.h>
63485 +#include <asm/uaccess.h>
63486 +#include <xen/driver_util.h>
63487 +
63488 +static int f(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
63489 +{
63490 +       /* apply_to_page_range() does all the hard work. */
63491 +       return 0;
63492 +}
63493 +
63494 +struct vm_struct *alloc_vm_area(unsigned long size)
63495 +{
63496 +       struct vm_struct *area;
63497 +
63498 +       area = get_vm_area(size, VM_IOREMAP);
63499 +       if (area == NULL)
63500 +               return NULL;
63501 +
63502 +       /*
63503 +        * This ensures that page tables are constructed for this region
63504 +        * of kernel virtual address space and mapped into init_mm.
63505 +        */
63506 +       if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
63507 +                               area->size, f, NULL)) {
63508 +               free_vm_area(area);
63509 +               return NULL;
63510 +       }
63511 +
63512 +       return area;
63513 +}
63514 +EXPORT_SYMBOL_GPL(alloc_vm_area);
63515 +
63516 +void free_vm_area(struct vm_struct *area)
63517 +{
63518 +       struct vm_struct *ret;
63519 +       ret = remove_vm_area(area->addr);
63520 +       BUG_ON(ret != area);
63521 +       kfree(area);
63522 +}
63523 +EXPORT_SYMBOL_GPL(free_vm_area);
63524 +
63525 +void lock_vm_area(struct vm_struct *area)
63526 +{
63527 +       unsigned long i;
63528 +       char c;
63529 +
63530 +       /*
63531 +        * Prevent context switch to a lazy mm that doesn't have this area
63532 +        * mapped into its page tables.
63533 +        */
63534 +       preempt_disable();
63535 +
63536 +       /*
63537 +        * Ensure that the page tables are mapped into the current mm. The
63538 +        * page-fault path will copy the page directory pointers from init_mm.
63539 +        */
63540 +       for (i = 0; i < area->size; i += PAGE_SIZE)
63541 +               (void)__get_user(c, (char __user *)area->addr + i);
63542 +}
63543 +EXPORT_SYMBOL_GPL(lock_vm_area);
63544 +
63545 +void unlock_vm_area(struct vm_struct *area)
63546 +{
63547 +       preempt_enable();
63548 +}
63549 +EXPORT_SYMBOL_GPL(unlock_vm_area);
63550 diff -urNp linux-2.6/drivers/xen/xenbus/Makefile new/drivers/xen/xenbus/Makefile
63551 --- linux-2.6/drivers/xen/xenbus/Makefile       1970-01-01 01:00:00.000000000 +0100
63552 +++ new/drivers/xen/xenbus/Makefile     2006-06-28 14:32:14.000000000 +0200
63553 @@ -0,0 +1,12 @@
63554 +obj-y  += xenbus.o
63555 +obj-$(CONFIG_XEN_BACKEND) += xenbus_be.o
63556 +
63557 +xenbus_be-objs =
63558 +xenbus_be-objs += xenbus_backend_client.o
63559 +
63560 +xenbus-objs =
63561 +xenbus-objs += xenbus_client.o
63562 +xenbus-objs += xenbus_comms.o
63563 +xenbus-objs += xenbus_xs.o
63564 +xenbus-objs += xenbus_probe.o
63565 +obj-$(CONFIG_XEN_XENBUS_DEV) += xenbus_dev.o
63566 diff -urNp linux-2.6/drivers/xen/xenbus/xenbus_backend_client.c new/drivers/xen/xenbus/xenbus_backend_client.c
63567 --- linux-2.6/drivers/xen/xenbus/xenbus_backend_client.c        1970-01-01 01:00:00.000000000 +0100
63568 +++ new/drivers/xen/xenbus/xenbus_backend_client.c      2006-05-23 18:42:17.000000000 +0200
63569 @@ -0,0 +1,135 @@
63570 +/******************************************************************************
63571 + * Backend-client-facing interface for the Xenbus driver.  In other words, the
63572 + * interface between the Xenbus and the device-specific code in the backend
63573 + * driver.
63574 + *
63575 + * Copyright (C) 2005-2006 XenSource Ltd
63576 + * 
63577 + * This program is free software; you can redistribute it and/or
63578 + * modify it under the terms of the GNU General Public License version 2
63579 + * as published by the Free Software Foundation; or, when distributed
63580 + * separately from the Linux kernel or incorporated into other
63581 + * software packages, subject to the following license:
63582 + * 
63583 + * Permission is hereby granted, free of charge, to any person obtaining a copy
63584 + * of this source file (the "Software"), to deal in the Software without
63585 + * restriction, including without limitation the rights to use, copy, modify,
63586 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
63587 + * and to permit persons to whom the Software is furnished to do so, subject to
63588 + * the following conditions:
63589 + * 
63590 + * The above copyright notice and this permission notice shall be included in
63591 + * all copies or substantial portions of the Software.
63592 + * 
63593 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
63594 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
63595 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
63596 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
63597 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
63598 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
63599 + * IN THE SOFTWARE.
63600 + */
63601 +
63602 +#include <linux/err.h>
63603 +#include <xen/gnttab.h>
63604 +#include <xen/xenbus.h>
63605 +#include <xen/driver_util.h>
63606 +
63607 +/* Based on Rusty Russell's skeleton driver's map_page */
63608 +struct vm_struct *xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref)
63609 +{
63610 +       struct gnttab_map_grant_ref op;
63611 +       struct vm_struct *area;
63612 +
63613 +       area = alloc_vm_area(PAGE_SIZE);
63614 +       if (!area)
63615 +               return ERR_PTR(-ENOMEM);
63616 +
63617 +       gnttab_set_map_op(&op, (unsigned long)area->addr, GNTMAP_host_map,
63618 +                         gnt_ref, dev->otherend_id);
63619 +       
63620 +       lock_vm_area(area);
63621 +       BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1));
63622 +       unlock_vm_area(area);
63623 +
63624 +       if (op.status != GNTST_okay) {
63625 +               free_vm_area(area);
63626 +               xenbus_dev_fatal(dev, op.status,
63627 +                                "mapping in shared page %d from domain %d",
63628 +                                gnt_ref, dev->otherend_id);
63629 +               BUG_ON(!IS_ERR(ERR_PTR(op.status)));
63630 +               return ERR_PTR(op.status);
63631 +       }
63632 +
63633 +       /* Stuff the handle in an unused field */
63634 +       area->phys_addr = (unsigned long)op.handle;
63635 +
63636 +       return area;
63637 +}
63638 +EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
63639 +
63640 +
63641 +int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
63642 +                  grant_handle_t *handle, void *vaddr)
63643 +{
63644 +       struct gnttab_map_grant_ref op;
63645 +       
63646 +       gnttab_set_map_op(&op, (unsigned long)vaddr, GNTMAP_host_map,
63647 +                         gnt_ref, dev->otherend_id);
63648 +       BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1));
63649 +
63650 +       if (op.status != GNTST_okay) {
63651 +               xenbus_dev_fatal(dev, op.status,
63652 +                                "mapping in shared page %d from domain %d",
63653 +                                gnt_ref, dev->otherend_id);
63654 +       } else
63655 +               *handle = op.handle;
63656 +
63657 +       return op.status;
63658 +}
63659 +EXPORT_SYMBOL_GPL(xenbus_map_ring);
63660 +
63661 +
63662 +/* Based on Rusty Russell's skeleton driver's unmap_page */
63663 +int xenbus_unmap_ring_vfree(struct xenbus_device *dev, struct vm_struct *area)
63664 +{
63665 +       struct gnttab_unmap_grant_ref op;
63666 +
63667 +       gnttab_set_unmap_op(&op, (unsigned long)area->addr, GNTMAP_host_map,
63668 +                           (grant_handle_t)area->phys_addr);
63669 +
63670 +       lock_vm_area(area);
63671 +       BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
63672 +       unlock_vm_area(area);
63673 +
63674 +       if (op.status == GNTST_okay)
63675 +               free_vm_area(area);
63676 +       else
63677 +               xenbus_dev_error(dev, op.status,
63678 +                                "unmapping page at handle %d error %d",
63679 +                                (int16_t)area->phys_addr, op.status);
63680 +
63681 +       return op.status;
63682 +}
63683 +EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
63684 +
63685 +
63686 +int xenbus_unmap_ring(struct xenbus_device *dev,
63687 +                    grant_handle_t handle, void *vaddr)
63688 +{
63689 +       struct gnttab_unmap_grant_ref op;
63690 +
63691 +       gnttab_set_unmap_op(&op, (unsigned long)vaddr, GNTMAP_host_map,
63692 +                           handle);
63693 +       BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
63694 +
63695 +       if (op.status != GNTST_okay)
63696 +               xenbus_dev_error(dev, op.status,
63697 +                                "unmapping page at handle %d error %d",
63698 +                                handle, op.status);
63699 +
63700 +       return op.status;
63701 +}
63702 +EXPORT_SYMBOL_GPL(xenbus_unmap_ring);
63703 +
63704 +MODULE_LICENSE("Dual BSD/GPL");
63705 diff -urNp linux-2.6/drivers/xen/xenbus/xenbus_client.c new/drivers/xen/xenbus/xenbus_client.c
63706 --- linux-2.6/drivers/xen/xenbus/xenbus_client.c        1970-01-01 01:00:00.000000000 +0100
63707 +++ new/drivers/xen/xenbus/xenbus_client.c      2006-06-28 14:32:14.000000000 +0200
63708 @@ -0,0 +1,281 @@
63709 +/******************************************************************************
63710 + * Client-facing interface for the Xenbus driver.  In other words, the
63711 + * interface between the Xenbus and the device-specific code, be it the
63712 + * frontend or the backend of that driver.
63713 + *
63714 + * Copyright (C) 2005 XenSource Ltd
63715 + * 
63716 + * This program is free software; you can redistribute it and/or
63717 + * modify it under the terms of the GNU General Public License version 2
63718 + * as published by the Free Software Foundation; or, when distributed
63719 + * separately from the Linux kernel or incorporated into other
63720 + * software packages, subject to the following license:
63721 + * 
63722 + * Permission is hereby granted, free of charge, to any person obtaining a copy
63723 + * of this source file (the "Software"), to deal in the Software without
63724 + * restriction, including without limitation the rights to use, copy, modify,
63725 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
63726 + * and to permit persons to whom the Software is furnished to do so, subject to
63727 + * the following conditions:
63728 + * 
63729 + * The above copyright notice and this permission notice shall be included in
63730 + * all copies or substantial portions of the Software.
63731 + * 
63732 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
63733 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
63734 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
63735 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
63736 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
63737 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
63738 + * IN THE SOFTWARE.
63739 + */
63740 +
63741 +#include <xen/evtchn.h>
63742 +#include <xen/gnttab.h>
63743 +#include <xen/xenbus.h>
63744 +#include <xen/driver_util.h>
63745 +
63746 +/* xenbus_probe.c */
63747 +extern char *kasprintf(const char *fmt, ...);
63748 +
63749 +#define DPRINTK(fmt, args...) \
63750 +    pr_debug("xenbus_client (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
63751 +
63752 +int xenbus_watch_path(struct xenbus_device *dev, const char *path,
63753 +                     struct xenbus_watch *watch,
63754 +                     void (*callback)(struct xenbus_watch *,
63755 +                                      const char **, unsigned int))
63756 +{
63757 +       int err;
63758 +
63759 +       watch->node = path;
63760 +       watch->callback = callback;
63761 +
63762 +       err = register_xenbus_watch(watch);
63763 +
63764 +       if (err) {
63765 +               watch->node = NULL;
63766 +               watch->callback = NULL;
63767 +               xenbus_dev_fatal(dev, err, "adding watch on %s", path);
63768 +       }
63769 +
63770 +       return err;
63771 +}
63772 +EXPORT_SYMBOL_GPL(xenbus_watch_path);
63773 +
63774 +
63775 +int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
63776 +                      const char *path2, struct xenbus_watch *watch,
63777 +                      void (*callback)(struct xenbus_watch *,
63778 +                                       const char **, unsigned int))
63779 +{
63780 +       int err;
63781 +       char *state = kasprintf("%s/%s", path, path2);
63782 +       if (!state) {
63783 +               xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch");
63784 +               return -ENOMEM;
63785 +       }
63786 +       err = xenbus_watch_path(dev, state, watch, callback);
63787 +
63788 +       if (err)
63789 +               kfree(state);
63790 +       return err;
63791 +}
63792 +EXPORT_SYMBOL_GPL(xenbus_watch_path2);
63793 +
63794 +
63795 +int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
63796 +{
63797 +       /* We check whether the state is currently set to the given value, and
63798 +          if not, then the state is set.  We don't want to unconditionally
63799 +          write the given state, because we don't want to fire watches
63800 +          unnecessarily.  Furthermore, if the node has gone, we don't write
63801 +          to it, as the device will be tearing down, and we don't want to
63802 +          resurrect that directory.
63803 +
63804 +          Note that, because of this cached value of our state, this function
63805 +          will not work inside a Xenstore transaction (something it was
63806 +          trying to in the past) because dev->state would not get reset if
63807 +          the transaction was aborted.
63808 +
63809 +        */
63810 +
63811 +       int current_state;
63812 +       int err;
63813 +
63814 +       if (state == dev->state)
63815 +               return 0;
63816 +
63817 +       err = xenbus_scanf(XBT_NIL, dev->nodename, "state", "%d",
63818 +                          &current_state);
63819 +       if (err != 1)
63820 +               return 0;
63821 +
63822 +       err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%d", state);
63823 +       if (err) {
63824 +               if (state != XenbusStateClosing) /* Avoid looping */
63825 +                       xenbus_dev_fatal(dev, err, "writing new state");
63826 +               return err;
63827 +       }
63828 +
63829 +       dev->state = state;
63830 +
63831 +       return 0;
63832 +}
63833 +EXPORT_SYMBOL_GPL(xenbus_switch_state);
63834 +
63835 +
63836 +/**
63837 + * Return the path to the error node for the given device, or NULL on failure.
63838 + * If the value returned is non-NULL, then it is the caller's to kfree.
63839 + */
63840 +static char *error_path(struct xenbus_device *dev)
63841 +{
63842 +       return kasprintf("error/%s", dev->nodename);
63843 +}
63844 +
63845 +
63846 +void _dev_error(struct xenbus_device *dev, int err, const char *fmt,
63847 +               va_list ap)
63848 +{
63849 +       int ret;
63850 +       unsigned int len;
63851 +       char *printf_buffer = NULL, *path_buffer = NULL;
63852 +
63853 +#define PRINTF_BUFFER_SIZE 4096
63854 +       printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
63855 +       if (printf_buffer == NULL)
63856 +               goto fail;
63857 +
63858 +       len = sprintf(printf_buffer, "%i ", -err);
63859 +       ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
63860 +
63861 +       BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1);
63862 +
63863 +       dev_err(&dev->dev, "%s\n", printf_buffer);
63864 +
63865 +       path_buffer = error_path(dev);
63866 +
63867 +       if (path_buffer == NULL) {
63868 +               printk("xenbus: failed to write error node for %s (%s)\n",
63869 +                      dev->nodename, printf_buffer);
63870 +               goto fail;
63871 +       }
63872 +
63873 +       if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) {
63874 +               printk("xenbus: failed to write error node for %s (%s)\n",
63875 +                      dev->nodename, printf_buffer);
63876 +               goto fail;
63877 +       }
63878 +
63879 +fail:
63880 +       if (printf_buffer)
63881 +               kfree(printf_buffer);
63882 +       if (path_buffer)
63883 +               kfree(path_buffer);
63884 +}
63885 +
63886 +
63887 +void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt,
63888 +                     ...)
63889 +{
63890 +       va_list ap;
63891 +
63892 +       va_start(ap, fmt);
63893 +       _dev_error(dev, err, fmt, ap);
63894 +       va_end(ap);
63895 +}
63896 +EXPORT_SYMBOL_GPL(xenbus_dev_error);
63897 +
63898 +
63899 +void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt,
63900 +                     ...)
63901 +{
63902 +       va_list ap;
63903 +
63904 +       va_start(ap, fmt);
63905 +       _dev_error(dev, err, fmt, ap);
63906 +       va_end(ap);
63907 +
63908 +       xenbus_switch_state(dev, XenbusStateClosing);
63909 +}
63910 +EXPORT_SYMBOL_GPL(xenbus_dev_fatal);
63911 +
63912 +
63913 +int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn)
63914 +{
63915 +       int err = gnttab_grant_foreign_access(dev->otherend_id, ring_mfn, 0);
63916 +       if (err < 0)
63917 +               xenbus_dev_fatal(dev, err, "granting access to ring page");
63918 +       return err;
63919 +}
63920 +EXPORT_SYMBOL_GPL(xenbus_grant_ring);
63921 +
63922 +
63923 +int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port)
63924 +{
63925 +       struct evtchn_alloc_unbound alloc_unbound;
63926 +       int err;
63927 +
63928 +       alloc_unbound.dom        = DOMID_SELF;
63929 +       alloc_unbound.remote_dom = dev->otherend_id;
63930 +
63931 +       err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
63932 +                                         &alloc_unbound);
63933 +       if (err)
63934 +               xenbus_dev_fatal(dev, err, "allocating event channel");
63935 +       else
63936 +               *port = alloc_unbound.port;
63937 +
63938 +       return err;
63939 +}
63940 +EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);
63941 +
63942 +
63943 +int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port)
63944 +{
63945 +       struct evtchn_bind_interdomain bind_interdomain;
63946 +       int err;
63947 +
63948 +       bind_interdomain.remote_dom  = dev->otherend_id;
63949 +       bind_interdomain.remote_port = remote_port,
63950 +
63951 +       err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
63952 +                                         &bind_interdomain);
63953 +       if (err)
63954 +               xenbus_dev_fatal(dev, err,
63955 +                                "binding to event channel %d from domain %d",
63956 +                                remote_port, dev->otherend_id);
63957 +       else
63958 +               *port = bind_interdomain.local_port;
63959 +
63960 +       return err;
63961 +}
63962 +EXPORT_SYMBOL_GPL(xenbus_bind_evtchn);
63963 +
63964 +
63965 +int xenbus_free_evtchn(struct xenbus_device *dev, int port)
63966 +{
63967 +       struct evtchn_close close;
63968 +       int err;
63969 +
63970 +       close.port = port;
63971 +
63972 +       err = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
63973 +       if (err)
63974 +               xenbus_dev_error(dev, err, "freeing event channel %d", port);
63975 +
63976 +       return err;
63977 +}
63978 +
63979 +
63980 +enum xenbus_state xenbus_read_driver_state(const char *path)
63981 +{
63982 +       enum xenbus_state result;
63983 +       int err = xenbus_gather(XBT_NIL, path, "state", "%d", &result, NULL);
63984 +       if (err)
63985 +               result = XenbusStateClosed;
63986 +
63987 +       return result;
63988 +}
63989 +EXPORT_SYMBOL_GPL(xenbus_read_driver_state);
63990 diff -urNp linux-2.6/drivers/xen/xenbus/xenbus_comms.c new/drivers/xen/xenbus/xenbus_comms.c
63991 --- linux-2.6/drivers/xen/xenbus/xenbus_comms.c 1970-01-01 01:00:00.000000000 +0100
63992 +++ new/drivers/xen/xenbus/xenbus_comms.c       2006-05-23 18:42:17.000000000 +0200
63993 @@ -0,0 +1,208 @@
63994 +/******************************************************************************
63995 + * xenbus_comms.c
63996 + *
63997 + * Low level code to talks to Xen Store: ringbuffer and event channel.
63998 + *
63999 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
64000 + * 
64001 + * This program is free software; you can redistribute it and/or
64002 + * modify it under the terms of the GNU General Public License version 2
64003 + * as published by the Free Software Foundation; or, when distributed
64004 + * separately from the Linux kernel or incorporated into other
64005 + * software packages, subject to the following license:
64006 + * 
64007 + * Permission is hereby granted, free of charge, to any person obtaining a copy
64008 + * of this source file (the "Software"), to deal in the Software without
64009 + * restriction, including without limitation the rights to use, copy, modify,
64010 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
64011 + * and to permit persons to whom the Software is furnished to do so, subject to
64012 + * the following conditions:
64013 + * 
64014 + * The above copyright notice and this permission notice shall be included in
64015 + * all copies or substantial portions of the Software.
64016 + * 
64017 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
64018 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
64019 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
64020 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
64021 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
64022 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
64023 + * IN THE SOFTWARE.
64024 + */
64025 +
64026 +#include <asm/hypervisor.h>
64027 +#include <xen/evtchn.h>
64028 +#include <linux/wait.h>
64029 +#include <linux/interrupt.h>
64030 +#include <linux/sched.h>
64031 +#include <linux/err.h>
64032 +#include <xen/xenbus.h>
64033 +#include "xenbus_comms.h"
64034 +
64035 +static int xenbus_irq;
64036 +
64037 +extern void xenbus_probe(void *);
64038 +extern int xenstored_ready;
64039 +static DECLARE_WORK(probe_work, xenbus_probe, NULL);
64040 +
64041 +DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
64042 +
64043 +static inline struct xenstore_domain_interface *xenstore_domain_interface(void)
64044 +{
64045 +       return mfn_to_virt(xen_start_info->store_mfn);
64046 +}
64047 +
64048 +static irqreturn_t wake_waiting(int irq, void *unused, struct pt_regs *regs)
64049 +{
64050 +       if (unlikely(xenstored_ready == 0)) {
64051 +               xenstored_ready = 1;
64052 +               schedule_work(&probe_work);
64053 +       }
64054 +
64055 +       wake_up(&xb_waitq);
64056 +       return IRQ_HANDLED;
64057 +}
64058 +
64059 +static int check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
64060 +{
64061 +       return ((prod - cons) <= XENSTORE_RING_SIZE);
64062 +}
64063 +
64064 +static void *get_output_chunk(XENSTORE_RING_IDX cons,
64065 +                             XENSTORE_RING_IDX prod,
64066 +                             char *buf, uint32_t *len)
64067 +{
64068 +       *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
64069 +       if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
64070 +               *len = XENSTORE_RING_SIZE - (prod - cons);
64071 +       return buf + MASK_XENSTORE_IDX(prod);
64072 +}
64073 +
64074 +static const void *get_input_chunk(XENSTORE_RING_IDX cons,
64075 +                                  XENSTORE_RING_IDX prod,
64076 +                                  const char *buf, uint32_t *len)
64077 +{
64078 +       *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
64079 +       if ((prod - cons) < *len)
64080 +               *len = prod - cons;
64081 +       return buf + MASK_XENSTORE_IDX(cons);
64082 +}
64083 +
64084 +int xb_write(const void *data, unsigned len)
64085 +{
64086 +       struct xenstore_domain_interface *intf = xenstore_domain_interface();
64087 +       XENSTORE_RING_IDX cons, prod;
64088 +       int rc;
64089 +
64090 +       while (len != 0) {
64091 +               void *dst;
64092 +               unsigned int avail;
64093 +
64094 +               rc = wait_event_interruptible(
64095 +                       xb_waitq,
64096 +                       (intf->req_prod - intf->req_cons) !=
64097 +                       XENSTORE_RING_SIZE);
64098 +               if (rc < 0)
64099 +                       return rc;
64100 +
64101 +               /* Read indexes, then verify. */
64102 +               cons = intf->req_cons;
64103 +               prod = intf->req_prod;
64104 +               mb();
64105 +               if (!check_indexes(cons, prod)) {
64106 +                       intf->req_cons = intf->req_prod = 0;
64107 +                       return -EIO;
64108 +               }
64109 +
64110 +               dst = get_output_chunk(cons, prod, intf->req, &avail);
64111 +               if (avail == 0)
64112 +                       continue;
64113 +               if (avail > len)
64114 +                       avail = len;
64115 +
64116 +               memcpy(dst, data, avail);
64117 +               data += avail;
64118 +               len -= avail;
64119 +
64120 +               /* Other side must not see new header until data is there. */
64121 +               wmb();
64122 +               intf->req_prod += avail;
64123 +
64124 +               /* This implies mb() before other side sees interrupt. */
64125 +               notify_remote_via_evtchn(xen_start_info->store_evtchn);
64126 +       }
64127 +
64128 +       return 0;
64129 +}
64130 +
64131 +int xb_read(void *data, unsigned len)
64132 +{
64133 +       struct xenstore_domain_interface *intf = xenstore_domain_interface();
64134 +       XENSTORE_RING_IDX cons, prod;
64135 +       int rc;
64136 +
64137 +       while (len != 0) {
64138 +               unsigned int avail;
64139 +               const char *src;
64140 +
64141 +               rc = wait_event_interruptible(
64142 +                       xb_waitq,
64143 +                       intf->rsp_cons != intf->rsp_prod);
64144 +               if (rc < 0)
64145 +                       return rc;
64146 +
64147 +               /* Read indexes, then verify. */
64148 +               cons = intf->rsp_cons;
64149 +               prod = intf->rsp_prod;
64150 +               mb();
64151 +               if (!check_indexes(cons, prod)) {
64152 +                       intf->rsp_cons = intf->rsp_prod = 0;
64153 +                       return -EIO;
64154 +               }
64155 +
64156 +               src = get_input_chunk(cons, prod, intf->rsp, &avail);
64157 +               if (avail == 0)
64158 +                       continue;
64159 +               if (avail > len)
64160 +                       avail = len;
64161 +
64162 +               /* We must read header before we read data. */
64163 +               rmb();
64164 +
64165 +               memcpy(data, src, avail);
64166 +               data += avail;
64167 +               len -= avail;
64168 +
64169 +               /* Other side must not see free space until we've copied out */
64170 +               mb();
64171 +               intf->rsp_cons += avail;
64172 +
64173 +               pr_debug("Finished read of %i bytes (%i to go)\n", avail, len);
64174 +
64175 +               /* Implies mb(): they will see new header. */
64176 +               notify_remote_via_evtchn(xen_start_info->store_evtchn);
64177 +       }
64178 +
64179 +       return 0;
64180 +}
64181 +
64182 +/* Set up interrupt handler off store event channel. */
64183 +int xb_init_comms(void)
64184 +{
64185 +       int err;
64186 +
64187 +       if (xenbus_irq)
64188 +               unbind_from_irqhandler(xenbus_irq, &xb_waitq);
64189 +
64190 +       err = bind_evtchn_to_irqhandler(
64191 +               xen_start_info->store_evtchn, wake_waiting,
64192 +               0, "xenbus", &xb_waitq);
64193 +       if (err <= 0) {
64194 +               printk(KERN_ERR "XENBUS request irq failed %i\n", err);
64195 +               return err;
64196 +       }
64197 +
64198 +       xenbus_irq = err;
64199 +
64200 +       return 0;
64201 +}
64202 diff -urNp linux-2.6/drivers/xen/xenbus/xenbus_comms.h new/drivers/xen/xenbus/xenbus_comms.h
64203 --- linux-2.6/drivers/xen/xenbus/xenbus_comms.h 1970-01-01 01:00:00.000000000 +0100
64204 +++ new/drivers/xen/xenbus/xenbus_comms.h       2006-05-23 18:42:17.000000000 +0200
64205 @@ -0,0 +1,43 @@
64206 +/*
64207 + * Private include for xenbus communications.
64208 + * 
64209 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
64210 + *
64211 + * This program is free software; you can redistribute it and/or
64212 + * modify it under the terms of the GNU General Public License version 2
64213 + * as published by the Free Software Foundation; or, when distributed
64214 + * separately from the Linux kernel or incorporated into other
64215 + * software packages, subject to the following license:
64216 + * 
64217 + * Permission is hereby granted, free of charge, to any person obtaining a copy
64218 + * of this source file (the "Software"), to deal in the Software without
64219 + * restriction, including without limitation the rights to use, copy, modify,
64220 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
64221 + * and to permit persons to whom the Software is furnished to do so, subject to
64222 + * the following conditions:
64223 + * 
64224 + * The above copyright notice and this permission notice shall be included in
64225 + * all copies or substantial portions of the Software.
64226 + * 
64227 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
64228 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
64229 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
64230 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
64231 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
64232 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
64233 + * IN THE SOFTWARE.
64234 + */
64235 +
64236 +#ifndef _XENBUS_COMMS_H
64237 +#define _XENBUS_COMMS_H
64238 +
64239 +int xs_init(void);
64240 +int xb_init_comms(void);
64241 +
64242 +/* Low level routines. */
64243 +int xb_write(const void *data, unsigned len);
64244 +int xb_read(void *data, unsigned len);
64245 +int xs_input_avail(void);
64246 +extern wait_queue_head_t xb_waitq;
64247 +
64248 +#endif /* _XENBUS_COMMS_H */
64249 diff -urNp linux-2.6/drivers/xen/xenbus/xenbus_dev.c new/drivers/xen/xenbus/xenbus_dev.c
64250 --- linux-2.6/drivers/xen/xenbus/xenbus_dev.c   1970-01-01 01:00:00.000000000 +0100
64251 +++ new/drivers/xen/xenbus/xenbus_dev.c 2006-06-28 14:32:14.000000000 +0200
64252 @@ -0,0 +1,245 @@
64253 +/*
64254 + * xenbus_dev.c
64255 + * 
64256 + * Driver giving user-space access to the kernel's xenbus connection
64257 + * to xenstore.
64258 + * 
64259 + * Copyright (c) 2005, Christian Limpach
64260 + * Copyright (c) 2005, Rusty Russell, IBM Corporation
64261 + * 
64262 + * This program is free software; you can redistribute it and/or
64263 + * modify it under the terms of the GNU General Public License version 2
64264 + * as published by the Free Software Foundation; or, when distributed
64265 + * separately from the Linux kernel or incorporated into other
64266 + * software packages, subject to the following license:
64267 + * 
64268 + * Permission is hereby granted, free of charge, to any person obtaining a copy
64269 + * of this source file (the "Software"), to deal in the Software without
64270 + * restriction, including without limitation the rights to use, copy, modify,
64271 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
64272 + * and to permit persons to whom the Software is furnished to do so, subject to
64273 + * the following conditions:
64274 + * 
64275 + * The above copyright notice and this permission notice shall be included in
64276 + * all copies or substantial portions of the Software.
64277 + * 
64278 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
64279 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
64280 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
64281 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
64282 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
64283 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
64284 + * IN THE SOFTWARE.
64285 + */
64286 +
64287 +#include <linux/config.h>
64288 +#include <linux/kernel.h>
64289 +#include <linux/errno.h>
64290 +#include <linux/uio.h>
64291 +#include <linux/notifier.h>
64292 +#include <linux/wait.h>
64293 +#include <linux/fs.h>
64294 +#include <linux/poll.h>
64295 +
64296 +#include "xenbus_comms.h"
64297 +
64298 +#include <asm/uaccess.h>
64299 +#include <asm/hypervisor.h>
64300 +#include <xen/xenbus.h>
64301 +#include <xen/xen_proc.h>
64302 +#include <asm/hypervisor.h>
64303 +
64304 +struct xenbus_dev_transaction {
64305 +       struct list_head list;
64306 +       struct xenbus_transaction handle;
64307 +};
64308 +
64309 +struct xenbus_dev_data {
64310 +       /* In-progress transaction. */
64311 +       struct list_head transactions;
64312 +
64313 +       /* Partial request. */
64314 +       unsigned int len;
64315 +       union {
64316 +               struct xsd_sockmsg msg;
64317 +               char buffer[PAGE_SIZE];
64318 +       } u;
64319 +
64320 +       /* Response queue. */
64321 +#define MASK_READ_IDX(idx) ((idx)&(PAGE_SIZE-1))
64322 +       char read_buffer[PAGE_SIZE];
64323 +       unsigned int read_cons, read_prod;
64324 +       wait_queue_head_t read_waitq;
64325 +};
64326 +
64327 +static struct proc_dir_entry *xenbus_dev_intf;
64328 +
64329 +static ssize_t xenbus_dev_read(struct file *filp,
64330 +                              char __user *ubuf,
64331 +                              size_t len, loff_t *ppos)
64332 +{
64333 +       struct xenbus_dev_data *u = filp->private_data;
64334 +       int i;
64335 +
64336 +       if (wait_event_interruptible(u->read_waitq,
64337 +                                    u->read_prod != u->read_cons))
64338 +               return -EINTR;
64339 +
64340 +       for (i = 0; i < len; i++) {
64341 +               if (u->read_cons == u->read_prod)
64342 +                       break;
64343 +               put_user(u->read_buffer[MASK_READ_IDX(u->read_cons)], ubuf+i);
64344 +               u->read_cons++;
64345 +       }
64346 +
64347 +       return i;
64348 +}
64349 +
64350 +static void queue_reply(struct xenbus_dev_data *u,
64351 +                       char *data, unsigned int len)
64352 +{
64353 +       int i;
64354 +
64355 +       for (i = 0; i < len; i++, u->read_prod++)
64356 +               u->read_buffer[MASK_READ_IDX(u->read_prod)] = data[i];
64357 +
64358 +       BUG_ON((u->read_prod - u->read_cons) > sizeof(u->read_buffer));
64359 +
64360 +       wake_up(&u->read_waitq);
64361 +}
64362 +
64363 +static ssize_t xenbus_dev_write(struct file *filp,
64364 +                               const char __user *ubuf,
64365 +                               size_t len, loff_t *ppos)
64366 +{
64367 +       struct xenbus_dev_data *u = filp->private_data;
64368 +       struct xenbus_dev_transaction *trans = NULL;
64369 +       uint32_t msg_type;
64370 +       void *reply;
64371 +
64372 +       if ((len + u->len) > sizeof(u->u.buffer))
64373 +               return -EINVAL;
64374 +
64375 +       if (copy_from_user(u->u.buffer + u->len, ubuf, len) != 0)
64376 +               return -EFAULT;
64377 +
64378 +       u->len += len;
64379 +       if (u->len < (sizeof(u->u.msg) + u->u.msg.len))
64380 +               return len;
64381 +
64382 +       msg_type = u->u.msg.type;
64383 +
64384 +       switch (msg_type) {
64385 +       case XS_TRANSACTION_START:
64386 +       case XS_TRANSACTION_END:
64387 +       case XS_DIRECTORY:
64388 +       case XS_READ:
64389 +       case XS_GET_PERMS:
64390 +       case XS_RELEASE:
64391 +       case XS_GET_DOMAIN_PATH:
64392 +       case XS_WRITE:
64393 +       case XS_MKDIR:
64394 +       case XS_RM:
64395 +       case XS_SET_PERMS:
64396 +               if (msg_type == XS_TRANSACTION_START) {
64397 +                       trans = kmalloc(sizeof(*trans), GFP_KERNEL);
64398 +                       if (!trans)
64399 +                               return -ENOMEM;
64400 +               }
64401 +
64402 +               reply = xenbus_dev_request_and_reply(&u->u.msg);
64403 +               if (IS_ERR(reply)) {
64404 +                       kfree(trans);
64405 +                       return PTR_ERR(reply);
64406 +               }
64407 +
64408 +               if (msg_type == XS_TRANSACTION_START) {
64409 +                       trans->handle.id = simple_strtoul(reply, NULL, 0);
64410 +                       list_add(&trans->list, &u->transactions);
64411 +               } else if (msg_type == XS_TRANSACTION_END) {
64412 +                       list_for_each_entry(trans, &u->transactions, list)
64413 +                               if (trans->handle.id == u->u.msg.tx_id)
64414 +                                       break;
64415 +                       BUG_ON(&trans->list == &u->transactions);
64416 +                       list_del(&trans->list);
64417 +                       kfree(trans);
64418 +               }
64419 +               queue_reply(u, (char *)&u->u.msg, sizeof(u->u.msg));
64420 +               queue_reply(u, (char *)reply, u->u.msg.len);
64421 +               kfree(reply);
64422 +               break;
64423 +
64424 +       default:
64425 +               return -EINVAL;
64426 +       }
64427 +
64428 +       u->len = 0;
64429 +       return len;
64430 +}
64431 +
64432 +static int xenbus_dev_open(struct inode *inode, struct file *filp)
64433 +{
64434 +       struct xenbus_dev_data *u;
64435 +
64436 +       if (xen_start_info->store_evtchn == 0)
64437 +               return -ENOENT;
64438 +
64439 +       nonseekable_open(inode, filp);
64440 +
64441 +       u = kzalloc(sizeof(*u), GFP_KERNEL);
64442 +       if (u == NULL)
64443 +               return -ENOMEM;
64444 +
64445 +       INIT_LIST_HEAD(&u->transactions);
64446 +       init_waitqueue_head(&u->read_waitq);
64447 +
64448 +       filp->private_data = u;
64449 +
64450 +       return 0;
64451 +}
64452 +
64453 +static int xenbus_dev_release(struct inode *inode, struct file *filp)
64454 +{
64455 +       struct xenbus_dev_data *u = filp->private_data;
64456 +       struct xenbus_dev_transaction *trans, *tmp;
64457 +
64458 +       list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
64459 +               xenbus_transaction_end(trans->handle, 1);
64460 +               list_del(&trans->list);
64461 +               kfree(trans);
64462 +       }
64463 +
64464 +       kfree(u);
64465 +
64466 +       return 0;
64467 +}
64468 +
64469 +static unsigned int xenbus_dev_poll(struct file *file, poll_table *wait)
64470 +{
64471 +       struct xenbus_dev_data *u = file->private_data;
64472 +
64473 +       poll_wait(file, &u->read_waitq, wait);
64474 +       if (u->read_cons != u->read_prod)
64475 +               return POLLIN | POLLRDNORM;
64476 +       return 0;
64477 +}
64478 +
64479 +static struct file_operations xenbus_dev_file_ops = {
64480 +       .read = xenbus_dev_read,
64481 +       .write = xenbus_dev_write,
64482 +       .open = xenbus_dev_open,
64483 +       .release = xenbus_dev_release,
64484 +       .poll = xenbus_dev_poll,
64485 +};
64486 +
64487 +static int __init
64488 +xenbus_dev_init(void)
64489 +{
64490 +       xenbus_dev_intf = create_xen_proc_entry("xenbus", 0400);
64491 +       if (xenbus_dev_intf)
64492 +               xenbus_dev_intf->proc_fops = &xenbus_dev_file_ops;
64493 +
64494 +       return 0;
64495 +}
64496 +
64497 +__initcall(xenbus_dev_init);
64498 diff -urNp linux-2.6/drivers/xen/xenbus/xenbus_probe.c new/drivers/xen/xenbus/xenbus_probe.c
64499 --- linux-2.6/drivers/xen/xenbus/xenbus_probe.c 1970-01-01 01:00:00.000000000 +0100
64500 +++ new/drivers/xen/xenbus/xenbus_probe.c       2006-07-07 16:05:52.000000000 +0200
64501 @@ -0,0 +1,1107 @@
64502 +/******************************************************************************
64503 + * Talks to Xen Store to figure out what devices we have.
64504 + *
64505 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
64506 + * Copyright (C) 2005 Mike Wray, Hewlett-Packard
64507 + * Copyright (C) 2005, 2006 XenSource Ltd
64508 + * 
64509 + * This program is free software; you can redistribute it and/or
64510 + * modify it under the terms of the GNU General Public License version 2
64511 + * as published by the Free Software Foundation; or, when distributed
64512 + * separately from the Linux kernel or incorporated into other
64513 + * software packages, subject to the following license:
64514 + * 
64515 + * Permission is hereby granted, free of charge, to any person obtaining a copy
64516 + * of this source file (the "Software"), to deal in the Software without
64517 + * restriction, including without limitation the rights to use, copy, modify,
64518 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
64519 + * and to permit persons to whom the Software is furnished to do so, subject to
64520 + * the following conditions:
64521 + * 
64522 + * The above copyright notice and this permission notice shall be included in
64523 + * all copies or substantial portions of the Software.
64524 + * 
64525 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
64526 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
64527 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
64528 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
64529 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
64530 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
64531 + * IN THE SOFTWARE.
64532 + */
64533 +
64534 +#define DPRINTK(fmt, args...)                          \
64535 +       pr_debug("xenbus_probe (%s:%d) " fmt ".\n",     \
64536 +                __FUNCTION__, __LINE__, ##args)
64537 +
64538 +#include <linux/kernel.h>
64539 +#include <linux/err.h>
64540 +#include <linux/string.h>
64541 +#include <linux/ctype.h>
64542 +#include <linux/fcntl.h>
64543 +#include <linux/mm.h>
64544 +#include <linux/notifier.h>
64545 +#include <linux/kthread.h>
64546 +
64547 +#include <asm/io.h>
64548 +#include <asm/page.h>
64549 +#include <asm/pgtable.h>
64550 +#include <asm/hypervisor.h>
64551 +#include <xen/xenbus.h>
64552 +#include <xen/xen_proc.h>
64553 +#include <xen/evtchn.h>
64554 +#include <xen/features.h>
64555 +
64556 +#include "xenbus_comms.h"
64557 +
64558 +extern struct mutex xenwatch_mutex;
64559 +
64560 +static BLOCKING_NOTIFIER_HEAD(xenstore_notifier_list);
64561 +
64562 +/* If something in array of ids matches this device, return it. */
64563 +static const struct xenbus_device_id *
64564 +match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
64565 +{
64566 +       for (; *arr->devicetype != '\0'; arr++) {
64567 +               if (!strcmp(arr->devicetype, dev->devicetype))
64568 +                       return arr;
64569 +       }
64570 +       return NULL;
64571 +}
64572 +
64573 +static int xenbus_match(struct device *_dev, struct device_driver *_drv)
64574 +{
64575 +       struct xenbus_driver *drv = to_xenbus_driver(_drv);
64576 +
64577 +       if (!drv->ids)
64578 +               return 0;
64579 +
64580 +       return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
64581 +}
64582 +
64583 +struct xen_bus_type
64584 +{
64585 +       char *root;
64586 +       unsigned int levels;
64587 +       int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename);
64588 +       int (*probe)(const char *type, const char *dir);
64589 +       struct bus_type bus;
64590 +       struct device dev;
64591 +};
64592 +
64593 +
64594 +/* device/<type>/<id> => <type>-<id> */
64595 +static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
64596 +{
64597 +       nodename = strchr(nodename, '/');
64598 +       if (!nodename || strlen(nodename + 1) >= BUS_ID_SIZE) {
64599 +               printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
64600 +               return -EINVAL;
64601 +       }
64602 +
64603 +       strlcpy(bus_id, nodename + 1, BUS_ID_SIZE);
64604 +       if (!strchr(bus_id, '/')) {
64605 +               printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
64606 +               return -EINVAL;
64607 +       }
64608 +       *strchr(bus_id, '/') = '-';
64609 +       return 0;
64610 +}
64611 +
64612 +
64613 +static void free_otherend_details(struct xenbus_device *dev)
64614 +{
64615 +       kfree(dev->otherend);
64616 +       dev->otherend = NULL;
64617 +}
64618 +
64619 +
64620 +static void free_otherend_watch(struct xenbus_device *dev)
64621 +{
64622 +       if (dev->otherend_watch.node) {
64623 +               unregister_xenbus_watch(&dev->otherend_watch);
64624 +               kfree(dev->otherend_watch.node);
64625 +               dev->otherend_watch.node = NULL;
64626 +       }
64627 +}
64628 +
64629 +
64630 +static int read_otherend_details(struct xenbus_device *xendev,
64631 +                                char *id_node, char *path_node)
64632 +{
64633 +       int err = xenbus_gather(XBT_NIL, xendev->nodename,
64634 +                               id_node, "%i", &xendev->otherend_id,
64635 +                               path_node, NULL, &xendev->otherend,
64636 +                               NULL);
64637 +       if (err) {
64638 +               xenbus_dev_fatal(xendev, err,
64639 +                                "reading other end details from %s",
64640 +                                xendev->nodename);
64641 +               return err;
64642 +       }
64643 +       if (strlen(xendev->otherend) == 0 ||
64644 +           !xenbus_exists(XBT_NIL, xendev->otherend, "")) {
64645 +               xenbus_dev_fatal(xendev, -ENOENT, "missing other end from %s",
64646 +                                xendev->nodename);
64647 +               free_otherend_details(xendev);
64648 +               return -ENOENT;
64649 +       }
64650 +
64651 +       return 0;
64652 +}
64653 +
64654 +
64655 +static int read_backend_details(struct xenbus_device *xendev)
64656 +{
64657 +       return read_otherend_details(xendev, "backend-id", "backend");
64658 +}
64659 +
64660 +
64661 +static int read_frontend_details(struct xenbus_device *xendev)
64662 +{
64663 +       return read_otherend_details(xendev, "frontend-id", "frontend");
64664 +}
64665 +
64666 +
64667 +/* Bus type for frontend drivers. */
64668 +static int xenbus_probe_frontend(const char *type, const char *name);
64669 +static struct xen_bus_type xenbus_frontend = {
64670 +       .root = "device",
64671 +       .levels = 2,            /* device/type/<id> */
64672 +       .get_bus_id = frontend_bus_id,
64673 +       .probe = xenbus_probe_frontend,
64674 +       .bus = {
64675 +               .name  = "xen",
64676 +               .match = xenbus_match,
64677 +       },
64678 +       .dev = {
64679 +               .bus_id = "xen",
64680 +       },
64681 +};
64682 +
64683 +/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
64684 +static int backend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
64685 +{
64686 +       int domid, err;
64687 +       const char *devid, *type, *frontend;
64688 +       unsigned int typelen;
64689 +
64690 +       type = strchr(nodename, '/');
64691 +       if (!type)
64692 +               return -EINVAL;
64693 +       type++;
64694 +       typelen = strcspn(type, "/");
64695 +       if (!typelen || type[typelen] != '/')
64696 +               return -EINVAL;
64697 +
64698 +       devid = strrchr(nodename, '/') + 1;
64699 +
64700 +       err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid,
64701 +                           "frontend", NULL, &frontend,
64702 +                           NULL);
64703 +       if (err)
64704 +               return err;
64705 +       if (strlen(frontend) == 0)
64706 +               err = -ERANGE;
64707 +       if (!err && !xenbus_exists(XBT_NIL, frontend, ""))
64708 +               err = -ENOENT;
64709 +
64710 +       kfree(frontend);
64711 +
64712 +       if (err)
64713 +               return err;
64714 +
64715 +       if (snprintf(bus_id, BUS_ID_SIZE,
64716 +                    "%.*s-%i-%s", typelen, type, domid, devid) >= BUS_ID_SIZE)
64717 +               return -ENOSPC;
64718 +       return 0;
64719 +}
64720 +
64721 +static int xenbus_uevent_backend(struct device *dev, char **envp,
64722 +                                int num_envp, char *buffer, int buffer_size);
64723 +static int xenbus_probe_backend(const char *type, const char *domid);
64724 +static struct xen_bus_type xenbus_backend = {
64725 +       .root = "backend",
64726 +       .levels = 3,            /* backend/type/<frontend>/<id> */
64727 +       .get_bus_id = backend_bus_id,
64728 +       .probe = xenbus_probe_backend,
64729 +       .bus = {
64730 +               .name  = "xen-backend",
64731 +               .match = xenbus_match,
64732 +               .uevent = xenbus_uevent_backend,
64733 +       },
64734 +       .dev = {
64735 +               .bus_id = "xen-backend",
64736 +       },
64737 +};
64738 +
64739 +static int xenbus_uevent_backend(struct device *dev, char **envp,
64740 +                                int num_envp, char *buffer, int buffer_size)
64741 +{
64742 +       struct xenbus_device *xdev;
64743 +       struct xenbus_driver *drv;
64744 +       int i = 0;
64745 +       int length = 0;
64746 +
64747 +       DPRINTK("");
64748 +
64749 +       if (dev == NULL)
64750 +               return -ENODEV;
64751 +
64752 +       xdev = to_xenbus_device(dev);
64753 +       if (xdev == NULL)
64754 +               return -ENODEV;
64755 +
64756 +       /* stuff we want to pass to /sbin/hotplug */
64757 +       add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
64758 +                      "XENBUS_TYPE=%s", xdev->devicetype);
64759 +
64760 +       add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
64761 +                      "XENBUS_PATH=%s", xdev->nodename);
64762 +
64763 +       add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
64764 +                      "XENBUS_BASE_PATH=%s", xenbus_backend.root);
64765 +
64766 +       /* terminate, set to next free slot, shrink available space */
64767 +       envp[i] = NULL;
64768 +       envp = &envp[i];
64769 +       num_envp -= i;
64770 +       buffer = &buffer[length];
64771 +       buffer_size -= length;
64772 +
64773 +       if (dev->driver) {
64774 +               drv = to_xenbus_driver(dev->driver);
64775 +               if (drv && drv->uevent)
64776 +                       return drv->uevent(xdev, envp, num_envp, buffer,
64777 +                                          buffer_size);
64778 +       }
64779 +
64780 +       return 0;
64781 +}
64782 +
64783 +static void otherend_changed(struct xenbus_watch *watch,
64784 +                            const char **vec, unsigned int len)
64785 +{
64786 +       struct xenbus_device *dev =
64787 +               container_of(watch, struct xenbus_device, otherend_watch);
64788 +       struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
64789 +       enum xenbus_state state;
64790 +
64791 +       /* Protect us against watches firing on old details when the otherend
64792 +          details change, say immediately after a resume. */
64793 +       if (!dev->otherend ||
64794 +           strncmp(dev->otherend, vec[XS_WATCH_PATH],
64795 +                   strlen(dev->otherend))) {
64796 +               DPRINTK("Ignoring watch at %s", vec[XS_WATCH_PATH]);
64797 +               return;
64798 +       }
64799 +
64800 +       state = xenbus_read_driver_state(dev->otherend);
64801 +
64802 +       DPRINTK("state is %d, %s, %s",
64803 +               state, dev->otherend_watch.node, vec[XS_WATCH_PATH]);
64804 +       if (drv->otherend_changed)
64805 +               drv->otherend_changed(dev, state);
64806 +}
64807 +
64808 +
64809 +static int talk_to_otherend(struct xenbus_device *dev)
64810 +{
64811 +       struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
64812 +
64813 +       free_otherend_watch(dev);
64814 +       free_otherend_details(dev);
64815 +
64816 +       return drv->read_otherend_details(dev);
64817 +}
64818 +
64819 +
64820 +static int watch_otherend(struct xenbus_device *dev)
64821 +{
64822 +       return xenbus_watch_path2(dev, dev->otherend, "state",
64823 +                                 &dev->otherend_watch, otherend_changed);
64824 +}
64825 +
64826 +
64827 +static int xenbus_dev_probe(struct device *_dev)
64828 +{
64829 +       struct xenbus_device *dev = to_xenbus_device(_dev);
64830 +       struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
64831 +       const struct xenbus_device_id *id;
64832 +       int err;
64833 +
64834 +       DPRINTK("");
64835 +
64836 +       if (!drv->probe) {
64837 +               err = -ENODEV;
64838 +               goto fail;
64839 +       }
64840 +
64841 +       id = match_device(drv->ids, dev);
64842 +       if (!id) {
64843 +               err = -ENODEV;
64844 +               goto fail;
64845 +       }
64846 +
64847 +       err = talk_to_otherend(dev);
64848 +       if (err) {
64849 +               printk(KERN_WARNING
64850 +                      "xenbus_probe: talk_to_otherend on %s failed.\n",
64851 +                      dev->nodename);
64852 +               return err;
64853 +       }
64854 +
64855 +       err = drv->probe(dev, id);
64856 +       if (err)
64857 +               goto fail;
64858 +
64859 +       err = watch_otherend(dev);
64860 +       if (err) {
64861 +               printk(KERN_WARNING
64862 +                      "xenbus_probe: watch_otherend on %s failed.\n",
64863 +                      dev->nodename);
64864 +               return err;
64865 +       }
64866 +
64867 +       return 0;
64868 +fail:
64869 +       xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);
64870 +       xenbus_switch_state(dev, XenbusStateClosed);
64871 +       return -ENODEV;
64872 +}
64873 +
64874 +static int xenbus_dev_remove(struct device *_dev)
64875 +{
64876 +       struct xenbus_device *dev = to_xenbus_device(_dev);
64877 +       struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
64878 +
64879 +       DPRINTK("");
64880 +
64881 +       free_otherend_watch(dev);
64882 +       free_otherend_details(dev);
64883 +
64884 +       if (drv->remove)
64885 +               drv->remove(dev);
64886 +
64887 +       xenbus_switch_state(dev, XenbusStateClosed);
64888 +       return 0;
64889 +}
64890 +
64891 +static int xenbus_register_driver_common(struct xenbus_driver *drv,
64892 +                                        struct xen_bus_type *bus)
64893 +{
64894 +       int ret;
64895 +
64896 +       drv->driver.name = drv->name;
64897 +       drv->driver.bus = &bus->bus;
64898 +       drv->driver.owner = drv->owner;
64899 +       drv->driver.probe = xenbus_dev_probe;
64900 +       drv->driver.remove = xenbus_dev_remove;
64901 +
64902 +       mutex_lock(&xenwatch_mutex);
64903 +       ret = driver_register(&drv->driver);
64904 +       mutex_unlock(&xenwatch_mutex);
64905 +       return ret;
64906 +}
64907 +
64908 +int xenbus_register_frontend(struct xenbus_driver *drv)
64909 +{
64910 +       drv->read_otherend_details = read_backend_details;
64911 +
64912 +       return xenbus_register_driver_common(drv, &xenbus_frontend);
64913 +}
64914 +EXPORT_SYMBOL_GPL(xenbus_register_frontend);
64915 +
64916 +int xenbus_register_backend(struct xenbus_driver *drv)
64917 +{
64918 +       drv->read_otherend_details = read_frontend_details;
64919 +
64920 +       return xenbus_register_driver_common(drv, &xenbus_backend);
64921 +}
64922 +EXPORT_SYMBOL_GPL(xenbus_register_backend);
64923 +
64924 +void xenbus_unregister_driver(struct xenbus_driver *drv)
64925 +{
64926 +       driver_unregister(&drv->driver);
64927 +}
64928 +EXPORT_SYMBOL_GPL(xenbus_unregister_driver);
64929 +
64930 +struct xb_find_info
64931 +{
64932 +       struct xenbus_device *dev;
64933 +       const char *nodename;
64934 +};
64935 +
64936 +static int cmp_dev(struct device *dev, void *data)
64937 +{
64938 +       struct xenbus_device *xendev = to_xenbus_device(dev);
64939 +       struct xb_find_info *info = data;
64940 +
64941 +       if (!strcmp(xendev->nodename, info->nodename)) {
64942 +               info->dev = xendev;
64943 +               get_device(dev);
64944 +               return 1;
64945 +       }
64946 +       return 0;
64947 +}
64948 +
64949 +struct xenbus_device *xenbus_device_find(const char *nodename,
64950 +                                        struct bus_type *bus)
64951 +{
64952 +       struct xb_find_info info = { .dev = NULL, .nodename = nodename };
64953 +
64954 +       bus_for_each_dev(bus, NULL, &info, cmp_dev);
64955 +       return info.dev;
64956 +}
64957 +
64958 +static int cleanup_dev(struct device *dev, void *data)
64959 +{
64960 +       struct xenbus_device *xendev = to_xenbus_device(dev);
64961 +       struct xb_find_info *info = data;
64962 +       int len = strlen(info->nodename);
64963 +
64964 +       DPRINTK("%s", info->nodename);
64965 +
64966 +       /* Match the info->nodename path, or any subdirectory of that path. */
64967 +       if (strncmp(xendev->nodename, info->nodename, len))
64968 +               return 0;
64969 +
64970 +       /* If the node name is longer, ensure it really is a subdirectory. */
64971 +       if ((strlen(xendev->nodename) > len) && (xendev->nodename[len] != '/'))
64972 +               return 0;
64973 +
64974 +       info->dev = xendev;
64975 +       get_device(dev);
64976 +       return 1;
64977 +}
64978 +
64979 +static void xenbus_cleanup_devices(const char *path, struct bus_type *bus)
64980 +{
64981 +       struct xb_find_info info = { .nodename = path };
64982 +
64983 +       do {
64984 +               info.dev = NULL;
64985 +               bus_for_each_dev(bus, NULL, &info, cleanup_dev);
64986 +               if (info.dev) {
64987 +                       device_unregister(&info.dev->dev);
64988 +                       put_device(&info.dev->dev);
64989 +               }
64990 +       } while (info.dev);
64991 +}
64992 +
64993 +static void xenbus_dev_release(struct device *dev)
64994 +{
64995 +       if (dev)
64996 +               kfree(to_xenbus_device(dev));
64997 +}
64998 +
64999 +/* Simplified asprintf. */
65000 +char *kasprintf(const char *fmt, ...)
65001 +{
65002 +       va_list ap;
65003 +       unsigned int len;
65004 +       char *p, dummy[1];
65005 +
65006 +       va_start(ap, fmt);
65007 +       /* FIXME: vsnprintf has a bug, NULL should work */
65008 +       len = vsnprintf(dummy, 0, fmt, ap);
65009 +       va_end(ap);
65010 +
65011 +       p = kmalloc(len + 1, GFP_KERNEL);
65012 +       if (!p)
65013 +               return NULL;
65014 +       va_start(ap, fmt);
65015 +       vsprintf(p, fmt, ap);
65016 +       va_end(ap);
65017 +       return p;
65018 +}
65019 +
65020 +static ssize_t xendev_show_nodename(struct device *dev,
65021 +                                   struct device_attribute *attr, char *buf)
65022 +{
65023 +       return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
65024 +}
65025 +DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL);
65026 +
65027 +static ssize_t xendev_show_devtype(struct device *dev,
65028 +                                  struct device_attribute *attr, char *buf)
65029 +{
65030 +       return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
65031 +}
65032 +DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
65033 +
65034 +
65035 +static int xenbus_probe_node(struct xen_bus_type *bus,
65036 +                            const char *type,
65037 +                            const char *nodename)
65038 +{
65039 +       int err;
65040 +       struct xenbus_device *xendev;
65041 +       size_t stringlen;
65042 +       char *tmpstring;
65043 +
65044 +       enum xenbus_state state = xenbus_read_driver_state(nodename);
65045 +
65046 +       if (state != XenbusStateInitialising) {
65047 +               /* Device is not new, so ignore it.  This can happen if a
65048 +                  device is going away after switching to Closed.  */
65049 +               return 0;
65050 +       }
65051 +
65052 +       stringlen = strlen(nodename) + 1 + strlen(type) + 1;
65053 +       xendev = kzalloc(sizeof(*xendev) + stringlen, GFP_KERNEL);
65054 +       if (!xendev)
65055 +               return -ENOMEM;
65056 +
65057 +       /* Copy the strings into the extra space. */
65058 +
65059 +       tmpstring = (char *)(xendev + 1);
65060 +       strcpy(tmpstring, nodename);
65061 +       xendev->nodename = tmpstring;
65062 +
65063 +       tmpstring += strlen(tmpstring) + 1;
65064 +       strcpy(tmpstring, type);
65065 +       xendev->devicetype = tmpstring;
65066 +
65067 +       xendev->dev.parent = &bus->dev;
65068 +       xendev->dev.bus = &bus->bus;
65069 +       xendev->dev.release = xenbus_dev_release;
65070 +
65071 +       err = bus->get_bus_id(xendev->dev.bus_id, xendev->nodename);
65072 +       if (err)
65073 +               goto fail;
65074 +
65075 +       /* Register with generic device framework. */
65076 +       err = device_register(&xendev->dev);
65077 +       if (err)
65078 +               goto fail;
65079 +
65080 +       device_create_file(&xendev->dev, &dev_attr_nodename);
65081 +       device_create_file(&xendev->dev, &dev_attr_devtype);
65082 +
65083 +       return 0;
65084 +fail:
65085 +       kfree(xendev);
65086 +       return err;
65087 +}
65088 +
65089 +/* device/<typename>/<name> */
65090 +static int xenbus_probe_frontend(const char *type, const char *name)
65091 +{
65092 +       char *nodename;
65093 +       int err;
65094 +
65095 +       nodename = kasprintf("%s/%s/%s", xenbus_frontend.root, type, name);
65096 +       if (!nodename)
65097 +               return -ENOMEM;
65098 +
65099 +       DPRINTK("%s", nodename);
65100 +
65101 +       err = xenbus_probe_node(&xenbus_frontend, type, nodename);
65102 +       kfree(nodename);
65103 +       return err;
65104 +}
65105 +
65106 +/* backend/<typename>/<frontend-uuid>/<name> */
65107 +static int xenbus_probe_backend_unit(const char *dir,
65108 +                                    const char *type,
65109 +                                    const char *name)
65110 +{
65111 +       char *nodename;
65112 +       int err;
65113 +
65114 +       nodename = kasprintf("%s/%s", dir, name);
65115 +       if (!nodename)
65116 +               return -ENOMEM;
65117 +
65118 +       DPRINTK("%s\n", nodename);
65119 +
65120 +       err = xenbus_probe_node(&xenbus_backend, type, nodename);
65121 +       kfree(nodename);
65122 +       return err;
65123 +}
65124 +
65125 +/* backend/<typename>/<frontend-domid> */
65126 +static int xenbus_probe_backend(const char *type, const char *domid)
65127 +{
65128 +       char *nodename;
65129 +       int err = 0;
65130 +       char **dir;
65131 +       unsigned int i, dir_n = 0;
65132 +
65133 +       DPRINTK("");
65134 +
65135 +       nodename = kasprintf("%s/%s/%s", xenbus_backend.root, type, domid);
65136 +       if (!nodename)
65137 +               return -ENOMEM;
65138 +
65139 +       dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n);
65140 +       if (IS_ERR(dir)) {
65141 +               kfree(nodename);
65142 +               return PTR_ERR(dir);
65143 +       }
65144 +
65145 +       for (i = 0; i < dir_n; i++) {
65146 +               err = xenbus_probe_backend_unit(nodename, type, dir[i]);
65147 +               if (err)
65148 +                       break;
65149 +       }
65150 +       kfree(dir);
65151 +       kfree(nodename);
65152 +       return err;
65153 +}
65154 +
65155 +static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
65156 +{
65157 +       int err = 0;
65158 +       char **dir;
65159 +       unsigned int dir_n = 0;
65160 +       int i;
65161 +
65162 +       dir = xenbus_directory(XBT_NIL, bus->root, type, &dir_n);
65163 +       if (IS_ERR(dir))
65164 +               return PTR_ERR(dir);
65165 +
65166 +       for (i = 0; i < dir_n; i++) {
65167 +               err = bus->probe(type, dir[i]);
65168 +               if (err)
65169 +                       break;
65170 +       }
65171 +       kfree(dir);
65172 +       return err;
65173 +}
65174 +
65175 +static int xenbus_probe_devices(struct xen_bus_type *bus)
65176 +{
65177 +       int err = 0;
65178 +       char **dir;
65179 +       unsigned int i, dir_n;
65180 +
65181 +       dir = xenbus_directory(XBT_NIL, bus->root, "", &dir_n);
65182 +       if (IS_ERR(dir))
65183 +               return PTR_ERR(dir);
65184 +
65185 +       for (i = 0; i < dir_n; i++) {
65186 +               err = xenbus_probe_device_type(bus, dir[i]);
65187 +               if (err)
65188 +                       break;
65189 +       }
65190 +       kfree(dir);
65191 +       return err;
65192 +}
65193 +
65194 +static unsigned int char_count(const char *str, char c)
65195 +{
65196 +       unsigned int i, ret = 0;
65197 +
65198 +       for (i = 0; str[i]; i++)
65199 +               if (str[i] == c)
65200 +                       ret++;
65201 +       return ret;
65202 +}
65203 +
65204 +static int strsep_len(const char *str, char c, unsigned int len)
65205 +{
65206 +       unsigned int i;
65207 +
65208 +       for (i = 0; str[i]; i++)
65209 +               if (str[i] == c) {
65210 +                       if (len == 0)
65211 +                               return i;
65212 +                       len--;
65213 +               }
65214 +       return (len == 0) ? i : -ERANGE;
65215 +}
65216 +
65217 +static void dev_changed(const char *node, struct xen_bus_type *bus)
65218 +{
65219 +       int exists, rootlen;
65220 +       struct xenbus_device *dev;
65221 +       char type[BUS_ID_SIZE];
65222 +       const char *p, *root;
65223 +
65224 +       if (char_count(node, '/') < 2)
65225 +               return;
65226 +
65227 +       exists = xenbus_exists(XBT_NIL, node, "");
65228 +       if (!exists) {
65229 +               xenbus_cleanup_devices(node, &bus->bus);
65230 +               return;
65231 +       }
65232 +
65233 +       /* backend/<type>/... or device/<type>/... */
65234 +       p = strchr(node, '/') + 1;
65235 +       snprintf(type, BUS_ID_SIZE, "%.*s", (int)strcspn(p, "/"), p);
65236 +       type[BUS_ID_SIZE-1] = '\0';
65237 +
65238 +       rootlen = strsep_len(node, '/', bus->levels);
65239 +       if (rootlen < 0)
65240 +               return;
65241 +       root = kasprintf("%.*s", rootlen, node);
65242 +       if (!root)
65243 +               return;
65244 +
65245 +       dev = xenbus_device_find(root, &bus->bus);
65246 +       if (!dev)
65247 +               xenbus_probe_node(bus, type, root);
65248 +       else
65249 +               put_device(&dev->dev);
65250 +
65251 +       kfree(root);
65252 +}
65253 +
65254 +static void frontend_changed(struct xenbus_watch *watch,
65255 +                            const char **vec, unsigned int len)
65256 +{
65257 +       DPRINTK("");
65258 +
65259 +       dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
65260 +}
65261 +
65262 +static void backend_changed(struct xenbus_watch *watch,
65263 +                           const char **vec, unsigned int len)
65264 +{
65265 +       DPRINTK("");
65266 +
65267 +       dev_changed(vec[XS_WATCH_PATH], &xenbus_backend);
65268 +}
65269 +
65270 +/* We watch for devices appearing and vanishing. */
65271 +static struct xenbus_watch fe_watch = {
65272 +       .node = "device",
65273 +       .callback = frontend_changed,
65274 +};
65275 +
65276 +static struct xenbus_watch be_watch = {
65277 +       .node = "backend",
65278 +       .callback = backend_changed,
65279 +};
65280 +
65281 +static int suspend_dev(struct device *dev, void *data)
65282 +{
65283 +       int err = 0;
65284 +       struct xenbus_driver *drv;
65285 +       struct xenbus_device *xdev;
65286 +
65287 +       DPRINTK("");
65288 +
65289 +       if (dev->driver == NULL)
65290 +               return 0;
65291 +       drv = to_xenbus_driver(dev->driver);
65292 +       xdev = container_of(dev, struct xenbus_device, dev);
65293 +       if (drv->suspend)
65294 +               err = drv->suspend(xdev);
65295 +       if (err)
65296 +               printk(KERN_WARNING
65297 +                      "xenbus: suspend %s failed: %i\n", dev->bus_id, err);
65298 +       return 0;
65299 +}
65300 +
65301 +static int resume_dev(struct device *dev, void *data)
65302 +{
65303 +       int err;
65304 +       struct xenbus_driver *drv;
65305 +       struct xenbus_device *xdev;
65306 +
65307 +       DPRINTK("");
65308 +
65309 +       if (dev->driver == NULL)
65310 +               return 0;
65311 +
65312 +       drv = to_xenbus_driver(dev->driver);
65313 +       xdev = container_of(dev, struct xenbus_device, dev);
65314 +
65315 +       err = talk_to_otherend(xdev);
65316 +       if (err) {
65317 +               printk(KERN_WARNING
65318 +                      "xenbus: resume (talk_to_otherend) %s failed: %i\n",
65319 +                      dev->bus_id, err);
65320 +               return err;
65321 +       }
65322 +
65323 +       xdev->state = XenbusStateInitialising;
65324 +
65325 +       if (drv->resume) {
65326 +               err = drv->resume(xdev);
65327 +               if (err) { 
65328 +                       printk(KERN_WARNING
65329 +                              "xenbus: resume %s failed: %i\n", 
65330 +                              dev->bus_id, err);
65331 +                       return err; 
65332 +               }
65333 +       }
65334 +
65335 +       err = watch_otherend(xdev);
65336 +       if (err) {
65337 +               printk(KERN_WARNING
65338 +                      "xenbus_probe: resume (watch_otherend) %s failed: "
65339 +                      "%d.\n", dev->bus_id, err);
65340 +               return err;
65341 +       }
65342 +
65343 +       return 0; 
65344 +}
65345 +
65346 +void xenbus_suspend(void)
65347 +{
65348 +       DPRINTK("");
65349 +
65350 +       bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
65351 +       bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, suspend_dev);
65352 +       xs_suspend();
65353 +}
65354 +EXPORT_SYMBOL_GPL(xenbus_suspend);
65355 +
65356 +void xenbus_resume(void)
65357 +{
65358 +       xb_init_comms();
65359 +       xs_resume();
65360 +       bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
65361 +       bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, resume_dev);
65362 +}
65363 +EXPORT_SYMBOL_GPL(xenbus_resume);
65364 +
65365 +
65366 +/* A flag to determine if xenstored is 'ready' (i.e. has started) */
65367 +int xenstored_ready = 0;
65368 +
65369 +
65370 +int register_xenstore_notifier(struct notifier_block *nb)
65371 +{
65372 +       int ret = 0;
65373 +
65374 +       if (xenstored_ready > 0)
65375 +               ret = nb->notifier_call(nb, 0, NULL);
65376 +       else
65377 +               blocking_notifier_chain_register(&xenstore_notifier_list, nb);
65378 +
65379 +       return ret;
65380 +}
65381 +EXPORT_SYMBOL_GPL(register_xenstore_notifier);
65382 +
65383 +void unregister_xenstore_notifier(struct notifier_block *nb)
65384 +{
65385 +       blocking_notifier_chain_unregister(&xenstore_notifier_list, nb);
65386 +}
65387 +EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
65388 +
65389 +
65390 +void xenbus_probe(void *unused)
65391 +{
65392 +       BUG_ON((xenstored_ready <= 0));
65393 +
65394 +       /* Enumerate devices in xenstore. */
65395 +       xenbus_probe_devices(&xenbus_frontend);
65396 +       xenbus_probe_devices(&xenbus_backend);
65397 +
65398 +       /* Watch for changes. */
65399 +       register_xenbus_watch(&fe_watch);
65400 +       register_xenbus_watch(&be_watch);
65401 +
65402 +       /* Notify others that xenstore is up */
65403 +       blocking_notifier_call_chain(&xenstore_notifier_list, 0, NULL);
65404 +}
65405 +
65406 +
65407 +#ifdef CONFIG_PROC_FS
65408 +static struct file_operations xsd_kva_fops;
65409 +static struct proc_dir_entry *xsd_kva_intf;
65410 +static struct proc_dir_entry *xsd_port_intf;
65411 +
65412 +static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma)
65413 +{
65414 +       size_t size = vma->vm_end - vma->vm_start;
65415 +
65416 +       if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
65417 +               return -EINVAL;
65418 +
65419 +       if (remap_pfn_range(vma, vma->vm_start,
65420 +                           mfn_to_pfn(xen_start_info->store_mfn),
65421 +                           size, vma->vm_page_prot))
65422 +               return -EAGAIN;
65423 +
65424 +       return 0;
65425 +}
65426 +
65427 +static int xsd_kva_read(char *page, char **start, off_t off,
65428 +                       int count, int *eof, void *data)
65429 +{
65430 +       int len;
65431 +
65432 +       len  = sprintf(page, "0x%p", mfn_to_virt(xen_start_info->store_mfn));
65433 +       *eof = 1;
65434 +       return len;
65435 +}
65436 +
65437 +static int xsd_port_read(char *page, char **start, off_t off,
65438 +                        int count, int *eof, void *data)
65439 +{
65440 +       int len;
65441 +
65442 +       len  = sprintf(page, "%d", xen_start_info->store_evtchn);
65443 +       *eof = 1;
65444 +       return len;
65445 +}
65446 +#endif
65447 +
65448 +
65449 +static int __init xenbus_probe_init(void)
65450 +{
65451 +       int err = 0, dom0;
65452 +       unsigned long page = 0;
65453 +
65454 +       DPRINTK("");
65455 +
65456 +       if (!is_running_on_xen())
65457 +               return -ENODEV;
65458 +
65459 +       /* Register ourselves with the kernel bus subsystem */
65460 +       bus_register(&xenbus_frontend.bus);
65461 +       bus_register(&xenbus_backend.bus);
65462 +
65463 +       /*
65464 +        * Domain0 doesn't have a store_evtchn or store_mfn yet.
65465 +        */
65466 +       dom0 = (xen_start_info->store_evtchn == 0);
65467 +
65468 +       if (dom0) {
65469 +               struct evtchn_alloc_unbound alloc_unbound;
65470 +
65471 +               /* Allocate page. */
65472 +               page = get_zeroed_page(GFP_KERNEL);
65473 +               if (!page)
65474 +                       return -ENOMEM;
65475 +
65476 +               xen_start_info->store_mfn =
65477 +                       pfn_to_mfn(virt_to_phys((void *)page) >>
65478 +                                  PAGE_SHIFT);
65479 +
65480 +               /* Next allocate a local port which xenstored can bind to */
65481 +               alloc_unbound.dom        = DOMID_SELF;
65482 +               alloc_unbound.remote_dom = 0;
65483 +
65484 +               err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
65485 +                                                 &alloc_unbound);
65486 +               if (err == -ENOSYS)
65487 +                       goto err;
65488 +               BUG_ON(err);
65489 +               xen_start_info->store_evtchn = alloc_unbound.port;
65490 +
65491 +#ifdef CONFIG_PROC_FS
65492 +               /* And finally publish the above info in /proc/xen */
65493 +               xsd_kva_intf = create_xen_proc_entry("xsd_kva", 0600);
65494 +               if (xsd_kva_intf) {
65495 +                       memcpy(&xsd_kva_fops, xsd_kva_intf->proc_fops,
65496 +                              sizeof(xsd_kva_fops));
65497 +                       xsd_kva_fops.mmap = xsd_kva_mmap;
65498 +                       xsd_kva_intf->proc_fops = &xsd_kva_fops;
65499 +                       xsd_kva_intf->read_proc = xsd_kva_read;
65500 +               }
65501 +               xsd_port_intf = create_xen_proc_entry("xsd_port", 0400);
65502 +               if (xsd_port_intf)
65503 +                       xsd_port_intf->read_proc = xsd_port_read;
65504 +#endif
65505 +       } else
65506 +               xenstored_ready = 1;
65507 +
65508 +       /* Initialize the interface to xenstore. */
65509 +       err = xs_init();
65510 +       if (err) {
65511 +               printk(KERN_WARNING
65512 +                      "XENBUS: Error initializing xenstore comms: %i\n", err);
65513 +               goto err;
65514 +       }
65515 +
65516 +       /* Register ourselves with the kernel device subsystem */
65517 +       device_register(&xenbus_frontend.dev);
65518 +       device_register(&xenbus_backend.dev);
65519 +
65520 +       if (!dom0)
65521 +               xenbus_probe(NULL);
65522 +
65523 +       return 0;
65524 +
65525 + err:
65526 +       if (page)
65527 +               free_page(page);
65528 +
65529 +       /*
65530 +        * Do not unregister the xenbus front/backend buses here. The buses
65531 +        * must exist because front/backend drivers will use them when they are
65532 +        * registered.
65533 +        */
65534 +
65535 +       return err;
65536 +}
65537 +
65538 +postcore_initcall(xenbus_probe_init);
65539 +
65540 +
65541 +static int is_disconnected_device(struct device *dev, void *data)
65542 +{
65543 +       struct xenbus_device *xendev = to_xenbus_device(dev);
65544 +
65545 +       /*
65546 +        * A device with no driver will never connect. We care only about
65547 +        * devices which should currently be in the process of connecting.
65548 +        */
65549 +       if (!dev->driver)
65550 +               return 0;
65551 +
65552 +       return (xendev->state != XenbusStateConnected);
65553 +}
65554 +
65555 +static int exists_disconnected_device(void)
65556 +{
65557 +       return bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL,
65558 +                               is_disconnected_device);
65559 +}
65560 +
65561 +static int print_device_status(struct device *dev, void *data)
65562 +{
65563 +       struct xenbus_device *xendev = to_xenbus_device(dev);
65564 +
65565 +       if (!dev->driver) {
65566 +               /* Information only: is this too noisy? */
65567 +               printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
65568 +                      xendev->nodename);
65569 +       } else if (xendev->state != XenbusStateConnected) {
65570 +               printk(KERN_WARNING "XENBUS: Timeout connecting "
65571 +                      "to device: %s (state %d)\n",
65572 +                      xendev->nodename, xendev->state);
65573 +       }
65574 +
65575 +       return 0;
65576 +}
65577 +
65578 +/*
65579 + * On a 10 second timeout, wait for all devices currently configured.  We need
65580 + * to do this to guarantee that the filesystems and / or network devices
65581 + * needed for boot are available, before we can allow the boot to proceed.
65582 + *
65583 + * This needs to be on a late_initcall, to happen after the frontend device
65584 + * drivers have been initialised, but before the root fs is mounted.
65585 + *
65586 + * A possible improvement here would be to have the tools add a per-device
65587 + * flag to the store entry, indicating whether it is needed at boot time.
65588 + * This would allow people who knew what they were doing to accelerate their
65589 + * boot slightly, but of course needs tools or manual intervention to set up
65590 + * those flags correctly.
65591 + */
65592 +static int __init wait_for_devices(void)
65593 +{
65594 +       unsigned long timeout = jiffies + 10*HZ;
65595 +
65596 +       if (!is_running_on_xen())
65597 +               return -ENODEV;
65598 +
65599 +       while (time_before(jiffies, timeout) && exists_disconnected_device())
65600 +               schedule_timeout_interruptible(HZ/10);
65601 +
65602 +       bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL,
65603 +                        print_device_status);
65604 +
65605 +       return 0;
65606 +}
65607 +
65608 +late_initcall(wait_for_devices);
65609 diff -urNp linux-2.6/drivers/xen/xenbus/xenbus_xs.c new/drivers/xen/xenbus/xenbus_xs.c
65610 --- linux-2.6/drivers/xen/xenbus/xenbus_xs.c    1970-01-01 01:00:00.000000000 +0100
65611 +++ new/drivers/xen/xenbus/xenbus_xs.c  2006-06-28 14:32:14.000000000 +0200
65612 @@ -0,0 +1,846 @@
65613 +/******************************************************************************
65614 + * xenbus_xs.c
65615 + *
65616 + * This is the kernel equivalent of the "xs" library.  We don't need everything
65617 + * and we use xenbus_comms for communication.
65618 + *
65619 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
65620 + * 
65621 + * This program is free software; you can redistribute it and/or
65622 + * modify it under the terms of the GNU General Public License version 2
65623 + * as published by the Free Software Foundation; or, when distributed
65624 + * separately from the Linux kernel or incorporated into other
65625 + * software packages, subject to the following license:
65626 + * 
65627 + * Permission is hereby granted, free of charge, to any person obtaining a copy
65628 + * of this source file (the "Software"), to deal in the Software without
65629 + * restriction, including without limitation the rights to use, copy, modify,
65630 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
65631 + * and to permit persons to whom the Software is furnished to do so, subject to
65632 + * the following conditions:
65633 + * 
65634 + * The above copyright notice and this permission notice shall be included in
65635 + * all copies or substantial portions of the Software.
65636 + * 
65637 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
65638 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
65639 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
65640 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
65641 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
65642 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
65643 + * IN THE SOFTWARE.
65644 + */
65645 +
65646 +#include <linux/unistd.h>
65647 +#include <linux/errno.h>
65648 +#include <linux/types.h>
65649 +#include <linux/uio.h>
65650 +#include <linux/kernel.h>
65651 +#include <linux/string.h>
65652 +#include <linux/err.h>
65653 +#include <linux/slab.h>
65654 +#include <linux/fcntl.h>
65655 +#include <linux/kthread.h>
65656 +#include <linux/rwsem.h>
65657 +#include <xen/xenbus.h>
65658 +#include "xenbus_comms.h"
65659 +
65660 +/* xenbus_probe.c */
65661 +extern char *kasprintf(const char *fmt, ...);
65662 +
65663 +struct xs_stored_msg {
65664 +       struct list_head list;
65665 +
65666 +       struct xsd_sockmsg hdr;
65667 +
65668 +       union {
65669 +               /* Queued replies. */
65670 +               struct {
65671 +                       char *body;
65672 +               } reply;
65673 +
65674 +               /* Queued watch events. */
65675 +               struct {
65676 +                       struct xenbus_watch *handle;
65677 +                       char **vec;
65678 +                       unsigned int vec_size;
65679 +               } watch;
65680 +       } u;
65681 +};
65682 +
65683 +struct xs_handle {
65684 +       /* A list of replies. Currently only one will ever be outstanding. */
65685 +       struct list_head reply_list;
65686 +       spinlock_t reply_lock;
65687 +       wait_queue_head_t reply_waitq;
65688 +
65689 +       /* One request at a time. */
65690 +       struct mutex request_mutex;
65691 +
65692 +       /* Protect transactions against save/restore. */
65693 +       struct rw_semaphore suspend_mutex;
65694 +};
65695 +
65696 +static struct xs_handle xs_state;
65697 +
65698 +/* List of registered watches, and a lock to protect it. */
65699 +static LIST_HEAD(watches);
65700 +static DEFINE_SPINLOCK(watches_lock);
65701 +
65702 +/* List of pending watch callback events, and a lock to protect it. */
65703 +static LIST_HEAD(watch_events);
65704 +static DEFINE_SPINLOCK(watch_events_lock);
65705 +
65706 +/*
65707 + * Details of the xenwatch callback kernel thread. The thread waits on the
65708 + * watch_events_waitq for work to do (queued on watch_events list). When it
65709 + * wakes up it acquires the xenwatch_mutex before reading the list and
65710 + * carrying out work.
65711 + */
65712 +static pid_t xenwatch_pid;
65713 +/* static */ DEFINE_MUTEX(xenwatch_mutex);
65714 +static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq);
65715 +
65716 +static int get_error(const char *errorstring)
65717 +{
65718 +       unsigned int i;
65719 +
65720 +       for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) {
65721 +               if (i == ARRAY_SIZE(xsd_errors) - 1) {
65722 +                       printk(KERN_WARNING
65723 +                              "XENBUS xen store gave: unknown error %s",
65724 +                              errorstring);
65725 +                       return EINVAL;
65726 +               }
65727 +       }
65728 +       return xsd_errors[i].errnum;
65729 +}
65730 +
65731 +static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)
65732 +{
65733 +       struct xs_stored_msg *msg;
65734 +       char *body;
65735 +
65736 +       spin_lock(&xs_state.reply_lock);
65737 +
65738 +       while (list_empty(&xs_state.reply_list)) {
65739 +               spin_unlock(&xs_state.reply_lock);
65740 +               /* XXX FIXME: Avoid synchronous wait for response here. */
65741 +               wait_event(xs_state.reply_waitq,
65742 +                          !list_empty(&xs_state.reply_list));
65743 +               spin_lock(&xs_state.reply_lock);
65744 +       }
65745 +
65746 +       msg = list_entry(xs_state.reply_list.next,
65747 +                        struct xs_stored_msg, list);
65748 +       list_del(&msg->list);
65749 +
65750 +       spin_unlock(&xs_state.reply_lock);
65751 +
65752 +       *type = msg->hdr.type;
65753 +       if (len)
65754 +               *len = msg->hdr.len;
65755 +       body = msg->u.reply.body;
65756 +
65757 +       kfree(msg);
65758 +
65759 +       return body;
65760 +}
65761 +
65762 +/* Emergency write. */
65763 +void xenbus_debug_write(const char *str, unsigned int count)
65764 +{
65765 +       struct xsd_sockmsg msg = { 0 };
65766 +
65767 +       msg.type = XS_DEBUG;
65768 +       msg.len = sizeof("print") + count + 1;
65769 +
65770 +       mutex_lock(&xs_state.request_mutex);
65771 +       xb_write(&msg, sizeof(msg));
65772 +       xb_write("print", sizeof("print"));
65773 +       xb_write(str, count);
65774 +       xb_write("", 1);
65775 +       mutex_unlock(&xs_state.request_mutex);
65776 +}
65777 +
65778 +void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
65779 +{
65780 +       void *ret;
65781 +       struct xsd_sockmsg req_msg = *msg;
65782 +       int err;
65783 +
65784 +       if (req_msg.type == XS_TRANSACTION_START)
65785 +               down_read(&xs_state.suspend_mutex);
65786 +
65787 +       mutex_lock(&xs_state.request_mutex);
65788 +
65789 +       err = xb_write(msg, sizeof(*msg) + msg->len);
65790 +       if (err) {
65791 +               msg->type = XS_ERROR;
65792 +               ret = ERR_PTR(err);
65793 +       } else
65794 +               ret = read_reply(&msg->type, &msg->len);
65795 +
65796 +       mutex_unlock(&xs_state.request_mutex);
65797 +
65798 +       if ((req_msg.type == XS_TRANSACTION_END) ||
65799 +           ((req_msg.type == XS_TRANSACTION_START) &&
65800 +            (msg->type == XS_ERROR)))
65801 +               up_read(&xs_state.suspend_mutex);
65802 +
65803 +       return ret;
65804 +}
65805 +
65806 +/* Send message to xs, get kmalloc'ed reply.  ERR_PTR() on error. */
65807 +static void *xs_talkv(struct xenbus_transaction t,
65808 +                     enum xsd_sockmsg_type type,
65809 +                     const struct kvec *iovec,
65810 +                     unsigned int num_vecs,
65811 +                     unsigned int *len)
65812 +{
65813 +       struct xsd_sockmsg msg;
65814 +       void *ret = NULL;
65815 +       unsigned int i;
65816 +       int err;
65817 +
65818 +       msg.tx_id = t.id;
65819 +       msg.req_id = 0;
65820 +       msg.type = type;
65821 +       msg.len = 0;
65822 +       for (i = 0; i < num_vecs; i++)
65823 +               msg.len += iovec[i].iov_len;
65824 +
65825 +       mutex_lock(&xs_state.request_mutex);
65826 +
65827 +       err = xb_write(&msg, sizeof(msg));
65828 +       if (err) {
65829 +               mutex_unlock(&xs_state.request_mutex);
65830 +               return ERR_PTR(err);
65831 +       }
65832 +
65833 +       for (i = 0; i < num_vecs; i++) {
65834 +               err = xb_write(iovec[i].iov_base, iovec[i].iov_len);;
65835 +               if (err) {
65836 +                       mutex_unlock(&xs_state.request_mutex);
65837 +                       return ERR_PTR(err);
65838 +               }
65839 +       }
65840 +
65841 +       ret = read_reply(&msg.type, len);
65842 +
65843 +       mutex_unlock(&xs_state.request_mutex);
65844 +
65845 +       if (IS_ERR(ret))
65846 +               return ret;
65847 +
65848 +       if (msg.type == XS_ERROR) {
65849 +               err = get_error(ret);
65850 +               kfree(ret);
65851 +               return ERR_PTR(-err);
65852 +       }
65853 +
65854 +       if (msg.type != type) {
65855 +               if (printk_ratelimit())
65856 +                       printk(KERN_WARNING
65857 +                              "XENBUS unexpected type [%d], expected [%d]\n",
65858 +                              msg.type, type);
65859 +               kfree(ret);
65860 +               return ERR_PTR(-EINVAL);
65861 +       }
65862 +       return ret;
65863 +}
65864 +
65865 +/* Simplified version of xs_talkv: single message. */
65866 +static void *xs_single(struct xenbus_transaction t,
65867 +                      enum xsd_sockmsg_type type,
65868 +                      const char *string,
65869 +                      unsigned int *len)
65870 +{
65871 +       struct kvec iovec;
65872 +
65873 +       iovec.iov_base = (void *)string;
65874 +       iovec.iov_len = strlen(string) + 1;
65875 +       return xs_talkv(t, type, &iovec, 1, len);
65876 +}
65877 +
65878 +/* Many commands only need an ack, don't care what it says. */
65879 +static int xs_error(char *reply)
65880 +{
65881 +       if (IS_ERR(reply))
65882 +               return PTR_ERR(reply);
65883 +       kfree(reply);
65884 +       return 0;
65885 +}
65886 +
65887 +static unsigned int count_strings(const char *strings, unsigned int len)
65888 +{
65889 +       unsigned int num;
65890 +       const char *p;
65891 +
65892 +       for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1)
65893 +               num++;
65894 +
65895 +       return num;
65896 +}
65897 +
65898 +/* Return the path to dir with /name appended. Buffer must be kfree()'ed. */
65899 +static char *join(const char *dir, const char *name)
65900 +{
65901 +       char *buffer;
65902 +
65903 +       if (strlen(name) == 0)
65904 +               buffer = kasprintf("%s", dir);
65905 +       else
65906 +               buffer = kasprintf("%s/%s", dir, name);
65907 +       return (!buffer) ? ERR_PTR(-ENOMEM) : buffer;
65908 +}
65909 +
65910 +static char **split(char *strings, unsigned int len, unsigned int *num)
65911 +{
65912 +       char *p, **ret;
65913 +
65914 +       /* Count the strings. */
65915 +       *num = count_strings(strings, len);
65916 +
65917 +       /* Transfer to one big alloc for easy freeing. */
65918 +       ret = kmalloc(*num * sizeof(char *) + len, GFP_KERNEL);
65919 +       if (!ret) {
65920 +               kfree(strings);
65921 +               return ERR_PTR(-ENOMEM);
65922 +       }
65923 +       memcpy(&ret[*num], strings, len);
65924 +       kfree(strings);
65925 +
65926 +       strings = (char *)&ret[*num];
65927 +       for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1)
65928 +               ret[(*num)++] = p;
65929 +
65930 +       return ret;
65931 +}
65932 +
65933 +char **xenbus_directory(struct xenbus_transaction t,
65934 +                       const char *dir, const char *node, unsigned int *num)
65935 +{
65936 +       char *strings, *path;
65937 +       unsigned int len;
65938 +
65939 +       path = join(dir, node);
65940 +       if (IS_ERR(path))
65941 +               return (char **)path;
65942 +
65943 +       strings = xs_single(t, XS_DIRECTORY, path, &len);
65944 +       kfree(path);
65945 +       if (IS_ERR(strings))
65946 +               return (char **)strings;
65947 +
65948 +       return split(strings, len, num);
65949 +}
65950 +EXPORT_SYMBOL_GPL(xenbus_directory);
65951 +
65952 +/* Check if a path exists. Return 1 if it does. */
65953 +int xenbus_exists(struct xenbus_transaction t,
65954 +                 const char *dir, const char *node)
65955 +{
65956 +       char **d;
65957 +       int dir_n;
65958 +
65959 +       d = xenbus_directory(t, dir, node, &dir_n);
65960 +       if (IS_ERR(d))
65961 +               return 0;
65962 +       kfree(d);
65963 +       return 1;
65964 +}
65965 +EXPORT_SYMBOL_GPL(xenbus_exists);
65966 +
65967 +/* Get the value of a single file.
65968 + * Returns a kmalloced value: call free() on it after use.
65969 + * len indicates length in bytes.
65970 + */
65971 +void *xenbus_read(struct xenbus_transaction t,
65972 +                 const char *dir, const char *node, unsigned int *len)
65973 +{
65974 +       char *path;
65975 +       void *ret;
65976 +
65977 +       path = join(dir, node);
65978 +       if (IS_ERR(path))
65979 +               return (void *)path;
65980 +
65981 +       ret = xs_single(t, XS_READ, path, len);
65982 +       kfree(path);
65983 +       return ret;
65984 +}
65985 +EXPORT_SYMBOL_GPL(xenbus_read);
65986 +
65987 +/* Write the value of a single file.
65988 + * Returns -err on failure.
65989 + */
65990 +int xenbus_write(struct xenbus_transaction t,
65991 +                const char *dir, const char *node, const char *string)
65992 +{
65993 +       const char *path;
65994 +       struct kvec iovec[2];
65995 +       int ret;
65996 +
65997 +       path = join(dir, node);
65998 +       if (IS_ERR(path))
65999 +               return PTR_ERR(path);
66000 +
66001 +       iovec[0].iov_base = (void *)path;
66002 +       iovec[0].iov_len = strlen(path) + 1;
66003 +       iovec[1].iov_base = (void *)string;
66004 +       iovec[1].iov_len = strlen(string);
66005 +
66006 +       ret = xs_error(xs_talkv(t, XS_WRITE, iovec, ARRAY_SIZE(iovec), NULL));
66007 +       kfree(path);
66008 +       return ret;
66009 +}
66010 +EXPORT_SYMBOL_GPL(xenbus_write);
66011 +
66012 +/* Create a new directory. */
66013 +int xenbus_mkdir(struct xenbus_transaction t,
66014 +                const char *dir, const char *node)
66015 +{
66016 +       char *path;
66017 +       int ret;
66018 +
66019 +       path = join(dir, node);
66020 +       if (IS_ERR(path))
66021 +               return PTR_ERR(path);
66022 +
66023 +       ret = xs_error(xs_single(t, XS_MKDIR, path, NULL));
66024 +       kfree(path);
66025 +       return ret;
66026 +}
66027 +EXPORT_SYMBOL_GPL(xenbus_mkdir);
66028 +
66029 +/* Destroy a file or directory (directories must be empty). */
66030 +int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node)
66031 +{
66032 +       char *path;
66033 +       int ret;
66034 +
66035 +       path = join(dir, node);
66036 +       if (IS_ERR(path))
66037 +               return PTR_ERR(path);
66038 +
66039 +       ret = xs_error(xs_single(t, XS_RM, path, NULL));
66040 +       kfree(path);
66041 +       return ret;
66042 +}
66043 +EXPORT_SYMBOL_GPL(xenbus_rm);
66044 +
66045 +/* Start a transaction: changes by others will not be seen during this
66046 + * transaction, and changes will not be visible to others until end.
66047 + */
66048 +int xenbus_transaction_start(struct xenbus_transaction *t)
66049 +{
66050 +       char *id_str;
66051 +
66052 +       down_read(&xs_state.suspend_mutex);
66053 +
66054 +       id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL);
66055 +       if (IS_ERR(id_str)) {
66056 +               up_read(&xs_state.suspend_mutex);
66057 +               return PTR_ERR(id_str);
66058 +       }
66059 +
66060 +       t->id = simple_strtoul(id_str, NULL, 0);
66061 +       kfree(id_str);
66062 +       return 0;
66063 +}
66064 +EXPORT_SYMBOL_GPL(xenbus_transaction_start);
66065 +
66066 +/* End a transaction.
66067 + * If abandon is true, transaction is discarded instead of committed.
66068 + */
66069 +int xenbus_transaction_end(struct xenbus_transaction t, int abort)
66070 +{
66071 +       char abortstr[2];
66072 +       int err;
66073 +
66074 +       if (abort)
66075 +               strcpy(abortstr, "F");
66076 +       else
66077 +               strcpy(abortstr, "T");
66078 +
66079 +       err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL));
66080 +
66081 +       up_read(&xs_state.suspend_mutex);
66082 +
66083 +       return err;
66084 +}
66085 +EXPORT_SYMBOL_GPL(xenbus_transaction_end);
66086 +
66087 +/* Single read and scanf: returns -errno or num scanned. */
66088 +int xenbus_scanf(struct xenbus_transaction t,
66089 +                const char *dir, const char *node, const char *fmt, ...)
66090 +{
66091 +       va_list ap;
66092 +       int ret;
66093 +       char *val;
66094 +
66095 +       val = xenbus_read(t, dir, node, NULL);
66096 +       if (IS_ERR(val))
66097 +               return PTR_ERR(val);
66098 +
66099 +       va_start(ap, fmt);
66100 +       ret = vsscanf(val, fmt, ap);
66101 +       va_end(ap);
66102 +       kfree(val);
66103 +       /* Distinctive errno. */
66104 +       if (ret == 0)
66105 +               return -ERANGE;
66106 +       return ret;
66107 +}
66108 +EXPORT_SYMBOL_GPL(xenbus_scanf);
66109 +
66110 +/* Single printf and write: returns -errno or 0. */
66111 +int xenbus_printf(struct xenbus_transaction t,
66112 +                 const char *dir, const char *node, const char *fmt, ...)
66113 +{
66114 +       va_list ap;
66115 +       int ret;
66116 +#define PRINTF_BUFFER_SIZE 4096
66117 +       char *printf_buffer;
66118 +
66119 +       printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
66120 +       if (printf_buffer == NULL)
66121 +               return -ENOMEM;
66122 +
66123 +       va_start(ap, fmt);
66124 +       ret = vsnprintf(printf_buffer, PRINTF_BUFFER_SIZE, fmt, ap);
66125 +       va_end(ap);
66126 +
66127 +       BUG_ON(ret > PRINTF_BUFFER_SIZE-1);
66128 +       ret = xenbus_write(t, dir, node, printf_buffer);
66129 +
66130 +       kfree(printf_buffer);
66131 +
66132 +       return ret;
66133 +}
66134 +EXPORT_SYMBOL_GPL(xenbus_printf);
66135 +
66136 +/* Takes tuples of names, scanf-style args, and void **, NULL terminated. */
66137 +int xenbus_gather(struct xenbus_transaction t, const char *dir, ...)
66138 +{
66139 +       va_list ap;
66140 +       const char *name;
66141 +       int ret = 0;
66142 +
66143 +       va_start(ap, dir);
66144 +       while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
66145 +               const char *fmt = va_arg(ap, char *);
66146 +               void *result = va_arg(ap, void *);
66147 +               char *p;
66148 +
66149 +               p = xenbus_read(t, dir, name, NULL);
66150 +               if (IS_ERR(p)) {
66151 +                       ret = PTR_ERR(p);
66152 +                       break;
66153 +               }
66154 +               if (fmt) {
66155 +                       if (sscanf(p, fmt, result) == 0)
66156 +                               ret = -EINVAL;
66157 +                       kfree(p);
66158 +               } else
66159 +                       *(char **)result = p;
66160 +       }
66161 +       va_end(ap);
66162 +       return ret;
66163 +}
66164 +EXPORT_SYMBOL_GPL(xenbus_gather);
66165 +
66166 +static int xs_watch(const char *path, const char *token)
66167 +{
66168 +       struct kvec iov[2];
66169 +
66170 +       iov[0].iov_base = (void *)path;
66171 +       iov[0].iov_len = strlen(path) + 1;
66172 +       iov[1].iov_base = (void *)token;
66173 +       iov[1].iov_len = strlen(token) + 1;
66174 +
66175 +       return xs_error(xs_talkv(XBT_NIL, XS_WATCH, iov,
66176 +                                ARRAY_SIZE(iov), NULL));
66177 +}
66178 +
66179 +static int xs_unwatch(const char *path, const char *token)
66180 +{
66181 +       struct kvec iov[2];
66182 +
66183 +       iov[0].iov_base = (char *)path;
66184 +       iov[0].iov_len = strlen(path) + 1;
66185 +       iov[1].iov_base = (char *)token;
66186 +       iov[1].iov_len = strlen(token) + 1;
66187 +
66188 +       return xs_error(xs_talkv(XBT_NIL, XS_UNWATCH, iov,
66189 +                                ARRAY_SIZE(iov), NULL));
66190 +}
66191 +
66192 +static struct xenbus_watch *find_watch(const char *token)
66193 +{
66194 +       struct xenbus_watch *i, *cmp;
66195 +
66196 +       cmp = (void *)simple_strtoul(token, NULL, 16);
66197 +
66198 +       list_for_each_entry(i, &watches, list)
66199 +               if (i == cmp)
66200 +                       return i;
66201 +
66202 +       return NULL;
66203 +}
66204 +
66205 +/* Register callback to watch this node. */
66206 +int register_xenbus_watch(struct xenbus_watch *watch)
66207 +{
66208 +       /* Pointer in ascii is the token. */
66209 +       char token[sizeof(watch) * 2 + 1];
66210 +       int err;
66211 +
66212 +       sprintf(token, "%lX", (long)watch);
66213 +
66214 +       down_read(&xs_state.suspend_mutex);
66215 +
66216 +       spin_lock(&watches_lock);
66217 +       BUG_ON(find_watch(token));
66218 +       list_add(&watch->list, &watches);
66219 +       spin_unlock(&watches_lock);
66220 +
66221 +       err = xs_watch(watch->node, token);
66222 +
66223 +       /* Ignore errors due to multiple registration. */
66224 +       if ((err != 0) && (err != -EEXIST)) {
66225 +               spin_lock(&watches_lock);
66226 +               list_del(&watch->list);
66227 +               spin_unlock(&watches_lock);
66228 +       }
66229 +
66230 +       up_read(&xs_state.suspend_mutex);
66231 +
66232 +       return err;
66233 +}
66234 +EXPORT_SYMBOL_GPL(register_xenbus_watch);
66235 +
66236 +void unregister_xenbus_watch(struct xenbus_watch *watch)
66237 +{
66238 +       struct xs_stored_msg *msg, *tmp;
66239 +       char token[sizeof(watch) * 2 + 1];
66240 +       int err;
66241 +
66242 +       sprintf(token, "%lX", (long)watch);
66243 +
66244 +       down_read(&xs_state.suspend_mutex);
66245 +
66246 +       spin_lock(&watches_lock);
66247 +       BUG_ON(!find_watch(token));
66248 +       list_del(&watch->list);
66249 +       spin_unlock(&watches_lock);
66250 +
66251 +       err = xs_unwatch(watch->node, token);
66252 +       if (err)
66253 +               printk(KERN_WARNING
66254 +                      "XENBUS Failed to release watch %s: %i\n",
66255 +                      watch->node, err);
66256 +
66257 +       up_read(&xs_state.suspend_mutex);
66258 +
66259 +       /* Cancel pending watch events. */
66260 +       spin_lock(&watch_events_lock);
66261 +       list_for_each_entry_safe(msg, tmp, &watch_events, list) {
66262 +               if (msg->u.watch.handle != watch)
66263 +                       continue;
66264 +               list_del(&msg->list);
66265 +               kfree(msg->u.watch.vec);
66266 +               kfree(msg);
66267 +       }
66268 +       spin_unlock(&watch_events_lock);
66269 +
66270 +       /* Flush any currently-executing callback, unless we are it. :-) */
66271 +       if (current->pid != xenwatch_pid) {
66272 +               mutex_lock(&xenwatch_mutex);
66273 +               mutex_unlock(&xenwatch_mutex);
66274 +       }
66275 +}
66276 +EXPORT_SYMBOL_GPL(unregister_xenbus_watch);
66277 +
66278 +void xs_suspend(void)
66279 +{
66280 +       down_write(&xs_state.suspend_mutex);
66281 +       mutex_lock(&xs_state.request_mutex);
66282 +}
66283 +
66284 +void xs_resume(void)
66285 +{
66286 +       struct xenbus_watch *watch;
66287 +       char token[sizeof(watch) * 2 + 1];
66288 +
66289 +       mutex_unlock(&xs_state.request_mutex);
66290 +
66291 +       /* No need for watches_lock: the suspend_mutex is sufficient. */
66292 +       list_for_each_entry(watch, &watches, list) {
66293 +               sprintf(token, "%lX", (long)watch);
66294 +               xs_watch(watch->node, token);
66295 +       }
66296 +
66297 +       up_write(&xs_state.suspend_mutex);
66298 +}
66299 +
66300 +static int xenwatch_handle_callback(void *data)
66301 +{
66302 +       struct xs_stored_msg *msg = data;
66303 +
66304 +       msg->u.watch.handle->callback(msg->u.watch.handle,
66305 +                                     (const char **)msg->u.watch.vec,
66306 +                                     msg->u.watch.vec_size);
66307 +
66308 +       kfree(msg->u.watch.vec);
66309 +       kfree(msg);
66310 +
66311 +       /* Kill this kthread if we were spawned just for this callback. */
66312 +       if (current->pid != xenwatch_pid)
66313 +               do_exit(0);
66314 +
66315 +       return 0;
66316 +}
66317 +
66318 +static int xenwatch_thread(void *unused)
66319 +{
66320 +       struct list_head *ent;
66321 +       struct xs_stored_msg *msg;
66322 +
66323 +       for (;;) {
66324 +               wait_event_interruptible(watch_events_waitq,
66325 +                                        !list_empty(&watch_events));
66326 +
66327 +               if (kthread_should_stop())
66328 +                       break;
66329 +
66330 +               mutex_lock(&xenwatch_mutex);
66331 +
66332 +               spin_lock(&watch_events_lock);
66333 +               ent = watch_events.next;
66334 +               if (ent != &watch_events)
66335 +                       list_del(ent);
66336 +               spin_unlock(&watch_events_lock);
66337 +
66338 +               if (ent != &watch_events) {
66339 +                       msg = list_entry(ent, struct xs_stored_msg, list);
66340 +                       if (msg->u.watch.handle->flags & XBWF_new_thread)
66341 +                               kthread_run(xenwatch_handle_callback,
66342 +                                           msg, "xenwatch_cb");
66343 +                       else
66344 +                               xenwatch_handle_callback(msg);
66345 +               }
66346 +
66347 +               mutex_unlock(&xenwatch_mutex);
66348 +       }
66349 +
66350 +       return 0;
66351 +}
66352 +
66353 +static int process_msg(void)
66354 +{
66355 +       struct xs_stored_msg *msg;
66356 +       char *body;
66357 +       int err;
66358 +
66359 +       msg = kmalloc(sizeof(*msg), GFP_KERNEL);
66360 +       if (msg == NULL)
66361 +               return -ENOMEM;
66362 +
66363 +       err = xb_read(&msg->hdr, sizeof(msg->hdr));
66364 +       if (err) {
66365 +               kfree(msg);
66366 +               return err;
66367 +       }
66368 +
66369 +       body = kmalloc(msg->hdr.len + 1, GFP_KERNEL);
66370 +       if (body == NULL) {
66371 +               kfree(msg);
66372 +               return -ENOMEM;
66373 +       }
66374 +
66375 +       err = xb_read(body, msg->hdr.len);
66376 +       if (err) {
66377 +               kfree(body);
66378 +               kfree(msg);
66379 +               return err;
66380 +       }
66381 +       body[msg->hdr.len] = '\0';
66382 +
66383 +       if (msg->hdr.type == XS_WATCH_EVENT) {
66384 +               msg->u.watch.vec = split(body, msg->hdr.len,
66385 +                                        &msg->u.watch.vec_size);
66386 +               if (IS_ERR(msg->u.watch.vec)) {
66387 +                       kfree(msg);
66388 +                       return PTR_ERR(msg->u.watch.vec);
66389 +               }
66390 +
66391 +               spin_lock(&watches_lock);
66392 +               msg->u.watch.handle = find_watch(
66393 +                       msg->u.watch.vec[XS_WATCH_TOKEN]);
66394 +               if (msg->u.watch.handle != NULL) {
66395 +                       spin_lock(&watch_events_lock);
66396 +                       list_add_tail(&msg->list, &watch_events);
66397 +                       wake_up(&watch_events_waitq);
66398 +                       spin_unlock(&watch_events_lock);
66399 +               } else {
66400 +                       kfree(msg->u.watch.vec);
66401 +                       kfree(msg);
66402 +               }
66403 +               spin_unlock(&watches_lock);
66404 +       } else {
66405 +               msg->u.reply.body = body;
66406 +               spin_lock(&xs_state.reply_lock);
66407 +               list_add_tail(&msg->list, &xs_state.reply_list);
66408 +               spin_unlock(&xs_state.reply_lock);
66409 +               wake_up(&xs_state.reply_waitq);
66410 +       }
66411 +
66412 +       return 0;
66413 +}
66414 +
66415 +static int xenbus_thread(void *unused)
66416 +{
66417 +       int err;
66418 +
66419 +       for (;;) {
66420 +               err = process_msg();
66421 +               if (err)
66422 +                       printk(KERN_WARNING "XENBUS error %d while reading "
66423 +                              "message\n", err);
66424 +               if (kthread_should_stop())
66425 +                       break;
66426 +       }
66427 +
66428 +       return 0;
66429 +}
66430 +
66431 +int xs_init(void)
66432 +{
66433 +       int err;
66434 +       struct task_struct *task;
66435 +
66436 +       INIT_LIST_HEAD(&xs_state.reply_list);
66437 +       spin_lock_init(&xs_state.reply_lock);
66438 +       init_waitqueue_head(&xs_state.reply_waitq);
66439 +
66440 +       mutex_init(&xs_state.request_mutex);
66441 +       init_rwsem(&xs_state.suspend_mutex);
66442 +
66443 +       /* Initialize the shared memory rings to talk to xenstored */
66444 +       err = xb_init_comms();
66445 +       if (err)
66446 +               return err;
66447 +
66448 +       task = kthread_run(xenwatch_thread, NULL, "xenwatch");
66449 +       if (IS_ERR(task))
66450 +               return PTR_ERR(task);
66451 +       xenwatch_pid = task->pid;
66452 +
66453 +       task = kthread_run(xenbus_thread, NULL, "xenbus");
66454 +       if (IS_ERR(task))
66455 +               return PTR_ERR(task);
66456 +
66457 +       return 0;
66458 +}
66459 diff -urNp linux-2.6/fs/Kconfig new/fs/Kconfig
66460 --- linux-2.6/fs/Kconfig        2006-07-03 14:15:01.000000000 +0200
66461 +++ new/fs/Kconfig      2006-05-09 12:34:40.000000000 +0200
66462 @@ -842,6 +842,7 @@ config TMPFS
66463  config HUGETLBFS
66464         bool "HugeTLB file system support"
66465         depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN
66466 +       depends !XEN
66467         help
66468           hugetlbfs is a filesystem backing for HugeTLB pages, based on
66469           ramfs. For architectures that support it, say Y here and read
66470 diff -urNp linux-2.6/include/asm-i386/a.out.h new/include/asm-i386/a.out.h
66471 --- linux-2.6/include/asm-i386/a.out.h  2006-07-03 14:15:09.000000000 +0200
66472 +++ new/include/asm-i386/a.out.h        2006-05-09 12:35:16.000000000 +0200
66473 @@ -19,7 +19,7 @@ struct exec
66474  
66475  #ifdef __KERNEL__
66476  
66477 -#define STACK_TOP      TASK_SIZE
66478 +#define STACK_TOP      (TASK_SIZE - 3*PAGE_SIZE)
66479  
66480  #endif
66481  
66482 diff -urNp linux-2.6/include/asm-i386/apic.h new/include/asm-i386/apic.h
66483 --- linux-2.6/include/asm-i386/apic.h   2006-07-03 14:15:09.000000000 +0200
66484 +++ new/include/asm-i386/apic.h 2006-05-09 12:35:16.000000000 +0200
66485 @@ -132,10 +132,12 @@ extern unsigned int nmi_watchdog;
66486  
66487  extern int disable_timer_pin_1;
66488  
66489 +#ifndef CONFIG_XEN
66490  void smp_send_timer_broadcast_ipi(struct pt_regs *regs);
66491  void switch_APIC_timer_to_ipi(void *cpumask);
66492  void switch_ipi_to_APIC_timer(void *cpumask);
66493  #define ARCH_APICTIMER_STOPS_ON_C3     1
66494 +#endif
66495  
66496  extern int timer_over_8254;
66497  
66498 diff -urNp linux-2.6/include/asm-i386/elf.h new/include/asm-i386/elf.h
66499 --- linux-2.6/include/asm-i386/elf.h    2006-07-03 14:15:09.000000000 +0200
66500 +++ new/include/asm-i386/elf.h  2006-05-09 12:35:16.000000000 +0200
66501 @@ -129,11 +129,16 @@ extern int dump_task_extended_fpu (struc
66502  #define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs)
66503  #define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) dump_task_extended_fpu(tsk, elf_xfpregs)
66504  
66505 -#define VSYSCALL_BASE  (__fix_to_virt(FIX_VSYSCALL))
66506 +#define VSYSCALL_BASE  (PAGE_OFFSET - 2*PAGE_SIZE)
66507  #define VSYSCALL_EHDR  ((const struct elfhdr *) VSYSCALL_BASE)
66508  #define VSYSCALL_ENTRY ((unsigned long) &__kernel_vsyscall)
66509  extern void __kernel_vsyscall;
66510  
66511 +#define ARCH_HAS_SETUP_ADDITIONAL_PAGES
66512 +struct linux_binprm;
66513 +extern int arch_setup_additional_pages(struct linux_binprm *bprm,
66514 +                                       int executable_stack);
66515 +
66516  #define ARCH_DLINFO                                            \
66517  do {                                                           \
66518                 NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY);        \
66519 diff -urNp linux-2.6/include/asm-i386/fixmap.h new/include/asm-i386/fixmap.h
66520 --- linux-2.6/include/asm-i386/fixmap.h 2006-07-03 14:15:09.000000000 +0200
66521 +++ new/include/asm-i386/fixmap.h       2006-05-09 12:35:16.000000000 +0200
66522 @@ -20,7 +20,7 @@
66523   * Leave one empty page between vmalloc'ed areas and
66524   * the start of the fixmap.
66525   */
66526 -#define __FIXADDR_TOP  0xfffff000
66527 +extern unsigned long __FIXADDR_TOP;
66528  
66529  #ifndef __ASSEMBLY__
66530  #include <linux/kernel.h>
66531 @@ -52,7 +52,6 @@
66532   */
66533  enum fixed_addresses {
66534         FIX_HOLE,
66535 -       FIX_VSYSCALL,
66536  #ifdef CONFIG_X86_LOCAL_APIC
66537         FIX_APIC_BASE,  /* local (CPU) APIC) -- required for SMP or not */
66538  #endif
66539 @@ -95,6 +94,8 @@ enum fixed_addresses {
66540  extern void __set_fixmap (enum fixed_addresses idx,
66541                                         unsigned long phys, pgprot_t flags);
66542  
66543 +extern void set_fixaddr_top(unsigned long top);
66544 +
66545  #define set_fixmap(idx, phys) \
66546                 __set_fixmap(idx, phys, PAGE_KERNEL)
66547  /*
66548 @@ -116,14 +117,6 @@ extern void __set_fixmap (enum fixed_add
66549  #define __fix_to_virt(x)       (FIXADDR_TOP - ((x) << PAGE_SHIFT))
66550  #define __virt_to_fix(x)       ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
66551  
66552 -/*
66553 - * This is the range that is readable by user mode, and things
66554 - * acting like user mode such as get_user_pages.
66555 - */
66556 -#define FIXADDR_USER_START     (__fix_to_virt(FIX_VSYSCALL))
66557 -#define FIXADDR_USER_END       (FIXADDR_USER_START + PAGE_SIZE)
66558 -
66559 -
66560  extern void __this_fixmap_does_not_exist(void);
66561  
66562  /*
66563 diff -urNp linux-2.6/include/asm-i386/mach-default/mach_traps.h new/include/asm-i386/mach-default/mach_traps.h
66564 --- linux-2.6/include/asm-i386/mach-default/mach_traps.h        2006-07-03 14:15:09.000000000 +0200
66565 +++ new/include/asm-i386/mach-default/mach_traps.h      2006-05-09 12:35:16.000000000 +0200
66566 @@ -15,6 +15,18 @@ static inline void clear_mem_error(unsig
66567         outb(reason, 0x61);
66568  }
66569  
66570 +static inline void clear_io_check_error(unsigned char reason)
66571 +{
66572 +       unsigned long i;
66573 +
66574 +       reason = (reason & 0xf) | 8;
66575 +       outb(reason, 0x61);
66576 +       i = 2000;
66577 +       while (--i) udelay(1000);
66578 +       reason &= ~8;
66579 +       outb(reason, 0x61);
66580 +}
66581 +
66582  static inline unsigned char get_nmi_reason(void)
66583  {
66584         return inb(0x61);
66585 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/agp.h new/include/asm-i386/mach-xen/asm/agp.h
66586 --- linux-2.6/include/asm-i386/mach-xen/asm/agp.h       1970-01-01 01:00:00.000000000 +0100
66587 +++ new/include/asm-i386/mach-xen/asm/agp.h     2006-05-09 12:35:17.000000000 +0200
66588 @@ -0,0 +1,37 @@
66589 +#ifndef AGP_H
66590 +#define AGP_H 1
66591 +
66592 +#include <asm/pgtable.h>
66593 +#include <asm/cacheflush.h>
66594 +#include <asm/system.h>
66595 +
66596 +/* 
66597 + * Functions to keep the agpgart mappings coherent with the MMU.
66598 + * The GART gives the CPU a physical alias of pages in memory. The alias region is
66599 + * mapped uncacheable. Make sure there are no conflicting mappings
66600 + * with different cachability attributes for the same page. This avoids
66601 + * data corruption on some CPUs.
66602 + */
66603 +
66604 +int map_page_into_agp(struct page *page);
66605 +int unmap_page_from_agp(struct page *page);
66606 +#define flush_agp_mappings() global_flush_tlb()
66607 +
66608 +/* Could use CLFLUSH here if the cpu supports it. But then it would
66609 +   need to be called for each cacheline of the whole page so it may not be 
66610 +   worth it. Would need a page for it. */
66611 +#define flush_agp_cache() wbinvd()
66612 +
66613 +/* Convert a physical address to an address suitable for the GART. */
66614 +#define phys_to_gart(x) phys_to_machine(x)
66615 +#define gart_to_phys(x) machine_to_phys(x)
66616 +
66617 +/* GATT allocation. Returns/accepts GATT kernel virtual address. */
66618 +#define alloc_gatt_pages(order)        ({                                          \
66619 +       char *_t; dma_addr_t _d;                                            \
66620 +       _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL);    \
66621 +       _t; })
66622 +#define free_gatt_pages(table, order)  \
66623 +       dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table))
66624 +
66625 +#endif
66626 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/desc.h new/include/asm-i386/mach-xen/asm/desc.h
66627 --- linux-2.6/include/asm-i386/mach-xen/asm/desc.h      1970-01-01 01:00:00.000000000 +0100
66628 +++ new/include/asm-i386/mach-xen/asm/desc.h    2006-05-09 12:35:17.000000000 +0200
66629 @@ -0,0 +1,164 @@
66630 +#ifndef __ARCH_DESC_H
66631 +#define __ARCH_DESC_H
66632 +
66633 +#include <asm/ldt.h>
66634 +#include <asm/segment.h>
66635 +
66636 +#define CPU_16BIT_STACK_SIZE 1024
66637 +
66638 +#ifndef __ASSEMBLY__
66639 +
66640 +#include <linux/preempt.h>
66641 +#include <linux/smp.h>
66642 +
66643 +#include <asm/mmu.h>
66644 +
66645 +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
66646 +
66647 +DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
66648 +
66649 +struct Xgt_desc_struct {
66650 +       unsigned short size;
66651 +       unsigned long address __attribute__((packed));
66652 +       unsigned short pad;
66653 +} __attribute__ ((packed));
66654 +
66655 +extern struct Xgt_desc_struct idt_descr;
66656 +DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
66657 +
66658 +
66659 +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
66660 +{
66661 +       return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
66662 +}
66663 +
66664 +#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
66665 +#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
66666 +
66667 +#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
66668 +#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
66669 +#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr))
66670 +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt))
66671 +
66672 +#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
66673 +#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
66674 +#define store_tr(tr) __asm__ ("str %0":"=mr" (tr))
66675 +#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt))
66676 +
66677 +/*
66678 + * This is the ldt that every process will get unless we need
66679 + * something other than this.
66680 + */
66681 +extern struct desc_struct default_ldt[];
66682 +extern void set_intr_gate(unsigned int irq, void * addr);
66683 +
66684 +#define _set_tssldt_desc(n,addr,limit,type) \
66685 +__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \
66686 +       "movw %w1,2(%2)\n\t" \
66687 +       "rorl $16,%1\n\t" \
66688 +       "movb %b1,4(%2)\n\t" \
66689 +       "movb %4,5(%2)\n\t" \
66690 +       "movb $0,6(%2)\n\t" \
66691 +       "movb %h1,7(%2)\n\t" \
66692 +       "rorl $16,%1" \
66693 +       : "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type))
66694 +
66695 +#ifndef CONFIG_X86_NO_TSS
66696 +static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr)
66697 +{
66698 +       _set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr,
66699 +               offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89);
66700 +}
66701 +
66702 +#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
66703 +#endif
66704 +
66705 +static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size)
66706 +{
66707 +       _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82);
66708 +}
66709 +
66710 +#define LDT_entry_a(info) \
66711 +       ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
66712 +
66713 +#define LDT_entry_b(info) \
66714 +       (((info)->base_addr & 0xff000000) | \
66715 +       (((info)->base_addr & 0x00ff0000) >> 16) | \
66716 +       ((info)->limit & 0xf0000) | \
66717 +       (((info)->read_exec_only ^ 1) << 9) | \
66718 +       ((info)->contents << 10) | \
66719 +       (((info)->seg_not_present ^ 1) << 15) | \
66720 +       ((info)->seg_32bit << 22) | \
66721 +       ((info)->limit_in_pages << 23) | \
66722 +       ((info)->useable << 20) | \
66723 +       0x7000)
66724 +
66725 +#define LDT_empty(info) (\
66726 +       (info)->base_addr       == 0    && \
66727 +       (info)->limit           == 0    && \
66728 +       (info)->contents        == 0    && \
66729 +       (info)->read_exec_only  == 1    && \
66730 +       (info)->seg_32bit       == 0    && \
66731 +       (info)->limit_in_pages  == 0    && \
66732 +       (info)->seg_not_present == 1    && \
66733 +       (info)->useable         == 0    )
66734 +
66735 +extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
66736 +
66737 +#if TLS_SIZE != 24
66738 +# error update this code.
66739 +#endif
66740 +
66741 +static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
66742 +{
66743 +#define C(i) HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), *(u64 *)&t->tls_array[i])
66744 +       C(0); C(1); C(2);
66745 +#undef C
66746 +}
66747 +
66748 +static inline void clear_LDT(void)
66749 +{
66750 +       int cpu = get_cpu();
66751 +
66752 +       /*
66753 +        * NB. We load the default_ldt for lcall7/27 handling on demand, as
66754 +        * it slows down context switching. Noone uses it anyway.
66755 +        */
66756 +       cpu = cpu;              /* XXX avoid compiler warning */
66757 +       xen_set_ldt(0UL, 0);
66758 +       put_cpu();
66759 +}
66760 +
66761 +/*
66762 + * load one particular LDT into the current CPU
66763 + */
66764 +static inline void load_LDT_nolock(mm_context_t *pc, int cpu)
66765 +{
66766 +       void *segments = pc->ldt;
66767 +       int count = pc->size;
66768 +
66769 +       if (likely(!count))
66770 +               segments = NULL;
66771 +
66772 +       xen_set_ldt((unsigned long)segments, count);
66773 +}
66774 +
66775 +static inline void load_LDT(mm_context_t *pc)
66776 +{
66777 +       int cpu = get_cpu();
66778 +       load_LDT_nolock(pc, cpu);
66779 +       put_cpu();
66780 +}
66781 +
66782 +static inline unsigned long get_desc_base(unsigned long *desc)
66783 +{
66784 +       unsigned long base;
66785 +       base = ((desc[0] >> 16)  & 0x0000ffff) |
66786 +               ((desc[1] << 16) & 0x00ff0000) |
66787 +               (desc[1] & 0xff000000);
66788 +       return base;
66789 +}
66790 +
66791 +#endif /* !__ASSEMBLY__ */
66792 +
66793 +#endif
66794 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/dma-mapping.h new/include/asm-i386/mach-xen/asm/dma-mapping.h
66795 --- linux-2.6/include/asm-i386/mach-xen/asm/dma-mapping.h       1970-01-01 01:00:00.000000000 +0100
66796 +++ new/include/asm-i386/mach-xen/asm/dma-mapping.h     2006-06-07 13:15:16.000000000 +0200
66797 @@ -0,0 +1,154 @@
66798 +#ifndef _ASM_I386_DMA_MAPPING_H
66799 +#define _ASM_I386_DMA_MAPPING_H
66800 +
66801 +/*
66802 + * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
66803 + * documentation.
66804 + */
66805 +
66806 +#include <linux/config.h>
66807 +#include <linux/mm.h>
66808 +#include <asm/cache.h>
66809 +#include <asm/io.h>
66810 +#include <asm/scatterlist.h>
66811 +#include <asm/swiotlb.h>
66812 +
66813 +static inline int
66814 +address_needs_mapping(struct device *hwdev, dma_addr_t addr)
66815 +{
66816 +       dma_addr_t mask = 0xffffffff;
66817 +       /* If the device has a mask, use it, otherwise default to 32 bits */
66818 +       if (hwdev && hwdev->dma_mask)
66819 +               mask = *hwdev->dma_mask;
66820 +       return (addr & ~mask) != 0;
66821 +}
66822 +
66823 +static inline int
66824 +range_straddles_page_boundary(void *p, size_t size)
66825 +{
66826 +       extern unsigned long *contiguous_bitmap;
66827 +       return (((((unsigned long)p & ~PAGE_MASK) + size) > PAGE_SIZE) &&
66828 +               !test_bit(__pa(p) >> PAGE_SHIFT, contiguous_bitmap));
66829 +}
66830 +
66831 +#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
66832 +#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
66833 +
66834 +void *dma_alloc_coherent(struct device *dev, size_t size,
66835 +                          dma_addr_t *dma_handle, gfp_t flag);
66836 +
66837 +void dma_free_coherent(struct device *dev, size_t size,
66838 +                        void *vaddr, dma_addr_t dma_handle);
66839 +
66840 +extern dma_addr_t
66841 +dma_map_single(struct device *dev, void *ptr, size_t size,
66842 +              enum dma_data_direction direction);
66843 +
66844 +extern void
66845 +dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
66846 +                enum dma_data_direction direction);
66847 +
66848 +extern int dma_map_sg(struct device *hwdev, struct scatterlist *sg,
66849 +                     int nents, enum dma_data_direction direction);
66850 +extern void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg,
66851 +                        int nents, enum dma_data_direction direction);
66852 +
66853 +extern dma_addr_t
66854 +dma_map_page(struct device *dev, struct page *page, unsigned long offset,
66855 +            size_t size, enum dma_data_direction direction);
66856 +
66857 +extern void
66858 +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
66859 +              enum dma_data_direction direction);
66860 +
66861 +extern void
66862 +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
66863 +                       enum dma_data_direction direction);
66864 +
66865 +extern void
66866 +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
66867 +                           enum dma_data_direction direction);
66868 +
66869 +static inline void
66870 +dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle,
66871 +                             unsigned long offset, size_t size,
66872 +                             enum dma_data_direction direction)
66873 +{
66874 +       dma_sync_single_for_cpu(dev, dma_handle+offset, size, direction);
66875 +}
66876 +
66877 +static inline void
66878 +dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
66879 +                                unsigned long offset, size_t size,
66880 +                                enum dma_data_direction direction)
66881 +{
66882 +       dma_sync_single_for_device(dev, dma_handle+offset, size, direction);
66883 +}
66884 +
66885 +static inline void
66886 +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
66887 +                   enum dma_data_direction direction)
66888 +{
66889 +       if (swiotlb)
66890 +               swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
66891 +       flush_write_buffers();
66892 +}
66893 +
66894 +static inline void
66895 +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
66896 +                   enum dma_data_direction direction)
66897 +{
66898 +       if (swiotlb)
66899 +               swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
66900 +       flush_write_buffers();
66901 +}
66902 +
66903 +extern int
66904 +dma_mapping_error(dma_addr_t dma_addr);
66905 +
66906 +extern int
66907 +dma_supported(struct device *dev, u64 mask);
66908 +
66909 +static inline int
66910 +dma_set_mask(struct device *dev, u64 mask)
66911 +{
66912 +       if(!dev->dma_mask || !dma_supported(dev, mask))
66913 +               return -EIO;
66914 +
66915 +       *dev->dma_mask = mask;
66916 +
66917 +       return 0;
66918 +}
66919 +
66920 +#ifdef __i386__
66921 +static inline int
66922 +dma_get_cache_alignment(void)
66923 +{
66924 +       /* no easy way to get cache size on all x86, so return the
66925 +        * maximum possible, to be safe */
66926 +       return (1 << INTERNODE_CACHE_SHIFT);
66927 +}
66928 +#endif
66929 +
66930 +#define dma_is_consistent(d)   (1)
66931 +
66932 +static inline void
66933 +dma_cache_sync(void *vaddr, size_t size,
66934 +              enum dma_data_direction direction)
66935 +{
66936 +       flush_write_buffers();
66937 +}
66938 +
66939 +#define ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
66940 +extern int
66941 +dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
66942 +                           dma_addr_t device_addr, size_t size, int flags);
66943 +
66944 +extern void
66945 +dma_release_declared_memory(struct device *dev);
66946 +
66947 +extern void *
66948 +dma_mark_declared_memory_occupied(struct device *dev,
66949 +                                 dma_addr_t device_addr, size_t size);
66950 +
66951 +#endif
66952 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/fixmap.h new/include/asm-i386/mach-xen/asm/fixmap.h
66953 --- linux-2.6/include/asm-i386/mach-xen/asm/fixmap.h    1970-01-01 01:00:00.000000000 +0100
66954 +++ new/include/asm-i386/mach-xen/asm/fixmap.h  2006-05-09 12:35:17.000000000 +0200
66955 @@ -0,0 +1,156 @@
66956 +/*
66957 + * fixmap.h: compile-time virtual memory allocation
66958 + *
66959 + * This file is subject to the terms and conditions of the GNU General Public
66960 + * License.  See the file "COPYING" in the main directory of this archive
66961 + * for more details.
66962 + *
66963 + * Copyright (C) 1998 Ingo Molnar
66964 + *
66965 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
66966 + */
66967 +
66968 +#ifndef _ASM_FIXMAP_H
66969 +#define _ASM_FIXMAP_H
66970 +
66971 +#include <linux/config.h>
66972 +
66973 +/* used by vmalloc.c, vsyscall.lds.S.
66974 + *
66975 + * Leave one empty page between vmalloc'ed areas and
66976 + * the start of the fixmap.
66977 + */
66978 +extern unsigned long __FIXADDR_TOP;
66979 +
66980 +#ifndef __ASSEMBLY__
66981 +#include <linux/kernel.h>
66982 +#include <asm/acpi.h>
66983 +#include <asm/apicdef.h>
66984 +#include <asm/page.h>
66985 +#include <xen/gnttab.h>
66986 +#ifdef CONFIG_HIGHMEM
66987 +#include <linux/threads.h>
66988 +#include <asm/kmap_types.h>
66989 +#endif
66990 +
66991 +/*
66992 + * Here we define all the compile-time 'special' virtual
66993 + * addresses. The point is to have a constant address at
66994 + * compile time, but to set the physical address only
66995 + * in the boot process. We allocate these special addresses
66996 + * from the end of virtual memory (0xfffff000) backwards.
66997 + * Also this lets us do fail-safe vmalloc(), we
66998 + * can guarantee that these special addresses and
66999 + * vmalloc()-ed addresses never overlap.
67000 + *
67001 + * these 'compile-time allocated' memory buffers are
67002 + * fixed-size 4k pages. (or larger if used with an increment
67003 + * highger than 1) use fixmap_set(idx,phys) to associate
67004 + * physical memory with fixmap indices.
67005 + *
67006 + * TLB entries of such buffers will not be flushed across
67007 + * task switches.
67008 + */
67009 +enum fixed_addresses {
67010 +       FIX_HOLE,
67011 +#ifdef CONFIG_X86_LOCAL_APIC
67012 +       FIX_APIC_BASE,  /* local (CPU) APIC) -- required for SMP or not */
67013 +#endif
67014 +#ifdef CONFIG_X86_IO_APIC
67015 +       FIX_IO_APIC_BASE_0,
67016 +       FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
67017 +#endif
67018 +#ifdef CONFIG_X86_VISWS_APIC
67019 +       FIX_CO_CPU,     /* Cobalt timer */
67020 +       FIX_CO_APIC,    /* Cobalt APIC Redirection Table */ 
67021 +       FIX_LI_PCIA,    /* Lithium PCI Bridge A */
67022 +       FIX_LI_PCIB,    /* Lithium PCI Bridge B */
67023 +#endif
67024 +#ifdef CONFIG_X86_F00F_BUG
67025 +       FIX_F00F_IDT,   /* Virtual mapping for IDT */
67026 +#endif
67027 +#ifdef CONFIG_X86_CYCLONE_TIMER
67028 +       FIX_CYCLONE_TIMER, /*cyclone timer register*/
67029 +#endif 
67030 +#ifdef CONFIG_HIGHMEM
67031 +       FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
67032 +       FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
67033 +#endif
67034 +#ifdef CONFIG_ACPI
67035 +       FIX_ACPI_BEGIN,
67036 +       FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
67037 +#endif
67038 +#ifdef CONFIG_PCI_MMCONFIG
67039 +       FIX_PCIE_MCFG,
67040 +#endif
67041 +       FIX_SHARED_INFO,
67042 +#define NR_FIX_ISAMAPS 256
67043 +       FIX_ISAMAP_END,
67044 +       FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
67045 +       __end_of_permanent_fixed_addresses,
67046 +       /* temporary boot-time mappings, used before ioremap() is functional */
67047 +#define NR_FIX_BTMAPS  16
67048 +       FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
67049 +       FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
67050 +       FIX_WP_TEST,
67051 +       __end_of_fixed_addresses
67052 +};
67053 +
67054 +extern void __set_fixmap(enum fixed_addresses idx,
67055 +                                       maddr_t phys, pgprot_t flags);
67056 +
67057 +extern void set_fixaddr_top(unsigned long top);
67058 +
67059 +#define set_fixmap(idx, phys) \
67060 +               __set_fixmap(idx, phys, PAGE_KERNEL)
67061 +/*
67062 + * Some hardware wants to get fixmapped without caching.
67063 + */
67064 +#define set_fixmap_nocache(idx, phys) \
67065 +               __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
67066 +
67067 +#define clear_fixmap(idx) \
67068 +               __set_fixmap(idx, 0, __pgprot(0))
67069 +
67070 +#define FIXADDR_TOP    ((unsigned long)__FIXADDR_TOP)
67071 +
67072 +#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
67073 +#define __FIXADDR_BOOT_SIZE    (__end_of_fixed_addresses << PAGE_SHIFT)
67074 +#define FIXADDR_START          (FIXADDR_TOP - __FIXADDR_SIZE)
67075 +#define FIXADDR_BOOT_START     (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
67076 +
67077 +#define __fix_to_virt(x)       (FIXADDR_TOP - ((x) << PAGE_SHIFT))
67078 +#define __virt_to_fix(x)       ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
67079 +
67080 +extern void __this_fixmap_does_not_exist(void);
67081 +
67082 +/*
67083 + * 'index to address' translation. If anyone tries to use the idx
67084 + * directly without tranlation, we catch the bug with a NULL-deference
67085 + * kernel oops. Illegal ranges of incoming indices are caught too.
67086 + */
67087 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
67088 +{
67089 +       /*
67090 +        * this branch gets completely eliminated after inlining,
67091 +        * except when someone tries to use fixaddr indices in an
67092 +        * illegal way. (such as mixing up address types or using
67093 +        * out-of-range indices).
67094 +        *
67095 +        * If it doesn't get removed, the linker will complain
67096 +        * loudly with a reasonably clear error message..
67097 +        */
67098 +       if (idx >= __end_of_fixed_addresses)
67099 +               __this_fixmap_does_not_exist();
67100 +
67101 +        return __fix_to_virt(idx);
67102 +}
67103 +
67104 +static inline unsigned long virt_to_fix(const unsigned long vaddr)
67105 +{
67106 +       BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
67107 +       return __virt_to_fix(vaddr);
67108 +}
67109 +
67110 +#endif /* !__ASSEMBLY__ */
67111 +#endif
67112 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/floppy.h new/include/asm-i386/mach-xen/asm/floppy.h
67113 --- linux-2.6/include/asm-i386/mach-xen/asm/floppy.h    1970-01-01 01:00:00.000000000 +0100
67114 +++ new/include/asm-i386/mach-xen/asm/floppy.h  2006-05-09 12:35:17.000000000 +0200
67115 @@ -0,0 +1,147 @@
67116 +/*
67117 + * Architecture specific parts of the Floppy driver
67118 + *
67119 + * This file is subject to the terms and conditions of the GNU General Public
67120 + * License.  See the file "COPYING" in the main directory of this archive
67121 + * for more details.
67122 + *
67123 + * Copyright (C) 1995
67124 + *
67125 + * Modifications for Xen are Copyright (c) 2004, Keir Fraser.
67126 + */
67127 +#ifndef __ASM_XEN_I386_FLOPPY_H
67128 +#define __ASM_XEN_I386_FLOPPY_H
67129 +
67130 +#include <linux/vmalloc.h>
67131 +
67132 +/* XEN: Hit DMA paths on the head. This trick from asm-m68k/floppy.h. */
67133 +#include <asm/dma.h>
67134 +#undef MAX_DMA_ADDRESS
67135 +#define MAX_DMA_ADDRESS 0
67136 +#define CROSS_64KB(a,s) (0)
67137 +
67138 +#define fd_inb(port)                   inb_p(port)
67139 +#define fd_outb(value,port)            outb_p(value,port)
67140 +
67141 +#define fd_request_dma()        (0)
67142 +#define fd_free_dma()           ((void)0)
67143 +#define fd_enable_irq()         enable_irq(FLOPPY_IRQ)
67144 +#define fd_disable_irq()        disable_irq(FLOPPY_IRQ)
67145 +#define fd_free_irq()          free_irq(FLOPPY_IRQ, NULL)
67146 +#define fd_get_dma_residue()    (virtual_dma_count + virtual_dma_residue)
67147 +#define fd_dma_setup(addr, size, mode, io) vdma_dma_setup(addr, size, mode, io)
67148 +/*
67149 + * Do not use vmalloc/vfree: floppy_release_irq_and_dma() gets called from
67150 + * softirq context via motor_off_callback. A generic bug we happen to trigger.
67151 + */
67152 +#define fd_dma_mem_alloc(size) __get_free_pages(GFP_KERNEL, get_order(size))
67153 +#define fd_dma_mem_free(addr, size) free_pages(addr, get_order(size))
67154 +
67155 +static int virtual_dma_count;
67156 +static int virtual_dma_residue;
67157 +static char *virtual_dma_addr;
67158 +static int virtual_dma_mode;
67159 +static int doing_pdma;
67160 +
67161 +static irqreturn_t floppy_hardint(int irq, void *dev_id, struct pt_regs * regs)
67162 +{
67163 +       register unsigned char st;
67164 +       register int lcount;
67165 +       register char *lptr;
67166 +
67167 +       if (!doing_pdma)
67168 +               return floppy_interrupt(irq, dev_id, regs);
67169 +
67170 +       st = 1;
67171 +       for(lcount=virtual_dma_count, lptr=virtual_dma_addr; 
67172 +           lcount; lcount--, lptr++) {
67173 +               st=inb(virtual_dma_port+4) & 0xa0 ;
67174 +               if(st != 0xa0) 
67175 +                       break;
67176 +               if(virtual_dma_mode)
67177 +                       outb_p(*lptr, virtual_dma_port+5);
67178 +               else
67179 +                       *lptr = inb_p(virtual_dma_port+5);
67180 +       }
67181 +       virtual_dma_count = lcount;
67182 +       virtual_dma_addr = lptr;
67183 +       st = inb(virtual_dma_port+4);
67184 +
67185 +       if(st == 0x20)
67186 +               return IRQ_HANDLED;
67187 +       if(!(st & 0x20)) {
67188 +               virtual_dma_residue += virtual_dma_count;
67189 +               virtual_dma_count=0;
67190 +               doing_pdma = 0;
67191 +               floppy_interrupt(irq, dev_id, regs);
67192 +               return IRQ_HANDLED;
67193 +       }
67194 +       return IRQ_HANDLED;
67195 +}
67196 +
67197 +static void fd_disable_dma(void)
67198 +{
67199 +       doing_pdma = 0;
67200 +       virtual_dma_residue += virtual_dma_count;
67201 +       virtual_dma_count=0;
67202 +}
67203 +
67204 +static int fd_request_irq(void)
67205 +{
67206 +       return request_irq(FLOPPY_IRQ, floppy_hardint,SA_INTERRUPT,
67207 +                                          "floppy", NULL);
67208 +}
67209 +
67210 +static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io)
67211 +{
67212 +       doing_pdma = 1;
67213 +       virtual_dma_port = io;
67214 +       virtual_dma_mode = (mode  == DMA_MODE_WRITE);
67215 +       virtual_dma_addr = addr;
67216 +       virtual_dma_count = size;
67217 +       virtual_dma_residue = 0;
67218 +       return 0;
67219 +}
67220 +
67221 +/* XEN: This trick to force 'virtual DMA' is from include/asm-m68k/floppy.h. */
67222 +#define FDC1 xen_floppy_init()
67223 +static int FDC2 = -1;
67224 +
67225 +static int xen_floppy_init(void)
67226 +{
67227 +       use_virtual_dma = 1;
67228 +       can_use_virtual_dma = 1;
67229 +       return 0x3f0;
67230 +}
67231 +
67232 +/*
67233 + * Floppy types are stored in the rtc's CMOS RAM and so rtc_lock
67234 + * is needed to prevent corrupted CMOS RAM in case "insmod floppy"
67235 + * coincides with another rtc CMOS user.               Paul G.
67236 + */
67237 +#define FLOPPY0_TYPE   ({                              \
67238 +       unsigned long flags;                            \
67239 +       unsigned char val;                              \
67240 +       spin_lock_irqsave(&rtc_lock, flags);            \
67241 +       val = (CMOS_READ(0x10) >> 4) & 15;              \
67242 +       spin_unlock_irqrestore(&rtc_lock, flags);       \
67243 +       val;                                            \
67244 +})
67245 +
67246 +#define FLOPPY1_TYPE   ({                              \
67247 +       unsigned long flags;                            \
67248 +       unsigned char val;                              \
67249 +       spin_lock_irqsave(&rtc_lock, flags);            \
67250 +       val = CMOS_READ(0x10) & 15;                     \
67251 +       spin_unlock_irqrestore(&rtc_lock, flags);       \
67252 +       val;                                            \
67253 +})
67254 +
67255 +#define N_FDC 2
67256 +#define N_DRIVE 8
67257 +
67258 +#define FLOPPY_MOTOR_MASK 0xf0
67259 +
67260 +#define EXTRA_FLOPPY_PARAMS
67261 +
67262 +#endif /* __ASM_XEN_I386_FLOPPY_H */
67263 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/highmem.h new/include/asm-i386/mach-xen/asm/highmem.h
67264 --- linux-2.6/include/asm-i386/mach-xen/asm/highmem.h   1970-01-01 01:00:00.000000000 +0100
67265 +++ new/include/asm-i386/mach-xen/asm/highmem.h 2006-05-09 12:35:17.000000000 +0200
67266 @@ -0,0 +1,81 @@
67267 +/*
67268 + * highmem.h: virtual kernel memory mappings for high memory
67269 + *
67270 + * Used in CONFIG_HIGHMEM systems for memory pages which
67271 + * are not addressable by direct kernel virtual addresses.
67272 + *
67273 + * Copyright (C) 1999 Gerhard Wichert, Siemens AG
67274 + *                   Gerhard.Wichert@pdb.siemens.de
67275 + *
67276 + *
67277 + * Redesigned the x86 32-bit VM architecture to deal with 
67278 + * up to 16 Terabyte physical memory. With current x86 CPUs
67279 + * we now support up to 64 Gigabytes physical RAM.
67280 + *
67281 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
67282 + */
67283 +
67284 +#ifndef _ASM_HIGHMEM_H
67285 +#define _ASM_HIGHMEM_H
67286 +
67287 +#ifdef __KERNEL__
67288 +
67289 +#include <linux/config.h>
67290 +#include <linux/interrupt.h>
67291 +#include <linux/threads.h>
67292 +#include <asm/kmap_types.h>
67293 +#include <asm/tlbflush.h>
67294 +
67295 +/* declarations for highmem.c */
67296 +extern unsigned long highstart_pfn, highend_pfn;
67297 +
67298 +extern pte_t *kmap_pte;
67299 +extern pgprot_t kmap_prot;
67300 +extern pte_t *pkmap_page_table;
67301 +
67302 +/*
67303 + * Right now we initialize only a single pte table. It can be extended
67304 + * easily, subsequent pte tables have to be allocated in one physical
67305 + * chunk of RAM.
67306 + */
67307 +#ifdef CONFIG_X86_PAE
67308 +#define LAST_PKMAP 512
67309 +#else
67310 +#define LAST_PKMAP 1024
67311 +#endif
67312 +/*
67313 + * Ordering is:
67314 + *
67315 + * FIXADDR_TOP
67316 + *                     fixed_addresses
67317 + * FIXADDR_START
67318 + *                     temp fixed addresses
67319 + * FIXADDR_BOOT_START
67320 + *                     Persistent kmap area
67321 + * PKMAP_BASE
67322 + * VMALLOC_END
67323 + *                     Vmalloc area
67324 + * VMALLOC_START
67325 + * high_memory
67326 + */
67327 +#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK )
67328 +#define LAST_PKMAP_MASK (LAST_PKMAP-1)
67329 +#define PKMAP_NR(virt)  ((virt-PKMAP_BASE) >> PAGE_SHIFT)
67330 +#define PKMAP_ADDR(nr)  (PKMAP_BASE + ((nr) << PAGE_SHIFT))
67331 +
67332 +extern void * FASTCALL(kmap_high(struct page *page));
67333 +extern void FASTCALL(kunmap_high(struct page *page));
67334 +
67335 +void *kmap(struct page *page);
67336 +void kunmap(struct page *page);
67337 +void *kmap_atomic(struct page *page, enum km_type type);
67338 +void *kmap_atomic_pte(struct page *page, enum km_type type);
67339 +void kunmap_atomic(void *kvaddr, enum km_type type);
67340 +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
67341 +struct page *kmap_atomic_to_page(void *ptr);
67342 +
67343 +#define flush_cache_kmaps()    do { } while (0)
67344 +
67345 +#endif /* __KERNEL__ */
67346 +
67347 +#endif /* _ASM_HIGHMEM_H */
67348 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/hw_irq.h new/include/asm-i386/mach-xen/asm/hw_irq.h
67349 --- linux-2.6/include/asm-i386/mach-xen/asm/hw_irq.h    1970-01-01 01:00:00.000000000 +0100
67350 +++ new/include/asm-i386/mach-xen/asm/hw_irq.h  2006-06-28 14:32:14.000000000 +0200
67351 @@ -0,0 +1,77 @@
67352 +#ifndef _ASM_HW_IRQ_H
67353 +#define _ASM_HW_IRQ_H
67354 +
67355 +/*
67356 + *     linux/include/asm/hw_irq.h
67357 + *
67358 + *     (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
67359 + *
67360 + *     moved some of the old arch/i386/kernel/irq.h to here. VY
67361 + *
67362 + *     IRQ/IPI changes taken from work by Thomas Radke
67363 + *     <tomsoft@informatik.tu-chemnitz.de>
67364 + */
67365 +
67366 +#include <linux/config.h>
67367 +#include <linux/profile.h>
67368 +#include <asm/atomic.h>
67369 +#include <asm/irq.h>
67370 +#include <asm/sections.h>
67371 +
67372 +struct hw_interrupt_type;
67373 +
67374 +/*
67375 + * Various low-level irq details needed by irq.c, process.c,
67376 + * time.c, io_apic.c and smp.c
67377 + *
67378 + * Interrupt entry/exit code at both C and assembly level
67379 + */
67380 +
67381 +extern u8 irq_vector[NR_IRQ_VECTORS];
67382 +#define IO_APIC_VECTOR(irq)    (irq_vector[irq])
67383 +#define AUTO_ASSIGN            -1
67384 +
67385 +extern void (*interrupt[NR_IRQS])(void);
67386 +
67387 +#ifdef CONFIG_SMP
67388 +fastcall void reschedule_interrupt(void);
67389 +fastcall void invalidate_interrupt(void);
67390 +fastcall void call_function_interrupt(void);
67391 +#endif
67392 +
67393 +#ifdef CONFIG_X86_LOCAL_APIC
67394 +fastcall void apic_timer_interrupt(void);
67395 +fastcall void error_interrupt(void);
67396 +fastcall void spurious_interrupt(void);
67397 +fastcall void thermal_interrupt(struct pt_regs *);
67398 +#define platform_legacy_irq(irq)       ((irq) < 16)
67399 +#endif
67400 +
67401 +void disable_8259A_irq(unsigned int irq);
67402 +void enable_8259A_irq(unsigned int irq);
67403 +int i8259A_irq_pending(unsigned int irq);
67404 +void make_8259A_irq(unsigned int irq);
67405 +void init_8259A(int aeoi);
67406 +void FASTCALL(send_IPI_self(int vector));
67407 +void init_VISWS_APIC_irqs(void);
67408 +void setup_IO_APIC(void);
67409 +void disable_IO_APIC(void);
67410 +void print_IO_APIC(void);
67411 +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
67412 +void send_IPI(int dest, int vector);
67413 +void setup_ioapic_dest(void);
67414 +
67415 +extern unsigned long io_apic_irqs;
67416 +
67417 +extern atomic_t irq_err_count;
67418 +extern atomic_t irq_mis_count;
67419 +
67420 +#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs))
67421 +
67422 +extern void resend_irq_on_evtchn(struct hw_interrupt_type *h, unsigned int i);
67423 +static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i)
67424 +{
67425 +       resend_irq_on_evtchn(h, i);
67426 +}
67427 +
67428 +#endif /* _ASM_HW_IRQ_H */
67429 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/hypercall.h new/include/asm-i386/mach-xen/asm/hypercall.h
67430 --- linux-2.6/include/asm-i386/mach-xen/asm/hypercall.h 1970-01-01 01:00:00.000000000 +0100
67431 +++ new/include/asm-i386/mach-xen/asm/hypercall.h       2006-06-28 14:32:14.000000000 +0200
67432 @@ -0,0 +1,372 @@
67433 +/******************************************************************************
67434 + * hypercall.h
67435 + * 
67436 + * Linux-specific hypervisor handling.
67437 + * 
67438 + * Copyright (c) 2002-2004, K A Fraser
67439 + * 
67440 + * This program is free software; you can redistribute it and/or
67441 + * modify it under the terms of the GNU General Public License version 2
67442 + * as published by the Free Software Foundation; or, when distributed
67443 + * separately from the Linux kernel or incorporated into other
67444 + * software packages, subject to the following license:
67445 + * 
67446 + * Permission is hereby granted, free of charge, to any person obtaining a copy
67447 + * of this source file (the "Software"), to deal in the Software without
67448 + * restriction, including without limitation the rights to use, copy, modify,
67449 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
67450 + * and to permit persons to whom the Software is furnished to do so, subject to
67451 + * the following conditions:
67452 + * 
67453 + * The above copyright notice and this permission notice shall be included in
67454 + * all copies or substantial portions of the Software.
67455 + * 
67456 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
67457 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
67458 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
67459 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
67460 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
67461 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
67462 + * IN THE SOFTWARE.
67463 + */
67464 +
67465 +#ifndef __HYPERCALL_H__
67466 +#define __HYPERCALL_H__
67467 +
67468 +#include <linux/string.h> /* memcpy() */
67469 +
67470 +#ifndef __HYPERVISOR_H__
67471 +# error "please don't include this file directly"
67472 +#endif
67473 +
67474 +#define __STR(x) #x
67475 +#define STR(x) __STR(x)
67476 +
67477 +#define _hypercall0(type, name)                        \
67478 +({                                             \
67479 +       long __res;                             \
67480 +       asm volatile (                          \
67481 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
67482 +               : "=a" (__res)                  \
67483 +               :                               \
67484 +               : "memory" );                   \
67485 +       (type)__res;                            \
67486 +})
67487 +
67488 +#define _hypercall1(type, name, a1)                            \
67489 +({                                                             \
67490 +       long __res, __ign1;                                     \
67491 +       asm volatile (                                          \
67492 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
67493 +               : "=a" (__res), "=b" (__ign1)                   \
67494 +               : "1" ((long)(a1))                              \
67495 +               : "memory" );                                   \
67496 +       (type)__res;                                            \
67497 +})
67498 +
67499 +#define _hypercall2(type, name, a1, a2)                                \
67500 +({                                                             \
67501 +       long __res, __ign1, __ign2;                             \
67502 +       asm volatile (                                          \
67503 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
67504 +               : "=a" (__res), "=b" (__ign1), "=c" (__ign2)    \
67505 +               : "1" ((long)(a1)), "2" ((long)(a2))            \
67506 +               : "memory" );                                   \
67507 +       (type)__res;                                            \
67508 +})
67509 +
67510 +#define _hypercall3(type, name, a1, a2, a3)                    \
67511 +({                                                             \
67512 +       long __res, __ign1, __ign2, __ign3;                     \
67513 +       asm volatile (                                          \
67514 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
67515 +               : "=a" (__res), "=b" (__ign1), "=c" (__ign2),   \
67516 +               "=d" (__ign3)                                   \
67517 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
67518 +               "3" ((long)(a3))                                \
67519 +               : "memory" );                                   \
67520 +       (type)__res;                                            \
67521 +})
67522 +
67523 +#define _hypercall4(type, name, a1, a2, a3, a4)                        \
67524 +({                                                             \
67525 +       long __res, __ign1, __ign2, __ign3, __ign4;             \
67526 +       asm volatile (                                          \
67527 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
67528 +               : "=a" (__res), "=b" (__ign1), "=c" (__ign2),   \
67529 +               "=d" (__ign3), "=S" (__ign4)                    \
67530 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
67531 +               "3" ((long)(a3)), "4" ((long)(a4))              \
67532 +               : "memory" );                                   \
67533 +       (type)__res;                                            \
67534 +})
67535 +
67536 +#define _hypercall5(type, name, a1, a2, a3, a4, a5)            \
67537 +({                                                             \
67538 +       long __res, __ign1, __ign2, __ign3, __ign4, __ign5;     \
67539 +       asm volatile (                                          \
67540 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
67541 +               : "=a" (__res), "=b" (__ign1), "=c" (__ign2),   \
67542 +               "=d" (__ign3), "=S" (__ign4), "=D" (__ign5)     \
67543 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
67544 +               "3" ((long)(a3)), "4" ((long)(a4)),             \
67545 +               "5" ((long)(a5))                                \
67546 +               : "memory" );                                   \
67547 +       (type)__res;                                            \
67548 +})
67549 +
67550 +static inline int
67551 +HYPERVISOR_set_trap_table(
67552 +       trap_info_t *table)
67553 +{
67554 +       return _hypercall1(int, set_trap_table, table);
67555 +}
67556 +
67557 +static inline int
67558 +HYPERVISOR_mmu_update(
67559 +       mmu_update_t *req, int count, int *success_count, domid_t domid)
67560 +{
67561 +       return _hypercall4(int, mmu_update, req, count, success_count, domid);
67562 +}
67563 +
67564 +static inline int
67565 +HYPERVISOR_mmuext_op(
67566 +       struct mmuext_op *op, int count, int *success_count, domid_t domid)
67567 +{
67568 +       return _hypercall4(int, mmuext_op, op, count, success_count, domid);
67569 +}
67570 +
67571 +static inline int
67572 +HYPERVISOR_set_gdt(
67573 +       unsigned long *frame_list, int entries)
67574 +{
67575 +       return _hypercall2(int, set_gdt, frame_list, entries);
67576 +}
67577 +
67578 +static inline int
67579 +HYPERVISOR_stack_switch(
67580 +       unsigned long ss, unsigned long esp)
67581 +{
67582 +       return _hypercall2(int, stack_switch, ss, esp);
67583 +}
67584 +
67585 +static inline int
67586 +HYPERVISOR_set_callbacks(
67587 +       unsigned long event_selector, unsigned long event_address,
67588 +       unsigned long failsafe_selector, unsigned long failsafe_address)
67589 +{
67590 +       return _hypercall4(int, set_callbacks,
67591 +                          event_selector, event_address,
67592 +                          failsafe_selector, failsafe_address);
67593 +}
67594 +
67595 +static inline int
67596 +HYPERVISOR_fpu_taskswitch(
67597 +       int set)
67598 +{
67599 +       return _hypercall1(int, fpu_taskswitch, set);
67600 +}
67601 +
67602 +static inline int
67603 +HYPERVISOR_sched_op_compat(
67604 +       int cmd, unsigned long arg)
67605 +{
67606 +       return _hypercall2(int, sched_op_compat, cmd, arg);
67607 +}
67608 +
67609 +static inline int
67610 +HYPERVISOR_sched_op(
67611 +       int cmd, void *arg)
67612 +{
67613 +       return _hypercall2(int, sched_op, cmd, arg);
67614 +}
67615 +
67616 +static inline long
67617 +HYPERVISOR_set_timer_op(
67618 +       u64 timeout)
67619 +{
67620 +       unsigned long timeout_hi = (unsigned long)(timeout>>32);
67621 +       unsigned long timeout_lo = (unsigned long)timeout;
67622 +       return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
67623 +}
67624 +
67625 +static inline int
67626 +HYPERVISOR_dom0_op(
67627 +       dom0_op_t *dom0_op)
67628 +{
67629 +       dom0_op->interface_version = DOM0_INTERFACE_VERSION;
67630 +       return _hypercall1(int, dom0_op, dom0_op);
67631 +}
67632 +
67633 +static inline int
67634 +HYPERVISOR_set_debugreg(
67635 +       int reg, unsigned long value)
67636 +{
67637 +       return _hypercall2(int, set_debugreg, reg, value);
67638 +}
67639 +
67640 +static inline unsigned long
67641 +HYPERVISOR_get_debugreg(
67642 +       int reg)
67643 +{
67644 +       return _hypercall1(unsigned long, get_debugreg, reg);
67645 +}
67646 +
67647 +static inline int
67648 +HYPERVISOR_update_descriptor(
67649 +       u64 ma, u64 desc)
67650 +{
67651 +       return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
67652 +}
67653 +
67654 +static inline int
67655 +HYPERVISOR_memory_op(
67656 +       unsigned int cmd, void *arg)
67657 +{
67658 +       return _hypercall2(int, memory_op, cmd, arg);
67659 +}
67660 +
67661 +static inline int
67662 +HYPERVISOR_multicall(
67663 +       void *call_list, int nr_calls)
67664 +{
67665 +       return _hypercall2(int, multicall, call_list, nr_calls);
67666 +}
67667 +
67668 +static inline int
67669 +HYPERVISOR_update_va_mapping(
67670 +       unsigned long va, pte_t new_val, unsigned long flags)
67671 +{
67672 +       unsigned long pte_hi = 0;
67673 +#ifdef CONFIG_X86_PAE
67674 +       pte_hi = new_val.pte_high;
67675 +#endif
67676 +       return _hypercall4(int, update_va_mapping, va,
67677 +                          new_val.pte_low, pte_hi, flags);
67678 +}
67679 +
67680 +static inline int
67681 +HYPERVISOR_event_channel_op(
67682 +       int cmd, void *arg)
67683 +{
67684 +       int rc = _hypercall2(int, event_channel_op, cmd, arg);
67685 +       if (unlikely(rc == -ENOSYS)) {
67686 +               struct evtchn_op op;
67687 +               op.cmd = cmd;
67688 +               memcpy(&op.u, arg, sizeof(op.u));
67689 +               rc = _hypercall1(int, event_channel_op_compat, &op);
67690 +               memcpy(arg, &op.u, sizeof(op.u));
67691 +       }
67692 +       return rc;
67693 +}
67694 +
67695 +static inline int
67696 +HYPERVISOR_acm_op(
67697 +       int cmd, void *arg)
67698 +{
67699 +       return _hypercall2(int, acm_op, cmd, arg);
67700 +}
67701 +
67702 +static inline int
67703 +HYPERVISOR_xen_version(
67704 +       int cmd, void *arg)
67705 +{
67706 +       return _hypercall2(int, xen_version, cmd, arg);
67707 +}
67708 +
67709 +static inline int
67710 +HYPERVISOR_console_io(
67711 +       int cmd, int count, char *str)
67712 +{
67713 +       return _hypercall3(int, console_io, cmd, count, str);
67714 +}
67715 +
67716 +static inline int
67717 +HYPERVISOR_physdev_op(
67718 +       int cmd, void *arg)
67719 +{
67720 +       int rc = _hypercall2(int, physdev_op, cmd, arg);
67721 +       if (unlikely(rc == -ENOSYS)) {
67722 +               struct physdev_op op;
67723 +               op.cmd = cmd;
67724 +               memcpy(&op.u, arg, sizeof(op.u));
67725 +               rc = _hypercall1(int, physdev_op_compat, &op);
67726 +               memcpy(arg, &op.u, sizeof(op.u));
67727 +       }
67728 +       return rc;
67729 +}
67730 +
67731 +static inline int
67732 +HYPERVISOR_grant_table_op(
67733 +       unsigned int cmd, void *uop, unsigned int count)
67734 +{
67735 +       return _hypercall3(int, grant_table_op, cmd, uop, count);
67736 +}
67737 +
67738 +static inline int
67739 +HYPERVISOR_update_va_mapping_otherdomain(
67740 +       unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
67741 +{
67742 +       unsigned long pte_hi = 0;
67743 +#ifdef CONFIG_X86_PAE
67744 +       pte_hi = new_val.pte_high;
67745 +#endif
67746 +       return _hypercall5(int, update_va_mapping_otherdomain, va,
67747 +                          new_val.pte_low, pte_hi, flags, domid);
67748 +}
67749 +
67750 +static inline int
67751 +HYPERVISOR_vm_assist(
67752 +       unsigned int cmd, unsigned int type)
67753 +{
67754 +       return _hypercall2(int, vm_assist, cmd, type);
67755 +}
67756 +
67757 +static inline int
67758 +HYPERVISOR_vcpu_op(
67759 +       int cmd, int vcpuid, void *extra_args)
67760 +{
67761 +       return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
67762 +}
67763 +
67764 +static inline int
67765 +HYPERVISOR_suspend(
67766 +       unsigned long srec)
67767 +{
67768 +       struct sched_shutdown sched_shutdown = {
67769 +               .reason = SHUTDOWN_suspend
67770 +       };
67771 +
67772 +       int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
67773 +                            &sched_shutdown, srec);
67774 +
67775 +       if (rc == -ENOSYS)
67776 +               rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
67777 +                                SHUTDOWN_suspend, srec);
67778 +
67779 +       return rc;
67780 +}
67781 +
67782 +static inline int
67783 +HYPERVISOR_nmi_op(
67784 +       unsigned long op, void *arg)
67785 +{
67786 +       return _hypercall2(int, nmi_op, op, arg);
67787 +}
67788 +
67789 +static inline int
67790 +HYPERVISOR_callback_op(
67791 +       int cmd, void *arg)
67792 +{
67793 +       return _hypercall2(int, callback_op, cmd, arg);
67794 +}
67795 +
67796 +static inline int
67797 +HYPERVISOR_xenoprof_op(
67798 +       int op, void *arg)
67799 +{
67800 +       return _hypercall2(int, xenoprof_op, op, arg);
67801 +}
67802 +
67803 +
67804 +#endif /* __HYPERCALL_H__ */
67805 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/hypervisor.h new/include/asm-i386/mach-xen/asm/hypervisor.h
67806 --- linux-2.6/include/asm-i386/mach-xen/asm/hypervisor.h        1970-01-01 01:00:00.000000000 +0100
67807 +++ new/include/asm-i386/mach-xen/asm/hypervisor.h      2006-05-23 18:42:17.000000000 +0200
67808 @@ -0,0 +1,226 @@
67809 +/******************************************************************************
67810 + * hypervisor.h
67811 + * 
67812 + * Linux-specific hypervisor handling.
67813 + * 
67814 + * Copyright (c) 2002-2004, K A Fraser
67815 + * 
67816 + * This program is free software; you can redistribute it and/or
67817 + * modify it under the terms of the GNU General Public License version 2
67818 + * as published by the Free Software Foundation; or, when distributed
67819 + * separately from the Linux kernel or incorporated into other
67820 + * software packages, subject to the following license:
67821 + * 
67822 + * Permission is hereby granted, free of charge, to any person obtaining a copy
67823 + * of this source file (the "Software"), to deal in the Software without
67824 + * restriction, including without limitation the rights to use, copy, modify,
67825 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
67826 + * and to permit persons to whom the Software is furnished to do so, subject to
67827 + * the following conditions:
67828 + * 
67829 + * The above copyright notice and this permission notice shall be included in
67830 + * all copies or substantial portions of the Software.
67831 + * 
67832 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
67833 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
67834 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
67835 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
67836 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
67837 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
67838 + * IN THE SOFTWARE.
67839 + */
67840 +
67841 +#ifndef __HYPERVISOR_H__
67842 +#define __HYPERVISOR_H__
67843 +
67844 +#include <linux/config.h>
67845 +#include <linux/types.h>
67846 +#include <linux/kernel.h>
67847 +#include <linux/version.h>
67848 +#include <linux/errno.h>
67849 +#include <xen/interface/xen.h>
67850 +#include <xen/interface/dom0_ops.h>
67851 +#include <xen/interface/event_channel.h>
67852 +#include <xen/interface/physdev.h>
67853 +#include <xen/interface/sched.h>
67854 +#include <xen/interface/nmi.h>
67855 +#include <asm/ptrace.h>
67856 +#include <asm/page.h>
67857 +#if defined(__i386__)
67858 +#  ifdef CONFIG_X86_PAE
67859 +#   include <asm-generic/pgtable-nopud.h>
67860 +#  else
67861 +#   include <asm-generic/pgtable-nopmd.h>
67862 +#  endif
67863 +#endif
67864 +
67865 +extern shared_info_t *HYPERVISOR_shared_info;
67866 +
67867 +/* arch/xen/i386/kernel/setup.c */
67868 +extern start_info_t *xen_start_info;
67869 +
67870 +/* arch/xen/kernel/evtchn.c */
67871 +/* Force a proper event-channel callback from Xen. */
67872 +void force_evtchn_callback(void);
67873 +
67874 +/* arch/xen/kernel/process.c */
67875 +void xen_cpu_idle (void);
67876 +
67877 +/* arch/xen/i386/kernel/hypervisor.c */
67878 +void do_hypervisor_callback(struct pt_regs *regs);
67879 +
67880 +/* arch/xen/i386/mm/hypervisor.c */
67881 +/*
67882 + * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already
67883 + * be MACHINE addresses.
67884 + */
67885 +
67886 +void xen_pt_switch(unsigned long ptr);
67887 +void xen_new_user_pt(unsigned long ptr); /* x86_64 only */
67888 +void xen_load_gs(unsigned int selector); /* x86_64 only */
67889 +void xen_tlb_flush(void);
67890 +void xen_invlpg(unsigned long ptr);
67891 +
67892 +void xen_l1_entry_update(pte_t *ptr, pte_t val);
67893 +void xen_l2_entry_update(pmd_t *ptr, pmd_t val);
67894 +void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */
67895 +void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */
67896 +void xen_pgd_pin(unsigned long ptr);
67897 +void xen_pgd_unpin(unsigned long ptr);
67898 +
67899 +void xen_set_ldt(unsigned long ptr, unsigned long bytes);
67900 +void xen_machphys_update(unsigned long mfn, unsigned long pfn);
67901 +
67902 +#ifdef CONFIG_SMP
67903 +#include <linux/cpumask.h>
67904 +void xen_tlb_flush_all(void);
67905 +void xen_invlpg_all(unsigned long ptr);
67906 +void xen_tlb_flush_mask(cpumask_t *mask);
67907 +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr);
67908 +#endif
67909 +
67910 +/* Returns zero on success else negative errno. */
67911 +int xen_create_contiguous_region(
67912 +    unsigned long vstart, unsigned int order, unsigned int address_bits);
67913 +void xen_destroy_contiguous_region(
67914 +    unsigned long vstart, unsigned int order);
67915 +
67916 +/* Turn jiffies into Xen system time. */
67917 +u64 jiffies_to_st(unsigned long jiffies);
67918 +
67919 +#include <asm/hypercall.h>
67920 +
67921 +#if defined(CONFIG_X86_64)
67922 +#define MULTI_UVMFLAGS_INDEX 2
67923 +#define MULTI_UVMDOMID_INDEX 3
67924 +#else
67925 +#define MULTI_UVMFLAGS_INDEX 3
67926 +#define MULTI_UVMDOMID_INDEX 4
67927 +#endif
67928 +
67929 +#define is_running_on_xen() 1
67930 +
67931 +static inline int
67932 +HYPERVISOR_yield(
67933 +       void)
67934 +{
67935 +       int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
67936 +
67937 +       if (rc == -ENOSYS)
67938 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
67939 +
67940 +       return rc;
67941 +}
67942 +
67943 +static inline int
67944 +HYPERVISOR_block(
67945 +       void)
67946 +{
67947 +       int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL);
67948 +
67949 +       if (rc == -ENOSYS)
67950 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0);
67951 +
67952 +       return rc;
67953 +}
67954 +
67955 +static inline int
67956 +HYPERVISOR_shutdown(
67957 +       unsigned int reason)
67958 +{
67959 +       struct sched_shutdown sched_shutdown = {
67960 +               .reason = reason
67961 +       };
67962 +
67963 +       int rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown);
67964 +
67965 +       if (rc == -ENOSYS)
67966 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason);
67967 +
67968 +       return rc;
67969 +}
67970 +
67971 +static inline int
67972 +HYPERVISOR_poll(
67973 +       evtchn_port_t *ports, unsigned int nr_ports, u64 timeout)
67974 +{
67975 +       int rc;
67976 +       struct sched_poll sched_poll = {
67977 +               .nr_ports = nr_ports,
67978 +               .timeout = jiffies_to_st(timeout)
67979 +       };
67980 +       set_xen_guest_handle(sched_poll.ports, ports);
67981 +
67982 +       rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
67983 +       if (rc == -ENOSYS)
67984 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
67985 +
67986 +       return rc;
67987 +}
67988 +
67989 +static inline void
67990 +MULTI_update_va_mapping(
67991 +    multicall_entry_t *mcl, unsigned long va,
67992 +    pte_t new_val, unsigned long flags)
67993 +{
67994 +    mcl->op = __HYPERVISOR_update_va_mapping;
67995 +    mcl->args[0] = va;
67996 +#if defined(CONFIG_X86_64)
67997 +    mcl->args[1] = new_val.pte;
67998 +    mcl->args[2] = flags;
67999 +#elif defined(CONFIG_X86_PAE)
68000 +    mcl->args[1] = new_val.pte_low;
68001 +    mcl->args[2] = new_val.pte_high;
68002 +    mcl->args[3] = flags;
68003 +#else
68004 +    mcl->args[1] = new_val.pte_low;
68005 +    mcl->args[2] = 0;
68006 +    mcl->args[3] = flags;
68007 +#endif
68008 +}
68009 +
68010 +static inline void
68011 +MULTI_update_va_mapping_otherdomain(
68012 +    multicall_entry_t *mcl, unsigned long va,
68013 +    pte_t new_val, unsigned long flags, domid_t domid)
68014 +{
68015 +    mcl->op = __HYPERVISOR_update_va_mapping_otherdomain;
68016 +    mcl->args[0] = va;
68017 +#if defined(CONFIG_X86_64)
68018 +    mcl->args[1] = new_val.pte;
68019 +    mcl->args[2] = flags;
68020 +    mcl->args[3] = domid;
68021 +#elif defined(CONFIG_X86_PAE)
68022 +    mcl->args[1] = new_val.pte_low;
68023 +    mcl->args[2] = new_val.pte_high;
68024 +    mcl->args[3] = flags;
68025 +    mcl->args[4] = domid;
68026 +#else
68027 +    mcl->args[1] = new_val.pte_low;
68028 +    mcl->args[2] = 0;
68029 +    mcl->args[3] = flags;
68030 +    mcl->args[4] = domid;
68031 +#endif
68032 +}
68033 +
68034 +#endif /* __HYPERVISOR_H__ */
68035 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/io.h new/include/asm-i386/mach-xen/asm/io.h
68036 --- linux-2.6/include/asm-i386/mach-xen/asm/io.h        1970-01-01 01:00:00.000000000 +0100
68037 +++ new/include/asm-i386/mach-xen/asm/io.h      2006-05-09 12:35:17.000000000 +0200
68038 @@ -0,0 +1,390 @@
68039 +#ifndef _ASM_IO_H
68040 +#define _ASM_IO_H
68041 +
68042 +#include <linux/config.h>
68043 +#include <linux/string.h>
68044 +#include <linux/compiler.h>
68045 +
68046 +/*
68047 + * This file contains the definitions for the x86 IO instructions
68048 + * inb/inw/inl/outb/outw/outl and the "string versions" of the same
68049 + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
68050 + * versions of the single-IO instructions (inb_p/inw_p/..).
68051 + *
68052 + * This file is not meant to be obfuscating: it's just complicated
68053 + * to (a) handle it all in a way that makes gcc able to optimize it
68054 + * as well as possible and (b) trying to avoid writing the same thing
68055 + * over and over again with slight variations and possibly making a
68056 + * mistake somewhere.
68057 + */
68058 +
68059 +/*
68060 + * Thanks to James van Artsdalen for a better timing-fix than
68061 + * the two short jumps: using outb's to a nonexistent port seems
68062 + * to guarantee better timings even on fast machines.
68063 + *
68064 + * On the other hand, I'd like to be sure of a non-existent port:
68065 + * I feel a bit unsafe about using 0x80 (should be safe, though)
68066 + *
68067 + *             Linus
68068 + */
68069 +
68070 + /*
68071 +  *  Bit simplified and optimized by Jan Hubicka
68072 +  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
68073 +  *
68074 +  *  isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
68075 +  *  isa_read[wl] and isa_write[wl] fixed
68076 +  *  - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
68077 +  */
68078 +
68079 +#define IO_SPACE_LIMIT 0xffff
68080 +
68081 +#define XQUAD_PORTIO_BASE 0xfe400000
68082 +#define XQUAD_PORTIO_QUAD 0x40000  /* 256k per quad. */
68083 +
68084 +#ifdef __KERNEL__
68085 +
68086 +#include <asm-generic/iomap.h>
68087 +
68088 +#include <linux/vmalloc.h>
68089 +#include <asm/fixmap.h>
68090 +
68091 +/*
68092 + * Convert a physical pointer to a virtual kernel pointer for /dev/mem
68093 + * access
68094 + */
68095 +#define xlate_dev_mem_ptr(p)   __va(p)
68096 +
68097 +/*
68098 + * Convert a virtual cached pointer to an uncached pointer
68099 + */
68100 +#define xlate_dev_kmem_ptr(p)  p
68101 +
68102 +/**
68103 + *     virt_to_phys    -       map virtual addresses to physical
68104 + *     @address: address to remap
68105 + *
68106 + *     The returned physical address is the physical (CPU) mapping for
68107 + *     the memory address given. It is only valid to use this function on
68108 + *     addresses directly mapped or allocated via kmalloc. 
68109 + *
68110 + *     This function does not give bus mappings for DMA transfers. In
68111 + *     almost all conceivable cases a device driver should not be using
68112 + *     this function
68113 + */
68114
68115 +static inline unsigned long virt_to_phys(volatile void * address)
68116 +{
68117 +       return __pa(address);
68118 +}
68119 +
68120 +/**
68121 + *     phys_to_virt    -       map physical address to virtual
68122 + *     @address: address to remap
68123 + *
68124 + *     The returned virtual address is a current CPU mapping for
68125 + *     the memory address given. It is only valid to use this function on
68126 + *     addresses that have a kernel mapping
68127 + *
68128 + *     This function does not handle bus mappings for DMA transfers. In
68129 + *     almost all conceivable cases a device driver should not be using
68130 + *     this function
68131 + */
68132 +
68133 +static inline void * phys_to_virt(unsigned long address)
68134 +{
68135 +       return __va(address);
68136 +}
68137 +
68138 +/*
68139 + * Change "struct page" to physical address.
68140 + */
68141 +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
68142 +#define page_to_phys(page)      (phys_to_machine(page_to_pseudophys(page)))
68143 +#define page_to_bus(page)       (phys_to_machine(page_to_pseudophys(page)))
68144 +
68145 +#define bio_to_pseudophys(bio)  (page_to_pseudophys(bio_page((bio))) + \
68146 +                                 (unsigned long) bio_offset((bio)))
68147 +#define bvec_to_pseudophys(bv)  (page_to_pseudophys((bv)->bv_page) + \
68148 +                                 (unsigned long) (bv)->bv_offset)
68149 +
68150 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)      \
68151 +       (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
68152 +        ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
68153 +         bvec_to_pseudophys((vec2))))
68154 +
68155 +extern void __iomem * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
68156 +
68157 +/**
68158 + * ioremap     -   map bus memory into CPU space
68159 + * @offset:    bus address of the memory
68160 + * @size:      size of the resource to map
68161 + *
68162 + * ioremap performs a platform specific sequence of operations to
68163 + * make bus memory CPU accessible via the readb/readw/readl/writeb/
68164 + * writew/writel functions and the other mmio helpers. The returned
68165 + * address is not guaranteed to be usable directly as a virtual
68166 + * address. 
68167 + */
68168 +
68169 +static inline void __iomem * ioremap(unsigned long offset, unsigned long size)
68170 +{
68171 +       return __ioremap(offset, size, 0);
68172 +}
68173 +
68174 +extern void __iomem * ioremap_nocache(unsigned long offset, unsigned long size);
68175 +extern void iounmap(volatile void __iomem *addr);
68176 +
68177 +/*
68178 + * bt_ioremap() and bt_iounmap() are for temporary early boot-time
68179 + * mappings, before the real ioremap() is functional.
68180 + * A boot-time mapping is currently limited to at most 16 pages.
68181 + */
68182 +extern void *bt_ioremap(unsigned long offset, unsigned long size);
68183 +extern void bt_iounmap(void *addr, unsigned long size);
68184 +
68185 +/* Use early IO mappings for DMI because it's initialized early */
68186 +#define dmi_ioremap bt_ioremap
68187 +#define dmi_iounmap bt_iounmap
68188 +#define dmi_alloc alloc_bootmem
68189 +
68190 +/*
68191 + * ISA I/O bus memory addresses are 1:1 with the physical address.
68192 + */
68193 +#define isa_virt_to_bus(_x) isa_virt_to_bus_is_UNSUPPORTED->x
68194 +#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x
68195 +#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
68196 +
68197 +/*
68198 + * However PCI ones are not necessarily 1:1 and therefore these interfaces
68199 + * are forbidden in portable PCI drivers.
68200 + *
68201 + * Allow them on x86 for legacy drivers, though.
68202 + */
68203 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
68204 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
68205 +
68206 +/*
68207 + * readX/writeX() are used to access memory mapped devices. On some
68208 + * architectures the memory mapped IO stuff needs to be accessed
68209 + * differently. On the x86 architecture, we just read/write the
68210 + * memory location directly.
68211 + */
68212 +
68213 +static inline unsigned char readb(const volatile void __iomem *addr)
68214 +{
68215 +       return *(volatile unsigned char __force *) addr;
68216 +}
68217 +static inline unsigned short readw(const volatile void __iomem *addr)
68218 +{
68219 +       return *(volatile unsigned short __force *) addr;
68220 +}
68221 +static inline unsigned int readl(const volatile void __iomem *addr)
68222 +{
68223 +       return *(volatile unsigned int __force *) addr;
68224 +}
68225 +#define readb_relaxed(addr) readb(addr)
68226 +#define readw_relaxed(addr) readw(addr)
68227 +#define readl_relaxed(addr) readl(addr)
68228 +#define __raw_readb readb
68229 +#define __raw_readw readw
68230 +#define __raw_readl readl
68231 +
68232 +static inline void writeb(unsigned char b, volatile void __iomem *addr)
68233 +{
68234 +       *(volatile unsigned char __force *) addr = b;
68235 +}
68236 +static inline void writew(unsigned short b, volatile void __iomem *addr)
68237 +{
68238 +       *(volatile unsigned short __force *) addr = b;
68239 +}
68240 +static inline void writel(unsigned int b, volatile void __iomem *addr)
68241 +{
68242 +       *(volatile unsigned int __force *) addr = b;
68243 +}
68244 +#define __raw_writeb writeb
68245 +#define __raw_writew writew
68246 +#define __raw_writel writel
68247 +
68248 +#define mmiowb()
68249 +
68250 +static inline void memset_io(volatile void __iomem *addr, unsigned char val, int count)
68251 +{
68252 +       memset((void __force *) addr, val, count);
68253 +}
68254 +static inline void memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
68255 +{
68256 +       __memcpy(dst, (void __force *) src, count);
68257 +}
68258 +static inline void memcpy_toio(volatile void __iomem *dst, const void *src, int count)
68259 +{
68260 +       __memcpy((void __force *) dst, src, count);
68261 +}
68262 +
68263 +/*
68264 + * ISA space is 'always mapped' on a typical x86 system, no need to
68265 + * explicitly ioremap() it. The fact that the ISA IO space is mapped
68266 + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
68267 + * are physical addresses. The following constant pointer can be
68268 + * used as the IO-area pointer (it can be iounmapped as well, so the
68269 + * analogy with PCI is quite large):
68270 + */
68271 +#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
68272 +
68273 +/*
68274 + * Again, i386 does not require mem IO specific function.
68275 + */
68276 +
68277 +#define eth_io_copy_and_sum(a,b,c,d)           eth_copy_and_sum((a),(void __force *)(b),(c),(d))
68278 +
68279 +/**
68280 + *     check_signature         -       find BIOS signatures
68281 + *     @io_addr: mmio address to check 
68282 + *     @signature:  signature block
68283 + *     @length: length of signature
68284 + *
68285 + *     Perform a signature comparison with the mmio address io_addr. This
68286 + *     address should have been obtained by ioremap.
68287 + *     Returns 1 on a match.
68288 + */
68289
68290 +static inline int check_signature(volatile void __iomem * io_addr,
68291 +       const unsigned char *signature, int length)
68292 +{
68293 +       int retval = 0;
68294 +       do {
68295 +               if (readb(io_addr) != *signature)
68296 +                       goto out;
68297 +               io_addr++;
68298 +               signature++;
68299 +               length--;
68300 +       } while (length);
68301 +       retval = 1;
68302 +out:
68303 +       return retval;
68304 +}
68305 +
68306 +/*
68307 + *     Cache management
68308 + *
68309 + *     This needed for two cases
68310 + *     1. Out of order aware processors
68311 + *     2. Accidentally out of order processors (PPro errata #51)
68312 + */
68313
68314 +#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
68315 +
68316 +static inline void flush_write_buffers(void)
68317 +{
68318 +       __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory");
68319 +}
68320 +
68321 +#define dma_cache_inv(_start,_size)            flush_write_buffers()
68322 +#define dma_cache_wback(_start,_size)          flush_write_buffers()
68323 +#define dma_cache_wback_inv(_start,_size)      flush_write_buffers()
68324 +
68325 +#else
68326 +
68327 +/* Nothing to do */
68328 +
68329 +#define dma_cache_inv(_start,_size)            do { } while (0)
68330 +#define dma_cache_wback(_start,_size)          do { } while (0)
68331 +#define dma_cache_wback_inv(_start,_size)      do { } while (0)
68332 +#define flush_write_buffers()
68333 +
68334 +#endif
68335 +
68336 +#endif /* __KERNEL__ */
68337 +
68338 +#ifdef SLOW_IO_BY_JUMPING
68339 +#define __SLOW_DOWN_IO "jmp 1f; 1: jmp 1f; 1:"
68340 +#else
68341 +#define __SLOW_DOWN_IO "outb %%al,$0x80;"
68342 +#endif
68343 +
68344 +static inline void slow_down_io(void) {
68345 +       __asm__ __volatile__(
68346 +               __SLOW_DOWN_IO
68347 +#ifdef REALLY_SLOW_IO
68348 +               __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
68349 +#endif
68350 +               : : );
68351 +}
68352 +
68353 +#ifdef CONFIG_X86_NUMAQ
68354 +extern void *xquad_portio;    /* Where the IO area was mapped */
68355 +#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
68356 +#define __BUILDIO(bwl,bw,type) \
68357 +static inline void out##bwl##_quad(unsigned type value, int port, int quad) { \
68358 +       if (xquad_portio) \
68359 +               write##bwl(value, XQUAD_PORT_ADDR(port, quad)); \
68360 +       else \
68361 +               out##bwl##_local(value, port); \
68362 +} \
68363 +static inline void out##bwl(unsigned type value, int port) { \
68364 +       out##bwl##_quad(value, port, 0); \
68365 +} \
68366 +static inline unsigned type in##bwl##_quad(int port, int quad) { \
68367 +       if (xquad_portio) \
68368 +               return read##bwl(XQUAD_PORT_ADDR(port, quad)); \
68369 +       else \
68370 +               return in##bwl##_local(port); \
68371 +} \
68372 +static inline unsigned type in##bwl(int port) { \
68373 +       return in##bwl##_quad(port, 0); \
68374 +}
68375 +#else
68376 +#define __BUILDIO(bwl,bw,type) \
68377 +static inline void out##bwl(unsigned type value, int port) { \
68378 +       out##bwl##_local(value, port); \
68379 +} \
68380 +static inline unsigned type in##bwl(int port) { \
68381 +       return in##bwl##_local(port); \
68382 +}
68383 +#endif
68384 +
68385 +
68386 +#define BUILDIO(bwl,bw,type) \
68387 +static inline void out##bwl##_local(unsigned type value, int port) { \
68388 +       __asm__ __volatile__("out" #bwl " %" #bw "0, %w1" : : "a"(value), "Nd"(port)); \
68389 +} \
68390 +static inline unsigned type in##bwl##_local(int port) { \
68391 +       unsigned type value; \
68392 +       __asm__ __volatile__("in" #bwl " %w1, %" #bw "0" : "=a"(value) : "Nd"(port)); \
68393 +       return value; \
68394 +} \
68395 +static inline void out##bwl##_local_p(unsigned type value, int port) { \
68396 +       out##bwl##_local(value, port); \
68397 +       slow_down_io(); \
68398 +} \
68399 +static inline unsigned type in##bwl##_local_p(int port) { \
68400 +       unsigned type value = in##bwl##_local(port); \
68401 +       slow_down_io(); \
68402 +       return value; \
68403 +} \
68404 +__BUILDIO(bwl,bw,type) \
68405 +static inline void out##bwl##_p(unsigned type value, int port) { \
68406 +       out##bwl(value, port); \
68407 +       slow_down_io(); \
68408 +} \
68409 +static inline unsigned type in##bwl##_p(int port) { \
68410 +       unsigned type value = in##bwl(port); \
68411 +       slow_down_io(); \
68412 +       return value; \
68413 +} \
68414 +static inline void outs##bwl(int port, const void *addr, unsigned long count) { \
68415 +       __asm__ __volatile__("rep; outs" #bwl : "+S"(addr), "+c"(count) : "d"(port)); \
68416 +} \
68417 +static inline void ins##bwl(int port, void *addr, unsigned long count) { \
68418 +       __asm__ __volatile__("rep; ins" #bwl : "+D"(addr), "+c"(count) : "d"(port)); \
68419 +}
68420 +
68421 +BUILDIO(b,b,char)
68422 +BUILDIO(w,w,short)
68423 +BUILDIO(l,,int)
68424 +
68425 +/* We will be supplying our own /dev/mem implementation */
68426 +#define ARCH_HAS_DEV_MEM
68427 +
68428 +#endif
68429 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/kmap_types.h new/include/asm-i386/mach-xen/asm/kmap_types.h
68430 --- linux-2.6/include/asm-i386/mach-xen/asm/kmap_types.h        1970-01-01 01:00:00.000000000 +0100
68431 +++ new/include/asm-i386/mach-xen/asm/kmap_types.h      2006-05-09 12:35:17.000000000 +0200
68432 @@ -0,0 +1,32 @@
68433 +#ifndef _ASM_KMAP_TYPES_H
68434 +#define _ASM_KMAP_TYPES_H
68435 +
68436 +#include <linux/config.h>
68437 +
68438 +#ifdef CONFIG_DEBUG_HIGHMEM
68439 +# define D(n) __KM_FENCE_##n ,
68440 +#else
68441 +# define D(n)
68442 +#endif
68443 +
68444 +enum km_type {
68445 +D(0)   KM_BOUNCE_READ,
68446 +D(1)   KM_SKB_SUNRPC_DATA,
68447 +D(2)   KM_SKB_DATA_SOFTIRQ,
68448 +D(3)   KM_USER0,
68449 +D(4)   KM_USER1,
68450 +D(5)   KM_BIO_SRC_IRQ,
68451 +D(6)   KM_BIO_DST_IRQ,
68452 +D(7)   KM_PTE0,
68453 +D(8)   KM_PTE1,
68454 +D(9)   KM_IRQ0,
68455 +D(10)  KM_IRQ1,
68456 +D(11)  KM_SOFTIRQ0,
68457 +D(12)  KM_SOFTIRQ1,
68458 +D(13)  KM_SWIOTLB,
68459 +D(14)  KM_TYPE_NR
68460 +};
68461 +
68462 +#undef D
68463 +
68464 +#endif
68465 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/mmu_context.h new/include/asm-i386/mach-xen/asm/mmu_context.h
68466 --- linux-2.6/include/asm-i386/mach-xen/asm/mmu_context.h       1970-01-01 01:00:00.000000000 +0100
68467 +++ new/include/asm-i386/mach-xen/asm/mmu_context.h     2006-06-28 14:32:14.000000000 +0200
68468 @@ -0,0 +1,108 @@
68469 +#ifndef __I386_SCHED_H
68470 +#define __I386_SCHED_H
68471 +
68472 +#include <linux/config.h>
68473 +#include <asm/desc.h>
68474 +#include <asm/atomic.h>
68475 +#include <asm/pgalloc.h>
68476 +#include <asm/tlbflush.h>
68477 +
68478 +/*
68479 + * Used for LDT copy/destruction.
68480 + */
68481 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
68482 +void destroy_context(struct mm_struct *mm);
68483 +
68484 +
68485 +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
68486 +{
68487 +#if 0 /* XEN: no lazy tlb */
68488 +       unsigned cpu = smp_processor_id();
68489 +       if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
68490 +               per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY;
68491 +#endif
68492 +}
68493 +
68494 +#define prepare_arch_switch(next)      __prepare_arch_switch()
68495 +
68496 +static inline void __prepare_arch_switch(void)
68497 +{
68498 +       /*
68499 +        * Save away %fs and %gs. No need to save %es and %ds, as those
68500 +        * are always kernel segments while inside the kernel. Must
68501 +        * happen before reload of cr3/ldt (i.e., not in __switch_to).
68502 +        */
68503 +       asm volatile ( "mov %%fs,%0 ; mov %%gs,%1"
68504 +               : "=m" (current->thread.fs),
68505 +                 "=m" (current->thread.gs));
68506 +       asm volatile ( "movl %0,%%fs ; movl %0,%%gs"
68507 +               : : "r" (0) );
68508 +}
68509 +
68510 +extern void mm_pin(struct mm_struct *mm);
68511 +extern void mm_unpin(struct mm_struct *mm);
68512 +void mm_pin_all(void);
68513 +
68514 +static inline void switch_mm(struct mm_struct *prev,
68515 +                            struct mm_struct *next,
68516 +                            struct task_struct *tsk)
68517 +{
68518 +       int cpu = smp_processor_id();
68519 +       struct mmuext_op _op[2], *op = _op;
68520 +
68521 +       if (likely(prev != next)) {
68522 +               BUG_ON(!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags));
68523 +
68524 +               /* stop flush ipis for the previous mm */
68525 +               cpu_clear(cpu, prev->cpu_vm_mask);
68526 +#if 0 /* XEN: no lazy tlb */
68527 +               per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
68528 +               per_cpu(cpu_tlbstate, cpu).active_mm = next;
68529 +#endif
68530 +               cpu_set(cpu, next->cpu_vm_mask);
68531 +
68532 +               /* Re-load page tables: load_cr3(next->pgd) */
68533 +               op->cmd = MMUEXT_NEW_BASEPTR;
68534 +               op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
68535 +               op++;
68536 +
68537 +               /*
68538 +                * load the LDT, if the LDT is different:
68539 +                */
68540 +               if (unlikely(prev->context.ldt != next->context.ldt)) {
68541 +                       /* load_LDT_nolock(&next->context, cpu) */
68542 +                       op->cmd = MMUEXT_SET_LDT;
68543 +                       op->arg1.linear_addr = (unsigned long)next->context.ldt;
68544 +                       op->arg2.nr_ents     = next->context.size;
68545 +                       op++;
68546 +               }
68547 +
68548 +               BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
68549 +       }
68550 +#if 0 /* XEN: no lazy tlb */
68551 +       else {
68552 +               per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
68553 +               BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
68554 +
68555 +               if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
68556 +                       /* We were in lazy tlb mode and leave_mm disabled 
68557 +                        * tlb flush IPI delivery. We must reload %cr3.
68558 +                        */
68559 +                       load_cr3(next->pgd);
68560 +                       load_LDT_nolock(&next->context, cpu);
68561 +               }
68562 +       }
68563 +#endif
68564 +}
68565 +
68566 +#define deactivate_mm(tsk, mm) \
68567 +       asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0))
68568 +
68569 +static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
68570 +{
68571 +       if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
68572 +               mm_pin(next);
68573 +       switch_mm(prev, next, NULL);
68574 +}
68575 +
68576 +#endif
68577 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/mmu.h new/include/asm-i386/mach-xen/asm/mmu.h
68578 --- linux-2.6/include/asm-i386/mach-xen/asm/mmu.h       1970-01-01 01:00:00.000000000 +0100
68579 +++ new/include/asm-i386/mach-xen/asm/mmu.h     2006-06-28 14:32:14.000000000 +0200
68580 @@ -0,0 +1,25 @@
68581 +#ifndef __i386_MMU_H
68582 +#define __i386_MMU_H
68583 +
68584 +#include <asm/semaphore.h>
68585 +/*
68586 + * The i386 doesn't have a mmu context, but
68587 + * we put the segment information here.
68588 + *
68589 + * cpu_vm_mask is used to optimize ldt flushing.
68590 + */
68591 +typedef struct { 
68592 +       int size;
68593 +       struct semaphore sem;
68594 +       void *ldt;
68595 +} mm_context_t;
68596 +
68597 +/* mm/memory.c:exit_mmap hook */
68598 +extern void _arch_exit_mmap(struct mm_struct *mm);
68599 +#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
68600 +
68601 +/* kernel/fork.c:dup_mmap hook */
68602 +extern void _arch_dup_mmap(struct mm_struct *mm);
68603 +#define arch_dup_mmap(mm, oldmm) ((void)(oldmm), _arch_dup_mmap(mm))
68604 +
68605 +#endif
68606 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/page.h new/include/asm-i386/mach-xen/asm/page.h
68607 --- linux-2.6/include/asm-i386/mach-xen/asm/page.h      1970-01-01 01:00:00.000000000 +0100
68608 +++ new/include/asm-i386/mach-xen/asm/page.h    2006-06-28 14:32:14.000000000 +0200
68609 @@ -0,0 +1,336 @@
68610 +#ifndef _I386_PAGE_H
68611 +#define _I386_PAGE_H
68612 +
68613 +/* PAGE_SHIFT determines the page size */
68614 +#define PAGE_SHIFT     12
68615 +#define PAGE_SIZE      (1UL << PAGE_SHIFT)
68616 +#define PAGE_MASK      (~(PAGE_SIZE-1))
68617 +
68618 +#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
68619 +#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
68620 +
68621 +#ifdef __KERNEL__
68622 +#ifndef __ASSEMBLY__
68623 +
68624 +#include <linux/config.h>
68625 +#include <linux/string.h>
68626 +#include <linux/types.h>
68627 +#include <linux/kernel.h>
68628 +#include <asm/bug.h>
68629 +#include <xen/interface/xen.h>
68630 +#include <xen/features.h>
68631 +#include <xen/foreign_page.h>
68632 +
68633 +#define arch_free_page(_page,_order)                   \
68634 +({     int foreign = PageForeign(_page);               \
68635 +       if (foreign)                                    \
68636 +               (PageForeignDestructor(_page))(_page);  \
68637 +       foreign;                                        \
68638 +})
68639 +#define HAVE_ARCH_FREE_PAGE
68640 +
68641 +#ifdef CONFIG_XEN_SCRUB_PAGES
68642 +#define scrub_pages(_p,_n) memset((void *)(_p), 0, (_n) << PAGE_SHIFT)
68643 +#else
68644 +#define scrub_pages(_p,_n) ((void)0)
68645 +#endif
68646 +
68647 +#ifdef CONFIG_X86_USE_3DNOW
68648 +
68649 +#include <asm/mmx.h>
68650 +
68651 +#define clear_page(page)       mmx_clear_page((void *)(page))
68652 +#define copy_page(to,from)     mmx_copy_page(to,from)
68653 +
68654 +#else
68655 +
68656 +#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
68657 +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
68658 +
68659 +/*
68660 + *     On older X86 processors it's not a win to use MMX here it seems.
68661 + *     Maybe the K6-III ?
68662 + */
68663
68664 +#define clear_page(page)       memset((void *)(page), 0, PAGE_SIZE)
68665 +#define copy_page(to,from)     memcpy((void *)(to), (void *)(from), PAGE_SIZE)
68666 +
68667 +#endif
68668 +
68669 +#define clear_user_page(page, vaddr, pg)       clear_page(page)
68670 +#define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)
68671 +
68672 +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
68673 +#define INVALID_P2M_ENTRY      (~0UL)
68674 +#define FOREIGN_FRAME_BIT      (1UL<<31)
68675 +#define FOREIGN_FRAME(m)       ((m) | FOREIGN_FRAME_BIT)
68676 +
68677 +extern unsigned long *phys_to_machine_mapping;
68678 +
68679 +#undef machine_to_phys_mapping
68680 +extern unsigned long *machine_to_phys_mapping;
68681 +extern unsigned int   machine_to_phys_order;
68682 +
68683 +static inline unsigned long pfn_to_mfn(unsigned long pfn)
68684 +{
68685 +       if (xen_feature(XENFEAT_auto_translated_physmap))
68686 +               return pfn;
68687 +       return phys_to_machine_mapping[(unsigned int)(pfn)] &
68688 +               ~FOREIGN_FRAME_BIT;
68689 +}
68690 +
68691 +static inline int phys_to_machine_mapping_valid(unsigned long pfn)
68692 +{
68693 +       if (xen_feature(XENFEAT_auto_translated_physmap))
68694 +               return 1;
68695 +       return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
68696 +}
68697 +
68698 +static inline unsigned long mfn_to_pfn(unsigned long mfn)
68699 +{
68700 +       extern unsigned long max_mapnr;
68701 +       unsigned long pfn;
68702 +
68703 +       if (xen_feature(XENFEAT_auto_translated_physmap))
68704 +               return mfn;
68705 +
68706 +       if (unlikely((mfn >> machine_to_phys_order) != 0))
68707 +               return max_mapnr;
68708 +
68709 +       /* The array access can fail (e.g., device space beyond end of RAM). */
68710 +       asm (
68711 +               "1:     movl %1,%0\n"
68712 +               "2:\n"
68713 +               ".section .fixup,\"ax\"\n"
68714 +               "3:     movl %2,%0\n"
68715 +               "       jmp  2b\n"
68716 +               ".previous\n"
68717 +               ".section __ex_table,\"a\"\n"
68718 +               "       .align 4\n"
68719 +               "       .long 1b,3b\n"
68720 +               ".previous"
68721 +               : "=r" (pfn)
68722 +               : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) );
68723 +
68724 +       return pfn;
68725 +}
68726 +
68727 +/*
68728 + * We detect special mappings in one of two ways:
68729 + *  1. If the MFN is an I/O page then Xen will set the m2p entry
68730 + *     to be outside our maximum possible pseudophys range.
68731 + *  2. If the MFN belongs to a different domain then we will certainly
68732 + *     not have MFN in our p2m table. Conversely, if the page is ours,
68733 + *     then we'll have p2m(m2p(MFN))==MFN.
68734 + * If we detect a special mapping then it doesn't have a 'struct page'.
68735 + * We force !pfn_valid() by returning an out-of-range pointer.
68736 + *
68737 + * NB. These checks require that, for any MFN that is not in our reservation,
68738 + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
68739 + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
68740 + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
68741 + *
68742 + * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
68743 + *      use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
68744 + *      require. In all the cases we care about, the FOREIGN_FRAME bit is
68745 + *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.
68746 + */
68747 +static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
68748 +{
68749 +       extern unsigned long max_mapnr;
68750 +       unsigned long pfn = mfn_to_pfn(mfn);
68751 +       if ((pfn < max_mapnr)
68752 +           && !xen_feature(XENFEAT_auto_translated_physmap)
68753 +           && (phys_to_machine_mapping[pfn] != mfn))
68754 +               return max_mapnr; /* force !pfn_valid() */
68755 +       return pfn;
68756 +}
68757 +
68758 +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
68759 +{
68760 +       if (xen_feature(XENFEAT_auto_translated_physmap)) {
68761 +               BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
68762 +               return;
68763 +       }
68764 +       phys_to_machine_mapping[pfn] = mfn;
68765 +}
68766 +
68767 +/* Definitions for machine and pseudophysical addresses. */
68768 +#ifdef CONFIG_X86_PAE
68769 +typedef unsigned long long paddr_t;
68770 +typedef unsigned long long maddr_t;
68771 +#else
68772 +typedef unsigned long paddr_t;
68773 +typedef unsigned long maddr_t;
68774 +#endif
68775 +
68776 +static inline maddr_t phys_to_machine(paddr_t phys)
68777 +{
68778 +       maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
68779 +       machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
68780 +       return machine;
68781 +}
68782 +static inline paddr_t machine_to_phys(maddr_t machine)
68783 +{
68784 +       paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
68785 +       phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
68786 +       return phys;
68787 +}
68788 +
68789 +/*
68790 + * These are used to make use of C type-checking..
68791 + */
68792 +extern int nx_enabled;
68793 +#ifdef CONFIG_X86_PAE
68794 +extern unsigned long long __supported_pte_mask;
68795 +typedef struct { unsigned long pte_low, pte_high; } pte_t;
68796 +typedef struct { unsigned long long pmd; } pmd_t;
68797 +typedef struct { unsigned long long pgd; } pgd_t;
68798 +typedef struct { unsigned long long pgprot; } pgprot_t;
68799 +#define __pte(x) ({ unsigned long long _x = (x);        \
68800 +    if (_x & 1) _x = phys_to_machine(_x);               \
68801 +    ((pte_t) {(unsigned long)(_x), (unsigned long)(_x>>32)}); })
68802 +#define __pgd(x) ({ unsigned long long _x = (x); \
68803 +    (((_x)&1) ? ((pgd_t) {phys_to_machine(_x)}) : ((pgd_t) {(_x)})); })
68804 +#define __pmd(x) ({ unsigned long long _x = (x); \
68805 +    (((_x)&1) ? ((pmd_t) {phys_to_machine(_x)}) : ((pmd_t) {(_x)})); })
68806 +static inline unsigned long long pte_val(pte_t x)
68807 +{
68808 +       unsigned long long ret;
68809 +
68810 +       if (x.pte_low) {
68811 +               ret = x.pte_low | (unsigned long long)x.pte_high << 32;
68812 +               ret = machine_to_phys(ret) | 1;
68813 +       } else {
68814 +               ret = 0;
68815 +       }
68816 +       return ret;
68817 +}
68818 +static inline unsigned long long pmd_val(pmd_t x)
68819 +{
68820 +       unsigned long long ret = x.pmd;
68821 +       if (ret) ret = machine_to_phys(ret) | 1;
68822 +       return ret;
68823 +}
68824 +static inline unsigned long long pgd_val(pgd_t x)
68825 +{
68826 +       unsigned long long ret = x.pgd;
68827 +       if (ret) ret = machine_to_phys(ret) | 1;
68828 +       return ret;
68829 +}
68830 +static inline unsigned long long pte_val_ma(pte_t x)
68831 +{
68832 +       return (unsigned long long)x.pte_high << 32 | x.pte_low;
68833 +}
68834 +#define HPAGE_SHIFT    21
68835 +#else
68836 +typedef struct { unsigned long pte_low; } pte_t;
68837 +typedef struct { unsigned long pgd; } pgd_t;
68838 +typedef struct { unsigned long pgprot; } pgprot_t;
68839 +#define boot_pte_t pte_t /* or would you rather have a typedef */
68840 +#define pte_val(x)     (((x).pte_low & 1) ? machine_to_phys((x).pte_low) : \
68841 +                        (x).pte_low)
68842 +#define pte_val_ma(x)  ((x).pte_low)
68843 +#define __pte(x) ({ unsigned long _x = (x); \
68844 +    (((_x)&1) ? ((pte_t) {phys_to_machine(_x)}) : ((pte_t) {(_x)})); })
68845 +#define __pgd(x) ({ unsigned long _x = (x); \
68846 +    (((_x)&1) ? ((pgd_t) {phys_to_machine(_x)}) : ((pgd_t) {(_x)})); })
68847 +static inline unsigned long pgd_val(pgd_t x)
68848 +{
68849 +       unsigned long ret = x.pgd;
68850 +       if (ret) ret = machine_to_phys(ret) | 1;
68851 +       return ret;
68852 +}
68853 +#define HPAGE_SHIFT    22
68854 +#endif
68855 +#define PTE_MASK       PAGE_MASK
68856 +
68857 +#ifdef CONFIG_HUGETLB_PAGE
68858 +#define HPAGE_SIZE     ((1UL) << HPAGE_SHIFT)
68859 +#define HPAGE_MASK     (~(HPAGE_SIZE - 1))
68860 +#define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
68861 +#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
68862 +#endif
68863 +
68864 +#define pgprot_val(x)  ((x).pgprot)
68865 +
68866 +#define __pte_ma(x)    ((pte_t) { (x) } )
68867 +#define __pgprot(x)    ((pgprot_t) { (x) } )
68868 +
68869 +#endif /* !__ASSEMBLY__ */
68870 +
68871 +/* to align the pointer to the (next) page boundary */
68872 +#define PAGE_ALIGN(addr)       (((addr)+PAGE_SIZE-1)&PAGE_MASK)
68873 +
68874 +/*
68875 + * This handles the memory map.. We could make this a config
68876 + * option, but too many people screw it up, and too few need
68877 + * it.
68878 + *
68879 + * A __PAGE_OFFSET of 0xC0000000 means that the kernel has
68880 + * a virtual address space of one gigabyte, which limits the
68881 + * amount of physical memory you can use to about 950MB. 
68882 + *
68883 + * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
68884 + * and CONFIG_HIGHMEM64G options in the kernel configuration.
68885 + */
68886 +
68887 +#ifndef __ASSEMBLY__
68888 +
68889 +/*
68890 + * This much address space is reserved for vmalloc() and iomap()
68891 + * as well as fixmap mappings.
68892 + */
68893 +extern unsigned int __VMALLOC_RESERVE;
68894 +
68895 +extern int sysctl_legacy_va_layout;
68896 +
68897 +extern int page_is_ram(unsigned long pagenr);
68898 +
68899 +#endif /* __ASSEMBLY__ */
68900 +
68901 +#ifdef __ASSEMBLY__
68902 +#define __PAGE_OFFSET          CONFIG_PAGE_OFFSET
68903 +#define __PHYSICAL_START       CONFIG_PHYSICAL_START
68904 +#else
68905 +#define __PAGE_OFFSET          ((unsigned long)CONFIG_PAGE_OFFSET)
68906 +#define __PHYSICAL_START       ((unsigned long)CONFIG_PHYSICAL_START)
68907 +#endif
68908 +#define __KERNEL_START         (__PAGE_OFFSET + __PHYSICAL_START)
68909 +
68910 +#ifdef CONFIG_XEN_COMPAT_030002
68911 +#undef LOAD_OFFSET
68912 +#define LOAD_OFFSET            0
68913 +#endif /* CONFIG_XEN_COMPAT_030002 */
68914 +
68915 +#define PAGE_OFFSET            ((unsigned long)__PAGE_OFFSET)
68916 +#define VMALLOC_RESERVE                ((unsigned long)__VMALLOC_RESERVE)
68917 +#define MAXMEM                 (__FIXADDR_TOP-__PAGE_OFFSET-__VMALLOC_RESERVE)
68918 +#define __pa(x)                        ((unsigned long)(x)-PAGE_OFFSET)
68919 +#define __va(x)                        ((void *)((unsigned long)(x)+PAGE_OFFSET))
68920 +#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
68921 +#ifdef CONFIG_FLATMEM
68922 +#define pfn_valid(pfn)         ((pfn) < max_mapnr)
68923 +#endif /* CONFIG_FLATMEM */
68924 +#define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
68925 +
68926 +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
68927 +
68928 +#define VM_DATA_DEFAULT_FLAGS \
68929 +       (VM_READ | VM_WRITE | \
68930 +       ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
68931 +                VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
68932 +
68933 +/* VIRT <-> MACHINE conversion */
68934 +#define virt_to_machine(v)     (phys_to_machine(__pa(v)))
68935 +#define virt_to_mfn(v)         (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
68936 +#define mfn_to_virt(m)         (__va(mfn_to_pfn(m) << PAGE_SHIFT))
68937 +
68938 +#define __HAVE_ARCH_GATE_AREA 1
68939 +
68940 +#endif /* __KERNEL__ */
68941 +
68942 +#include <asm-generic/memory_model.h>
68943 +#include <asm-generic/page.h>
68944 +
68945 +#endif /* _I386_PAGE_H */
68946 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/param.h new/include/asm-i386/mach-xen/asm/param.h
68947 --- linux-2.6/include/asm-i386/mach-xen/asm/param.h     1970-01-01 01:00:00.000000000 +0100
68948 +++ new/include/asm-i386/mach-xen/asm/param.h   2006-05-09 12:35:17.000000000 +0200
68949 @@ -0,0 +1,24 @@
68950 +#ifndef _ASMi386_PARAM_H
68951 +#define _ASMi386_PARAM_H
68952 +
68953 +#ifdef __KERNEL__
68954 +# include <linux/config.h>
68955 +# define HZ            CONFIG_HZ       /* Internal kernel timer frequency */
68956 +# define USER_HZ       100             /* .. some user interfaces are in "ticks" */
68957 +# define CLOCKS_PER_SEC                (USER_HZ)       /* like times() */
68958 +#endif
68959 +
68960 +#ifndef HZ
68961 +#define HZ 100
68962 +#endif
68963 +
68964 +#define EXEC_PAGESIZE  4096
68965 +
68966 +#ifndef NOGROUP
68967 +#define NOGROUP                (-1)
68968 +#endif
68969 +
68970 +#define MAXHOSTNAMELEN 64      /* max length of hostname */
68971 +#define COMMAND_LINE_SIZE 256
68972 +
68973 +#endif
68974 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/pci.h new/include/asm-i386/mach-xen/asm/pci.h
68975 --- linux-2.6/include/asm-i386/mach-xen/asm/pci.h       1970-01-01 01:00:00.000000000 +0100
68976 +++ new/include/asm-i386/mach-xen/asm/pci.h     2006-05-09 12:35:17.000000000 +0200
68977 @@ -0,0 +1,154 @@
68978 +#ifndef __i386_PCI_H
68979 +#define __i386_PCI_H
68980 +
68981 +#include <linux/config.h>
68982 +
68983 +#ifdef __KERNEL__
68984 +#include <linux/mm.h>          /* for struct page */
68985 +
68986 +/* Can be used to override the logic in pci_scan_bus for skipping
68987 +   already-configured bus numbers - to be used for buggy BIOSes
68988 +   or architectures with incomplete PCI setup by the loader */
68989 +
68990 +#ifdef CONFIG_PCI
68991 +extern unsigned int pcibios_assign_all_busses(void);
68992 +#else
68993 +#define pcibios_assign_all_busses()    0
68994 +#endif
68995 +#define pcibios_scan_all_fns(a, b)     0
68996 +
68997 +extern unsigned long pci_mem_start;
68998 +#define PCIBIOS_MIN_IO         0x1000
68999 +#define PCIBIOS_MIN_MEM                (pci_mem_start)
69000 +
69001 +#define PCIBIOS_MIN_CARDBUS_IO 0x4000
69002 +
69003 +void pcibios_config_init(void);
69004 +struct pci_bus * pcibios_scan_root(int bus);
69005 +
69006 +void pcibios_set_master(struct pci_dev *dev);
69007 +void pcibios_penalize_isa_irq(int irq, int active);
69008 +struct irq_routing_table *pcibios_get_irq_routing_table(void);
69009 +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
69010 +
69011 +/* Dynamic DMA mapping stuff.
69012 + * i386 has everything mapped statically.
69013 + */
69014 +
69015 +#include <linux/types.h>
69016 +#include <linux/slab.h>
69017 +#include <asm/scatterlist.h>
69018 +#include <linux/string.h>
69019 +#include <asm/io.h>
69020 +
69021 +struct pci_dev;
69022 +
69023 +#ifdef CONFIG_SWIOTLB
69024 +
69025 +
69026 +/* On Xen we use SWIOTLB instead of blk-specific bounce buffers. */
69027 +#define PCI_DMA_BUS_IS_PHYS    (0)
69028 +
69029 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)      \
69030 +       dma_addr_t ADDR_NAME;
69031 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)                \
69032 +       __u32 LEN_NAME;
69033 +#define pci_unmap_addr(PTR, ADDR_NAME)                 \
69034 +       ((PTR)->ADDR_NAME)
69035 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)                \
69036 +       (((PTR)->ADDR_NAME) = (VAL))
69037 +#define pci_unmap_len(PTR, LEN_NAME)                   \
69038 +       ((PTR)->LEN_NAME)
69039 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)          \
69040 +       (((PTR)->LEN_NAME) = (VAL))
69041 +
69042 +#else
69043 +
69044 +/* The PCI address space does equal the physical memory
69045 + * address space.  The networking and block device layers use
69046 + * this boolean for bounce buffer decisions.
69047 + */
69048 +#define PCI_DMA_BUS_IS_PHYS    (1)
69049 +
69050 +/* pci_unmap_{page,single} is a nop so... */
69051 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
69052 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
69053 +#define pci_unmap_addr(PTR, ADDR_NAME)         (0)
69054 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)        do { } while (0)
69055 +#define pci_unmap_len(PTR, LEN_NAME)           (0)
69056 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)  do { } while (0)
69057 +
69058 +#endif
69059 +
69060 +/* This is always fine. */
69061 +#define pci_dac_dma_supported(pci_dev, mask)   (1)
69062 +
69063 +static inline dma64_addr_t
69064 +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
69065 +{
69066 +       return ((dma64_addr_t) page_to_phys(page) +
69067 +               (dma64_addr_t) offset);
69068 +}
69069 +
69070 +static inline struct page *
69071 +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
69072 +{
69073 +       return pfn_to_page(dma_addr >> PAGE_SHIFT);
69074 +}
69075 +
69076 +static inline unsigned long
69077 +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
69078 +{
69079 +       return (dma_addr & ~PAGE_MASK);
69080 +}
69081 +
69082 +static inline void
69083 +pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
69084 +{
69085 +}
69086 +
69087 +static inline void
69088 +pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
69089 +{
69090 +       flush_write_buffers();
69091 +}
69092 +
69093 +#define HAVE_PCI_MMAP
69094 +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
69095 +                              enum pci_mmap_state mmap_state, int write_combine);
69096 +
69097 +
69098 +static inline void pcibios_add_platform_entries(struct pci_dev *dev)
69099 +{
69100 +}
69101 +
69102 +#ifdef CONFIG_PCI
69103 +static inline void pci_dma_burst_advice(struct pci_dev *pdev,
69104 +                                       enum pci_dma_burst_strategy *strat,
69105 +                                       unsigned long *strategy_parameter)
69106 +{
69107 +       *strat = PCI_DMA_BURST_INFINITY;
69108 +       *strategy_parameter = ~0UL;
69109 +}
69110 +#endif
69111 +
69112 +#endif /* __KERNEL__ */
69113 +
69114 +#ifdef CONFIG_XEN_PCIDEV_FRONTEND
69115 +#include <xen/pcifront.h>
69116 +#endif /* CONFIG_XEN_PCIDEV_FRONTEND */
69117 +
69118 +/* implement the pci_ DMA API in terms of the generic device dma_ one */
69119 +#include <asm-generic/pci-dma-compat.h>
69120 +
69121 +/* generic pci stuff */
69122 +#include <asm-generic/pci.h>
69123 +
69124 +/* On Xen we have to scan all functions since Xen hides bridges from
69125 + * us.  If a bridge is at fn=0 and that slot has a multifunction
69126 + * device, we won't find the additional devices without scanning all
69127 + * functions. */
69128 +#undef pcibios_scan_all_fns
69129 +#define pcibios_scan_all_fns(a, b)     1
69130 +
69131 +#endif /* __i386_PCI_H */
69132 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/pgalloc.h new/include/asm-i386/mach-xen/asm/pgalloc.h
69133 --- linux-2.6/include/asm-i386/mach-xen/asm/pgalloc.h   1970-01-01 01:00:00.000000000 +0100
69134 +++ new/include/asm-i386/mach-xen/asm/pgalloc.h 2006-05-09 12:35:17.000000000 +0200
69135 @@ -0,0 +1,64 @@
69136 +#ifndef _I386_PGALLOC_H
69137 +#define _I386_PGALLOC_H
69138 +
69139 +#include <linux/config.h>
69140 +#include <asm/fixmap.h>
69141 +#include <linux/threads.h>
69142 +#include <linux/mm.h>          /* for struct page */
69143 +#include <asm/io.h>            /* for phys_to_virt and page_to_pseudophys */
69144 +
69145 +/* Is this pagetable pinned? */
69146 +#define PG_pinned      PG_arch_1
69147 +
69148 +#define pmd_populate_kernel(mm, pmd, pte) \
69149 +               set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
69150 +
69151 +#define pmd_populate(mm, pmd, pte)                                     \
69152 +do {                                                                   \
69153 +       if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) {     \
69154 +               if (!PageHighMem(pte))                                  \
69155 +                       BUG_ON(HYPERVISOR_update_va_mapping(            \
69156 +                         (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT),\
69157 +                         pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));\
69158 +               set_pmd(pmd, __pmd(_PAGE_TABLE +                        \
69159 +                       ((unsigned long long)page_to_pfn(pte) <<        \
69160 +                               (unsigned long long) PAGE_SHIFT)));     \
69161 +       } else {                                                        \
69162 +               *(pmd) = __pmd(_PAGE_TABLE +                            \
69163 +                       ((unsigned long long)page_to_pfn(pte) <<        \
69164 +                               (unsigned long long) PAGE_SHIFT));      \
69165 +       }                                                               \
69166 +} while (0)
69167 +
69168 +/*
69169 + * Allocate and free page tables.
69170 + */
69171 +extern pgd_t *pgd_alloc(struct mm_struct *);
69172 +extern void pgd_free(pgd_t *pgd);
69173 +
69174 +extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
69175 +extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
69176 +
69177 +static inline void pte_free_kernel(pte_t *pte)
69178 +{
69179 +       free_page((unsigned long)pte);
69180 +       make_page_writable(pte, XENFEAT_writable_page_tables);
69181 +}
69182 +
69183 +extern void pte_free(struct page *pte);
69184 +
69185 +#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
69186 +
69187 +#ifdef CONFIG_X86_PAE
69188 +/*
69189 + * In the PAE case we free the pmds as part of the pgd.
69190 + */
69191 +#define pmd_alloc_one(mm, addr)                ({ BUG(); ((pmd_t *)2); })
69192 +#define pmd_free(x)                    do { } while (0)
69193 +#define __pmd_free_tlb(tlb,x)          do { } while (0)
69194 +#define pud_populate(mm, pmd, pte)     BUG()
69195 +#endif
69196 +
69197 +#define check_pgt_cache()      do { } while (0)
69198 +
69199 +#endif /* _I386_PGALLOC_H */
69200 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h new/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h
69201 --- linux-2.6/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h       1970-01-01 01:00:00.000000000 +0100
69202 +++ new/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h     2006-05-09 12:35:17.000000000 +0200
69203 @@ -0,0 +1,21 @@
69204 +#ifndef _I386_PGTABLE_2LEVEL_DEFS_H
69205 +#define _I386_PGTABLE_2LEVEL_DEFS_H
69206 +
69207 +#define HAVE_SHARED_KERNEL_PMD 0
69208 +
69209 +/*
69210 + * traditional i386 two-level paging structure:
69211 + */
69212 +
69213 +#define PGDIR_SHIFT    22
69214 +#define PTRS_PER_PGD   1024
69215 +#define PTRS_PER_PGD_NO_HV     (HYPERVISOR_VIRT_START >> PGDIR_SHIFT)
69216 +
69217 +/*
69218 + * the i386 is two-level, so we don't really have any
69219 + * PMD directory physically.
69220 + */
69221 +
69222 +#define PTRS_PER_PTE   1024
69223 +
69224 +#endif /* _I386_PGTABLE_2LEVEL_DEFS_H */
69225 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/pgtable-2level.h new/include/asm-i386/mach-xen/asm/pgtable-2level.h
69226 --- linux-2.6/include/asm-i386/mach-xen/asm/pgtable-2level.h    1970-01-01 01:00:00.000000000 +0100
69227 +++ new/include/asm-i386/mach-xen/asm/pgtable-2level.h  2006-05-23 18:37:11.000000000 +0200
69228 @@ -0,0 +1,88 @@
69229 +#ifndef _I386_PGTABLE_2LEVEL_H
69230 +#define _I386_PGTABLE_2LEVEL_H
69231 +
69232 +#include <asm-generic/pgtable-nopmd.h>
69233 +
69234 +#define pte_ERROR(e) \
69235 +       printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low)
69236 +#define pgd_ERROR(e) \
69237 +       printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
69238 +
69239 +/*
69240 + * Certain architectures need to do special things when PTEs
69241 + * within a page table are directly modified.  Thus, the following
69242 + * hook is made available.
69243 + */
69244 +#define set_pte(pteptr, pteval) (*(pteptr) = pteval)
69245 +
69246 +#define set_pte_at(_mm,addr,ptep,pteval) do {                          \
69247 +       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
69248 +           HYPERVISOR_update_va_mapping((addr), (pteval), 0))          \
69249 +               set_pte((ptep), (pteval));                              \
69250 +} while (0)
69251 +
69252 +#define set_pte_at_sync(_mm,addr,ptep,pteval) do {                     \
69253 +       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
69254 +           HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \
69255 +               set_pte((ptep), (pteval));                              \
69256 +               xen_invlpg((addr));                                     \
69257 +       }                                                               \
69258 +} while (0)
69259 +
69260 +#define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
69261 +
69262 +#define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval))
69263 +
69264 +#define pte_clear(mm,addr,xp)  do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
69265 +#define pmd_clear(xp)  do { set_pmd(xp, __pmd(0)); } while (0)
69266 +
69267 +#define ptep_get_and_clear(mm,addr,xp) __pte_ma(xchg(&(xp)->pte_low, 0))
69268 +#define pte_same(a, b)         ((a).pte_low == (b).pte_low)
69269 +#define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT)
69270 +#define pte_pfn(_pte) mfn_to_local_pfn(pte_mfn(_pte))
69271 +
69272 +#define pte_page(_pte) pfn_to_page(pte_pfn(_pte))
69273 +
69274 +#define pte_none(x)            (!(x).pte_low)
69275 +#define pfn_pte(pfn, prot)     __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
69276 +#define pfn_pte_ma(pfn, prot)  __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
69277 +#define pfn_pmd(pfn, prot)     __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
69278 +
69279 +/*
69280 + * All present user pages are user-executable:
69281 + */
69282 +static inline int pte_exec(pte_t pte)
69283 +{
69284 +       return pte_user(pte);
69285 +}
69286 +
69287 +/*
69288 + * All present pages are kernel-executable:
69289 + */
69290 +static inline int pte_exec_kernel(pte_t pte)
69291 +{
69292 +       return 1;
69293 +}
69294 +
69295 +/*
69296 + * Bits 0, 6 and 7 are taken, split up the 29 bits of offset
69297 + * into this range:
69298 + */
69299 +#define PTE_FILE_MAX_BITS      29
69300 +
69301 +#define pte_to_pgoff(pte) \
69302 +       ((((pte).pte_low >> 1) & 0x1f ) + (((pte).pte_low >> 8) << 5 ))
69303 +
69304 +#define pgoff_to_pte(off) \
69305 +       ((pte_t) { (((off) & 0x1f) << 1) + (((off) >> 5) << 8) + _PAGE_FILE })
69306 +
69307 +/* Encode and de-code a swap entry */
69308 +#define __swp_type(x)                  (((x).val >> 1) & 0x1f)
69309 +#define __swp_offset(x)                        ((x).val >> 8)
69310 +#define __swp_entry(type, offset)      ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
69311 +#define __pte_to_swp_entry(pte)                ((swp_entry_t) { (pte).pte_low })
69312 +#define __swp_entry_to_pte(x)          ((pte_t) { (x).val })
69313 +
69314 +void vmalloc_sync_all(void);
69315 +
69316 +#endif /* _I386_PGTABLE_2LEVEL_H */
69317 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h new/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h
69318 --- linux-2.6/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h       1970-01-01 01:00:00.000000000 +0100
69319 +++ new/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h     2006-05-09 12:35:17.000000000 +0200
69320 @@ -0,0 +1,25 @@
69321 +#ifndef _I386_PGTABLE_3LEVEL_DEFS_H
69322 +#define _I386_PGTABLE_3LEVEL_DEFS_H
69323 +
69324 +#define HAVE_SHARED_KERNEL_PMD 0
69325 +
69326 +/*
69327 + * PGDIR_SHIFT determines what a top-level page table entry can map
69328 + */
69329 +#define PGDIR_SHIFT    30
69330 +#define PTRS_PER_PGD   4
69331 +#define PTRS_PER_PGD_NO_HV 4
69332 +
69333 +/*
69334 + * PMD_SHIFT determines the size of the area a middle-level
69335 + * page table can map
69336 + */
69337 +#define PMD_SHIFT      21
69338 +#define PTRS_PER_PMD   512
69339 +
69340 +/*
69341 + * entries per page directory level
69342 + */
69343 +#define PTRS_PER_PTE   512
69344 +
69345 +#endif /* _I386_PGTABLE_3LEVEL_DEFS_H */
69346 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/pgtable-3level.h new/include/asm-i386/mach-xen/asm/pgtable-3level.h
69347 --- linux-2.6/include/asm-i386/mach-xen/asm/pgtable-3level.h    1970-01-01 01:00:00.000000000 +0100
69348 +++ new/include/asm-i386/mach-xen/asm/pgtable-3level.h  2006-06-28 14:32:14.000000000 +0200
69349 @@ -0,0 +1,197 @@
69350 +#ifndef _I386_PGTABLE_3LEVEL_H
69351 +#define _I386_PGTABLE_3LEVEL_H
69352 +
69353 +#include <asm-generic/pgtable-nopud.h>
69354 +
69355 +/*
69356 + * Intel Physical Address Extension (PAE) Mode - three-level page
69357 + * tables on PPro+ CPUs.
69358 + *
69359 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
69360 + */
69361 +
69362 +#define pte_ERROR(e) \
69363 +       printk("%s:%d: bad pte %p(%08lx%08lx).\n", __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low)
69364 +#define pmd_ERROR(e) \
69365 +       printk("%s:%d: bad pmd %p(%016Lx).\n", __FILE__, __LINE__, &(e), pmd_val(e))
69366 +#define pgd_ERROR(e) \
69367 +       printk("%s:%d: bad pgd %p(%016Lx).\n", __FILE__, __LINE__, &(e), pgd_val(e))
69368 +
69369 +#define pud_none(pud)                          0
69370 +#define pud_bad(pud)                           0
69371 +#define pud_present(pud)                       1
69372 +
69373 +/*
69374 + * Is the pte executable?
69375 + */
69376 +static inline int pte_x(pte_t pte)
69377 +{
69378 +       return !(pte_val(pte) & _PAGE_NX);
69379 +}
69380 +
69381 +/*
69382 + * All present user-pages with !NX bit are user-executable:
69383 + */
69384 +static inline int pte_exec(pte_t pte)
69385 +{
69386 +       return pte_user(pte) && pte_x(pte);
69387 +}
69388 +/*
69389 + * All present pages with !NX bit are kernel-executable:
69390 + */
69391 +static inline int pte_exec_kernel(pte_t pte)
69392 +{
69393 +       return pte_x(pte);
69394 +}
69395 +
69396 +/* Rules for using set_pte: the pte being assigned *must* be
69397 + * either not present or in a state where the hardware will
69398 + * not attempt to update the pte.  In places where this is
69399 + * not possible, use pte_get_and_clear to obtain the old pte
69400 + * value and then use set_pte to update it.  -ben
69401 + */
69402 +#define __HAVE_ARCH_SET_PTE_ATOMIC
69403 +
69404 +#if 1
69405 +/* use writable pagetables */
69406 +static inline void set_pte(pte_t *ptep, pte_t pte)
69407 +{
69408 +       ptep->pte_high = pte.pte_high;
69409 +       smp_wmb();
69410 +       ptep->pte_low = pte.pte_low;
69411 +}
69412 +# define set_pte_atomic(pteptr,pteval) \
69413 +               set_64bit((unsigned long long *)(pteptr),pte_val_ma(pteval))
69414 +#else
69415 +/* no writable pagetables */
69416 +# define set_pte(pteptr,pteval)                                \
69417 +               xen_l1_entry_update((pteptr), (pteval))
69418 +# define set_pte_atomic(pteptr,pteval) set_pte(pteptr,pteval)
69419 +#endif
69420 +
69421 +#define set_pte_at(_mm,addr,ptep,pteval) do {                          \
69422 +       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
69423 +           HYPERVISOR_update_va_mapping((addr), (pteval), 0))          \
69424 +               set_pte((ptep), (pteval));                              \
69425 +} while (0)
69426 +
69427 +#define set_pte_at_sync(_mm,addr,ptep,pteval) do {                     \
69428 +       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
69429 +           HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \
69430 +               set_pte((ptep), (pteval));                              \
69431 +               xen_invlpg((addr));                                     \
69432 +       }                                                               \
69433 +} while (0)
69434 +
69435 +#define set_pmd(pmdptr,pmdval)                         \
69436 +               xen_l2_entry_update((pmdptr), (pmdval))
69437 +#define set_pud(pudptr,pudval) \
69438 +               xen_l3_entry_update((pudptr), (pudval))
69439 +
69440 +/*
69441 + * Pentium-II erratum A13: in PAE mode we explicitly have to flush
69442 + * the TLB via cr3 if the top-level pgd is changed...
69443 + * We do not let the generic code free and clear pgd entries due to
69444 + * this erratum.
69445 + */
69446 +static inline void pud_clear (pud_t * pud) { }
69447 +
69448 +#define pud_page(pud) \
69449 +((struct page *) __va(pud_val(pud) & PAGE_MASK))
69450 +
69451 +#define pud_page_kernel(pud) \
69452 +((unsigned long) __va(pud_val(pud) & PAGE_MASK))
69453 +
69454 +
69455 +/* Find an entry in the second-level page table.. */
69456 +#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
69457 +                       pmd_index(address))
69458 +
69459 +/*
69460 + * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
69461 + * entry, so clear the bottom half first and enforce ordering with a compiler
69462 + * barrier.
69463 + */
69464 +static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
69465 +{
69466 +       ptep->pte_low = 0;
69467 +       smp_wmb();
69468 +       ptep->pte_high = 0;
69469 +}
69470 +
69471 +#define pmd_clear(xp)do { set_pmd(xp, __pmd(0)); } while (0)
69472 +
69473 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
69474 +{
69475 +       pte_t res;
69476 +
69477 +       /* xchg acts as a barrier before the setting of the high bits */
69478 +       res.pte_low = xchg(&ptep->pte_low, 0);
69479 +       res.pte_high = ptep->pte_high;
69480 +       ptep->pte_high = 0;
69481 +
69482 +       return res;
69483 +}
69484 +
69485 +static inline int pte_same(pte_t a, pte_t b)
69486 +{
69487 +       return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
69488 +}
69489 +
69490 +#define pte_page(x)    pfn_to_page(pte_pfn(x))
69491 +
69492 +static inline int pte_none(pte_t pte)
69493 +{
69494 +       return !pte.pte_low && !pte.pte_high;
69495 +}
69496 +
69497 +#define pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) |\
69498 +                      (((_pte).pte_high & 0xfff) << (32-PAGE_SHIFT)))
69499 +#define pte_pfn(_pte) mfn_to_local_pfn(pte_mfn(_pte))
69500 +
69501 +extern unsigned long long __supported_pte_mask;
69502 +
69503 +static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
69504 +{
69505 +       pte_t pte;
69506 +
69507 +       pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \
69508 +                                       (pgprot_val(pgprot) >> 32);
69509 +       pte.pte_high &= (__supported_pte_mask >> 32);
69510 +       pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \
69511 +                                                       __supported_pte_mask;
69512 +       return pte;
69513 +}
69514 +
69515 +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
69516 +{
69517 +       return pfn_pte_ma(pfn_to_mfn(page_nr), pgprot);
69518 +}
69519 +
69520 +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
69521 +{
69522 +       BUG(); panic("needs review");
69523 +       return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) | \
69524 +                       pgprot_val(pgprot)) & __supported_pte_mask);
69525 +}
69526 +
69527 +/*
69528 + * Bits 0, 6 and 7 are taken in the low part of the pte,
69529 + * put the 32 bits of offset into the high part.
69530 + */
69531 +#define pte_to_pgoff(pte) ((pte).pte_high)
69532 +#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
69533 +#define PTE_FILE_MAX_BITS       32
69534 +
69535 +/* Encode and de-code a swap entry */
69536 +#define __swp_type(x)                  (((x).val) & 0x1f)
69537 +#define __swp_offset(x)                        ((x).val >> 5)
69538 +#define __swp_entry(type, offset)      ((swp_entry_t){(type) | (offset) << 5})
69539 +#define __pte_to_swp_entry(pte)                ((swp_entry_t){ (pte).pte_high })
69540 +#define __swp_entry_to_pte(x)          ((pte_t){ 0, (x).val })
69541 +
69542 +#define __pmd_free_tlb(tlb, x)         do { } while (0)
69543 +
69544 +#define vmalloc_sync_all() ((void)0)
69545 +
69546 +#endif /* _I386_PGTABLE_3LEVEL_H */
69547 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/pgtable.h new/include/asm-i386/mach-xen/asm/pgtable.h
69548 --- linux-2.6/include/asm-i386/mach-xen/asm/pgtable.h   1970-01-01 01:00:00.000000000 +0100
69549 +++ new/include/asm-i386/mach-xen/asm/pgtable.h 2006-05-23 18:37:12.000000000 +0200
69550 @@ -0,0 +1,509 @@
69551 +#ifndef _I386_PGTABLE_H
69552 +#define _I386_PGTABLE_H
69553 +
69554 +#include <linux/config.h>
69555 +#include <asm/hypervisor.h>
69556 +
69557 +/*
69558 + * The Linux memory management assumes a three-level page table setup. On
69559 + * the i386, we use that, but "fold" the mid level into the top-level page
69560 + * table, so that we physically have the same two-level page table as the
69561 + * i386 mmu expects.
69562 + *
69563 + * This file contains the functions and defines necessary to modify and use
69564 + * the i386 page table tree.
69565 + */
69566 +#ifndef __ASSEMBLY__
69567 +#include <asm/processor.h>
69568 +#include <asm/fixmap.h>
69569 +#include <linux/threads.h>
69570 +
69571 +#ifndef _I386_BITOPS_H
69572 +#include <asm/bitops.h>
69573 +#endif
69574 +
69575 +#include <linux/slab.h>
69576 +#include <linux/list.h>
69577 +#include <linux/spinlock.h>
69578 +
69579 +struct mm_struct;
69580 +struct vm_area_struct;
69581 +
69582 +/*
69583 + * ZERO_PAGE is a global shared page that is always zero: used
69584 + * for zero-mapped memory areas etc..
69585 + */
69586 +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
69587 +extern unsigned long empty_zero_page[1024];
69588 +extern pgd_t *swapper_pg_dir;
69589 +extern kmem_cache_t *pgd_cache;
69590 +extern kmem_cache_t *pmd_cache;
69591 +extern spinlock_t pgd_lock;
69592 +extern struct page *pgd_list;
69593 +
69594 +void pmd_ctor(void *, kmem_cache_t *, unsigned long);
69595 +void pgd_ctor(void *, kmem_cache_t *, unsigned long);
69596 +void pgd_dtor(void *, kmem_cache_t *, unsigned long);
69597 +void pgtable_cache_init(void);
69598 +void paging_init(void);
69599 +
69600 +/*
69601 + * The Linux x86 paging architecture is 'compile-time dual-mode', it
69602 + * implements both the traditional 2-level x86 page tables and the
69603 + * newer 3-level PAE-mode page tables.
69604 + */
69605 +#ifdef CONFIG_X86_PAE
69606 +# include <asm/pgtable-3level-defs.h>
69607 +# define PMD_SIZE      (1UL << PMD_SHIFT)
69608 +# define PMD_MASK      (~(PMD_SIZE-1))
69609 +#else
69610 +# include <asm/pgtable-2level-defs.h>
69611 +#endif
69612 +
69613 +#define PGDIR_SIZE     (1UL << PGDIR_SHIFT)
69614 +#define PGDIR_MASK     (~(PGDIR_SIZE-1))
69615 +
69616 +#define USER_PTRS_PER_PGD      (TASK_SIZE/PGDIR_SIZE)
69617 +#define FIRST_USER_ADDRESS     0
69618 +
69619 +#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
69620 +#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
69621 +
69622 +#define TWOLEVEL_PGDIR_SHIFT   22
69623 +#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT)
69624 +#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS)
69625 +
69626 +/* Just any arbitrary offset to the start of the vmalloc VM area: the
69627 + * current 8MB value just means that there will be a 8MB "hole" after the
69628 + * physical memory until the kernel virtual memory starts.  That means that
69629 + * any out-of-bounds memory accesses will hopefully be caught.
69630 + * The vmalloc() routines leaves a hole of 4kB between each vmalloced
69631 + * area for the same reason. ;)
69632 + */
69633 +#define VMALLOC_OFFSET (8*1024*1024)
69634 +#define VMALLOC_START  (((unsigned long) high_memory + vmalloc_earlyreserve + \
69635 +                       2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
69636 +#ifdef CONFIG_HIGHMEM
69637 +# define VMALLOC_END   (PKMAP_BASE-2*PAGE_SIZE)
69638 +#else
69639 +# define VMALLOC_END   (FIXADDR_START-2*PAGE_SIZE)
69640 +#endif
69641 +
69642 +/*
69643 + * _PAGE_PSE set in the page directory entry just means that
69644 + * the page directory entry points directly to a 4MB-aligned block of
69645 + * memory. 
69646 + */
69647 +#define _PAGE_BIT_PRESENT      0
69648 +#define _PAGE_BIT_RW           1
69649 +#define _PAGE_BIT_USER         2
69650 +#define _PAGE_BIT_PWT          3
69651 +#define _PAGE_BIT_PCD          4
69652 +#define _PAGE_BIT_ACCESSED     5
69653 +#define _PAGE_BIT_DIRTY                6
69654 +#define _PAGE_BIT_PSE          7       /* 4 MB (or 2MB) page, Pentium+, if present.. */
69655 +#define _PAGE_BIT_GLOBAL       8       /* Global TLB entry PPro+ */
69656 +#define _PAGE_BIT_UNUSED1      9       /* available for programmer */
69657 +#define _PAGE_BIT_UNUSED2      10
69658 +#define _PAGE_BIT_UNUSED3      11
69659 +#define _PAGE_BIT_NX           63
69660 +
69661 +#define _PAGE_PRESENT  0x001
69662 +#define _PAGE_RW       0x002
69663 +#define _PAGE_USER     0x004
69664 +#define _PAGE_PWT      0x008
69665 +#define _PAGE_PCD      0x010
69666 +#define _PAGE_ACCESSED 0x020
69667 +#define _PAGE_DIRTY    0x040
69668 +#define _PAGE_PSE      0x080   /* 4 MB (or 2MB) page, Pentium+, if present.. */
69669 +#define _PAGE_GLOBAL   0x100   /* Global TLB entry PPro+ */
69670 +#define _PAGE_UNUSED1  0x200   /* available for programmer */
69671 +#define _PAGE_UNUSED2  0x400
69672 +#define _PAGE_UNUSED3  0x800
69673 +
69674 +/* If _PAGE_PRESENT is clear, we use these: */
69675 +#define _PAGE_FILE     0x040   /* nonlinear file mapping, saved PTE; unset:swap */
69676 +#define _PAGE_PROTNONE 0x080   /* if the user mapped it with PROT_NONE;
69677 +                                  pte_present gives true */
69678 +#ifdef CONFIG_X86_PAE
69679 +#define _PAGE_NX       (1ULL<<_PAGE_BIT_NX)
69680 +#else
69681 +#define _PAGE_NX       0
69682 +#endif
69683 +
69684 +#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
69685 +#define _KERNPG_TABLE  (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
69686 +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
69687 +
69688 +#define PAGE_NONE \
69689 +       __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
69690 +#define PAGE_SHARED \
69691 +       __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
69692 +
69693 +#define PAGE_SHARED_EXEC \
69694 +       __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
69695 +#define PAGE_COPY_NOEXEC \
69696 +       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
69697 +#define PAGE_COPY_EXEC \
69698 +       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
69699 +#define PAGE_COPY \
69700 +       PAGE_COPY_NOEXEC
69701 +#define PAGE_READONLY \
69702 +       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
69703 +#define PAGE_READONLY_EXEC \
69704 +       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
69705 +
69706 +#define _PAGE_KERNEL \
69707 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
69708 +#define _PAGE_KERNEL_EXEC \
69709 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
69710 +
69711 +extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
69712 +#define __PAGE_KERNEL_RO               (__PAGE_KERNEL & ~_PAGE_RW)
69713 +#define __PAGE_KERNEL_NOCACHE          (__PAGE_KERNEL | _PAGE_PCD)
69714 +#define __PAGE_KERNEL_LARGE            (__PAGE_KERNEL | _PAGE_PSE)
69715 +#define __PAGE_KERNEL_LARGE_EXEC       (__PAGE_KERNEL_EXEC | _PAGE_PSE)
69716 +
69717 +#define PAGE_KERNEL            __pgprot(__PAGE_KERNEL)
69718 +#define PAGE_KERNEL_RO         __pgprot(__PAGE_KERNEL_RO)
69719 +#define PAGE_KERNEL_EXEC       __pgprot(__PAGE_KERNEL_EXEC)
69720 +#define PAGE_KERNEL_NOCACHE    __pgprot(__PAGE_KERNEL_NOCACHE)
69721 +#define PAGE_KERNEL_LARGE      __pgprot(__PAGE_KERNEL_LARGE)
69722 +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
69723 +
69724 +/*
69725 + * The i386 can't do page protection for execute, and considers that
69726 + * the same are read. Also, write permissions imply read permissions.
69727 + * This is the closest we can get..
69728 + */
69729 +#define __P000 PAGE_NONE
69730 +#define __P001 PAGE_READONLY
69731 +#define __P010 PAGE_COPY
69732 +#define __P011 PAGE_COPY
69733 +#define __P100 PAGE_READONLY_EXEC
69734 +#define __P101 PAGE_READONLY_EXEC
69735 +#define __P110 PAGE_COPY_EXEC
69736 +#define __P111 PAGE_COPY_EXEC
69737 +
69738 +#define __S000 PAGE_NONE
69739 +#define __S001 PAGE_READONLY
69740 +#define __S010 PAGE_SHARED
69741 +#define __S011 PAGE_SHARED
69742 +#define __S100 PAGE_READONLY_EXEC
69743 +#define __S101 PAGE_READONLY_EXEC
69744 +#define __S110 PAGE_SHARED_EXEC
69745 +#define __S111 PAGE_SHARED_EXEC
69746 +
69747 +/*
69748 + * Define this if things work differently on an i386 and an i486:
69749 + * it will (on an i486) warn about kernel memory accesses that are
69750 + * done without a 'access_ok(VERIFY_WRITE,..)'
69751 + */
69752 +#undef TEST_ACCESS_OK
69753 +
69754 +/* The boot page tables (all created as a single array) */
69755 +extern unsigned long pg0[];
69756 +
69757 +#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
69758 +
69759 +/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
69760 +#define pmd_none(x)    (!(unsigned long)pmd_val(x))
69761 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
69762 +   can temporarily clear it. */
69763 +#define pmd_present(x) (pmd_val(x))
69764 +#define pmd_bad(x)     ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
69765 +
69766 +
69767 +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
69768 +
69769 +/*
69770 + * The following only work if pte_present() is true.
69771 + * Undefined behaviour if not..
69772 + */
69773 +static inline int pte_user(pte_t pte)          { return (pte).pte_low & _PAGE_USER; }
69774 +static inline int pte_read(pte_t pte)          { return (pte).pte_low & _PAGE_USER; }
69775 +static inline int pte_dirty(pte_t pte)         { return (pte).pte_low & _PAGE_DIRTY; }
69776 +static inline int pte_young(pte_t pte)         { return (pte).pte_low & _PAGE_ACCESSED; }
69777 +static inline int pte_write(pte_t pte)         { return (pte).pte_low & _PAGE_RW; }
69778 +static inline int pte_huge(pte_t pte)          { return (pte).pte_low & _PAGE_PSE; }
69779 +
69780 +/*
69781 + * The following only works if pte_present() is not true.
69782 + */
69783 +static inline int pte_file(pte_t pte)          { return (pte).pte_low & _PAGE_FILE; }
69784 +
69785 +static inline pte_t pte_rdprotect(pte_t pte)   { (pte).pte_low &= ~_PAGE_USER; return pte; }
69786 +static inline pte_t pte_exprotect(pte_t pte)   { (pte).pte_low &= ~_PAGE_USER; return pte; }
69787 +static inline pte_t pte_mkclean(pte_t pte)     { (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
69788 +static inline pte_t pte_mkold(pte_t pte)       { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
69789 +static inline pte_t pte_wrprotect(pte_t pte)   { (pte).pte_low &= ~_PAGE_RW; return pte; }
69790 +static inline pte_t pte_mkread(pte_t pte)      { (pte).pte_low |= _PAGE_USER; return pte; }
69791 +static inline pte_t pte_mkexec(pte_t pte)      { (pte).pte_low |= _PAGE_USER; return pte; }
69792 +static inline pte_t pte_mkdirty(pte_t pte)     { (pte).pte_low |= _PAGE_DIRTY; return pte; }
69793 +static inline pte_t pte_mkyoung(pte_t pte)     { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
69794 +static inline pte_t pte_mkwrite(pte_t pte)     { (pte).pte_low |= _PAGE_RW; return pte; }
69795 +static inline pte_t pte_mkhuge(pte_t pte)      { (pte).pte_low |= _PAGE_PSE; return pte; }
69796 +
69797 +#ifdef CONFIG_X86_PAE
69798 +# include <asm/pgtable-3level.h>
69799 +#else
69800 +# include <asm/pgtable-2level.h>
69801 +#endif
69802 +
69803 +static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
69804 +{
69805 +       if (!pte_dirty(*ptep))
69806 +               return 0;
69807 +       return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low);
69808 +}
69809 +
69810 +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
69811 +{
69812 +       if (!pte_young(*ptep))
69813 +               return 0;
69814 +       return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low);
69815 +}
69816 +
69817 +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
69818 +{
69819 +       pte_t pte;
69820 +       if (full) {
69821 +               pte = *ptep;
69822 +               pte_clear(mm, addr, ptep);
69823 +       } else {
69824 +               pte = ptep_get_and_clear(mm, addr, ptep);
69825 +       }
69826 +       return pte;
69827 +}
69828 +
69829 +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
69830 +{
69831 +       if (pte_write(*ptep))
69832 +               clear_bit(_PAGE_BIT_RW, &ptep->pte_low);
69833 +}
69834 +
69835 +/*
69836 + * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
69837 + *
69838 + *  dst - pointer to pgd range anwhere on a pgd page
69839 + *  src - ""
69840 + *  count - the number of pgds to copy.
69841 + *
69842 + * dst and src can be on the same page, but the range must not overlap,
69843 + * and must not cross a page boundary.
69844 + */
69845 +static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
69846 +{
69847 +       memcpy(dst, src, count * sizeof(pgd_t));
69848 +}
69849 +
69850 +/*
69851 + * Macro to mark a page protection value as "uncacheable".  On processors which do not support
69852 + * it, this is a no-op.
69853 + */
69854 +#define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3)                                          \
69855 +                                ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot))
69856 +
69857 +/*
69858 + * Conversion functions: convert a page and protection to a page entry,
69859 + * and a page entry and page directory to the page they refer to.
69860 + */
69861 +
69862 +#define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))
69863 +
69864 +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
69865 +{
69866 +       pte.pte_low &= _PAGE_CHG_MASK;
69867 +       pte.pte_low |= pgprot_val(newprot);
69868 +#ifdef CONFIG_X86_PAE
69869 +       /*
69870 +        * Chop off the NX bit (if present), and add the NX portion of
69871 +        * the newprot (if present):
69872 +        */
69873 +       pte.pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
69874 +       pte.pte_high |= (pgprot_val(newprot) >> 32) & \
69875 +                                       (__supported_pte_mask >> 32);
69876 +#endif
69877 +       return pte;
69878 +}
69879 +
69880 +#define pmd_large(pmd) \
69881 +((pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
69882 +
69883 +/*
69884 + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
69885 + *
69886 + * this macro returns the index of the entry in the pgd page which would
69887 + * control the given virtual address
69888 + */
69889 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
69890 +#define pgd_index_k(addr) pgd_index(addr)
69891 +
69892 +/*
69893 + * pgd_offset() returns a (pgd_t *)
69894 + * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
69895 + */
69896 +#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))
69897 +
69898 +/*
69899 + * a shortcut which implies the use of the kernel's pgd, instead
69900 + * of a process's
69901 + */
69902 +#define pgd_offset_k(address) pgd_offset(&init_mm, address)
69903 +
69904 +/*
69905 + * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
69906 + *
69907 + * this macro returns the index of the entry in the pmd page which would
69908 + * control the given virtual address
69909 + */
69910 +#define pmd_index(address) \
69911 +               (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
69912 +
69913 +/*
69914 + * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
69915 + *
69916 + * this macro returns the index of the entry in the pte page which would
69917 + * control the given virtual address
69918 + */
69919 +#define pte_index(address) \
69920 +               (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
69921 +#define pte_offset_kernel(dir, address) \
69922 +       ((pte_t *) pmd_page_kernel(*(dir)) +  pte_index(address))
69923 +
69924 +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
69925 +
69926 +#define pmd_page_kernel(pmd) \
69927 +               ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
69928 +
69929 +/*
69930 + * Helper function that returns the kernel pagetable entry controlling
69931 + * the virtual address 'address'. NULL means no pagetable entry present.
69932 + * NOTE: the return type is pte_t but if the pmd is PSE then we return it
69933 + * as a pte too.
69934 + */
69935 +extern pte_t *lookup_address(unsigned long address);
69936 +
69937 +/*
69938 + * Make a given kernel text page executable/non-executable.
69939 + * Returns the previous executability setting of that page (which
69940 + * is used to restore the previous state). Used by the SMP bootup code.
69941 + * NOTE: this is an __init function for security reasons.
69942 + */
69943 +#ifdef CONFIG_X86_PAE
69944 + extern int set_kernel_exec(unsigned long vaddr, int enable);
69945 +#else
69946 + static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
69947 +#endif
69948 +
69949 +extern void noexec_setup(const char *str);
69950 +
69951 +#if defined(CONFIG_HIGHPTE)
69952 +#define pte_offset_map(dir, address) \
69953 +       ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + \
69954 +        pte_index(address))
69955 +#define pte_offset_map_nested(dir, address) \
69956 +       ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + \
69957 +        pte_index(address))
69958 +#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
69959 +#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
69960 +#else
69961 +#define pte_offset_map(dir, address) \
69962 +       ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address))
69963 +#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
69964 +#define pte_unmap(pte) do { } while (0)
69965 +#define pte_unmap_nested(pte) do { } while (0)
69966 +#endif
69967 +
69968 +/*
69969 + * The i386 doesn't have any external MMU info: the kernel page
69970 + * tables contain all the necessary information.
69971 + *
69972 + * Also, we only update the dirty/accessed state if we set
69973 + * the dirty bit by hand in the kernel, since the hardware
69974 + * will do the accessed bit for us, and we don't want to
69975 + * race with other CPU's that might be updating the dirty
69976 + * bit at the same time.
69977 + */
69978 +#define update_mmu_cache(vma,address,pte) do { } while (0)
69979 +#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
69980 +#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
69981 +       do {                                                              \
69982 +               if (__dirty) {                                            \
69983 +                       if ( likely((__vma)->vm_mm == current->mm) ) {    \
69984 +                           BUG_ON(HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG|UVMF_MULTI|(unsigned long)((__vma)->vm_mm->cpu_vm_mask.bits))); \
69985 +                       } else {                                          \
69986 +                            xen_l1_entry_update((__ptep), (__entry)); \
69987 +                           flush_tlb_page((__vma), (__address));         \
69988 +                       }                                                 \
69989 +               }                                                         \
69990 +       } while (0)
69991 +
69992 +#define __HAVE_ARCH_PTEP_ESTABLISH
69993 +#define ptep_establish(__vma, __address, __ptep, __entry)              \
69994 +do {                                                                   \
69995 +       ptep_set_access_flags(__vma, __address, __ptep, __entry, 1);    \
69996 +} while (0)
69997 +
69998 +#include <xen/features.h>
69999 +void make_lowmem_page_readonly(void *va, unsigned int feature);
70000 +void make_lowmem_page_writable(void *va, unsigned int feature);
70001 +void make_page_readonly(void *va, unsigned int feature);
70002 +void make_page_writable(void *va, unsigned int feature);
70003 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
70004 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
70005 +
70006 +#define virt_to_ptep(__va)                                             \
70007 +({                                                                     \
70008 +       pgd_t *__pgd = pgd_offset_k((unsigned long)(__va));             \
70009 +       pud_t *__pud = pud_offset(__pgd, (unsigned long)(__va));        \
70010 +       pmd_t *__pmd = pmd_offset(__pud, (unsigned long)(__va));        \
70011 +       pte_offset_kernel(__pmd, (unsigned long)(__va));                \
70012 +})
70013 +
70014 +#define arbitrary_virt_to_machine(__va)                                        \
70015 +({                                                                     \
70016 +       maddr_t m = (maddr_t)pte_mfn(*virt_to_ptep(__va)) << PAGE_SHIFT;\
70017 +       m | ((unsigned long)(__va) & (PAGE_SIZE-1));                    \
70018 +})
70019 +
70020 +#endif /* !__ASSEMBLY__ */
70021 +
70022 +#ifdef CONFIG_FLATMEM
70023 +#define kern_addr_valid(addr)  (1)
70024 +#endif /* CONFIG_FLATMEM */
70025 +
70026 +int direct_remap_pfn_range(struct vm_area_struct *vma,
70027 +                           unsigned long address, 
70028 +                           unsigned long mfn,
70029 +                           unsigned long size, 
70030 +                           pgprot_t prot,
70031 +                           domid_t  domid);
70032 +int direct_kernel_remap_pfn_range(unsigned long address, 
70033 +                                 unsigned long mfn,
70034 +                                 unsigned long size, 
70035 +                                 pgprot_t prot,
70036 +                                 domid_t  domid);
70037 +int create_lookup_pte_addr(struct mm_struct *mm,
70038 +                           unsigned long address,
70039 +                           uint64_t *ptep);
70040 +int touch_pte_range(struct mm_struct *mm,
70041 +                    unsigned long address,
70042 +                    unsigned long size);
70043 +
70044 +#define io_remap_pfn_range(vma,from,pfn,size,prot) \
70045 +direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
70046 +
70047 +#define MK_IOSPACE_PFN(space, pfn)     (pfn)
70048 +#define GET_IOSPACE(pfn)               0
70049 +#define GET_PFN(pfn)                   (pfn)
70050 +
70051 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
70052 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
70053 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
70054 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
70055 +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
70056 +#define __HAVE_ARCH_PTE_SAME
70057 +#include <asm-generic/pgtable.h>
70058 +
70059 +#endif /* _I386_PGTABLE_H */
70060 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/processor.h new/include/asm-i386/mach-xen/asm/processor.h
70061 --- linux-2.6/include/asm-i386/mach-xen/asm/processor.h 1970-01-01 01:00:00.000000000 +0100
70062 +++ new/include/asm-i386/mach-xen/asm/processor.h       2006-05-23 18:37:12.000000000 +0200
70063 @@ -0,0 +1,753 @@
70064 +/*
70065 + * include/asm-i386/processor.h
70066 + *
70067 + * Copyright (C) 1994 Linus Torvalds
70068 + */
70069 +
70070 +#ifndef __ASM_I386_PROCESSOR_H
70071 +#define __ASM_I386_PROCESSOR_H
70072 +
70073 +#include <asm/vm86.h>
70074 +#include <asm/math_emu.h>
70075 +#include <asm/segment.h>
70076 +#include <asm/page.h>
70077 +#include <asm/types.h>
70078 +#include <asm/sigcontext.h>
70079 +#include <asm/cpufeature.h>
70080 +#include <asm/msr.h>
70081 +#include <asm/system.h>
70082 +#include <linux/cache.h>
70083 +#include <linux/config.h>
70084 +#include <linux/threads.h>
70085 +#include <asm/percpu.h>
70086 +#include <linux/cpumask.h>
70087 +#include <xen/interface/physdev.h>
70088 +
70089 +/* flag for disabling the tsc */
70090 +extern int tsc_disable;
70091 +
70092 +struct desc_struct {
70093 +       unsigned long a,b;
70094 +};
70095 +
70096 +#define desc_empty(desc) \
70097 +               (!((desc)->a | (desc)->b))
70098 +
70099 +#define desc_equal(desc1, desc2) \
70100 +               (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
70101 +/*
70102 + * Default implementation of macro that returns current
70103 + * instruction pointer ("program counter").
70104 + */
70105 +#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; })
70106 +
70107 +/*
70108 + *  CPU type and hardware bug flags. Kept separately for each CPU.
70109 + *  Members of this structure are referenced in head.S, so think twice
70110 + *  before touching them. [mj]
70111 + */
70112 +
70113 +struct cpuinfo_x86 {
70114 +       __u8    x86;            /* CPU family */
70115 +       __u8    x86_vendor;     /* CPU vendor */
70116 +       __u8    x86_model;
70117 +       __u8    x86_mask;
70118 +       char    wp_works_ok;    /* It doesn't on 386's */
70119 +       char    hlt_works_ok;   /* Problems on some 486Dx4's and old 386's */
70120 +       char    hard_math;
70121 +       char    rfu;
70122 +               int     cpuid_level;    /* Maximum supported CPUID level, -1=no CPUID */
70123 +       unsigned long   x86_capability[NCAPINTS];
70124 +       char    x86_vendor_id[16];
70125 +       char    x86_model_id[64];
70126 +       int     x86_cache_size;  /* in KB - valid for CPUS which support this
70127 +                                   call  */
70128 +       int     x86_cache_alignment;    /* In bytes */
70129 +       char    fdiv_bug;
70130 +       char    f00f_bug;
70131 +       char    coma_bug;
70132 +       char    pad0;
70133 +       int     x86_power;
70134 +       unsigned long loops_per_jiffy;
70135 +#ifdef CONFIG_SMP
70136 +       cpumask_t llc_shared_map;       /* cpus sharing the last level cache */
70137 +#endif
70138 +       unsigned char x86_max_cores;    /* cpuid returned max cores value */
70139 +       unsigned char booted_cores;     /* number of cores as seen by OS */
70140 +       unsigned char apicid;
70141 +} __attribute__((__aligned__(SMP_CACHE_BYTES)));
70142 +
70143 +#define X86_VENDOR_INTEL 0
70144 +#define X86_VENDOR_CYRIX 1
70145 +#define X86_VENDOR_AMD 2
70146 +#define X86_VENDOR_UMC 3
70147 +#define X86_VENDOR_NEXGEN 4
70148 +#define X86_VENDOR_CENTAUR 5
70149 +#define X86_VENDOR_RISE 6
70150 +#define X86_VENDOR_TRANSMETA 7
70151 +#define X86_VENDOR_NSC 8
70152 +#define X86_VENDOR_NUM 9
70153 +#define X86_VENDOR_UNKNOWN 0xff
70154 +
70155 +/*
70156 + * capabilities of CPUs
70157 + */
70158 +
70159 +extern struct cpuinfo_x86 boot_cpu_data;
70160 +extern struct cpuinfo_x86 new_cpu_data;
70161 +#ifndef CONFIG_X86_NO_TSS
70162 +extern struct tss_struct doublefault_tss;
70163 +DECLARE_PER_CPU(struct tss_struct, init_tss);
70164 +#endif
70165 +
70166 +#ifdef CONFIG_SMP
70167 +extern struct cpuinfo_x86 cpu_data[];
70168 +#define current_cpu_data cpu_data[smp_processor_id()]
70169 +#else
70170 +#define cpu_data (&boot_cpu_data)
70171 +#define current_cpu_data boot_cpu_data
70172 +#endif
70173 +
70174 +extern int phys_proc_id[NR_CPUS];
70175 +extern int cpu_core_id[NR_CPUS];
70176 +extern int cpu_llc_id[NR_CPUS];
70177 +extern char ignore_fpu_irq;
70178 +
70179 +extern void identify_cpu(struct cpuinfo_x86 *);
70180 +extern void print_cpu_info(struct cpuinfo_x86 *);
70181 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
70182 +
70183 +#ifdef CONFIG_X86_HT
70184 +extern void detect_ht(struct cpuinfo_x86 *c);
70185 +#else
70186 +static inline void detect_ht(struct cpuinfo_x86 *c) {}
70187 +#endif
70188 +
70189 +/*
70190 + * EFLAGS bits
70191 + */
70192 +#define X86_EFLAGS_CF  0x00000001 /* Carry Flag */
70193 +#define X86_EFLAGS_PF  0x00000004 /* Parity Flag */
70194 +#define X86_EFLAGS_AF  0x00000010 /* Auxillary carry Flag */
70195 +#define X86_EFLAGS_ZF  0x00000040 /* Zero Flag */
70196 +#define X86_EFLAGS_SF  0x00000080 /* Sign Flag */
70197 +#define X86_EFLAGS_TF  0x00000100 /* Trap Flag */
70198 +#define X86_EFLAGS_IF  0x00000200 /* Interrupt Flag */
70199 +#define X86_EFLAGS_DF  0x00000400 /* Direction Flag */
70200 +#define X86_EFLAGS_OF  0x00000800 /* Overflow Flag */
70201 +#define X86_EFLAGS_IOPL        0x00003000 /* IOPL mask */
70202 +#define X86_EFLAGS_NT  0x00004000 /* Nested Task */
70203 +#define X86_EFLAGS_RF  0x00010000 /* Resume Flag */
70204 +#define X86_EFLAGS_VM  0x00020000 /* Virtual Mode */
70205 +#define X86_EFLAGS_AC  0x00040000 /* Alignment Check */
70206 +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
70207 +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
70208 +#define X86_EFLAGS_ID  0x00200000 /* CPUID detection flag */
70209 +
70210 +/*
70211 + * Generic CPUID function
70212 + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
70213 + * resulting in stale register contents being returned.
70214 + */
70215 +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
70216 +{
70217 +       __asm__(XEN_CPUID
70218 +               : "=a" (*eax),
70219 +                 "=b" (*ebx),
70220 +                 "=c" (*ecx),
70221 +                 "=d" (*edx)
70222 +               : "0" (op), "c"(0));
70223 +}
70224 +
70225 +/* Some CPUID calls want 'count' to be placed in ecx */
70226 +static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
70227 +               int *edx)
70228 +{
70229 +       __asm__(XEN_CPUID
70230 +               : "=a" (*eax),
70231 +                 "=b" (*ebx),
70232 +                 "=c" (*ecx),
70233 +                 "=d" (*edx)
70234 +               : "0" (op), "c" (count));
70235 +}
70236 +
70237 +/*
70238 + * CPUID functions returning a single datum
70239 + */
70240 +static inline unsigned int cpuid_eax(unsigned int op)
70241 +{
70242 +       unsigned int eax;
70243 +
70244 +       __asm__(XEN_CPUID
70245 +               : "=a" (eax)
70246 +               : "0" (op)
70247 +               : "bx", "cx", "dx");
70248 +       return eax;
70249 +}
70250 +static inline unsigned int cpuid_ebx(unsigned int op)
70251 +{
70252 +       unsigned int eax, ebx;
70253 +
70254 +       __asm__(XEN_CPUID
70255 +               : "=a" (eax), "=b" (ebx)
70256 +               : "0" (op)
70257 +               : "cx", "dx" );
70258 +       return ebx;
70259 +}
70260 +static inline unsigned int cpuid_ecx(unsigned int op)
70261 +{
70262 +       unsigned int eax, ecx;
70263 +
70264 +       __asm__(XEN_CPUID
70265 +               : "=a" (eax), "=c" (ecx)
70266 +               : "0" (op)
70267 +               : "bx", "dx" );
70268 +       return ecx;
70269 +}
70270 +static inline unsigned int cpuid_edx(unsigned int op)
70271 +{
70272 +       unsigned int eax, edx;
70273 +
70274 +       __asm__(XEN_CPUID
70275 +               : "=a" (eax), "=d" (edx)
70276 +               : "0" (op)
70277 +               : "bx", "cx");
70278 +       return edx;
70279 +}
70280 +
70281 +#define load_cr3(pgdir) write_cr3(__pa(pgdir))
70282 +
70283 +/*
70284 + * Intel CPU features in CR4
70285 + */
70286 +#define X86_CR4_VME            0x0001  /* enable vm86 extensions */
70287 +#define X86_CR4_PVI            0x0002  /* virtual interrupts flag enable */
70288 +#define X86_CR4_TSD            0x0004  /* disable time stamp at ipl 3 */
70289 +#define X86_CR4_DE             0x0008  /* enable debugging extensions */
70290 +#define X86_CR4_PSE            0x0010  /* enable page size extensions */
70291 +#define X86_CR4_PAE            0x0020  /* enable physical address extensions */
70292 +#define X86_CR4_MCE            0x0040  /* Machine check enable */
70293 +#define X86_CR4_PGE            0x0080  /* enable global pages */
70294 +#define X86_CR4_PCE            0x0100  /* enable performance counters at ipl 3 */
70295 +#define X86_CR4_OSFXSR         0x0200  /* enable fast FPU save and restore */
70296 +#define X86_CR4_OSXMMEXCPT     0x0400  /* enable unmasked SSE exceptions */
70297 +
70298 +/*
70299 + * Save the cr4 feature set we're using (ie
70300 + * Pentium 4MB enable and PPro Global page
70301 + * enable), so that any CPU's that boot up
70302 + * after us can get the correct flags.
70303 + */
70304 +extern unsigned long mmu_cr4_features;
70305 +
70306 +static inline void set_in_cr4 (unsigned long mask)
70307 +{
70308 +       unsigned cr4;
70309 +       mmu_cr4_features |= mask;
70310 +       cr4 = read_cr4();
70311 +       cr4 |= mask;
70312 +       write_cr4(cr4);
70313 +}
70314 +
70315 +static inline void clear_in_cr4 (unsigned long mask)
70316 +{
70317 +       unsigned cr4;
70318 +       mmu_cr4_features &= ~mask;
70319 +       cr4 = read_cr4();
70320 +       cr4 &= ~mask;
70321 +       write_cr4(cr4);
70322 +}
70323 +
70324 +/*
70325 + *      NSC/Cyrix CPU configuration register indexes
70326 + */
70327 +
70328 +#define CX86_PCR0 0x20
70329 +#define CX86_GCR  0xb8
70330 +#define CX86_CCR0 0xc0
70331 +#define CX86_CCR1 0xc1
70332 +#define CX86_CCR2 0xc2
70333 +#define CX86_CCR3 0xc3
70334 +#define CX86_CCR4 0xe8
70335 +#define CX86_CCR5 0xe9
70336 +#define CX86_CCR6 0xea
70337 +#define CX86_CCR7 0xeb
70338 +#define CX86_PCR1 0xf0
70339 +#define CX86_DIR0 0xfe
70340 +#define CX86_DIR1 0xff
70341 +#define CX86_ARR_BASE 0xc4
70342 +#define CX86_RCR_BASE 0xdc
70343 +
70344 +/*
70345 + *      NSC/Cyrix CPU indexed register access macros
70346 + */
70347 +
70348 +#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); })
70349 +
70350 +#define setCx86(reg, data) do { \
70351 +       outb((reg), 0x22); \
70352 +       outb((data), 0x23); \
70353 +} while (0)
70354 +
70355 +/* Stop speculative execution */
70356 +static inline void sync_core(void)
70357 +{
70358 +       int tmp;
70359 +       asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
70360 +}
70361 +
70362 +static inline void __monitor(const void *eax, unsigned long ecx,
70363 +               unsigned long edx)
70364 +{
70365 +       /* "monitor %eax,%ecx,%edx;" */
70366 +       asm volatile(
70367 +               ".byte 0x0f,0x01,0xc8;"
70368 +               : :"a" (eax), "c" (ecx), "d"(edx));
70369 +}
70370 +
70371 +static inline void __mwait(unsigned long eax, unsigned long ecx)
70372 +{
70373 +       /* "mwait %eax,%ecx;" */
70374 +       asm volatile(
70375 +               ".byte 0x0f,0x01,0xc9;"
70376 +               : :"a" (eax), "c" (ecx));
70377 +}
70378 +
70379 +/* from system description table in BIOS.  Mostly for MCA use, but
70380 +others may find it useful. */
70381 +extern unsigned int machine_id;
70382 +extern unsigned int machine_submodel_id;
70383 +extern unsigned int BIOS_revision;
70384 +extern unsigned int mca_pentium_flag;
70385 +
70386 +/* Boot loader type from the setup header */
70387 +extern int bootloader_type;
70388 +
70389 +/*
70390 + * User space process size: 3GB (default).
70391 + */
70392 +#define TASK_SIZE      (PAGE_OFFSET)
70393 +
70394 +/* This decides where the kernel will search for a free chunk of vm
70395 + * space during mmap's.
70396 + */
70397 +#define TASK_UNMAPPED_BASE     (PAGE_ALIGN(TASK_SIZE / 3))
70398 +
70399 +#define HAVE_ARCH_PICK_MMAP_LAYOUT
70400 +
70401 +/*
70402 + * Size of io_bitmap.
70403 + */
70404 +#define IO_BITMAP_BITS  65536
70405 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
70406 +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
70407 +#ifndef CONFIG_X86_NO_TSS
70408 +#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
70409 +#endif
70410 +#define INVALID_IO_BITMAP_OFFSET 0x8000
70411 +#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
70412 +
70413 +struct i387_fsave_struct {
70414 +       long    cwd;
70415 +       long    swd;
70416 +       long    twd;
70417 +       long    fip;
70418 +       long    fcs;
70419 +       long    foo;
70420 +       long    fos;
70421 +       long    st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
70422 +       long    status;         /* software status information */
70423 +};
70424 +
70425 +struct i387_fxsave_struct {
70426 +       unsigned short  cwd;
70427 +       unsigned short  swd;
70428 +       unsigned short  twd;
70429 +       unsigned short  fop;
70430 +       long    fip;
70431 +       long    fcs;
70432 +       long    foo;
70433 +       long    fos;
70434 +       long    mxcsr;
70435 +       long    mxcsr_mask;
70436 +       long    st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
70437 +       long    xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
70438 +       long    padding[56];
70439 +} __attribute__ ((aligned (16)));
70440 +
70441 +struct i387_soft_struct {
70442 +       long    cwd;
70443 +       long    swd;
70444 +       long    twd;
70445 +       long    fip;
70446 +       long    fcs;
70447 +       long    foo;
70448 +       long    fos;
70449 +       long    st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
70450 +       unsigned char   ftop, changed, lookahead, no_update, rm, alimit;
70451 +       struct info     *info;
70452 +       unsigned long   entry_eip;
70453 +};
70454 +
70455 +union i387_union {
70456 +       struct i387_fsave_struct        fsave;
70457 +       struct i387_fxsave_struct       fxsave;
70458 +       struct i387_soft_struct soft;
70459 +};
70460 +
70461 +typedef struct {
70462 +       unsigned long seg;
70463 +} mm_segment_t;
70464 +
70465 +struct thread_struct;
70466 +
70467 +#ifndef CONFIG_X86_NO_TSS
70468 +struct tss_struct {
70469 +       unsigned short  back_link,__blh;
70470 +       unsigned long   esp0;
70471 +       unsigned short  ss0,__ss0h;
70472 +       unsigned long   esp1;
70473 +       unsigned short  ss1,__ss1h;     /* ss1 is used to cache MSR_IA32_SYSENTER_CS */
70474 +       unsigned long   esp2;
70475 +       unsigned short  ss2,__ss2h;
70476 +       unsigned long   __cr3;
70477 +       unsigned long   eip;
70478 +       unsigned long   eflags;
70479 +       unsigned long   eax,ecx,edx,ebx;
70480 +       unsigned long   esp;
70481 +       unsigned long   ebp;
70482 +       unsigned long   esi;
70483 +       unsigned long   edi;
70484 +       unsigned short  es, __esh;
70485 +       unsigned short  cs, __csh;
70486 +       unsigned short  ss, __ssh;
70487 +       unsigned short  ds, __dsh;
70488 +       unsigned short  fs, __fsh;
70489 +       unsigned short  gs, __gsh;
70490 +       unsigned short  ldt, __ldth;
70491 +       unsigned short  trace, io_bitmap_base;
70492 +       /*
70493 +        * The extra 1 is there because the CPU will access an
70494 +        * additional byte beyond the end of the IO permission
70495 +        * bitmap. The extra byte must be all 1 bits, and must
70496 +        * be within the limit.
70497 +        */
70498 +       unsigned long   io_bitmap[IO_BITMAP_LONGS + 1];
70499 +       /*
70500 +        * Cache the current maximum and the last task that used the bitmap:
70501 +        */
70502 +       unsigned long io_bitmap_max;
70503 +       struct thread_struct *io_bitmap_owner;
70504 +       /*
70505 +        * pads the TSS to be cacheline-aligned (size is 0x100)
70506 +        */
70507 +       unsigned long __cacheline_filler[35];
70508 +       /*
70509 +        * .. and then another 0x100 bytes for emergency kernel stack
70510 +        */
70511 +       unsigned long stack[64];
70512 +} __attribute__((packed));
70513 +#endif
70514 +
70515 +#define ARCH_MIN_TASKALIGN     16
70516 +
70517 +struct thread_struct {
70518 +/* cached TLS descriptors. */
70519 +       struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
70520 +       unsigned long   esp0;
70521 +       unsigned long   sysenter_cs;
70522 +       unsigned long   eip;
70523 +       unsigned long   esp;
70524 +       unsigned long   fs;
70525 +       unsigned long   gs;
70526 +/* Hardware debugging registers */
70527 +       unsigned long   debugreg[8];  /* %%db0-7 debug registers */
70528 +/* fault info */
70529 +       unsigned long   cr2, trap_no, error_code;
70530 +/* floating point info */
70531 +       union i387_union        i387;
70532 +/* virtual 86 mode info */
70533 +       struct vm86_struct __user * vm86_info;
70534 +       unsigned long           screen_bitmap;
70535 +       unsigned long           v86flags, v86mask, saved_esp0;
70536 +       unsigned int            saved_fs, saved_gs;
70537 +/* IO permissions */
70538 +       unsigned long   *io_bitmap_ptr;
70539 +       unsigned long   iopl;
70540 +/* max allowed port in the bitmap, in bytes: */
70541 +       unsigned long   io_bitmap_max;
70542 +};
70543 +
70544 +#define INIT_THREAD  {                                                 \
70545 +       .vm86_info = NULL,                                              \
70546 +       .sysenter_cs = __KERNEL_CS,                                     \
70547 +       .io_bitmap_ptr = NULL,                                          \
70548 +}
70549 +
70550 +#ifndef CONFIG_X86_NO_TSS
70551 +/*
70552 + * Note that the .io_bitmap member must be extra-big. This is because
70553 + * the CPU will access an additional byte beyond the end of the IO
70554 + * permission bitmap. The extra byte must be all 1 bits, and must
70555 + * be within the limit.
70556 + */
70557 +#define INIT_TSS  {                                                    \
70558 +       .esp0           = sizeof(init_stack) + (long)&init_stack,       \
70559 +       .ss0            = __KERNEL_DS,                                  \
70560 +       .ss1            = __KERNEL_CS,                                  \
70561 +       .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,                     \
70562 +       .io_bitmap      = { [ 0 ... IO_BITMAP_LONGS] = ~0 },            \
70563 +}
70564 +
70565 +static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread)
70566 +{
70567 +       tss->esp0 = thread->esp0;
70568 +       /* This can only happen when SEP is enabled, no need to test "SEP"arately */
70569 +       if (unlikely(tss->ss1 != thread->sysenter_cs)) {
70570 +               tss->ss1 = thread->sysenter_cs;
70571 +               wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
70572 +       }
70573 +}
70574 +#define load_esp0(tss, thread) \
70575 +       __load_esp0(tss, thread)
70576 +#else
70577 +#define load_esp0(tss, thread) \
70578 +       HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)
70579 +#endif
70580 +
70581 +#define start_thread(regs, new_eip, new_esp) do {              \
70582 +       __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0));       \
70583 +       set_fs(USER_DS);                                        \
70584 +       regs->xds = __USER_DS;                                  \
70585 +       regs->xes = __USER_DS;                                  \
70586 +       regs->xss = __USER_DS;                                  \
70587 +       regs->xcs = __USER_CS;                                  \
70588 +       regs->eip = new_eip;                                    \
70589 +       regs->esp = new_esp;                                    \
70590 +} while (0)
70591 +
70592 +/*
70593 + * These special macros can be used to get or set a debugging register
70594 + */
70595 +#define get_debugreg(var, register)                            \
70596 +               (var) = HYPERVISOR_get_debugreg((register))
70597 +#define set_debugreg(value, register)                  \
70598 +               HYPERVISOR_set_debugreg((register), (value))
70599 +
70600 +/*
70601 + * Set IOPL bits in EFLAGS from given mask
70602 + */
70603 +static inline void set_iopl_mask(unsigned mask)
70604 +{
70605 +       struct physdev_set_iopl set_iopl;
70606 +
70607 +       /* Force the change at ring 0. */
70608 +       set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
70609 +       HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
70610 +}
70611 +
70612 +/* Forward declaration, a strange C thing */
70613 +struct task_struct;
70614 +struct mm_struct;
70615 +
70616 +/* Free all resources held by a thread. */
70617 +extern void release_thread(struct task_struct *);
70618 +
70619 +/* Prepare to copy thread state - unlazy all lazy status */
70620 +extern void prepare_to_copy(struct task_struct *tsk);
70621 +
70622 +/*
70623 + * create a kernel thread without removing it from tasklists
70624 + */
70625 +extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
70626 +
70627 +extern unsigned long thread_saved_pc(struct task_struct *tsk);
70628 +void show_trace(struct task_struct *task, unsigned long *stack);
70629 +
70630 +unsigned long get_wchan(struct task_struct *p);
70631 +
70632 +#define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
70633 +#define KSTK_TOP(info)                                                 \
70634 +({                                                                     \
70635 +       unsigned long *__ptr = (unsigned long *)(info);                 \
70636 +       (unsigned long)(&__ptr[THREAD_SIZE_LONGS]);                     \
70637 +})
70638 +
70639 +/*
70640 + * The below -8 is to reserve 8 bytes on top of the ring0 stack.
70641 + * This is necessary to guarantee that the entire "struct pt_regs"
70642 + * is accessable even if the CPU haven't stored the SS/ESP registers
70643 + * on the stack (interrupt gate does not save these registers
70644 + * when switching to the same priv ring).
70645 + * Therefore beware: accessing the xss/esp fields of the
70646 + * "struct pt_regs" is possible, but they may contain the
70647 + * completely wrong values.
70648 + */
70649 +#define task_pt_regs(task)                                             \
70650 +({                                                                     \
70651 +       struct pt_regs *__regs__;                                       \
70652 +       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
70653 +       __regs__ - 1;                                                   \
70654 +})
70655 +
70656 +#define KSTK_EIP(task) (task_pt_regs(task)->eip)
70657 +#define KSTK_ESP(task) (task_pt_regs(task)->esp)
70658 +
70659 +
70660 +struct microcode_header {
70661 +       unsigned int hdrver;
70662 +       unsigned int rev;
70663 +       unsigned int date;
70664 +       unsigned int sig;
70665 +       unsigned int cksum;
70666 +       unsigned int ldrver;
70667 +       unsigned int pf;
70668 +       unsigned int datasize;
70669 +       unsigned int totalsize;
70670 +       unsigned int reserved[3];
70671 +};
70672 +
70673 +struct microcode {
70674 +       struct microcode_header hdr;
70675 +       unsigned int bits[0];
70676 +};
70677 +
70678 +typedef struct microcode microcode_t;
70679 +typedef struct microcode_header microcode_header_t;
70680 +
70681 +/* microcode format is extended from prescott processors */
70682 +struct extended_signature {
70683 +       unsigned int sig;
70684 +       unsigned int pf;
70685 +       unsigned int cksum;
70686 +};
70687 +
70688 +struct extended_sigtable {
70689 +       unsigned int count;
70690 +       unsigned int cksum;
70691 +       unsigned int reserved[3];
70692 +       struct extended_signature sigs[0];
70693 +};
70694 +
70695 +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
70696 +static inline void rep_nop(void)
70697 +{
70698 +       __asm__ __volatile__("rep;nop": : :"memory");
70699 +}
70700 +
70701 +#define cpu_relax()    rep_nop()
70702 +
70703 +/* generic versions from gas */
70704 +#define GENERIC_NOP1   ".byte 0x90\n"
70705 +#define GENERIC_NOP2           ".byte 0x89,0xf6\n"
70706 +#define GENERIC_NOP3        ".byte 0x8d,0x76,0x00\n"
70707 +#define GENERIC_NOP4        ".byte 0x8d,0x74,0x26,0x00\n"
70708 +#define GENERIC_NOP5        GENERIC_NOP1 GENERIC_NOP4
70709 +#define GENERIC_NOP6   ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
70710 +#define GENERIC_NOP7   ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
70711 +#define GENERIC_NOP8   GENERIC_NOP1 GENERIC_NOP7
70712 +
70713 +/* Opteron nops */
70714 +#define K8_NOP1 GENERIC_NOP1
70715 +#define K8_NOP2        ".byte 0x66,0x90\n" 
70716 +#define K8_NOP3        ".byte 0x66,0x66,0x90\n" 
70717 +#define K8_NOP4        ".byte 0x66,0x66,0x66,0x90\n" 
70718 +#define K8_NOP5        K8_NOP3 K8_NOP2 
70719 +#define K8_NOP6        K8_NOP3 K8_NOP3
70720 +#define K8_NOP7        K8_NOP4 K8_NOP3
70721 +#define K8_NOP8        K8_NOP4 K8_NOP4
70722 +
70723 +/* K7 nops */
70724 +/* uses eax dependencies (arbitary choice) */
70725 +#define K7_NOP1  GENERIC_NOP1
70726 +#define K7_NOP2        ".byte 0x8b,0xc0\n" 
70727 +#define K7_NOP3        ".byte 0x8d,0x04,0x20\n"
70728 +#define K7_NOP4        ".byte 0x8d,0x44,0x20,0x00\n"
70729 +#define K7_NOP5        K7_NOP4 ASM_NOP1
70730 +#define K7_NOP6        ".byte 0x8d,0x80,0,0,0,0\n"
70731 +#define K7_NOP7        ".byte 0x8D,0x04,0x05,0,0,0,0\n"
70732 +#define K7_NOP8        K7_NOP7 ASM_NOP1
70733 +
70734 +#ifdef CONFIG_MK8
70735 +#define ASM_NOP1 K8_NOP1
70736 +#define ASM_NOP2 K8_NOP2
70737 +#define ASM_NOP3 K8_NOP3
70738 +#define ASM_NOP4 K8_NOP4
70739 +#define ASM_NOP5 K8_NOP5
70740 +#define ASM_NOP6 K8_NOP6
70741 +#define ASM_NOP7 K8_NOP7
70742 +#define ASM_NOP8 K8_NOP8
70743 +#elif defined(CONFIG_MK7)
70744 +#define ASM_NOP1 K7_NOP1
70745 +#define ASM_NOP2 K7_NOP2
70746 +#define ASM_NOP3 K7_NOP3
70747 +#define ASM_NOP4 K7_NOP4
70748 +#define ASM_NOP5 K7_NOP5
70749 +#define ASM_NOP6 K7_NOP6
70750 +#define ASM_NOP7 K7_NOP7
70751 +#define ASM_NOP8 K7_NOP8
70752 +#else
70753 +#define ASM_NOP1 GENERIC_NOP1
70754 +#define ASM_NOP2 GENERIC_NOP2
70755 +#define ASM_NOP3 GENERIC_NOP3
70756 +#define ASM_NOP4 GENERIC_NOP4
70757 +#define ASM_NOP5 GENERIC_NOP5
70758 +#define ASM_NOP6 GENERIC_NOP6
70759 +#define ASM_NOP7 GENERIC_NOP7
70760 +#define ASM_NOP8 GENERIC_NOP8
70761 +#endif
70762 +
70763 +#define ASM_NOP_MAX 8
70764 +
70765 +/* Prefetch instructions for Pentium III and AMD Athlon */
70766 +/* It's not worth to care about 3dnow! prefetches for the K6
70767 +   because they are microcoded there and very slow.
70768 +   However we don't do prefetches for pre XP Athlons currently
70769 +   That should be fixed. */
70770 +#define ARCH_HAS_PREFETCH
70771 +static inline void prefetch(const void *x)
70772 +{
70773 +       alternative_input(ASM_NOP4,
70774 +                         "prefetchnta (%1)",
70775 +                         X86_FEATURE_XMM,
70776 +                         "r" (x));
70777 +}
70778 +
70779 +#define ARCH_HAS_PREFETCH
70780 +#define ARCH_HAS_PREFETCHW
70781 +#define ARCH_HAS_SPINLOCK_PREFETCH
70782 +
70783 +/* 3dnow! prefetch to get an exclusive cache line. Useful for 
70784 +   spinlocks to avoid one state transition in the cache coherency protocol. */
70785 +static inline void prefetchw(const void *x)
70786 +{
70787 +       alternative_input(ASM_NOP4,
70788 +                         "prefetchw (%1)",
70789 +                         X86_FEATURE_3DNOW,
70790 +                         "r" (x));
70791 +}
70792 +#define spin_lock_prefetch(x)  prefetchw(x)
70793 +
70794 +extern void select_idle_routine(const struct cpuinfo_x86 *c);
70795 +
70796 +#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
70797 +
70798 +extern unsigned long boot_option_idle_override;
70799 +extern void enable_sep_cpu(void);
70800 +extern int sysenter_setup(void);
70801 +
70802 +#ifdef CONFIG_MTRR
70803 +extern void mtrr_ap_init(void);
70804 +extern void mtrr_bp_init(void);
70805 +#else
70806 +#define mtrr_ap_init() do {} while (0)
70807 +#define mtrr_bp_init() do {} while (0)
70808 +#endif
70809 +
70810 +#ifdef CONFIG_X86_MCE
70811 +extern void mcheck_init(struct cpuinfo_x86 *c);
70812 +#else
70813 +#define mcheck_init(c) do {} while(0)
70814 +#endif
70815 +
70816 +#endif /* __ASM_I386_PROCESSOR_H */
70817 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/ptrace.h new/include/asm-i386/mach-xen/asm/ptrace.h
70818 --- linux-2.6/include/asm-i386/mach-xen/asm/ptrace.h    1970-01-01 01:00:00.000000000 +0100
70819 +++ new/include/asm-i386/mach-xen/asm/ptrace.h  2006-05-09 12:35:17.000000000 +0200
70820 @@ -0,0 +1,90 @@
70821 +#ifndef _I386_PTRACE_H
70822 +#define _I386_PTRACE_H
70823 +
70824 +#define EBX 0
70825 +#define ECX 1
70826 +#define EDX 2
70827 +#define ESI 3
70828 +#define EDI 4
70829 +#define EBP 5
70830 +#define EAX 6
70831 +#define DS 7
70832 +#define ES 8
70833 +#define FS 9
70834 +#define GS 10
70835 +#define ORIG_EAX 11
70836 +#define EIP 12
70837 +#define CS  13
70838 +#define EFL 14
70839 +#define UESP 15
70840 +#define SS   16
70841 +#define FRAME_SIZE 17
70842 +
70843 +/* this struct defines the way the registers are stored on the 
70844 +   stack during a system call. */
70845 +
70846 +struct pt_regs {
70847 +       long ebx;
70848 +       long ecx;
70849 +       long edx;
70850 +       long esi;
70851 +       long edi;
70852 +       long ebp;
70853 +       long eax;
70854 +       int  xds;
70855 +       int  xes;
70856 +       long orig_eax;
70857 +       long eip;
70858 +       int  xcs;
70859 +       long eflags;
70860 +       long esp;
70861 +       int  xss;
70862 +};
70863 +
70864 +/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
70865 +#define PTRACE_GETREGS            12
70866 +#define PTRACE_SETREGS            13
70867 +#define PTRACE_GETFPREGS          14
70868 +#define PTRACE_SETFPREGS          15
70869 +#define PTRACE_GETFPXREGS         18
70870 +#define PTRACE_SETFPXREGS         19
70871 +
70872 +#define PTRACE_OLDSETOPTIONS         21
70873 +
70874 +#define PTRACE_GET_THREAD_AREA    25
70875 +#define PTRACE_SET_THREAD_AREA    26
70876 +
70877 +#define PTRACE_SYSEMU            31
70878 +#define PTRACE_SYSEMU_SINGLESTEP  32
70879 +
70880 +#ifdef __KERNEL__
70881 +
70882 +#include <asm/vm86.h>
70883 +
70884 +struct task_struct;
70885 +extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code);
70886 +
70887 +/*
70888 + * user_mode_vm(regs) determines whether a register set came from user mode.
70889 + * This is true if V8086 mode was enabled OR if the register set was from
70890 + * protected mode with RPL-3 CS value.  This tricky test checks that with
70891 + * one comparison.  Many places in the kernel can bypass this full check
70892 + * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
70893 + */
70894 +static inline int user_mode(struct pt_regs *regs)
70895 +{
70896 +       return (regs->xcs & 2) != 0;
70897 +}
70898 +static inline int user_mode_vm(struct pt_regs *regs)
70899 +{
70900 +       return ((regs->xcs & 2) | (regs->eflags & VM_MASK)) != 0;
70901 +}
70902 +#define instruction_pointer(regs) ((regs)->eip)
70903 +#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
70904 +extern unsigned long profile_pc(struct pt_regs *regs);
70905 +#else
70906 +#define profile_pc(regs) instruction_pointer(regs)
70907 +#endif
70908 +#endif /* __KERNEL__ */
70909 +
70910 +#endif
70911 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/scatterlist.h new/include/asm-i386/mach-xen/asm/scatterlist.h
70912 --- linux-2.6/include/asm-i386/mach-xen/asm/scatterlist.h       1970-01-01 01:00:00.000000000 +0100
70913 +++ new/include/asm-i386/mach-xen/asm/scatterlist.h     2006-05-09 12:35:17.000000000 +0200
70914 @@ -0,0 +1,22 @@
70915 +#ifndef _I386_SCATTERLIST_H
70916 +#define _I386_SCATTERLIST_H
70917 +
70918 +struct scatterlist {
70919 +    struct page                *page;
70920 +    unsigned int       offset;
70921 +    unsigned int       length;
70922 +    dma_addr_t         dma_address;
70923 +    unsigned int       dma_length;
70924 +};
70925 +
70926 +/* These macros should be used after a pci_map_sg call has been done
70927 + * to get bus addresses of each of the SG entries and their lengths.
70928 + * You should only work with the number of sg entries pci_map_sg
70929 + * returns.
70930 + */
70931 +#define sg_dma_address(sg)     ((sg)->dma_address)
70932 +#define sg_dma_len(sg)         ((sg)->dma_length)
70933 +
70934 +#define ISA_DMA_THRESHOLD (0x00ffffff)
70935 +
70936 +#endif /* !(_I386_SCATTERLIST_H) */
70937 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/segment.h new/include/asm-i386/mach-xen/asm/segment.h
70938 --- linux-2.6/include/asm-i386/mach-xen/asm/segment.h   1970-01-01 01:00:00.000000000 +0100
70939 +++ new/include/asm-i386/mach-xen/asm/segment.h 2006-05-09 12:35:17.000000000 +0200
70940 @@ -0,0 +1,117 @@
70941 +#ifndef _ASM_SEGMENT_H
70942 +#define _ASM_SEGMENT_H
70943 +
70944 +/*
70945 + * The layout of the per-CPU GDT under Linux:
70946 + *
70947 + *   0 - null
70948 + *   1 - reserved
70949 + *   2 - reserved
70950 + *   3 - reserved
70951 + *
70952 + *   4 - unused                        <==== new cacheline
70953 + *   5 - unused
70954 + *
70955 + *  ------- start of TLS (Thread-Local Storage) segments:
70956 + *
70957 + *   6 - TLS segment #1                        [ glibc's TLS segment ]
70958 + *   7 - TLS segment #2                        [ Wine's %fs Win32 segment ]
70959 + *   8 - TLS segment #3
70960 + *   9 - reserved
70961 + *  10 - reserved
70962 + *  11 - reserved
70963 + *
70964 + *  ------- start of kernel segments:
70965 + *
70966 + *  12 - kernel code segment           <==== new cacheline
70967 + *  13 - kernel data segment
70968 + *  14 - default user CS
70969 + *  15 - default user DS
70970 + *  16 - TSS
70971 + *  17 - LDT
70972 + *  18 - PNPBIOS support (16->32 gate)
70973 + *  19 - PNPBIOS support
70974 + *  20 - PNPBIOS support
70975 + *  21 - PNPBIOS support
70976 + *  22 - PNPBIOS support
70977 + *  23 - APM BIOS support
70978 + *  24 - APM BIOS support
70979 + *  25 - APM BIOS support 
70980 + *
70981 + *  26 - ESPFIX small SS
70982 + *  27 - unused
70983 + *  28 - unused
70984 + *  29 - unused
70985 + *  30 - unused
70986 + *  31 - TSS for double fault handler
70987 + */
70988 +#define GDT_ENTRY_TLS_ENTRIES  3
70989 +#define GDT_ENTRY_TLS_MIN      6
70990 +#define GDT_ENTRY_TLS_MAX      (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
70991 +
70992 +#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
70993 +
70994 +#define GDT_ENTRY_DEFAULT_USER_CS      14
70995 +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
70996 +
70997 +#define GDT_ENTRY_DEFAULT_USER_DS      15
70998 +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
70999 +
71000 +#define GDT_ENTRY_KERNEL_BASE  12
71001 +
71002 +#define GDT_ENTRY_KERNEL_CS            (GDT_ENTRY_KERNEL_BASE + 0)
71003 +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
71004 +#define GET_KERNEL_CS() (__KERNEL_CS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
71005 +
71006 +#define GDT_ENTRY_KERNEL_DS            (GDT_ENTRY_KERNEL_BASE + 1)
71007 +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
71008 +#define GET_KERNEL_DS() (__KERNEL_DS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
71009 +
71010 +#define GDT_ENTRY_TSS                  (GDT_ENTRY_KERNEL_BASE + 4)
71011 +#define GDT_ENTRY_LDT                  (GDT_ENTRY_KERNEL_BASE + 5)
71012 +
71013 +#define GDT_ENTRY_PNPBIOS_BASE         (GDT_ENTRY_KERNEL_BASE + 6)
71014 +#define GDT_ENTRY_APMBIOS_BASE         (GDT_ENTRY_KERNEL_BASE + 11)
71015 +
71016 +#define GDT_ENTRY_ESPFIX_SS            (GDT_ENTRY_KERNEL_BASE + 14)
71017 +#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
71018 +
71019 +#define GDT_ENTRY_DOUBLEFAULT_TSS      31
71020 +
71021 +/*
71022 + * The GDT has 32 entries
71023 + */
71024 +#define GDT_ENTRIES 32
71025 +
71026 +#define GDT_SIZE (GDT_ENTRIES * 8)
71027 +
71028 +/* Simple and small GDT entries for booting only */
71029 +
71030 +#define GDT_ENTRY_BOOT_CS              2
71031 +#define __BOOT_CS      (GDT_ENTRY_BOOT_CS * 8)
71032 +
71033 +#define GDT_ENTRY_BOOT_DS              (GDT_ENTRY_BOOT_CS + 1)
71034 +#define __BOOT_DS      (GDT_ENTRY_BOOT_DS * 8)
71035 +
71036 +/* The PnP BIOS entries in the GDT */
71037 +#define GDT_ENTRY_PNPBIOS_CS32         (GDT_ENTRY_PNPBIOS_BASE + 0)
71038 +#define GDT_ENTRY_PNPBIOS_CS16         (GDT_ENTRY_PNPBIOS_BASE + 1)
71039 +#define GDT_ENTRY_PNPBIOS_DS           (GDT_ENTRY_PNPBIOS_BASE + 2)
71040 +#define GDT_ENTRY_PNPBIOS_TS1          (GDT_ENTRY_PNPBIOS_BASE + 3)
71041 +#define GDT_ENTRY_PNPBIOS_TS2          (GDT_ENTRY_PNPBIOS_BASE + 4)
71042 +
71043 +/* The PnP BIOS selectors */
71044 +#define PNP_CS32   (GDT_ENTRY_PNPBIOS_CS32 * 8)        /* segment for calling fn */
71045 +#define PNP_CS16   (GDT_ENTRY_PNPBIOS_CS16 * 8)        /* code segment for BIOS */
71046 +#define PNP_DS     (GDT_ENTRY_PNPBIOS_DS * 8)  /* data segment for BIOS */
71047 +#define PNP_TS1    (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
71048 +#define PNP_TS2    (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
71049 +
71050 +/*
71051 + * The interrupt descriptor table has room for 256 idt's,
71052 + * the global descriptor table is dependent on the number
71053 + * of tasks we can have..
71054 + */
71055 +#define IDT_ENTRIES 256
71056 +
71057 +#endif
71058 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/setup.h new/include/asm-i386/mach-xen/asm/setup.h
71059 --- linux-2.6/include/asm-i386/mach-xen/asm/setup.h     1970-01-01 01:00:00.000000000 +0100
71060 +++ new/include/asm-i386/mach-xen/asm/setup.h   2006-05-09 12:35:17.000000000 +0200
71061 @@ -0,0 +1,64 @@
71062 +/*
71063 + *     Just a place holder. We don't want to have to test x86 before
71064 + *     we include stuff
71065 + */
71066 +
71067 +#ifndef _i386_SETUP_H
71068 +#define _i386_SETUP_H
71069 +
71070 +#include <linux/pfn.h>
71071 +
71072 +/*
71073 + * Reserved space for vmalloc and iomap - defined in asm/page.h
71074 + */
71075 +#define MAXMEM_PFN     PFN_DOWN(MAXMEM)
71076 +#define MAX_NONPAE_PFN (1 << 20)
71077 +
71078 +#define PARAM_SIZE 4096
71079 +#define COMMAND_LINE_SIZE 256
71080 +
71081 +#define OLD_CL_MAGIC_ADDR      0x90020
71082 +#define OLD_CL_MAGIC           0xA33F
71083 +#define OLD_CL_BASE_ADDR       0x90000
71084 +#define OLD_CL_OFFSET          0x90022
71085 +#define NEW_CL_POINTER         0x228   /* Relative to real mode data */
71086 +
71087 +#ifndef __ASSEMBLY__
71088 +/*
71089 + * This is set up by the setup-routine at boot-time
71090 + */
71091 +extern unsigned char boot_params[PARAM_SIZE];
71092 +
71093 +#define PARAM  (boot_params)
71094 +#define SCREEN_INFO (*(struct screen_info *) (PARAM+0))
71095 +#define EXT_MEM_K (*(unsigned short *) (PARAM+2))
71096 +#define ALT_MEM_K (*(unsigned long *) (PARAM+0x1e0))
71097 +#define E820_MAP_NR (*(char*) (PARAM+E820NR))
71098 +#define E820_MAP    ((struct e820entry *) (PARAM+E820MAP))
71099 +#define APM_BIOS_INFO (*(struct apm_bios_info *) (PARAM+0x40))
71100 +#define IST_INFO   (*(struct ist_info *) (PARAM+0x60))
71101 +#define DRIVE_INFO (*(struct drive_info_struct *) (PARAM+0x80))
71102 +#define SYS_DESC_TABLE (*(struct sys_desc_table_struct*)(PARAM+0xa0))
71103 +#define EFI_SYSTAB ((efi_system_table_t *) *((unsigned long *)(PARAM+0x1c4)))
71104 +#define EFI_MEMDESC_SIZE (*((unsigned long *) (PARAM+0x1c8)))
71105 +#define EFI_MEMDESC_VERSION (*((unsigned long *) (PARAM+0x1cc)))
71106 +#define EFI_MEMMAP ((void *) *((unsigned long *)(PARAM+0x1d0)))
71107 +#define EFI_MEMMAP_SIZE (*((unsigned long *) (PARAM+0x1d4)))
71108 +#define MOUNT_ROOT_RDONLY (*(unsigned short *) (PARAM+0x1F2))
71109 +#define RAMDISK_FLAGS (*(unsigned short *) (PARAM+0x1F8))
71110 +#define VIDEO_MODE (*(unsigned short *) (PARAM+0x1FA))
71111 +#define ORIG_ROOT_DEV (*(unsigned short *) (PARAM+0x1FC))
71112 +#define AUX_DEVICE_INFO (*(unsigned char *) (PARAM+0x1FF))
71113 +#define LOADER_TYPE (*(unsigned char *) (PARAM+0x210))
71114 +#define KERNEL_START (*(unsigned long *) (PARAM+0x214))
71115 +#define INITRD_START (__pa(xen_start_info->mod_start))
71116 +#define INITRD_SIZE (xen_start_info->mod_len)
71117 +#define EDID_INFO   (*(struct edid_info *) (PARAM+0x440))
71118 +#define EDD_NR     (*(unsigned char *) (PARAM+EDDNR))
71119 +#define EDD_MBR_SIG_NR (*(unsigned char *) (PARAM+EDD_MBR_SIG_NR_BUF))
71120 +#define EDD_MBR_SIGNATURE ((unsigned int *) (PARAM+EDD_MBR_SIG_BUF))
71121 +#define EDD_BUF     ((struct edd_info *) (PARAM+EDDBUF))
71122 +
71123 +#endif /* __ASSEMBLY__ */
71124 +
71125 +#endif /* _i386_SETUP_H */
71126 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/smp.h new/include/asm-i386/mach-xen/asm/smp.h
71127 --- linux-2.6/include/asm-i386/mach-xen/asm/smp.h       1970-01-01 01:00:00.000000000 +0100
71128 +++ new/include/asm-i386/mach-xen/asm/smp.h     2006-05-09 12:35:17.000000000 +0200
71129 @@ -0,0 +1,104 @@
71130 +#ifndef __ASM_SMP_H
71131 +#define __ASM_SMP_H
71132 +
71133 +/*
71134 + * We need the APIC definitions automatically as part of 'smp.h'
71135 + */
71136 +#ifndef __ASSEMBLY__
71137 +#include <linux/config.h>
71138 +#include <linux/kernel.h>
71139 +#include <linux/threads.h>
71140 +#include <linux/cpumask.h>
71141 +#endif
71142 +
71143 +#ifdef CONFIG_X86_LOCAL_APIC
71144 +#ifndef __ASSEMBLY__
71145 +#include <asm/fixmap.h>
71146 +#include <asm/bitops.h>
71147 +#include <asm/mpspec.h>
71148 +#ifdef CONFIG_X86_IO_APIC
71149 +#include <asm/io_apic.h>
71150 +#endif
71151 +#include <asm/apic.h>
71152 +#endif
71153 +#endif
71154 +
71155 +#define BAD_APICID 0xFFu
71156 +#ifdef CONFIG_SMP
71157 +#ifndef __ASSEMBLY__
71158 +
71159 +/*
71160 + * Private routines/data
71161 + */
71162
71163 +extern void smp_alloc_memory(void);
71164 +extern int pic_mode;
71165 +extern int smp_num_siblings;
71166 +extern cpumask_t cpu_sibling_map[];
71167 +extern cpumask_t cpu_core_map[];
71168 +
71169 +extern void (*mtrr_hook) (void);
71170 +extern void zap_low_mappings (void);
71171 +extern void lock_ipi_call_lock(void);
71172 +extern void unlock_ipi_call_lock(void);
71173 +
71174 +#define MAX_APICID 256
71175 +extern u8 x86_cpu_to_apicid[];
71176 +
71177 +#define cpu_physical_id(cpu)   x86_cpu_to_apicid[cpu]
71178 +
71179 +#ifdef CONFIG_HOTPLUG_CPU
71180 +extern void cpu_exit_clear(void);
71181 +extern void cpu_uninit(void);
71182 +#endif
71183 +
71184 +/*
71185 + * This function is needed by all SMP systems. It must _always_ be valid
71186 + * from the initial startup. We map APIC_BASE very early in page_setup(),
71187 + * so this is correct in the x86 case.
71188 + */
71189 +#define raw_smp_processor_id() (current_thread_info()->cpu)
71190 +
71191 +extern cpumask_t cpu_possible_map;
71192 +#define cpu_callin_map cpu_possible_map
71193 +
71194 +/* We don't mark CPUs online until __cpu_up(), so we need another measure */
71195 +static inline int num_booting_cpus(void)
71196 +{
71197 +       return cpus_weight(cpu_possible_map);
71198 +}
71199 +
71200 +#ifdef CONFIG_X86_LOCAL_APIC
71201 +
71202 +#ifdef APIC_DEFINITION
71203 +extern int hard_smp_processor_id(void);
71204 +#else
71205 +#include <mach_apicdef.h>
71206 +static inline int hard_smp_processor_id(void)
71207 +{
71208 +       /* we don't want to mark this access volatile - bad code generation */
71209 +       return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
71210 +}
71211 +#endif
71212 +
71213 +static __inline int logical_smp_processor_id(void)
71214 +{
71215 +       /* we don't want to mark this access volatile - bad code generation */
71216 +       return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
71217 +}
71218 +
71219 +#endif
71220 +
71221 +extern int __cpu_disable(void);
71222 +extern void __cpu_die(unsigned int cpu);
71223 +extern void prefill_possible_map(void);
71224 +#endif /* !__ASSEMBLY__ */
71225 +
71226 +#else /* CONFIG_SMP */
71227 +
71228 +#define cpu_physical_id(cpu)           boot_cpu_physical_apicid
71229 +
71230 +#define NO_PROC_ID             0xFF            /* No processor magic marker */
71231 +
71232 +#endif
71233 +#endif
71234 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/spinlock.h new/include/asm-i386/mach-xen/asm/spinlock.h
71235 --- linux-2.6/include/asm-i386/mach-xen/asm/spinlock.h  1970-01-01 01:00:00.000000000 +0100
71236 +++ new/include/asm-i386/mach-xen/asm/spinlock.h        2006-05-09 12:35:17.000000000 +0200
71237 @@ -0,0 +1,200 @@
71238 +#ifndef __ASM_SPINLOCK_H
71239 +#define __ASM_SPINLOCK_H
71240 +
71241 +#include <asm/atomic.h>
71242 +#include <asm/rwlock.h>
71243 +#include <asm/page.h>
71244 +#include <linux/config.h>
71245 +#include <linux/compiler.h>
71246 +
71247 +/*
71248 + * Your basic SMP spinlocks, allowing only a single CPU anywhere
71249 + *
71250 + * Simple spin lock operations.  There are two variants, one clears IRQ's
71251 + * on the local processor, one does not.
71252 + *
71253 + * We make no fairness assumptions. They have a cost.
71254 + *
71255 + * (the type definitions are in asm/spinlock_types.h)
71256 + */
71257 +
71258 +#define __raw_spin_is_locked(x) \
71259 +               (*(volatile signed char *)(&(x)->slock) <= 0)
71260 +
71261 +#define __raw_spin_lock_string \
71262 +       "\n1:\t" \
71263 +       "lock ; decb %0\n\t" \
71264 +       "jns 3f\n" \
71265 +       "2:\t" \
71266 +       "rep;nop\n\t" \
71267 +       "cmpb $0,%0\n\t" \
71268 +       "jle 2b\n\t" \
71269 +       "jmp 1b\n" \
71270 +       "3:\n\t"
71271 +
71272 +#define __raw_spin_lock_string_flags \
71273 +       "\n1:\t" \
71274 +       "lock ; decb %0\n\t" \
71275 +       "jns 5f\n" \
71276 +       "2:\t" \
71277 +       "testl $0x200, %1\n\t" \
71278 +       "jz 4f\n\t" \
71279 +       "#sti\n" \
71280 +       "3:\t" \
71281 +       "rep;nop\n\t" \
71282 +       "cmpb $0, %0\n\t" \
71283 +       "jle 3b\n\t" \
71284 +       "#cli\n\t" \
71285 +       "jmp 1b\n" \
71286 +       "4:\t" \
71287 +       "rep;nop\n\t" \
71288 +       "cmpb $0, %0\n\t" \
71289 +       "jg 1b\n\t" \
71290 +       "jmp 4b\n" \
71291 +       "5:\n\t"
71292 +
71293 +#define __raw_spin_lock_string_up \
71294 +       "\n\tdecb %0"
71295 +
71296 +static inline void __raw_spin_lock(raw_spinlock_t *lock)
71297 +{
71298 +       alternative_smp(
71299 +               __raw_spin_lock_string,
71300 +               __raw_spin_lock_string_up,
71301 +               "=m" (lock->slock) : : "memory");
71302 +}
71303 +
71304 +static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
71305 +{
71306 +       alternative_smp(
71307 +               __raw_spin_lock_string_flags,
71308 +               __raw_spin_lock_string_up,
71309 +               "=m" (lock->slock) : "r" (flags) : "memory");
71310 +}
71311 +
71312 +static inline int __raw_spin_trylock(raw_spinlock_t *lock)
71313 +{
71314 +       char oldval;
71315 +       __asm__ __volatile__(
71316 +               "xchgb %b0,%1"
71317 +               :"=q" (oldval), "=m" (lock->slock)
71318 +               :"0" (0) : "memory");
71319 +       return oldval > 0;
71320 +}
71321 +
71322 +/*
71323 + * __raw_spin_unlock based on writing $1 to the low byte.
71324 + * This method works. Despite all the confusion.
71325 + * (except on PPro SMP or if we are using OOSTORE, so we use xchgb there)
71326 + * (PPro errata 66, 92)
71327 + */
71328 +
71329 +#if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE)
71330 +
71331 +#define __raw_spin_unlock_string \
71332 +       "movb $1,%0" \
71333 +               :"=m" (lock->slock) : : "memory"
71334 +
71335 +
71336 +static inline void __raw_spin_unlock(raw_spinlock_t *lock)
71337 +{
71338 +       __asm__ __volatile__(
71339 +               __raw_spin_unlock_string
71340 +       );
71341 +}
71342 +
71343 +#else
71344 +
71345 +#define __raw_spin_unlock_string \
71346 +       "xchgb %b0, %1" \
71347 +               :"=q" (oldval), "=m" (lock->slock) \
71348 +               :"0" (oldval) : "memory"
71349 +
71350 +static inline void __raw_spin_unlock(raw_spinlock_t *lock)
71351 +{
71352 +       char oldval = 1;
71353 +
71354 +       __asm__ __volatile__(
71355 +               __raw_spin_unlock_string
71356 +       );
71357 +}
71358 +
71359 +#endif
71360 +
71361 +#define __raw_spin_unlock_wait(lock) \
71362 +       do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0)
71363 +
71364 +/*
71365 + * Read-write spinlocks, allowing multiple readers
71366 + * but only one writer.
71367 + *
71368 + * NOTE! it is quite common to have readers in interrupts
71369 + * but no interrupt writers. For those circumstances we
71370 + * can "mix" irq-safe locks - any writer needs to get a
71371 + * irq-safe write-lock, but readers can get non-irqsafe
71372 + * read-locks.
71373 + *
71374 + * On x86, we implement read-write locks as a 32-bit counter
71375 + * with the high bit (sign) being the "contended" bit.
71376 + *
71377 + * The inline assembly is non-obvious. Think about it.
71378 + *
71379 + * Changed to use the same technique as rw semaphores.  See
71380 + * semaphore.h for details.  -ben
71381 + *
71382 + * the helpers are in arch/i386/kernel/semaphore.c
71383 + */
71384 +
71385 +/**
71386 + * read_can_lock - would read_trylock() succeed?
71387 + * @lock: the rwlock in question.
71388 + */
71389 +#define __raw_read_can_lock(x)         ((int)(x)->lock > 0)
71390 +
71391 +/**
71392 + * write_can_lock - would write_trylock() succeed?
71393 + * @lock: the rwlock in question.
71394 + */
71395 +#define __raw_write_can_lock(x)                ((x)->lock == RW_LOCK_BIAS)
71396 +
71397 +static inline void __raw_read_lock(raw_rwlock_t *rw)
71398 +{
71399 +       __build_read_lock(rw, "__read_lock_failed");
71400 +}
71401 +
71402 +static inline void __raw_write_lock(raw_rwlock_t *rw)
71403 +{
71404 +       __build_write_lock(rw, "__write_lock_failed");
71405 +}
71406 +
71407 +static inline int __raw_read_trylock(raw_rwlock_t *lock)
71408 +{
71409 +       atomic_t *count = (atomic_t *)lock;
71410 +       atomic_dec(count);
71411 +       if (atomic_read(count) >= 0)
71412 +               return 1;
71413 +       atomic_inc(count);
71414 +       return 0;
71415 +}
71416 +
71417 +static inline int __raw_write_trylock(raw_rwlock_t *lock)
71418 +{
71419 +       atomic_t *count = (atomic_t *)lock;
71420 +       if (atomic_sub_and_test(RW_LOCK_BIAS, count))
71421 +               return 1;
71422 +       atomic_add(RW_LOCK_BIAS, count);
71423 +       return 0;
71424 +}
71425 +
71426 +static inline void __raw_read_unlock(raw_rwlock_t *rw)
71427 +{
71428 +       asm volatile(LOCK_PREFIX "incl %0" :"=m" (rw->lock) : : "memory");
71429 +}
71430 +
71431 +static inline void __raw_write_unlock(raw_rwlock_t *rw)
71432 +{
71433 +       asm volatile(LOCK_PREFIX "addl $" RW_LOCK_BIAS_STR ", %0"
71434 +                                : "=m" (rw->lock) : : "memory");
71435 +}
71436 +
71437 +#endif /* __ASM_SPINLOCK_H */
71438 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/swiotlb.h new/include/asm-i386/mach-xen/asm/swiotlb.h
71439 --- linux-2.6/include/asm-i386/mach-xen/asm/swiotlb.h   1970-01-01 01:00:00.000000000 +0100
71440 +++ new/include/asm-i386/mach-xen/asm/swiotlb.h 2006-05-09 12:35:17.000000000 +0200
71441 @@ -0,0 +1,43 @@
71442 +#ifndef _ASM_SWIOTLB_H
71443 +#define _ASM_SWIOTLB_H 1
71444 +
71445 +#include <linux/config.h>
71446 +
71447 +/* SWIOTLB interface */
71448 +
71449 +extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr, size_t size,
71450 +                                     int dir);
71451 +extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
71452 +                                 size_t size, int dir);
71453 +extern void swiotlb_sync_single_for_cpu(struct device *hwdev,
71454 +                                        dma_addr_t dev_addr,
71455 +                                        size_t size, int dir);
71456 +extern void swiotlb_sync_single_for_device(struct device *hwdev,
71457 +                                           dma_addr_t dev_addr,
71458 +                                           size_t size, int dir);
71459 +extern void swiotlb_sync_sg_for_cpu(struct device *hwdev,
71460 +                                    struct scatterlist *sg, int nelems,
71461 +                                    int dir);
71462 +extern void swiotlb_sync_sg_for_device(struct device *hwdev,
71463 +                                       struct scatterlist *sg, int nelems,
71464 +                                       int dir);
71465 +extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
71466 +                     int nents, int direction);
71467 +extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
71468 +                        int nents, int direction);
71469 +extern int swiotlb_dma_mapping_error(dma_addr_t dma_addr);
71470 +extern dma_addr_t swiotlb_map_page(struct device *hwdev, struct page *page,
71471 +                                   unsigned long offset, size_t size,
71472 +                                   enum dma_data_direction direction);
71473 +extern void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
71474 +                               size_t size, enum dma_data_direction direction);
71475 +extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
71476 +extern void swiotlb_init(void);
71477 +
71478 +#ifdef CONFIG_SWIOTLB
71479 +extern int swiotlb;
71480 +#else
71481 +#define swiotlb 0
71482 +#endif
71483 +
71484 +#endif
71485 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/synch_bitops.h new/include/asm-i386/mach-xen/asm/synch_bitops.h
71486 --- linux-2.6/include/asm-i386/mach-xen/asm/synch_bitops.h      1970-01-01 01:00:00.000000000 +0100
71487 +++ new/include/asm-i386/mach-xen/asm/synch_bitops.h    2006-07-07 15:10:03.000000000 +0200
71488 @@ -0,0 +1,143 @@
71489 +#ifndef __XEN_SYNCH_BITOPS_H__
71490 +#define __XEN_SYNCH_BITOPS_H__
71491 +
71492 +/*
71493 + * Copyright 1992, Linus Torvalds.
71494 + * Heavily modified to provide guaranteed strong synchronisation
71495 + * when communicating with Xen or other guest OSes running on other CPUs.
71496 + */
71497 +
71498 +#include <linux/config.h>
71499 +
71500 +#define ADDR (*(volatile long *) addr)
71501 +
71502 +static __inline__ void synch_set_bit(int nr, volatile void * addr)
71503 +{
71504 +    __asm__ __volatile__ ( 
71505 +        "lock btsl %1,%0"
71506 +        : "+m" (ADDR) : "Ir" (nr) : "memory" );
71507 +}
71508 +
71509 +static __inline__ void synch_clear_bit(int nr, volatile void * addr)
71510 +{
71511 +    __asm__ __volatile__ (
71512 +        "lock btrl %1,%0"
71513 +        : "+m" (ADDR) : "Ir" (nr) : "memory" );
71514 +}
71515 +
71516 +static __inline__ void synch_change_bit(int nr, volatile void * addr)
71517 +{
71518 +    __asm__ __volatile__ (
71519 +        "lock btcl %1,%0"
71520 +        : "+m" (ADDR) : "Ir" (nr) : "memory" );
71521 +}
71522 +
71523 +static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr)
71524 +{
71525 +    int oldbit;
71526 +    __asm__ __volatile__ (
71527 +        "lock btsl %2,%1\n\tsbbl %0,%0"
71528 +        : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
71529 +    return oldbit;
71530 +}
71531 +
71532 +static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr)
71533 +{
71534 +    int oldbit;
71535 +    __asm__ __volatile__ (
71536 +        "lock btrl %2,%1\n\tsbbl %0,%0"
71537 +        : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
71538 +    return oldbit;
71539 +}
71540 +
71541 +static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr)
71542 +{
71543 +    int oldbit;
71544 +
71545 +    __asm__ __volatile__ (
71546 +        "lock btcl %2,%1\n\tsbbl %0,%0"
71547 +        : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
71548 +    return oldbit;
71549 +}
71550 +
71551 +struct __synch_xchg_dummy { unsigned long a[100]; };
71552 +#define __synch_xg(x) ((struct __synch_xchg_dummy *)(x))
71553 +
71554 +#define synch_cmpxchg(ptr, old, new) \
71555 +((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\
71556 +                                     (unsigned long)(old), \
71557 +                                     (unsigned long)(new), \
71558 +                                     sizeof(*(ptr))))
71559 +
71560 +static inline unsigned long __synch_cmpxchg(volatile void *ptr,
71561 +                                           unsigned long old,
71562 +                                           unsigned long new, int size)
71563 +{
71564 +       unsigned long prev;
71565 +       switch (size) {
71566 +       case 1:
71567 +               __asm__ __volatile__("lock; cmpxchgb %b1,%2"
71568 +                                    : "=a"(prev)
71569 +                                    : "q"(new), "m"(*__synch_xg(ptr)),
71570 +                                      "0"(old)
71571 +                                    : "memory");
71572 +               return prev;
71573 +       case 2:
71574 +               __asm__ __volatile__("lock; cmpxchgw %w1,%2"
71575 +                                    : "=a"(prev)
71576 +                                    : "r"(new), "m"(*__synch_xg(ptr)),
71577 +                                      "0"(old)
71578 +                                    : "memory");
71579 +               return prev;
71580 +#ifdef CONFIG_X86_64
71581 +       case 4:
71582 +               __asm__ __volatile__("lock; cmpxchgl %k1,%2"
71583 +                                    : "=a"(prev)
71584 +                                    : "r"(new), "m"(*__synch_xg(ptr)),
71585 +                                      "0"(old)
71586 +                                    : "memory");
71587 +               return prev;
71588 +       case 8:
71589 +               __asm__ __volatile__("lock; cmpxchgq %1,%2"
71590 +                                    : "=a"(prev)
71591 +                                    : "r"(new), "m"(*__synch_xg(ptr)),
71592 +                                      "0"(old)
71593 +                                    : "memory");
71594 +               return prev;
71595 +#else
71596 +       case 4:
71597 +               __asm__ __volatile__("lock; cmpxchgl %1,%2"
71598 +                                    : "=a"(prev)
71599 +                                    : "r"(new), "m"(*__synch_xg(ptr)),
71600 +                                      "0"(old)
71601 +                                    : "memory");
71602 +               return prev;
71603 +#endif
71604 +       }
71605 +       return old;
71606 +}
71607 +
71608 +static __always_inline int synch_const_test_bit(int nr,
71609 +                                               const volatile void * addr)
71610 +{
71611 +    return ((1UL << (nr & 31)) & 
71612 +            (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
71613 +}
71614 +
71615 +static __inline__ int synch_var_test_bit(int nr, volatile void * addr)
71616 +{
71617 +    int oldbit;
71618 +    __asm__ __volatile__ (
71619 +        "btl %2,%1\n\tsbbl %0,%0"
71620 +        : "=r" (oldbit) : "m" (ADDR), "Ir" (nr) );
71621 +    return oldbit;
71622 +}
71623 +
71624 +#define synch_test_bit(nr,addr) \
71625 +(__builtin_constant_p(nr) ? \
71626 + synch_const_test_bit((nr),(addr)) : \
71627 + synch_var_test_bit((nr),(addr)))
71628 +
71629 +#define synch_cmpxchg_subword synch_cmpxchg
71630 +
71631 +#endif /* __XEN_SYNCH_BITOPS_H__ */
71632 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/system.h new/include/asm-i386/mach-xen/asm/system.h
71633 --- linux-2.6/include/asm-i386/mach-xen/asm/system.h    1970-01-01 01:00:00.000000000 +0100
71634 +++ new/include/asm-i386/mach-xen/asm/system.h  2006-06-07 13:15:16.000000000 +0200
71635 @@ -0,0 +1,578 @@
71636 +#ifndef __ASM_SYSTEM_H
71637 +#define __ASM_SYSTEM_H
71638 +
71639 +#include <linux/config.h>
71640 +#include <linux/kernel.h>
71641 +#include <asm/segment.h>
71642 +#include <asm/cpufeature.h>
71643 +#include <linux/bitops.h> /* for LOCK_PREFIX */
71644 +#include <asm/synch_bitops.h>
71645 +#include <asm/hypervisor.h>
71646 +
71647 +#ifdef __KERNEL__
71648 +
71649 +#ifdef CONFIG_SMP
71650 +#define __vcpu_id smp_processor_id()
71651 +#else
71652 +#define __vcpu_id 0
71653 +#endif
71654 +
71655 +struct task_struct;    /* one of the stranger aspects of C forward declarations.. */
71656 +extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
71657 +
71658 +#define switch_to(prev,next,last) do {                                 \
71659 +       unsigned long esi,edi;                                          \
71660 +       asm volatile("pushl %%ebp\n\t"                                  \
71661 +                    "movl %%esp,%0\n\t"        /* save ESP */          \
71662 +                    "movl %5,%%esp\n\t"        /* restore ESP */       \
71663 +                    "movl $1f,%1\n\t"          /* save EIP */          \
71664 +                    "pushl %6\n\t"             /* restore EIP */       \
71665 +                    "jmp __switch_to\n"                                \
71666 +                    "1:\t"                                             \
71667 +                    "popl %%ebp\n\t"                                   \
71668 +                    :"=m" (prev->thread.esp),"=m" (prev->thread.eip),  \
71669 +                     "=a" (last),"=S" (esi),"=D" (edi)                 \
71670 +                    :"m" (next->thread.esp),"m" (next->thread.eip),    \
71671 +                     "2" (prev), "d" (next));                          \
71672 +} while (0)
71673 +
71674 +#define _set_base(addr,base) do { unsigned long __pr; \
71675 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
71676 +       "rorl $16,%%edx\n\t" \
71677 +       "movb %%dl,%2\n\t" \
71678 +       "movb %%dh,%3" \
71679 +       :"=&d" (__pr) \
71680 +       :"m" (*((addr)+2)), \
71681 +        "m" (*((addr)+4)), \
71682 +        "m" (*((addr)+7)), \
71683 +         "0" (base) \
71684 +        ); } while(0)
71685 +
71686 +#define _set_limit(addr,limit) do { unsigned long __lr; \
71687 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
71688 +       "rorl $16,%%edx\n\t" \
71689 +       "movb %2,%%dh\n\t" \
71690 +       "andb $0xf0,%%dh\n\t" \
71691 +       "orb %%dh,%%dl\n\t" \
71692 +       "movb %%dl,%2" \
71693 +       :"=&d" (__lr) \
71694 +       :"m" (*(addr)), \
71695 +        "m" (*((addr)+6)), \
71696 +        "0" (limit) \
71697 +        ); } while(0)
71698 +
71699 +#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
71700 +#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
71701 +
71702 +/*
71703 + * Load a segment. Fall back on loading the zero
71704 + * segment if something goes wrong..
71705 + */
71706 +#define loadsegment(seg,value)                 \
71707 +       asm volatile("\n"                       \
71708 +               "1:\t"                          \
71709 +               "mov %0,%%" #seg "\n"           \
71710 +               "2:\n"                          \
71711 +               ".section .fixup,\"ax\"\n"      \
71712 +               "3:\t"                          \
71713 +               "pushl $0\n\t"                  \
71714 +               "popl %%" #seg "\n\t"           \
71715 +               "jmp 2b\n"                      \
71716 +               ".previous\n"                   \
71717 +               ".section __ex_table,\"a\"\n\t" \
71718 +               ".align 4\n\t"                  \
71719 +               ".long 1b,3b\n"                 \
71720 +               ".previous"                     \
71721 +               : :"rm" (value))
71722 +
71723 +/*
71724 + * Save a segment register away
71725 + */
71726 +#define savesegment(seg, value) \
71727 +       asm volatile("mov %%" #seg ",%0":"=rm" (value))
71728 +
71729 +/*
71730 + * Clear and set 'TS' bit respectively
71731 + */
71732 +#define clts() (HYPERVISOR_fpu_taskswitch(0))
71733 +#define read_cr0() ({ \
71734 +       unsigned int __dummy; \
71735 +       __asm__ __volatile__( \
71736 +               "movl %%cr0,%0\n\t" \
71737 +               :"=r" (__dummy)); \
71738 +       __dummy; \
71739 +})
71740 +#define write_cr0(x) \
71741 +       __asm__ __volatile__("movl %0,%%cr0": :"r" (x));
71742 +
71743 +#define read_cr2() \
71744 +       (HYPERVISOR_shared_info->vcpu_info[smp_processor_id()].arch.cr2)
71745 +#define write_cr2(x) \
71746 +       __asm__ __volatile__("movl %0,%%cr2": :"r" (x));
71747 +
71748 +#define read_cr3() ({ \
71749 +       unsigned int __dummy; \
71750 +       __asm__ ( \
71751 +               "movl %%cr3,%0\n\t" \
71752 +               :"=r" (__dummy)); \
71753 +       __dummy = xen_cr3_to_pfn(__dummy); \
71754 +       mfn_to_pfn(__dummy) << PAGE_SHIFT; \
71755 +})
71756 +#define write_cr3(x) ({                                                \
71757 +       unsigned int __dummy = pfn_to_mfn((x) >> PAGE_SHIFT);   \
71758 +       __dummy = xen_pfn_to_cr3(__dummy);                      \
71759 +       __asm__ __volatile__("movl %0,%%cr3": :"r" (__dummy));  \
71760 +})
71761 +
71762 +#define read_cr4() ({ \
71763 +       unsigned int __dummy; \
71764 +       __asm__( \
71765 +               "movl %%cr4,%0\n\t" \
71766 +               :"=r" (__dummy)); \
71767 +       __dummy; \
71768 +})
71769 +
71770 +#define read_cr4_safe() ({                           \
71771 +       unsigned int __dummy;                         \
71772 +       /* This could fault if %cr4 does not exist */ \
71773 +       __asm__("1: movl %%cr4, %0              \n"   \
71774 +               "2:                             \n"   \
71775 +               ".section __ex_table,\"a\"      \n"   \
71776 +               ".long 1b,2b                    \n"   \
71777 +               ".previous                      \n"   \
71778 +               : "=r" (__dummy): "0" (0));           \
71779 +       __dummy;                                      \
71780 +})
71781 +
71782 +#define write_cr4(x) \
71783 +       __asm__ __volatile__("movl %0,%%cr4": :"r" (x));
71784 +#define stts() (HYPERVISOR_fpu_taskswitch(1))
71785 +
71786 +#endif /* __KERNEL__ */
71787 +
71788 +#define wbinvd() \
71789 +       __asm__ __volatile__ ("wbinvd": : :"memory");
71790 +
71791 +static inline unsigned long get_limit(unsigned long segment)
71792 +{
71793 +       unsigned long __limit;
71794 +       __asm__("lsll %1,%0"
71795 +               :"=r" (__limit):"r" (segment));
71796 +       return __limit+1;
71797 +}
71798 +
71799 +#define nop() __asm__ __volatile__ ("nop")
71800 +
71801 +#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
71802 +
71803 +#define tas(ptr) (xchg((ptr),1))
71804 +
71805 +struct __xchg_dummy { unsigned long a[100]; };
71806 +#define __xg(x) ((struct __xchg_dummy *)(x))
71807 +
71808 +
71809 +#ifdef CONFIG_X86_CMPXCHG64
71810 +
71811 +/*
71812 + * The semantics of XCHGCMP8B are a bit strange, this is why
71813 + * there is a loop and the loading of %%eax and %%edx has to
71814 + * be inside. This inlines well in most cases, the cached
71815 + * cost is around ~38 cycles. (in the future we might want
71816 + * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
71817 + * might have an implicit FPU-save as a cost, so it's not
71818 + * clear which path to go.)
71819 + *
71820 + * cmpxchg8b must be used with the lock prefix here to allow
71821 + * the instruction to be executed atomically, see page 3-102
71822 + * of the instruction set reference 24319102.pdf. We need
71823 + * the reader side to see the coherent 64bit value.
71824 + */
71825 +static inline void __set_64bit (unsigned long long * ptr,
71826 +               unsigned int low, unsigned int high)
71827 +{
71828 +       __asm__ __volatile__ (
71829 +               "\n1:\t"
71830 +               "movl (%0), %%eax\n\t"
71831 +               "movl 4(%0), %%edx\n\t"
71832 +               "lock cmpxchg8b (%0)\n\t"
71833 +               "jnz 1b"
71834 +               : /* no outputs */
71835 +               :       "D"(ptr),
71836 +                       "b"(low),
71837 +                       "c"(high)
71838 +               :       "ax","dx","memory");
71839 +}
71840 +
71841 +static inline void __set_64bit_constant (unsigned long long *ptr,
71842 +                                                unsigned long long value)
71843 +{
71844 +       __set_64bit(ptr,(unsigned int)(value), (unsigned int)((value)>>32ULL));
71845 +}
71846 +#define ll_low(x)      *(((unsigned int*)&(x))+0)
71847 +#define ll_high(x)     *(((unsigned int*)&(x))+1)
71848 +
71849 +static inline void __set_64bit_var (unsigned long long *ptr,
71850 +                        unsigned long long value)
71851 +{
71852 +       __set_64bit(ptr,ll_low(value), ll_high(value));
71853 +}
71854 +
71855 +#define set_64bit(ptr,value) \
71856 +(__builtin_constant_p(value) ? \
71857 + __set_64bit_constant(ptr, value) : \
71858 + __set_64bit_var(ptr, value) )
71859 +
71860 +#define _set_64bit(ptr,value) \
71861 +(__builtin_constant_p(value) ? \
71862 + __set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \
71863 + __set_64bit(ptr, ll_low(value), ll_high(value)) )
71864 +
71865 +#endif
71866 +
71867 +/*
71868 + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
71869 + * Note 2: xchg has side effect, so that attribute volatile is necessary,
71870 + *       but generally the primitive is invalid, *ptr is output argument. --ANK
71871 + */
71872 +static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
71873 +{
71874 +       switch (size) {
71875 +               case 1:
71876 +                       __asm__ __volatile__("xchgb %b0,%1"
71877 +                               :"=q" (x)
71878 +                               :"m" (*__xg(ptr)), "0" (x)
71879 +                               :"memory");
71880 +                       break;
71881 +               case 2:
71882 +                       __asm__ __volatile__("xchgw %w0,%1"
71883 +                               :"=r" (x)
71884 +                               :"m" (*__xg(ptr)), "0" (x)
71885 +                               :"memory");
71886 +                       break;
71887 +               case 4:
71888 +                       __asm__ __volatile__("xchgl %0,%1"
71889 +                               :"=r" (x)
71890 +                               :"m" (*__xg(ptr)), "0" (x)
71891 +                               :"memory");
71892 +                       break;
71893 +       }
71894 +       return x;
71895 +}
71896 +
71897 +/*
71898 + * Atomic compare and exchange.  Compare OLD with MEM, if identical,
71899 + * store NEW in MEM.  Return the initial value in MEM.  Success is
71900 + * indicated by comparing RETURN with OLD.
71901 + */
71902 +
71903 +#ifdef CONFIG_X86_CMPXCHG
71904 +#define __HAVE_ARCH_CMPXCHG 1
71905 +#define cmpxchg(ptr,o,n)\
71906 +       ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
71907 +                                       (unsigned long)(n),sizeof(*(ptr))))
71908 +#endif
71909 +
71910 +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
71911 +                                     unsigned long new, int size)
71912 +{
71913 +       unsigned long prev;
71914 +       switch (size) {
71915 +       case 1:
71916 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
71917 +                                    : "=a"(prev)
71918 +                                    : "q"(new), "m"(*__xg(ptr)), "0"(old)
71919 +                                    : "memory");
71920 +               return prev;
71921 +       case 2:
71922 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
71923 +                                    : "=a"(prev)
71924 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
71925 +                                    : "memory");
71926 +               return prev;
71927 +       case 4:
71928 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
71929 +                                    : "=a"(prev)
71930 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
71931 +                                    : "memory");
71932 +               return prev;
71933 +       }
71934 +       return old;
71935 +}
71936 +
71937 +#ifndef CONFIG_X86_CMPXCHG
71938 +/*
71939 + * Building a kernel capable running on 80386. It may be necessary to
71940 + * simulate the cmpxchg on the 80386 CPU. For that purpose we define
71941 + * a function for each of the sizes we support.
71942 + */
71943 +
71944 +extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8);
71945 +extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16);
71946 +extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32);
71947 +
71948 +static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
71949 +                                     unsigned long new, int size)
71950 +{
71951 +       switch (size) {
71952 +       case 1:
71953 +               return cmpxchg_386_u8(ptr, old, new);
71954 +       case 2:
71955 +               return cmpxchg_386_u16(ptr, old, new);
71956 +       case 4:
71957 +               return cmpxchg_386_u32(ptr, old, new);
71958 +       }
71959 +       return old;
71960 +}
71961 +
71962 +#define cmpxchg(ptr,o,n)                                               \
71963 +({                                                                     \
71964 +       __typeof__(*(ptr)) __ret;                                       \
71965 +       if (likely(boot_cpu_data.x86 > 3))                              \
71966 +               __ret = __cmpxchg((ptr), (unsigned long)(o),            \
71967 +                                       (unsigned long)(n), sizeof(*(ptr))); \
71968 +       else                                                            \
71969 +               __ret = cmpxchg_386((ptr), (unsigned long)(o),          \
71970 +                                       (unsigned long)(n), sizeof(*(ptr))); \
71971 +       __ret;                                                          \
71972 +})
71973 +#endif
71974 +
71975 +#ifdef CONFIG_X86_CMPXCHG64
71976 +
71977 +static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old,
71978 +                                     unsigned long long new)
71979 +{
71980 +       unsigned long long prev;
71981 +       __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
71982 +                            : "=A"(prev)
71983 +                            : "b"((unsigned long)new),
71984 +                              "c"((unsigned long)(new >> 32)),
71985 +                              "m"(*__xg(ptr)),
71986 +                              "0"(old)
71987 +                            : "memory");
71988 +       return prev;
71989 +}
71990 +
71991 +#define cmpxchg64(ptr,o,n)\
71992 +       ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\
71993 +                                       (unsigned long long)(n)))
71994 +
71995 +#endif
71996 +    
71997 +/*
71998 + * Force strict CPU ordering.
71999 + * And yes, this is required on UP too when we're talking
72000 + * to devices.
72001 + *
72002 + * For now, "wmb()" doesn't actually do anything, as all
72003 + * Intel CPU's follow what Intel calls a *Processor Order*,
72004 + * in which all writes are seen in the program order even
72005 + * outside the CPU.
72006 + *
72007 + * I expect future Intel CPU's to have a weaker ordering,
72008 + * but I'd also expect them to finally get their act together
72009 + * and add some real memory barriers if so.
72010 + *
72011 + * Some non intel clones support out of order store. wmb() ceases to be a
72012 + * nop for these.
72013 + */
72014
72015 +
72016 +/* 
72017 + * Actually only lfence would be needed for mb() because all stores done 
72018 + * by the kernel should be already ordered. But keep a full barrier for now. 
72019 + */
72020 +
72021 +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
72022 +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
72023 +
72024 +/**
72025 + * read_barrier_depends - Flush all pending reads that subsequents reads
72026 + * depend on.
72027 + *
72028 + * No data-dependent reads from memory-like regions are ever reordered
72029 + * over this barrier.  All reads preceding this primitive are guaranteed
72030 + * to access memory (but not necessarily other CPUs' caches) before any
72031 + * reads following this primitive that depend on the data return by
72032 + * any of the preceding reads.  This primitive is much lighter weight than
72033 + * rmb() on most CPUs, and is never heavier weight than is
72034 + * rmb().
72035 + *
72036 + * These ordering constraints are respected by both the local CPU
72037 + * and the compiler.
72038 + *
72039 + * Ordering is not guaranteed by anything other than these primitives,
72040 + * not even by data dependencies.  See the documentation for
72041 + * memory_barrier() for examples and URLs to more information.
72042 + *
72043 + * For example, the following code would force ordering (the initial
72044 + * value of "a" is zero, "b" is one, and "p" is "&a"):
72045 + *
72046 + * <programlisting>
72047 + *     CPU 0                           CPU 1
72048 + *
72049 + *     b = 2;
72050 + *     memory_barrier();
72051 + *     p = &b;                         q = p;
72052 + *                                     read_barrier_depends();
72053 + *                                     d = *q;
72054 + * </programlisting>
72055 + *
72056 + * because the read of "*q" depends on the read of "p" and these
72057 + * two reads are separated by a read_barrier_depends().  However,
72058 + * the following code, with the same initial values for "a" and "b":
72059 + *
72060 + * <programlisting>
72061 + *     CPU 0                           CPU 1
72062 + *
72063 + *     a = 2;
72064 + *     memory_barrier();
72065 + *     b = 3;                          y = b;
72066 + *                                     read_barrier_depends();
72067 + *                                     x = a;
72068 + * </programlisting>
72069 + *
72070 + * does not enforce ordering, since there is no data dependency between
72071 + * the read of "a" and the read of "b".  Therefore, on some CPUs, such
72072 + * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
72073 + * in cases like thiswhere there are no data dependencies.
72074 + **/
72075 +
72076 +#define read_barrier_depends() do { } while(0)
72077 +
72078 +#ifdef CONFIG_X86_OOSTORE
72079 +/* Actually there are no OOO store capable CPUs for now that do SSE, 
72080 +   but make it already an possibility. */
72081 +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
72082 +#else
72083 +#define wmb()  __asm__ __volatile__ ("": : :"memory")
72084 +#endif
72085 +
72086 +#ifdef CONFIG_SMP
72087 +#define smp_mb()       mb()
72088 +#define smp_rmb()      rmb()
72089 +#define smp_wmb()      wmb()
72090 +#define smp_read_barrier_depends()     read_barrier_depends()
72091 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
72092 +#else
72093 +#define smp_mb()       barrier()
72094 +#define smp_rmb()      barrier()
72095 +#define smp_wmb()      barrier()
72096 +#define smp_read_barrier_depends()     do { } while(0)
72097 +#define set_mb(var, value) do { var = value; barrier(); } while (0)
72098 +#endif
72099 +
72100 +#define set_wmb(var, value) do { var = value; wmb(); } while (0)
72101 +
72102 +/* interrupt control.. */
72103 +
72104 +/* 
72105 + * The use of 'barrier' in the following reflects their use as local-lock
72106 + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
72107 + * critical operations are executed. All critical operations must complete
72108 + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
72109 + * includes these barriers, for example.
72110 + */
72111 +
72112 +#define __cli()                                                                \
72113 +do {                                                                   \
72114 +       vcpu_info_t *_vcpu;                                             \
72115 +       preempt_disable();                                              \
72116 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
72117 +       _vcpu->evtchn_upcall_mask = 1;                                  \
72118 +       preempt_enable_no_resched();                                    \
72119 +       barrier();                                                      \
72120 +} while (0)
72121 +
72122 +#define __sti()                                                                \
72123 +do {                                                                   \
72124 +       vcpu_info_t *_vcpu;                                             \
72125 +       barrier();                                                      \
72126 +       preempt_disable();                                              \
72127 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
72128 +       _vcpu->evtchn_upcall_mask = 0;                                  \
72129 +       barrier(); /* unmask then check (avoid races) */                \
72130 +       if (unlikely(_vcpu->evtchn_upcall_pending))                     \
72131 +               force_evtchn_callback();                                \
72132 +       preempt_enable();                                               \
72133 +} while (0)
72134 +
72135 +#define __save_flags(x)                                                        \
72136 +do {                                                                   \
72137 +       vcpu_info_t *_vcpu;                                             \
72138 +       preempt_disable();                                              \
72139 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
72140 +       (x) = _vcpu->evtchn_upcall_mask;                                \
72141 +       preempt_enable();                                               \
72142 +} while (0)
72143 +
72144 +#define __restore_flags(x)                                             \
72145 +do {                                                                   \
72146 +       vcpu_info_t *_vcpu;                                             \
72147 +       barrier();                                                      \
72148 +       preempt_disable();                                              \
72149 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
72150 +       if ((_vcpu->evtchn_upcall_mask = (x)) == 0) {                   \
72151 +               barrier(); /* unmask then check (avoid races) */        \
72152 +               if (unlikely(_vcpu->evtchn_upcall_pending))             \
72153 +                       force_evtchn_callback();                        \
72154 +               preempt_enable();                                       \
72155 +       } else                                                          \
72156 +               preempt_enable_no_resched();                            \
72157 +} while (0)
72158 +
72159 +void safe_halt(void);
72160 +void halt(void);
72161 +
72162 +#define __save_and_cli(x)                                              \
72163 +do {                                                                   \
72164 +       vcpu_info_t *_vcpu;                                             \
72165 +       preempt_disable();                                              \
72166 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
72167 +       (x) = _vcpu->evtchn_upcall_mask;                                \
72168 +       _vcpu->evtchn_upcall_mask = 1;                                  \
72169 +       preempt_enable_no_resched();                                    \
72170 +       barrier();                                                      \
72171 +} while (0)
72172 +
72173 +#define local_irq_save(x)      __save_and_cli(x)
72174 +#define local_irq_restore(x)   __restore_flags(x)
72175 +#define local_save_flags(x)    __save_flags(x)
72176 +#define local_irq_disable()    __cli()
72177 +#define local_irq_enable()     __sti()
72178 +
72179 +/* Cannot use preempt_enable() here as we would recurse in preempt_sched(). */
72180 +#define irqs_disabled()                                                        \
72181 +({     int ___x;                                                       \
72182 +       vcpu_info_t *_vcpu;                                             \
72183 +       preempt_disable();                                              \
72184 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
72185 +       ___x = (_vcpu->evtchn_upcall_mask != 0);                        \
72186 +       preempt_enable_no_resched();                                    \
72187 +       ___x; })
72188 +
72189 +/*
72190 + * disable hlt during certain critical i/o operations
72191 + */
72192 +#define HAVE_DISABLE_HLT
72193 +void disable_hlt(void);
72194 +void enable_hlt(void);
72195 +
72196 +extern int es7000_plat;
72197 +void cpu_idle_wait(void);
72198 +
72199 +/*
72200 + * On SMP systems, when the scheduler does migration-cost autodetection,
72201 + * it needs a way to flush as much of the CPU's caches as possible:
72202 + */
72203 +static inline void sched_cacheflush(void)
72204 +{
72205 +       wbinvd();
72206 +}
72207 +
72208 +extern unsigned long arch_align_stack(unsigned long sp);
72209 +extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
72210 +
72211 +void default_idle(void);
72212 +
72213 +#endif
72214 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/tlbflush.h new/include/asm-i386/mach-xen/asm/tlbflush.h
72215 --- linux-2.6/include/asm-i386/mach-xen/asm/tlbflush.h  1970-01-01 01:00:00.000000000 +0100
72216 +++ new/include/asm-i386/mach-xen/asm/tlbflush.h        2006-05-09 12:35:17.000000000 +0200
72217 @@ -0,0 +1,102 @@
72218 +#ifndef _I386_TLBFLUSH_H
72219 +#define _I386_TLBFLUSH_H
72220 +
72221 +#include <linux/config.h>
72222 +#include <linux/mm.h>
72223 +#include <asm/processor.h>
72224 +
72225 +#define __flush_tlb() xen_tlb_flush()
72226 +#define __flush_tlb_global() xen_tlb_flush()
72227 +#define __flush_tlb_all() xen_tlb_flush()
72228 +
72229 +extern unsigned long pgkern_mask;
72230 +
72231 +#define cpu_has_invlpg (boot_cpu_data.x86 > 3)
72232 +
72233 +#define __flush_tlb_single(addr) xen_invlpg(addr)
72234 +
72235 +#define __flush_tlb_one(addr) __flush_tlb_single(addr)
72236 +
72237 +/*
72238 + * TLB flushing:
72239 + *
72240 + *  - flush_tlb() flushes the current mm struct TLBs
72241 + *  - flush_tlb_all() flushes all processes TLBs
72242 + *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
72243 + *  - flush_tlb_page(vma, vmaddr) flushes one page
72244 + *  - flush_tlb_range(vma, start, end) flushes a range of pages
72245 + *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
72246 + *  - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
72247 + *
72248 + * ..but the i386 has somewhat limited tlb flushing capabilities,
72249 + * and page-granular flushes are available only on i486 and up.
72250 + */
72251 +
72252 +#ifndef CONFIG_SMP
72253 +
72254 +#define flush_tlb() __flush_tlb()
72255 +#define flush_tlb_all() __flush_tlb_all()
72256 +#define local_flush_tlb() __flush_tlb()
72257 +
72258 +static inline void flush_tlb_mm(struct mm_struct *mm)
72259 +{
72260 +       if (mm == current->active_mm)
72261 +               __flush_tlb();
72262 +}
72263 +
72264 +static inline void flush_tlb_page(struct vm_area_struct *vma,
72265 +       unsigned long addr)
72266 +{
72267 +       if (vma->vm_mm == current->active_mm)
72268 +               __flush_tlb_one(addr);
72269 +}
72270 +
72271 +static inline void flush_tlb_range(struct vm_area_struct *vma,
72272 +       unsigned long start, unsigned long end)
72273 +{
72274 +       if (vma->vm_mm == current->active_mm)
72275 +               __flush_tlb();
72276 +}
72277 +
72278 +#else
72279 +
72280 +#include <asm/smp.h>
72281 +
72282 +#define local_flush_tlb() \
72283 +       __flush_tlb()
72284 +
72285 +extern void flush_tlb_all(void);
72286 +extern void flush_tlb_current_task(void);
72287 +extern void flush_tlb_mm(struct mm_struct *);
72288 +extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
72289 +
72290 +#define flush_tlb()    flush_tlb_current_task()
72291 +
72292 +static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
72293 +{
72294 +       flush_tlb_mm(vma->vm_mm);
72295 +}
72296 +
72297 +#define TLBSTATE_OK    1
72298 +#define TLBSTATE_LAZY  2
72299 +
72300 +struct tlb_state
72301 +{
72302 +       struct mm_struct *active_mm;
72303 +       int state;
72304 +       char __cacheline_padding[L1_CACHE_BYTES-8];
72305 +};
72306 +DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
72307 +
72308 +
72309 +#endif
72310 +
72311 +#define flush_tlb_kernel_range(start, end) flush_tlb_all()
72312 +
72313 +static inline void flush_tlb_pgtables(struct mm_struct *mm,
72314 +                                     unsigned long start, unsigned long end)
72315 +{
72316 +       /* i386 does not keep any page table caches in TLB */
72317 +}
72318 +
72319 +#endif /* _I386_TLBFLUSH_H */
72320 diff -urNp linux-2.6/include/asm-i386/mach-xen/asm/vga.h new/include/asm-i386/mach-xen/asm/vga.h
72321 --- linux-2.6/include/asm-i386/mach-xen/asm/vga.h       1970-01-01 01:00:00.000000000 +0100
72322 +++ new/include/asm-i386/mach-xen/asm/vga.h     2006-05-09 12:35:17.000000000 +0200
72323 @@ -0,0 +1,20 @@
72324 +/*
72325 + *     Access to VGA videoram
72326 + *
72327 + *     (c) 1998 Martin Mares <mj@ucw.cz>
72328 + */
72329 +
72330 +#ifndef _LINUX_ASM_VGA_H_
72331 +#define _LINUX_ASM_VGA_H_
72332 +
72333 +/*
72334 + *     On the PC, we can just recalculate addresses and then
72335 + *     access the videoram directly without any black magic.
72336 + */
72337 +
72338 +#define VGA_MAP_MEM(x) (unsigned long)isa_bus_to_virt(x)
72339 +
72340 +#define vga_readb(x) (*(x))
72341 +#define vga_writeb(x,y) (*(y) = (x))
72342 +
72343 +#endif
72344 diff -urNp linux-2.6/include/asm-i386/mach-xen/irq_vectors.h new/include/asm-i386/mach-xen/irq_vectors.h
72345 --- linux-2.6/include/asm-i386/mach-xen/irq_vectors.h   1970-01-01 01:00:00.000000000 +0100
72346 +++ new/include/asm-i386/mach-xen/irq_vectors.h 2006-05-09 12:35:17.000000000 +0200
72347 @@ -0,0 +1,125 @@
72348 +/*
72349 + * This file should contain #defines for all of the interrupt vector
72350 + * numbers used by this architecture.
72351 + *
72352 + * In addition, there are some standard defines:
72353 + *
72354 + *     FIRST_EXTERNAL_VECTOR:
72355 + *             The first free place for external interrupts
72356 + *
72357 + *     SYSCALL_VECTOR:
72358 + *             The IRQ vector a syscall makes the user to kernel transition
72359 + *             under.
72360 + *
72361 + *     TIMER_IRQ:
72362 + *             The IRQ number the timer interrupt comes in at.
72363 + *
72364 + *     NR_IRQS:
72365 + *             The total number of interrupt vectors (including all the
72366 + *             architecture specific interrupts) needed.
72367 + *
72368 + */                    
72369 +#ifndef _ASM_IRQ_VECTORS_H
72370 +#define _ASM_IRQ_VECTORS_H
72371 +
72372 +/*
72373 + * IDT vectors usable for external interrupt sources start
72374 + * at 0x20:
72375 + */
72376 +#define FIRST_EXTERNAL_VECTOR  0x20
72377 +
72378 +#define SYSCALL_VECTOR         0x80
72379 +
72380 +/*
72381 + * Vectors 0x20-0x2f are used for ISA interrupts.
72382 + */
72383 +
72384 +#if 0
72385 +/*
72386 + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
72387 + *
72388 + *  some of the following vectors are 'rare', they are merged
72389 + *  into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
72390 + *  TLB, reschedule and local APIC vectors are performance-critical.
72391 + *
72392 + *  Vectors 0xf0-0xfa are free (reserved for future Linux use).
72393 + */
72394 +#define SPURIOUS_APIC_VECTOR   0xff
72395 +#define ERROR_APIC_VECTOR      0xfe
72396 +#define INVALIDATE_TLB_VECTOR  0xfd
72397 +#define RESCHEDULE_VECTOR      0xfc
72398 +#define CALL_FUNCTION_VECTOR   0xfb
72399 +
72400 +#define THERMAL_APIC_VECTOR    0xf0
72401 +/*
72402 + * Local APIC timer IRQ vector is on a different priority level,
72403 + * to work around the 'lost local interrupt if more than 2 IRQ
72404 + * sources per level' errata.
72405 + */
72406 +#define LOCAL_TIMER_VECTOR     0xef
72407 +#endif
72408 +
72409 +#define SPURIOUS_APIC_VECTOR   0xff
72410 +#define ERROR_APIC_VECTOR      0xfe
72411 +
72412 +/*
72413 + * First APIC vector available to drivers: (vectors 0x30-0xee)
72414 + * we start at 0x31 to spread out vectors evenly between priority
72415 + * levels. (0x80 is the syscall vector)
72416 + */
72417 +#define FIRST_DEVICE_VECTOR    0x31
72418 +#define FIRST_SYSTEM_VECTOR    0xef
72419 +
72420 +/*
72421 + * 16 8259A IRQ's, 208 potential APIC interrupt sources.
72422 + * Right now the APIC is mostly only used for SMP.
72423 + * 256 vectors is an architectural limit. (we can have
72424 + * more than 256 devices theoretically, but they will
72425 + * have to use shared interrupts)
72426 + * Since vectors 0x00-0x1f are used/reserved for the CPU,
72427 + * the usable vector space is 0x20-0xff (224 vectors)
72428 + */
72429 +
72430 +#define RESCHEDULE_VECTOR      0
72431 +#define CALL_FUNCTION_VECTOR   1
72432 +#define NR_IPIS                        2
72433 +
72434 +/*
72435 + * The maximum number of vectors supported by i386 processors
72436 + * is limited to 256. For processors other than i386, NR_VECTORS
72437 + * should be changed accordingly.
72438 + */
72439 +#define NR_VECTORS 256
72440 +
72441 +#define FPU_IRQ                        13
72442 +
72443 +#define        FIRST_VM86_IRQ          3
72444 +#define LAST_VM86_IRQ          15
72445 +#define invalid_vm86_irq(irq)  ((irq) < 3 || (irq) > 15)
72446 +
72447 +/*
72448 + * The flat IRQ space is divided into two regions:
72449 + *  1. A one-to-one mapping of real physical IRQs. This space is only used
72450 + *     if we have physical device-access privilege. This region is at the 
72451 + *     start of the IRQ space so that existing device drivers do not need
72452 + *     to be modified to translate physical IRQ numbers into our IRQ space.
72453 + *  3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
72454 + *     are bound using the provided bind/unbind functions.
72455 + */
72456 +
72457 +#define PIRQ_BASE              0
72458 +#define NR_PIRQS               256
72459 +
72460 +#define DYNIRQ_BASE            (PIRQ_BASE + NR_PIRQS)
72461 +#define NR_DYNIRQS             256
72462 +
72463 +#define NR_IRQS                        (NR_PIRQS + NR_DYNIRQS)
72464 +#define NR_IRQ_VECTORS         NR_IRQS
72465 +
72466 +#define pirq_to_irq(_x)                ((_x) + PIRQ_BASE)
72467 +#define irq_to_pirq(_x)                ((_x) - PIRQ_BASE)
72468 +
72469 +#define dynirq_to_irq(_x)      ((_x) + DYNIRQ_BASE)
72470 +#define irq_to_dynirq(_x)      ((_x) - DYNIRQ_BASE)
72471 +
72472 +#endif /* _ASM_IRQ_VECTORS_H */
72473 diff -urNp linux-2.6/include/asm-i386/mach-xen/mach_traps.h new/include/asm-i386/mach-xen/mach_traps.h
72474 --- linux-2.6/include/asm-i386/mach-xen/mach_traps.h    1970-01-01 01:00:00.000000000 +0100
72475 +++ new/include/asm-i386/mach-xen/mach_traps.h  2006-05-09 12:35:17.000000000 +0200
72476 @@ -0,0 +1,33 @@
72477 +/*
72478 + *  include/asm-xen/asm-i386/mach-xen/mach_traps.h
72479 + *
72480 + *  Machine specific NMI handling for Xen
72481 + */
72482 +#ifndef _MACH_TRAPS_H
72483 +#define _MACH_TRAPS_H
72484 +
72485 +#include <linux/bitops.h>
72486 +#include <xen/interface/nmi.h>
72487 +
72488 +static inline void clear_mem_error(unsigned char reason) {}
72489 +static inline void clear_io_check_error(unsigned char reason) {}
72490 +
72491 +static inline unsigned char get_nmi_reason(void)
72492 +{
72493 +       shared_info_t *s = HYPERVISOR_shared_info;
72494 +       unsigned char reason = 0;
72495 +
72496 +       /* construct a value which looks like it came from
72497 +        * port 0x61.
72498 +        */
72499 +       if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
72500 +               reason |= 0x40;
72501 +       if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
72502 +               reason |= 0x80;
72503 +
72504 +        return reason;
72505 +}
72506 +
72507 +static inline void reassert_nmi(void) {}
72508 +
72509 +#endif /* !_MACH_TRAPS_H */
72510 diff -urNp linux-2.6/include/asm-i386/mach-xen/setup_arch_post.h new/include/asm-i386/mach-xen/setup_arch_post.h
72511 --- linux-2.6/include/asm-i386/mach-xen/setup_arch_post.h       1970-01-01 01:00:00.000000000 +0100
72512 +++ new/include/asm-i386/mach-xen/setup_arch_post.h     2006-06-28 14:32:14.000000000 +0200
72513 @@ -0,0 +1,101 @@
72514 +/**
72515 + * machine_specific_memory_setup - Hook for machine specific memory setup.
72516 + *
72517 + * Description:
72518 + *     This is included late in kernel/setup.c so that it can make
72519 + *     use of all of the static functions.
72520 + **/
72521 +
72522 +#include <xen/interface/callback.h>
72523 +#include <xen/interface/memory.h>
72524 +
72525 +static char * __init machine_specific_memory_setup(void)
72526 +{
72527 +       int rc;
72528 +       struct xen_memory_map memmap;
72529 +       /*
72530 +        * This is rather large for a stack variable but this early in
72531 +        * the boot process we know we have plenty slack space.
72532 +        */
72533 +       struct e820entry map[E820MAX];
72534 +
72535 +       memmap.nr_entries = E820MAX;
72536 +       set_xen_guest_handle(memmap.buffer, map);
72537 +
72538 +       rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
72539 +       if ( rc == -ENOSYS ) {
72540 +               memmap.nr_entries = 1;
72541 +               map[0].addr = 0ULL;
72542 +               map[0].size = PFN_PHYS(xen_start_info->nr_pages);
72543 +               /* 8MB slack (to balance backend allocations). */
72544 +               map[0].size += 8ULL << 20;
72545 +               map[0].type = E820_RAM;
72546 +               rc = 0;
72547 +       }
72548 +       BUG_ON(rc);
72549 +
72550 +       sanitize_e820_map(map, (char *)&memmap.nr_entries);
72551 +
72552 +       BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
72553 +
72554 +       return "Xen";
72555 +}
72556 +
72557 +extern void hypervisor_callback(void);
72558 +extern void failsafe_callback(void);
72559 +extern void nmi(void);
72560 +
72561 +unsigned long *machine_to_phys_mapping;
72562 +EXPORT_SYMBOL(machine_to_phys_mapping);
72563 +unsigned int machine_to_phys_order;
72564 +EXPORT_SYMBOL(machine_to_phys_order);
72565 +
72566 +static void __init machine_specific_arch_setup(void)
72567 +{
72568 +       int ret;
72569 +       struct xen_machphys_mapping mapping;
72570 +       unsigned long machine_to_phys_nr_ents;
72571 +       struct xen_platform_parameters pp;
72572 +       struct callback_register event = {
72573 +               .type = CALLBACKTYPE_event,
72574 +               .address = { __KERNEL_CS, (unsigned long)hypervisor_callback },
72575 +       };
72576 +       struct callback_register failsafe = {
72577 +               .type = CALLBACKTYPE_failsafe,
72578 +               .address = { __KERNEL_CS, (unsigned long)failsafe_callback },
72579 +       };
72580 +       struct callback_register nmi_cb = {
72581 +               .type = CALLBACKTYPE_nmi,
72582 +               .address = { __KERNEL_CS, (unsigned long)nmi },
72583 +       };
72584 +
72585 +       ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
72586 +       if (ret == 0)
72587 +               ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
72588 +       if (ret == -ENOSYS)
72589 +               ret = HYPERVISOR_set_callbacks(
72590 +                       event.address.cs, event.address.eip,
72591 +                       failsafe.address.cs, failsafe.address.eip);
72592 +       BUG_ON(ret);
72593 +
72594 +       ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
72595 +       if (ret == -ENOSYS) {
72596 +               struct xennmi_callback cb;
72597 +
72598 +               cb.handler_address = nmi_cb.address.eip;
72599 +               HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
72600 +       }
72601 +
72602 +       if (HYPERVISOR_xen_version(XENVER_platform_parameters,
72603 +                                  &pp) == 0)
72604 +               set_fixaddr_top(pp.virt_start - PAGE_SIZE);
72605 +
72606 +       machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START;
72607 +       machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
72608 +       if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
72609 +               machine_to_phys_mapping = (unsigned long *)mapping.v_start;
72610 +               machine_to_phys_nr_ents = mapping.max_mfn + 1;
72611 +       }
72612 +       while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents )
72613 +               machine_to_phys_order++;
72614 +}
72615 diff -urNp linux-2.6/include/asm-i386/mach-xen/setup_arch_pre.h new/include/asm-i386/mach-xen/setup_arch_pre.h
72616 --- linux-2.6/include/asm-i386/mach-xen/setup_arch_pre.h        1970-01-01 01:00:00.000000000 +0100
72617 +++ new/include/asm-i386/mach-xen/setup_arch_pre.h      2006-05-09 12:35:17.000000000 +0200
72618 @@ -0,0 +1,5 @@
72619 +/* Hook to call BIOS initialisation function */
72620 +
72621 +#define ARCH_SETUP machine_specific_arch_setup();
72622 +
72623 +static void __init machine_specific_arch_setup(void);
72624 diff -urNp linux-2.6/include/asm-i386/page.h new/include/asm-i386/page.h
72625 --- linux-2.6/include/asm-i386/page.h   2006-07-03 14:15:09.000000000 +0200
72626 +++ new/include/asm-i386/page.h 2006-05-09 12:35:17.000000000 +0200
72627 @@ -121,7 +121,7 @@ extern int page_is_ram(unsigned long pag
72628  
72629  #define PAGE_OFFSET            ((unsigned long)__PAGE_OFFSET)
72630  #define VMALLOC_RESERVE                ((unsigned long)__VMALLOC_RESERVE)
72631 -#define MAXMEM                 (-__PAGE_OFFSET-__VMALLOC_RESERVE)
72632 +#define MAXMEM                 (__FIXADDR_TOP-__PAGE_OFFSET-__VMALLOC_RESERVE)
72633  #define __pa(x)                        ((unsigned long)(x)-PAGE_OFFSET)
72634  #define __va(x)                        ((void *)((unsigned long)(x)+PAGE_OFFSET))
72635  #define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
72636 @@ -137,6 +137,8 @@ extern int page_is_ram(unsigned long pag
72637         ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
72638                  VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
72639  
72640 +#define __HAVE_ARCH_GATE_AREA 1
72641 +
72642  #endif /* __KERNEL__ */
72643  
72644  #include <asm-generic/memory_model.h>
72645 diff -urNp linux-2.6/include/asm-i386/pgtable-2level-defs.h new/include/asm-i386/pgtable-2level-defs.h
72646 --- linux-2.6/include/asm-i386/pgtable-2level-defs.h    2006-07-03 14:15:09.000000000 +0200
72647 +++ new/include/asm-i386/pgtable-2level-defs.h  2006-05-09 12:35:17.000000000 +0200
72648 @@ -1,6 +1,8 @@
72649  #ifndef _I386_PGTABLE_2LEVEL_DEFS_H
72650  #define _I386_PGTABLE_2LEVEL_DEFS_H
72651  
72652 +#define HAVE_SHARED_KERNEL_PMD 0
72653 +
72654  /*
72655   * traditional i386 two-level paging structure:
72656   */
72657 diff -urNp linux-2.6/include/asm-i386/pgtable-3level-defs.h new/include/asm-i386/pgtable-3level-defs.h
72658 --- linux-2.6/include/asm-i386/pgtable-3level-defs.h    2006-07-03 14:15:09.000000000 +0200
72659 +++ new/include/asm-i386/pgtable-3level-defs.h  2006-05-09 12:35:18.000000000 +0200
72660 @@ -1,6 +1,8 @@
72661  #ifndef _I386_PGTABLE_3LEVEL_DEFS_H
72662  #define _I386_PGTABLE_3LEVEL_DEFS_H
72663  
72664 +#define HAVE_SHARED_KERNEL_PMD 1
72665 +
72666  /*
72667   * PGDIR_SHIFT determines what a top-level page table entry can map
72668   */
72669 diff -urNp linux-2.6/include/asm-ia64/agp.h new/include/asm-ia64/agp.h
72670 --- linux-2.6/include/asm-ia64/agp.h    2006-07-03 14:15:09.000000000 +0200
72671 +++ new/include/asm-ia64/agp.h  2006-05-09 12:35:18.000000000 +0200
72672 @@ -19,13 +19,44 @@
72673  #define flush_agp_cache()              mb()
72674  
72675  /* Convert a physical address to an address suitable for the GART. */
72676 +#ifndef CONFIG_XEN_IA64_DOM0_VP
72677  #define phys_to_gart(x) (x)
72678  #define gart_to_phys(x) (x)
72679 +#else
72680 +#define phys_to_gart(x) phys_to_machine_for_dma(x)
72681 +#define gart_to_phys(x) machine_to_phys_for_dma(x)
72682 +#endif
72683  
72684  /* GATT allocation. Returns/accepts GATT kernel virtual address. */
72685 +#ifndef CONFIG_XEN_IA64_DOM0_VP
72686  #define alloc_gatt_pages(order)                \
72687         ((char *)__get_free_pages(GFP_KERNEL, (order)))
72688  #define free_gatt_pages(table, order)  \
72689         free_pages((unsigned long)(table), (order))
72690 +#else
72691 +#include <asm/hypervisor.h>
72692 +static inline char*
72693 +alloc_gatt_pages(unsigned int order)
72694 +{
72695 +       unsigned long error;
72696 +       unsigned long ret = __get_free_pages(GFP_KERNEL, (order));
72697 +       if (ret == 0) {
72698 +               goto out;
72699 +       }
72700 +       error = xen_create_contiguous_region(ret, order, 0);
72701 +       if (error) {
72702 +               free_pages(ret, order);
72703 +               ret = 0;
72704 +       }
72705 +out:
72706 +       return (char*)ret;
72707 +}
72708 +static inline void
72709 +free_gatt_pages(void* table, unsigned int order)
72710 +{
72711 +       xen_destroy_contiguous_region((unsigned long)table, order);
72712 +       free_pages((unsigned long)table, order);
72713 +}
72714 +#endif /* CONFIG_XEN_IA64_DOM0_VP */
72715  
72716  #endif /* _ASM_IA64_AGP_H */
72717 diff -urNp linux-2.6/include/asm-ia64/dma-mapping.h new/include/asm-ia64/dma-mapping.h
72718 --- linux-2.6/include/asm-ia64/dma-mapping.h    2006-07-03 14:15:09.000000000 +0200
72719 +++ new/include/asm-ia64/dma-mapping.h  2006-06-28 14:32:14.000000000 +0200
72720 @@ -7,7 +7,14 @@
72721   */
72722  #include <linux/config.h>
72723  #include <asm/machvec.h>
72724 +#ifdef CONFIG_XEN_IA64_DOM0_VP
72725 +/* Needed for arch/i386/kernel/swiotlb.c and arch/i386/kernel/pci-dma-xen.c */
72726 +#include <asm/hypervisor.h>
72727 +/* Needed for arch/i386/kernel/swiotlb.c */
72728 +#include <asm-i386/mach-xen/asm/swiotlb.h>
72729 +#endif
72730  
72731 +#ifndef CONFIG_XEN_IA64_DOM0_VP
72732  #define dma_alloc_coherent     platform_dma_alloc_coherent
72733  #define dma_alloc_noncoherent  platform_dma_alloc_coherent     /* coherent mem. is cheap */
72734  #define dma_free_coherent      platform_dma_free_coherent
72735 @@ -21,6 +28,46 @@
72736  #define dma_sync_single_for_device platform_dma_sync_single_for_device
72737  #define dma_sync_sg_for_device platform_dma_sync_sg_for_device
72738  #define dma_mapping_error      platform_dma_mapping_error
72739 +#else
72740 +int dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents,
72741 +               enum dma_data_direction direction);
72742 +void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
72743 +                  enum dma_data_direction direction);
72744 +int dma_supported(struct device *dev, u64 mask);
72745 +void *dma_alloc_coherent(struct device *dev, size_t size,
72746 +                         dma_addr_t *dma_handle, gfp_t gfp);
72747 +void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
72748 +                       dma_addr_t dma_handle);
72749 +dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size,
72750 +                          enum dma_data_direction direction);
72751 +void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
72752 +                      enum dma_data_direction direction);
72753 +void dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
72754 +                             size_t size, enum dma_data_direction direction);
72755 +void dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle,
72756 +                                size_t size,
72757 +                                enum dma_data_direction direction);
72758 +int dma_mapping_error(dma_addr_t dma_addr);
72759 +
72760 +#define flush_write_buffers()  do { } while (0)
72761 +static inline void
72762 +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
72763 +                    enum dma_data_direction direction)
72764 +{
72765 +       if (swiotlb)
72766 +               swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
72767 +       flush_write_buffers();
72768 +}
72769 +
72770 +static inline void
72771 +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
72772 +                       enum dma_data_direction direction)
72773 +{
72774 +       if (swiotlb)
72775 +               swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
72776 +       flush_write_buffers();
72777 +}
72778 +#endif
72779  
72780  #define dma_map_page(dev, pg, off, size, dir)                          \
72781         dma_map_single(dev, page_address(pg) + (off), (size), (dir))
72782 @@ -62,4 +109,29 @@ dma_cache_sync (void *vaddr, size_t size
72783  
72784  #define dma_is_consistent(dma_handle)  (1)     /* all we do is coherent memory... */
72785  
72786 +#ifdef CONFIG_XEN_IA64_DOM0_VP
72787 +/* arch/i386/kernel/swiotlb.o requires */
72788 +void contiguous_bitmap_init(unsigned long end_pfn);
72789 +
72790 +static inline int
72791 +address_needs_mapping(struct device *hwdev, dma_addr_t addr)
72792 +{
72793 +       dma_addr_t mask = DMA_64BIT_MASK;
72794 +       /* If the device has a mask, use it, otherwise default to 64 bits */
72795 +       if (hwdev && hwdev->dma_mask)
72796 +               mask = *hwdev->dma_mask;
72797 +       return (addr & ~mask) != 0;
72798 +}
72799 +
72800 +static inline int
72801 +range_straddles_page_boundary(void *p, size_t size)
72802 +{
72803 +       extern unsigned long *contiguous_bitmap;
72804 +       return (((((unsigned long)p & ~PAGE_MASK) + size) > PAGE_SIZE) &&
72805 +               !test_bit(__pa(p) >> PAGE_SHIFT, contiguous_bitmap));
72806 +}
72807 +#else
72808 +#define contiguous_bitmap_init(end_pfn)        ((void)end_pfn)
72809 +#endif
72810 +
72811  #endif /* _ASM_IA64_DMA_MAPPING_H */
72812 diff -urNp linux-2.6/include/asm-ia64/fixmap.h new/include/asm-ia64/fixmap.h
72813 --- linux-2.6/include/asm-ia64/fixmap.h 1970-01-01 01:00:00.000000000 +0100
72814 +++ new/include/asm-ia64/fixmap.h       2006-05-09 12:35:18.000000000 +0200
72815 @@ -0,0 +1,2 @@
72816 +#define clear_fixmap(x)        do {} while (0)
72817 +#define        set_fixmap(x,y) do {} while (0)
72818 diff -urNp linux-2.6/include/asm-ia64/gcc_intrin.h new/include/asm-ia64/gcc_intrin.h
72819 --- linux-2.6/include/asm-ia64/gcc_intrin.h     2006-07-03 14:15:09.000000000 +0200
72820 +++ new/include/asm-ia64/gcc_intrin.h   2006-05-09 12:35:18.000000000 +0200
72821 @@ -26,7 +26,7 @@ extern void ia64_bad_param_for_getreg (v
72822  
72823  register unsigned long ia64_r13 asm ("r13") __attribute_used__;
72824  
72825 -#define ia64_setreg(regnum, val)                                               \
72826 +#define __ia64_setreg(regnum, val)                                             \
72827  ({                                                                             \
72828         switch (regnum) {                                                       \
72829             case _IA64_REG_PSR_L:                                               \
72830 @@ -55,7 +55,7 @@ register unsigned long ia64_r13 asm ("r1
72831         }                                                                       \
72832  })
72833  
72834 -#define ia64_getreg(regnum)                                                    \
72835 +#define __ia64_getreg(regnum)                                                  \
72836  ({                                                                             \
72837         __u64 ia64_intri_res;                                                   \
72838                                                                                 \
72839 @@ -92,7 +92,7 @@ register unsigned long ia64_r13 asm ("r1
72840  
72841  #define ia64_hint_pause 0
72842  
72843 -#define ia64_hint(mode)                                                \
72844 +#define __ia64_hint(mode)                                              \
72845  ({                                                             \
72846         switch (mode) {                                         \
72847         case ia64_hint_pause:                                   \
72848 @@ -374,7 +374,7 @@ register unsigned long ia64_r13 asm ("r1
72849  
72850  #define ia64_invala() asm volatile ("invala" ::: "memory")
72851  
72852 -#define ia64_thash(addr)                                                       \
72853 +#define __ia64_thash(addr)                                                     \
72854  ({                                                                             \
72855         __u64 ia64_intri_res;                                                   \
72856         asm volatile ("thash %0=%1" : "=r"(ia64_intri_res) : "r" (addr));       \
72857 @@ -394,18 +394,18 @@ register unsigned long ia64_r13 asm ("r1
72858  
72859  #define ia64_nop(x)    asm volatile ("nop %0"::"i"(x));
72860  
72861 -#define ia64_itci(addr)        asm volatile ("itc.i %0;;" :: "r"(addr) : "memory")
72862 +#define __ia64_itci(addr)      asm volatile ("itc.i %0;;" :: "r"(addr) : "memory")
72863  
72864 -#define ia64_itcd(addr)        asm volatile ("itc.d %0;;" :: "r"(addr) : "memory")
72865 +#define __ia64_itcd(addr)      asm volatile ("itc.d %0;;" :: "r"(addr) : "memory")
72866  
72867  
72868 -#define ia64_itri(trnum, addr) asm volatile ("itr.i itr[%0]=%1"                                \
72869 +#define __ia64_itri(trnum, addr) asm volatile ("itr.i itr[%0]=%1"                      \
72870                                              :: "r"(trnum), "r"(addr) : "memory")
72871  
72872 -#define ia64_itrd(trnum, addr) asm volatile ("itr.d dtr[%0]=%1"                                \
72873 +#define __ia64_itrd(trnum, addr) asm volatile ("itr.d dtr[%0]=%1"                      \
72874                                              :: "r"(trnum), "r"(addr) : "memory")
72875  
72876 -#define ia64_tpa(addr)                                                         \
72877 +#define __ia64_tpa(addr)                                                       \
72878  ({                                                                             \
72879         __u64 ia64_pa;                                                          \
72880         asm volatile ("tpa %0 = %1" : "=r"(ia64_pa) : "r"(addr) : "memory");    \
72881 @@ -415,22 +415,22 @@ register unsigned long ia64_r13 asm ("r1
72882  #define __ia64_set_dbr(index, val)                                             \
72883         asm volatile ("mov dbr[%0]=%1" :: "r"(index), "r"(val) : "memory")
72884  
72885 -#define ia64_set_ibr(index, val)                                               \
72886 +#define __ia64_set_ibr(index, val)                                             \
72887         asm volatile ("mov ibr[%0]=%1" :: "r"(index), "r"(val) : "memory")
72888  
72889 -#define ia64_set_pkr(index, val)                                               \
72890 +#define __ia64_set_pkr(index, val)                                             \
72891         asm volatile ("mov pkr[%0]=%1" :: "r"(index), "r"(val) : "memory")
72892  
72893 -#define ia64_set_pmc(index, val)                                               \
72894 +#define __ia64_set_pmc(index, val)                                             \
72895         asm volatile ("mov pmc[%0]=%1" :: "r"(index), "r"(val) : "memory")
72896  
72897 -#define ia64_set_pmd(index, val)                                               \
72898 +#define __ia64_set_pmd(index, val)                                             \
72899         asm volatile ("mov pmd[%0]=%1" :: "r"(index), "r"(val) : "memory")
72900  
72901 -#define ia64_set_rr(index, val)                                                        \
72902 +#define __ia64_set_rr(index, val)                                                      \
72903         asm volatile ("mov rr[%0]=%1" :: "r"(index), "r"(val) : "memory");
72904  
72905 -#define ia64_get_cpuid(index)                                                          \
72906 +#define __ia64_get_cpuid(index)                                                                \
72907  ({                                                                                     \
72908         __u64 ia64_intri_res;                                                           \
72909         asm volatile ("mov %0=cpuid[%r1]" : "=r"(ia64_intri_res) : "rO"(index));        \
72910 @@ -444,21 +444,21 @@ register unsigned long ia64_r13 asm ("r1
72911         ia64_intri_res;                                                         \
72912  })
72913  
72914 -#define ia64_get_ibr(index)                                                    \
72915 +#define __ia64_get_ibr(index)                                                  \
72916  ({                                                                             \
72917         __u64 ia64_intri_res;                                                   \
72918         asm volatile ("mov %0=ibr[%1]" : "=r"(ia64_intri_res) : "r"(index));    \
72919         ia64_intri_res;                                                         \
72920  })
72921  
72922 -#define ia64_get_pkr(index)                                                    \
72923 +#define __ia64_get_pkr(index)                                                  \
72924  ({                                                                             \
72925         __u64 ia64_intri_res;                                                   \
72926         asm volatile ("mov %0=pkr[%1]" : "=r"(ia64_intri_res) : "r"(index));    \
72927         ia64_intri_res;                                                         \
72928  })
72929  
72930 -#define ia64_get_pmc(index)                                                    \
72931 +#define __ia64_get_pmc(index)                                                  \
72932  ({                                                                             \
72933         __u64 ia64_intri_res;                                                   \
72934         asm volatile ("mov %0=pmc[%1]" : "=r"(ia64_intri_res) : "r"(index));    \
72935 @@ -466,48 +466,48 @@ register unsigned long ia64_r13 asm ("r1
72936  })
72937  
72938  
72939 -#define ia64_get_pmd(index)                                                    \
72940 +#define __ia64_get_pmd(index)                                                  \
72941  ({                                                                             \
72942         __u64 ia64_intri_res;                                                   \
72943         asm volatile ("mov %0=pmd[%1]" : "=r"(ia64_intri_res) : "r"(index));    \
72944         ia64_intri_res;                                                         \
72945  })
72946  
72947 -#define ia64_get_rr(index)                                                     \
72948 +#define __ia64_get_rr(index)                                                   \
72949  ({                                                                             \
72950         __u64 ia64_intri_res;                                                   \
72951         asm volatile ("mov %0=rr[%1]" : "=r"(ia64_intri_res) : "r" (index));    \
72952         ia64_intri_res;                                                         \
72953  })
72954  
72955 -#define ia64_fc(addr)  asm volatile ("fc %0" :: "r"(addr) : "memory")
72956 +#define __ia64_fc(addr)        asm volatile ("fc %0" :: "r"(addr) : "memory")
72957  
72958  
72959  #define ia64_sync_i()  asm volatile (";; sync.i" ::: "memory")
72960  
72961 -#define ia64_ssm(mask) asm volatile ("ssm %0":: "i"((mask)) : "memory")
72962 -#define ia64_rsm(mask) asm volatile ("rsm %0":: "i"((mask)) : "memory")
72963 +#define __ia64_ssm(mask)       asm volatile ("ssm %0":: "i"((mask)) : "memory")
72964 +#define __ia64_rsm(mask)       asm volatile ("rsm %0":: "i"((mask)) : "memory")
72965  #define ia64_sum(mask) asm volatile ("sum %0":: "i"((mask)) : "memory")
72966  #define ia64_rum(mask) asm volatile ("rum %0":: "i"((mask)) : "memory")
72967  
72968 -#define ia64_ptce(addr)        asm volatile ("ptc.e %0" :: "r"(addr))
72969 +#define __ia64_ptce(addr)      asm volatile ("ptc.e %0" :: "r"(addr))
72970  
72971 -#define ia64_ptcga(addr, size)                                                 \
72972 +#define __ia64_ptcga(addr, size)                                                       \
72973  do {                                                                           \
72974         asm volatile ("ptc.ga %0,%1" :: "r"(addr), "r"(size) : "memory");       \
72975         ia64_dv_serialize_data();                                               \
72976  } while (0)
72977  
72978 -#define ia64_ptcl(addr, size)                                                  \
72979 +#define __ia64_ptcl(addr, size)                                                        \
72980  do {                                                                           \
72981         asm volatile ("ptc.l %0,%1" :: "r"(addr), "r"(size) : "memory");        \
72982         ia64_dv_serialize_data();                                               \
72983  } while (0)
72984  
72985 -#define ia64_ptri(addr, size)                                          \
72986 +#define __ia64_ptri(addr, size)                                                \
72987         asm volatile ("ptr.i %0,%1" :: "r"(addr), "r"(size) : "memory")
72988  
72989 -#define ia64_ptrd(addr, size)                                          \
72990 +#define __ia64_ptrd(addr, size)                                                \
72991         asm volatile ("ptr.d %0,%1" :: "r"(addr), "r"(size) : "memory")
72992  
72993  /* Values for lfhint in ia64_lfetch and ia64_lfetch_fault */
72994 @@ -589,7 +589,7 @@ do {                                                                                \
72995          }                                                              \
72996  })
72997  
72998 -#define ia64_intrin_local_irq_restore(x)                       \
72999 +#define __ia64_intrin_local_irq_restore(x)                     \
73000  do {                                                           \
73001         asm volatile (";;   cmp.ne p6,p7=%0,r0;;"               \
73002                       "(p6) ssm psr.i;"                         \
73003 @@ -598,4 +598,6 @@ do {                                                                \
73004                       :: "r"((x)) : "p6", "p7", "memory");      \
73005  } while (0)
73006  
73007 +#define __ia64_get_psr_i()     (__ia64_getreg(_IA64_REG_PSR) & 0x4000UL)
73008 +
73009  #endif /* _ASM_IA64_GCC_INTRIN_H */
73010 diff -urNp linux-2.6/include/asm-ia64/hw_irq.h new/include/asm-ia64/hw_irq.h
73011 --- linux-2.6/include/asm-ia64/hw_irq.h 2006-07-03 14:15:09.000000000 +0200
73012 +++ new/include/asm-ia64/hw_irq.h       2006-06-28 14:32:14.000000000 +0200
73013 @@ -15,7 +15,11 @@
73014  #include <asm/ptrace.h>
73015  #include <asm/smp.h>
73016  
73017 +#ifndef CONFIG_XEN
73018  typedef u8 ia64_vector;
73019 +#else
73020 +typedef u16 ia64_vector;
73021 +#endif
73022  
73023  /*
73024   * 0 special
73025 @@ -89,6 +93,13 @@ extern void register_percpu_irq (ia64_ve
73026  static inline void
73027  hw_resend_irq (struct hw_interrupt_type *h, unsigned int vector)
73028  {
73029 +#ifdef CONFIG_XEN
73030 +       extern void resend_irq_on_evtchn(struct hw_interrupt_type *h,
73031 +                                        unsigned int i);
73032 +       if (is_running_on_xen())
73033 +               resend_irq_on_evtchn(h, vector);
73034 +       else
73035 +#endif /* CONFIG_XEN */
73036         platform_send_ipi(smp_processor_id(), vector, IA64_IPI_DM_INT, 0);
73037  }
73038  
73039 diff -urNp linux-2.6/include/asm-ia64/hypercall.h new/include/asm-ia64/hypercall.h
73040 --- linux-2.6/include/asm-ia64/hypercall.h      1970-01-01 01:00:00.000000000 +0100
73041 +++ new/include/asm-ia64/hypercall.h    2006-06-07 13:15:16.000000000 +0200
73042 @@ -0,0 +1,533 @@
73043 +/******************************************************************************
73044 + * hypercall.h
73045 + * 
73046 + * Linux-specific hypervisor handling.
73047 + * 
73048 + * Copyright (c) 2002-2004, K A Fraser
73049 + * 
73050 + * This program is free software; you can redistribute it and/or
73051 + * modify it under the terms of the GNU General Public License version 2
73052 + * as published by the Free Software Foundation; or, when distributed
73053 + * separately from the Linux kernel or incorporated into other
73054 + * software packages, subject to the following license:
73055 + * 
73056 + * Permission is hereby granted, free of charge, to any person obtaining a copy
73057 + * of this source file (the "Software"), to deal in the Software without
73058 + * restriction, including without limitation the rights to use, copy, modify,
73059 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
73060 + * and to permit persons to whom the Software is furnished to do so, subject to
73061 + * the following conditions:
73062 + * 
73063 + * The above copyright notice and this permission notice shall be included in
73064 + * all copies or substantial portions of the Software.
73065 + * 
73066 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
73067 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
73068 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
73069 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
73070 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
73071 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
73072 + * IN THE SOFTWARE.
73073 + */
73074 +
73075 +#ifndef __HYPERCALL_H__
73076 +#define __HYPERCALL_H__
73077 +
73078 +#include <linux/string.h> /* memcpy() */
73079 +
73080 +#ifndef __HYPERVISOR_H__
73081 +# error "please don't include this file directly"
73082 +#endif
73083 +
73084 +/*
73085 + * Assembler stubs for hyper-calls.
73086 + */
73087 +
73088 +#define _hypercall0(type, name)                                        \
73089 +({                                                             \
73090 +       long __res;                                             \
73091 +       __asm__ __volatile__ (";;\n"                            \
73092 +                             "mov r2=%1\n"                     \
73093 +                             "break 0x1000 ;;\n"               \
73094 +                             "mov %0=r8 ;;\n"                  \
73095 +                             : "=r" (__res)                    \
73096 +                             : "i" (__HYPERVISOR_##name)       \
73097 +                             : "r2","r8",                      \
73098 +                               "memory" );                     \
73099 +       (type)__res;                                            \
73100 +})
73101 +
73102 +#define _hypercall1(type, name, a1)                            \
73103 +({                                                             \
73104 +       long __res;                                             \
73105 +       __asm__ __volatile__ (";;\n"                            \
73106 +                             "mov r14=%2\n"                    \
73107 +                             "mov r2=%1\n"                     \
73108 +                             "break 0x1000 ;;\n"               \
73109 +                             "mov %0=r8 ;;\n"                  \
73110 +                             : "=r" (__res)                    \
73111 +                             : "i" (__HYPERVISOR_##name),      \
73112 +                               "r" ((unsigned long)(a1))       \
73113 +                             : "r14","r2","r8",                \
73114 +                               "memory" );                     \
73115 +       (type)__res;                                            \
73116 +})
73117 +
73118 +#define _hypercall2(type, name, a1, a2)                                \
73119 +({                                                             \
73120 +       long __res;                                             \
73121 +       __asm__ __volatile__ (";;\n"                            \
73122 +                             "mov r14=%2\n"                    \
73123 +                             "mov r15=%3\n"                    \
73124 +                             "mov r2=%1\n"                     \
73125 +                             "break 0x1000 ;;\n"               \
73126 +                             "mov %0=r8 ;;\n"                  \
73127 +                             : "=r" (__res)                    \
73128 +                             : "i" (__HYPERVISOR_##name),      \
73129 +                               "r" ((unsigned long)(a1)),      \
73130 +                               "r" ((unsigned long)(a2))       \
73131 +                             : "r14","r15","r2","r8",          \
73132 +                               "memory" );                     \
73133 +       (type)__res;                                            \
73134 +})
73135 +
73136 +#define _hypercall3(type, name, a1, a2, a3)                    \
73137 +({                                                             \
73138 +       long __res;                                             \
73139 +       __asm__ __volatile__ (";;\n"                            \
73140 +                             "mov r14=%2\n"                    \
73141 +                             "mov r15=%3\n"                    \
73142 +                             "mov r16=%4\n"                    \
73143 +                             "mov r2=%1\n"                     \
73144 +                             "break 0x1000 ;;\n"               \
73145 +                             "mov %0=r8 ;;\n"                  \
73146 +                             : "=r" (__res)                    \
73147 +                             : "i" (__HYPERVISOR_##name),      \
73148 +                               "r" ((unsigned long)(a1)),      \
73149 +                               "r" ((unsigned long)(a2)),      \
73150 +                               "r" ((unsigned long)(a3))       \
73151 +                             : "r14","r15","r16","r2","r8",    \
73152 +                               "memory" );                     \
73153 +       (type)__res;                                            \
73154 +})
73155 +
73156 +#define _hypercall4(type, name, a1, a2, a3, a4)                        \
73157 +({                                                             \
73158 +       long __res;                                             \
73159 +       __asm__ __volatile__ (";;\n"                            \
73160 +                             "mov r14=%2\n"                    \
73161 +                             "mov r15=%3\n"                    \
73162 +                             "mov r16=%4\n"                    \
73163 +                             "mov r17=%5\n"                    \
73164 +                             "mov r2=%1\n"                     \
73165 +                             "break 0x1000 ;;\n"               \
73166 +                             "mov %0=r8 ;;\n"                  \
73167 +                             : "=r" (__res)                    \
73168 +                             : "i" (__HYPERVISOR_##name),      \
73169 +                               "r" ((unsigned long)(a1)),      \
73170 +                               "r" ((unsigned long)(a2)),      \
73171 +                               "r" ((unsigned long)(a3)),      \
73172 +                               "r" ((unsigned long)(a4))       \
73173 +                             : "r14","r15","r16","r2","r8",    \
73174 +                               "r17","memory" );               \
73175 +       (type)__res;                                            \
73176 +})
73177 +
73178 +#define _hypercall5(type, name, a1, a2, a3, a4, a5)            \
73179 +({                                                             \
73180 +       long __res;                                             \
73181 +       __asm__ __volatile__ (";;\n"                            \
73182 +                             "mov r14=%2\n"                    \
73183 +                             "mov r15=%3\n"                    \
73184 +                             "mov r16=%4\n"                    \
73185 +                             "mov r17=%5\n"                    \
73186 +                             "mov r18=%6\n"                    \
73187 +                             "mov r2=%1\n"                     \
73188 +                             "break 0x1000 ;;\n"               \
73189 +                             "mov %0=r8 ;;\n"                  \
73190 +                             : "=r" (__res)                    \
73191 +                             : "i" (__HYPERVISOR_##name),      \
73192 +                               "r" ((unsigned long)(a1)),      \
73193 +                               "r" ((unsigned long)(a2)),      \
73194 +                               "r" ((unsigned long)(a3)),      \
73195 +                               "r" ((unsigned long)(a4)),      \
73196 +                               "r" ((unsigned long)(a5))       \
73197 +                             : "r14","r15","r16","r2","r8",    \
73198 +                               "r17","r18","memory" );         \
73199 +       (type)__res;                                            \
73200 +})
73201 +
73202 +static inline int
73203 +HYPERVISOR_sched_op_compat(
73204 +    int cmd, unsigned long arg)
73205 +{
73206 +       return _hypercall2(int, sched_op_compat, cmd, arg);
73207 +}
73208 +
73209 +static inline int
73210 +HYPERVISOR_sched_op(
73211 +       int cmd, void *arg)
73212 +{
73213 +       return _hypercall2(int, sched_op, cmd, arg);
73214 +}
73215 +
73216 +static inline long
73217 +HYPERVISOR_set_timer_op(
73218 +    u64 timeout)
73219 +{
73220 +    unsigned long timeout_hi = (unsigned long)(timeout>>32);
73221 +    unsigned long timeout_lo = (unsigned long)timeout;
73222 +    return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
73223 +}
73224 +
73225 +static inline int
73226 +HYPERVISOR_dom0_op(
73227 +    dom0_op_t *dom0_op)
73228 +{
73229 +    dom0_op->interface_version = DOM0_INTERFACE_VERSION;
73230 +    return _hypercall1(int, dom0_op, dom0_op);
73231 +}
73232 +
73233 +static inline int
73234 +HYPERVISOR_multicall(
73235 +    void *call_list, int nr_calls)
73236 +{
73237 +    return _hypercall2(int, multicall, call_list, nr_calls);
73238 +}
73239 +
73240 +#ifndef CONFIG_XEN_IA64_DOM0_VP
73241 +static inline int
73242 +HYPERVISOR_memory_op(
73243 +    unsigned int cmd, void *arg)
73244 +{
73245 +    return _hypercall2(int, memory_op, cmd, arg);
73246 +}
73247 +#else
73248 +//XXX xen/ia64 copy_from_guest() is broken.
73249 +//    This is a temporal work around until it is fixed.
73250 +static inline int
73251 +____HYPERVISOR_memory_op(
73252 +    unsigned int cmd, void *arg)
73253 +{
73254 +    return _hypercall2(int, memory_op, cmd, arg);
73255 +}
73256 +
73257 +#include <xen/interface/memory.h>
73258 +int ia64_xenmem_reservation_op(unsigned long op,
73259 +                  struct xen_memory_reservation* reservation__);
73260 +static inline int
73261 +HYPERVISOR_memory_op(
73262 +    unsigned int cmd, void *arg)
73263 +{
73264 +    switch (cmd) {
73265 +    case XENMEM_increase_reservation:
73266 +    case XENMEM_decrease_reservation:
73267 +    case XENMEM_populate_physmap:
73268 +        return ia64_xenmem_reservation_op(cmd, 
73269 +                                          (struct xen_memory_reservation*)arg);
73270 +    default:
73271 +        return ____HYPERVISOR_memory_op(cmd, arg);
73272 +    }
73273 +    /* NOTREACHED */
73274 +}
73275 +#endif
73276 +
73277 +static inline int
73278 +HYPERVISOR_event_channel_op(
73279 +    int cmd, void *arg)
73280 +{
73281 +    int rc = _hypercall2(int, event_channel_op, cmd, arg);
73282 +    if (unlikely(rc == -ENOSYS)) {
73283 +        struct evtchn_op op;
73284 +        op.cmd = cmd;
73285 +        memcpy(&op.u, arg, sizeof(op.u));
73286 +        rc = _hypercall1(int, event_channel_op_compat, &op);
73287 +    }
73288 +    return rc;
73289 +}
73290 +
73291 +static inline int
73292 +HYPERVISOR_acm_op(
73293 +       unsigned int cmd, void *arg)
73294 +{
73295 +    return _hypercall2(int, acm_op, cmd, arg);
73296 +}
73297 +
73298 +static inline int
73299 +HYPERVISOR_xen_version(
73300 +    int cmd, void *arg)
73301 +{
73302 +    return _hypercall2(int, xen_version, cmd, arg);
73303 +}
73304 +
73305 +static inline int
73306 +HYPERVISOR_console_io(
73307 +    int cmd, int count, char *str)
73308 +{
73309 +    return _hypercall3(int, console_io, cmd, count, str);
73310 +}
73311 +
73312 +static inline int
73313 +HYPERVISOR_physdev_op(
73314 +    int cmd, void *arg)
73315 +{
73316 +    int rc = _hypercall2(int, physdev_op, cmd, arg);
73317 +    if (unlikely(rc == -ENOSYS)) {
73318 +        struct physdev_op op;
73319 +        op.cmd = cmd;
73320 +        memcpy(&op.u, arg, sizeof(op.u));
73321 +        rc = _hypercall1(int, physdev_op_compat, &op);
73322 +    }
73323 +    return rc;
73324 +}
73325 +
73326 +//XXX __HYPERVISOR_grant_table_op is used for this hypercall constant.
73327 +static inline int
73328 +____HYPERVISOR_grant_table_op(
73329 +    unsigned int cmd, void *uop, unsigned int count)
73330 +{
73331 +    return _hypercall3(int, grant_table_op, cmd, uop, count);
73332 +}
73333 +#ifndef CONFIG_XEN_IA64_DOM0_VP
73334 +#define HYPERVISOR_grant_table_op(cmd, uop, count) \
73335 +       ____HYPERVISOR_grant_table_op((cmd), (uop), (count))
73336 +#else
73337 +int HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count);
73338 +#endif
73339 +
73340 +static inline int
73341 +HYPERVISOR_vcpu_op(
73342 +       int cmd, int vcpuid, void *extra_args)
73343 +{
73344 +    return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
73345 +}
73346 +
73347 +static inline int
73348 +HYPERVISOR_suspend(
73349 +       unsigned long srec)
73350 +{
73351 +       struct sched_shutdown sched_shutdown = {
73352 +               .reason = SHUTDOWN_suspend
73353 +       };
73354 +
73355 +       int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
73356 +                            &sched_shutdown, srec);
73357 +
73358 +       if (rc == -ENOSYS)
73359 +               rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
73360 +                                SHUTDOWN_suspend, srec);
73361 +
73362 +       return rc;
73363 +}
73364 +
73365 +static inline int
73366 +HYPERVISOR_callback_op(
73367 +       int cmd, void *arg)
73368 +{
73369 +       return _hypercall2(int, callback_op, cmd, arg);
73370 +}
73371 +
73372 +extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs);
73373 +static inline void exit_idle(void) {}
73374 +#define do_IRQ(irq, regs) ({                   \
73375 +       irq_enter();                            \
73376 +       __do_IRQ((irq), (regs));                \
73377 +       irq_exit();                             \
73378 +})
73379 +
73380 +#ifdef CONFIG_XEN_IA64_DOM0_VP
73381 +#include <linux/err.h>
73382 +#include <asm/xen/privop.h>
73383 +
73384 +#define _hypercall_imm1(type, name, imm, a1)                   \
73385 +({                                                             \
73386 +       long __res;                                             \
73387 +       __asm__ __volatile__ (";;\n"                            \
73388 +                             "mov r14=%2\n"                    \
73389 +                             "mov r15=%3\n"                    \
73390 +                             "mov r2=%1\n"                     \
73391 +                             "break 0x1000 ;;\n"               \
73392 +                             "mov %0=r8 ;;\n"                  \
73393 +                             : "=r" (__res)                    \
73394 +                             : "i" (__HYPERVISOR_##name),      \
73395 +                               "i" (imm),                      \
73396 +                               "r" ((unsigned long)(a1))       \
73397 +                             : "r14","r15","r2","r8",          \
73398 +                               "memory" );                     \
73399 +       (type)__res;                                            \
73400 +})
73401 +
73402 +#define _hypercall_imm2(type, name, imm, a1, a2)               \
73403 +({                                                             \
73404 +       long __res;                                             \
73405 +       __asm__ __volatile__ (";;\n"                            \
73406 +                             "mov r14=%2\n"                    \
73407 +                             "mov r15=%3\n"                    \
73408 +                             "mov r16=%4\n"                    \
73409 +                             "mov r2=%1\n"                     \
73410 +                             "break 0x1000 ;;\n"               \
73411 +                             "mov %0=r8 ;;\n"                  \
73412 +                             : "=r" (__res)                    \
73413 +                             : "i" (__HYPERVISOR_##name),      \
73414 +                               "i" (imm),                      \
73415 +                               "r" ((unsigned long)(a1)),      \
73416 +                               "r" ((unsigned long)(a2))       \
73417 +                             : "r14","r15","r16","r2","r8",    \
73418 +                               "memory" );                     \
73419 +       (type)__res;                                            \
73420 +})
73421 +
73422 +#define _hypercall_imm3(type, name, imm, a1, a2, a3)           \
73423 +({                                                             \
73424 +       long __res;                                             \
73425 +       __asm__ __volatile__ (";;\n"                            \
73426 +                             "mov r14=%2\n"                    \
73427 +                             "mov r15=%3\n"                    \
73428 +                             "mov r16=%4\n"                    \
73429 +                             "mov r17=%5\n"                    \
73430 +                             "mov r2=%1\n"                     \
73431 +                             "break 0x1000 ;;\n"               \
73432 +                             "mov %0=r8 ;;\n"                  \
73433 +                             : "=r" (__res)                    \
73434 +                             : "i" (__HYPERVISOR_##name),      \
73435 +                               "i" (imm),                      \
73436 +                               "r" ((unsigned long)(a1)),      \
73437 +                               "r" ((unsigned long)(a2)),      \
73438 +                               "r" ((unsigned long)(a3))       \
73439 +                             : "r14","r15","r16","r17",        \
73440 +                               "r2","r8",                      \
73441 +                               "memory" );                     \
73442 +       (type)__res;                                            \
73443 +})
73444 +
73445 +#define _hypercall_imm4(type, name, imm, a1, a2, a3, a4)       \
73446 +({                                                             \
73447 +       long __res;                                             \
73448 +       __asm__ __volatile__ (";;\n"                            \
73449 +                             "mov r14=%2\n"                    \
73450 +                             "mov r15=%3\n"                    \
73451 +                             "mov r16=%4\n"                    \
73452 +                             "mov r17=%5\n"                    \
73453 +                             "mov r18=%6\n"                    \
73454 +                             "mov r2=%1\n"                     \
73455 +                             "break 0x1000 ;;\n"               \
73456 +                             "mov %0=r8 ;;\n"                  \
73457 +                             : "=r" (__res)                    \
73458 +                             : "i" (__HYPERVISOR_##name),      \
73459 +                               "i" (imm),                      \
73460 +                               "r" ((unsigned long)(a1)),      \
73461 +                               "r" ((unsigned long)(a2)),      \
73462 +                               "r" ((unsigned long)(a3)),      \
73463 +                               "r" ((unsigned long)(a4))       \
73464 +                             : "r14","r15","r16","r17","r18",  \
73465 +                               "r2","r8",                      \
73466 +                               "memory" );                     \
73467 +       (type)__res;                                            \
73468 +})
73469 +
73470 +static inline unsigned long
73471 +__HYPERVISOR_ioremap(unsigned long ioaddr, unsigned long size)
73472 +{
73473 +       return _hypercall_imm2(unsigned long, ia64_dom0vp_op,
73474 +                              IA64_DOM0VP_ioremap, ioaddr, size);
73475 +}
73476 +
73477 +static inline unsigned long
73478 +HYPERVISOR_ioremap(unsigned long ioaddr, unsigned long size)
73479 +{
73480 +       unsigned long ret = ioaddr;
73481 +       if (is_running_on_xen()) {
73482 +               ret = __HYPERVISOR_ioremap(ioaddr, size);
73483 +               if (unlikely(ret == -ENOSYS))
73484 +                       panic("hypercall %s failed with %ld. "
73485 +                             "Please check Xen and Linux config mismatch\n",
73486 +                             __func__, -ret);
73487 +               else if (unlikely(IS_ERR_VALUE(ret)))
73488 +                       ret = ioaddr;
73489 +       }
73490 +       return ret;
73491 +}
73492 +
73493 +static inline unsigned long
73494 +__HYPERVISOR_phystomach(unsigned long gpfn)
73495 +{
73496 +       return _hypercall_imm1(unsigned long, ia64_dom0vp_op,
73497 +                              IA64_DOM0VP_phystomach, gpfn);
73498 +}
73499 +
73500 +static inline unsigned long
73501 +HYPERVISOR_phystomach(unsigned long gpfn)
73502 +{
73503 +       unsigned long ret = gpfn;
73504 +       if (is_running_on_xen()) {
73505 +               ret = __HYPERVISOR_phystomach(gpfn);
73506 +       }
73507 +       return ret;
73508 +}
73509 +
73510 +static inline unsigned long
73511 +__HYPERVISOR_machtophys(unsigned long mfn)
73512 +{
73513 +       return _hypercall_imm1(unsigned long, ia64_dom0vp_op,
73514 +                              IA64_DOM0VP_machtophys, mfn);
73515 +}
73516 +
73517 +static inline unsigned long
73518 +HYPERVISOR_machtophys(unsigned long mfn)
73519 +{
73520 +       unsigned long ret = mfn;
73521 +       if (is_running_on_xen()) {
73522 +               ret = __HYPERVISOR_machtophys(mfn);
73523 +       }
73524 +       return ret;
73525 +}
73526 +
73527 +static inline unsigned long
73528 +__HYPERVISOR_zap_physmap(unsigned long gpfn, unsigned int extent_order)
73529 +{
73530 +       return _hypercall_imm2(unsigned long, ia64_dom0vp_op,
73531 +                              IA64_DOM0VP_zap_physmap, gpfn, extent_order);
73532 +}
73533 +
73534 +static inline unsigned long
73535 +HYPERVISOR_zap_physmap(unsigned long gpfn, unsigned int extent_order)
73536 +{
73537 +       unsigned long ret = 0;
73538 +       if (is_running_on_xen()) {
73539 +               ret = __HYPERVISOR_zap_physmap(gpfn, extent_order);
73540 +       }
73541 +       return ret;
73542 +}
73543 +
73544 +static inline unsigned long
73545 +__HYPERVISOR_add_physmap(unsigned long gpfn, unsigned long mfn,
73546 +                        unsigned long flags, domid_t domid)
73547 +{
73548 +       return _hypercall_imm4(unsigned long, ia64_dom0vp_op,
73549 +                              IA64_DOM0VP_add_physmap, gpfn, mfn, flags,
73550 +                              domid);
73551 +}
73552 +
73553 +static inline unsigned long
73554 +HYPERVISOR_add_physmap(unsigned long gpfn, unsigned long mfn,
73555 +                      unsigned long flags, domid_t domid)
73556 +{
73557 +       unsigned long ret = 0;
73558 +       BUG_ON(!is_running_on_xen());//XXX
73559 +       if (is_running_on_xen()) {
73560 +               ret = __HYPERVISOR_add_physmap(gpfn, mfn, flags, domid);
73561 +       }
73562 +       return ret;
73563 +}
73564 +
73565 +// for balloon driver
73566 +#define HYPERVISOR_update_va_mapping(va, new_val, flags) (0)
73567 +
73568 +#else
73569 +#define HYPERVISOR_ioremap(ioaddr, size)               (ioaddr)
73570 +#define HYPERVISOR_phystomach(gpfn)                    (gpfn)
73571 +#define HYPERVISOR_machtophys(mfn)                     (mfn)
73572 +#define HYPERVISOR_zap_physmap(gpfn, extent_order)     (0)
73573 +#define HYPERVISOR_add_physmap(gpfn, mfn, flags)       (0)
73574 +#endif
73575 +#endif /* __HYPERCALL_H__ */
73576 diff -urNp linux-2.6/include/asm-ia64/hypervisor.h new/include/asm-ia64/hypervisor.h
73577 --- linux-2.6/include/asm-ia64/hypervisor.h     1970-01-01 01:00:00.000000000 +0100
73578 +++ new/include/asm-ia64/hypervisor.h   2006-06-28 14:32:14.000000000 +0200
73579 @@ -0,0 +1,207 @@
73580 +/******************************************************************************
73581 + * hypervisor.h
73582 + * 
73583 + * Linux-specific hypervisor handling.
73584 + * 
73585 + * Copyright (c) 2002-2004, K A Fraser
73586 + * 
73587 + * This program is free software; you can redistribute it and/or
73588 + * modify it under the terms of the GNU General Public License version 2
73589 + * as published by the Free Software Foundation; or, when distributed
73590 + * separately from the Linux kernel or incorporated into other
73591 + * software packages, subject to the following license:
73592 + * 
73593 + * Permission is hereby granted, free of charge, to any person obtaining a copy
73594 + * of this source file (the "Software"), to deal in the Software without
73595 + * restriction, including without limitation the rights to use, copy, modify,
73596 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
73597 + * and to permit persons to whom the Software is furnished to do so, subject to
73598 + * the following conditions:
73599 + * 
73600 + * The above copyright notice and this permission notice shall be included in
73601 + * all copies or substantial portions of the Software.
73602 + * 
73603 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
73604 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
73605 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
73606 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
73607 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
73608 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
73609 + * IN THE SOFTWARE.
73610 + */
73611 +
73612 +#ifndef __HYPERVISOR_H__
73613 +#define __HYPERVISOR_H__
73614 +
73615 +#ifndef CONFIG_XEN
73616 +#define is_running_on_xen()                    (0)
73617 +#define HYPERVISOR_ioremap(offset, size)       (offset)
73618 +#else
73619 +extern int running_on_xen;
73620 +#define is_running_on_xen()                    (running_on_xen)
73621 +#endif
73622 +
73623 +#ifdef CONFIG_XEN
73624 +#include <linux/config.h>
73625 +#include <linux/types.h>
73626 +#include <linux/kernel.h>
73627 +#include <linux/version.h>
73628 +#include <linux/errno.h>
73629 +#include <xen/interface/xen.h>
73630 +#include <xen/interface/dom0_ops.h>
73631 +#include <xen/interface/event_channel.h>
73632 +#include <xen/interface/physdev.h>
73633 +#include <xen/interface/sched.h>
73634 +#include <asm/hypercall.h>
73635 +#include <asm/ptrace.h>
73636 +#include <asm/page.h>
73637 +
73638 +extern shared_info_t *HYPERVISOR_shared_info;
73639 +extern start_info_t *xen_start_info;
73640 +
73641 +void force_evtchn_callback(void);
73642 +
73643 +/* Turn jiffies into Xen system time. XXX Implement me. */
73644 +#define jiffies_to_st(j)       0
73645 +
73646 +static inline int
73647 +HYPERVISOR_yield(
73648 +       void)
73649 +{
73650 +       int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
73651 +
73652 +       if (rc == -ENOSYS)
73653 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
73654 +
73655 +       return rc;
73656 +}
73657 +
73658 +static inline int
73659 +HYPERVISOR_block(
73660 +       void)
73661 +{
73662 +       int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL);
73663 +
73664 +       if (rc == -ENOSYS)
73665 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0);
73666 +
73667 +       return rc;
73668 +}
73669 +
73670 +static inline int
73671 +HYPERVISOR_shutdown(
73672 +       unsigned int reason)
73673 +{
73674 +       struct sched_shutdown sched_shutdown = {
73675 +               .reason = reason
73676 +       };
73677 +
73678 +       int rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown);
73679 +
73680 +       if (rc == -ENOSYS)
73681 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason);
73682 +
73683 +       return rc;
73684 +}
73685 +
73686 +static inline int
73687 +HYPERVISOR_poll(
73688 +       evtchn_port_t *ports, unsigned int nr_ports, u64 timeout)
73689 +{
73690 +       struct sched_poll sched_poll = {
73691 +               .nr_ports = nr_ports,
73692 +               .timeout = jiffies_to_st(timeout)
73693 +       };
73694 +
73695 +       int rc;
73696 +
73697 +       set_xen_guest_handle(sched_poll.ports, ports);
73698 +       rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
73699 +       if (rc == -ENOSYS)
73700 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
73701 +
73702 +       return rc;
73703 +}
73704 +
73705 +#include <asm/hypercall.h>
73706 +
73707 +// for drivers/xen/privcmd/privcmd.c
73708 +#define machine_to_phys_mapping 0
73709 +#ifndef CONFIG_XEN_IA64_DOM0_VP
73710 +#define direct_remap_pfn_range(a,b,c,d,e,f) remap_pfn_range(a,b,c,d,e)
73711 +#define        pfn_to_mfn(x)   (x)
73712 +#define        mfn_to_pfn(x)   (x)
73713 +#else
73714 +struct vm_area_struct;
73715 +int direct_remap_pfn_range(struct vm_area_struct *vma,
73716 +                          unsigned long address,
73717 +                          unsigned long mfn,
73718 +                          unsigned long size,
73719 +                          pgprot_t prot,
73720 +                          domid_t  domid);
73721 +struct file;
73722 +int privcmd_mmap(struct file * file, struct vm_area_struct * vma);
73723 +#define HAVE_ARCH_PRIVCMD_MMAP
73724 +#endif
73725 +
73726 +// for drivers/xen/balloon/balloon.c
73727 +#ifdef CONFIG_XEN_SCRUB_PAGES
73728 +#define scrub_pages(_p,_n) memset((void *)(_p), 0, (_n) << PAGE_SHIFT)
73729 +#else
73730 +#define scrub_pages(_p,_n) ((void)0)
73731 +#endif
73732 +#define        pte_mfn(_x)     pte_pfn(_x)
73733 +#define __pte_ma(_x)   ((pte_t) {(_x)})
73734 +#define phys_to_machine_mapping_valid(_x)      (1)
73735 +#define pfn_pte_ma(_x,_y)      __pte_ma(0)
73736 +#ifndef CONFIG_XEN_IA64_DOM0_VP //XXX
73737 +#define set_phys_to_machine(_x,_y)     do {} while (0)
73738 +#define xen_machphys_update(_x,_y)     do {} while (0)
73739 +#endif
73740 +
73741 +#ifdef CONFIG_XEN_IA64_DOM0_VP
73742 +int __xen_create_contiguous_region(unsigned long vstart, unsigned int order, unsigned int address_bits);
73743 +static inline int
73744 +xen_create_contiguous_region(unsigned long vstart,
73745 +                             unsigned int order, unsigned int address_bits)
73746 +{
73747 +       int ret = 0;
73748 +       if (is_running_on_xen()) {
73749 +               ret = __xen_create_contiguous_region(vstart, order,
73750 +                                                    address_bits);
73751 +       }
73752 +       return ret;
73753 +}
73754 +
73755 +void __xen_destroy_contiguous_region(unsigned long vstart, unsigned int order);
73756 +static inline void
73757 +xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
73758 +{
73759 +       if (is_running_on_xen())
73760 +               __xen_destroy_contiguous_region(vstart, order);
73761 +}
73762 +
73763 +// for netfront.c, netback.c
73764 +#define MULTI_UVMFLAGS_INDEX 0 //XXX any value
73765 +
73766 +static inline void
73767 +MULTI_update_va_mapping(
73768 +       multicall_entry_t *mcl, unsigned long va,
73769 +       pte_t new_val, unsigned long flags)
73770 +{
73771 +       mcl->op = __HYPERVISOR_update_va_mapping;
73772 +       mcl->result = 0;
73773 +}
73774 +
73775 +#else
73776 +#define xen_create_contiguous_region(vstart, order, address_bits)      (0)
73777 +#define xen_destroy_contiguous_region(vstart, order)   do {} while (0)
73778 +#endif
73779 +
73780 +// for debug
73781 +asmlinkage int xprintk(const char *fmt, ...);
73782 +#define xprintd(fmt, ...)      xprintk("%s:%d " fmt, __func__, __LINE__, \
73783 +                                       ##__VA_ARGS__)
73784 +#endif /* CONFIG_XEN */
73785 +
73786 +#endif /* __HYPERVISOR_H__ */
73787 diff -urNp linux-2.6/include/asm-ia64/intel_intrin.h new/include/asm-ia64/intel_intrin.h
73788 --- linux-2.6/include/asm-ia64/intel_intrin.h   2006-07-03 14:15:09.000000000 +0200
73789 +++ new/include/asm-ia64/intel_intrin.h 2006-05-09 12:35:18.000000000 +0200
73790 @@ -16,8 +16,10 @@
73791                          * intrinsic
73792                          */
73793  
73794 -#define ia64_getreg            __getReg
73795 -#define ia64_setreg            __setReg
73796 +#define __ia64_getreg          __getReg
73797 +#define __ia64_setreg          __setReg
73798 +
73799 +#define __ia64_hint(x)
73800  
73801  #define ia64_hint              __hint
73802  #define ia64_hint_pause                __hint_pause
73803 @@ -33,16 +35,16 @@
73804  #define ia64_getf_exp          __getf_exp
73805  #define ia64_shrp              _m64_shrp
73806  
73807 -#define ia64_tpa               __tpa
73808 +#define __ia64_tpa             __tpa
73809  #define ia64_invala            __invala
73810  #define ia64_invala_gr         __invala_gr
73811  #define ia64_invala_fr         __invala_fr
73812  #define ia64_nop               __nop
73813  #define ia64_sum               __sum
73814 -#define ia64_ssm               __ssm
73815 +#define __ia64_ssm             __ssm
73816  #define ia64_rum               __rum
73817 -#define ia64_rsm               __rsm
73818 -#define ia64_fc                __fc
73819 +#define __ia64_rsm             __rsm
73820 +#define __ia64_fc              __fc
73821  
73822  #define ia64_ldfs              __ldfs
73823  #define ia64_ldfd              __ldfd
73824 @@ -80,24 +82,24 @@
73825  
73826  #define __ia64_set_dbr(index, val)     \
73827                 __setIndReg(_IA64_REG_INDR_DBR, index, val)
73828 -#define ia64_set_ibr(index, val)       \
73829 +#define __ia64_set_ibr(index, val)     \
73830                 __setIndReg(_IA64_REG_INDR_IBR, index, val)
73831 -#define ia64_set_pkr(index, val)       \
73832 +#define __ia64_set_pkr(index, val)     \
73833                 __setIndReg(_IA64_REG_INDR_PKR, index, val)
73834 -#define ia64_set_pmc(index, val)       \
73835 +#define __ia64_set_pmc(index, val)     \
73836                 __setIndReg(_IA64_REG_INDR_PMC, index, val)
73837 -#define ia64_set_pmd(index, val)       \
73838 +#define __ia64_set_pmd(index, val)     \
73839                 __setIndReg(_IA64_REG_INDR_PMD, index, val)
73840 -#define ia64_set_rr(index, val)        \
73841 +#define __ia64_set_rr(index, val)      \
73842                 __setIndReg(_IA64_REG_INDR_RR, index, val)
73843  
73844 -#define ia64_get_cpuid(index)  __getIndReg(_IA64_REG_INDR_CPUID, index)
73845 +#define __ia64_get_cpuid(index)        __getIndReg(_IA64_REG_INDR_CPUID, index)
73846  #define __ia64_get_dbr(index)  __getIndReg(_IA64_REG_INDR_DBR, index)
73847 -#define ia64_get_ibr(index)    __getIndReg(_IA64_REG_INDR_IBR, index)
73848 -#define ia64_get_pkr(index)    __getIndReg(_IA64_REG_INDR_PKR, index)
73849 -#define ia64_get_pmc(index)    __getIndReg(_IA64_REG_INDR_PMC, index)
73850 -#define ia64_get_pmd(index)    __getIndReg(_IA64_REG_INDR_PMD, index)
73851 -#define ia64_get_rr(index)     __getIndReg(_IA64_REG_INDR_RR, index)
73852 +#define __ia64_get_ibr(index)  __getIndReg(_IA64_REG_INDR_IBR, index)
73853 +#define __ia64_get_pkr(index)  __getIndReg(_IA64_REG_INDR_PKR, index)
73854 +#define __ia64_get_pmc(index)  __getIndReg(_IA64_REG_INDR_PMC, index)
73855 +#define __ia64_get_pmd(index)          __getIndReg(_IA64_REG_INDR_PMD, index)
73856 +#define __ia64_get_rr(index)   __getIndReg(_IA64_REG_INDR_RR, index)
73857  
73858  #define ia64_srlz_d            __dsrlz
73859  #define ia64_srlz_i            __isrlz
73860 @@ -116,18 +118,18 @@
73861  #define ia64_ld8_acq           __ld8_acq
73862  
73863  #define ia64_sync_i            __synci
73864 -#define ia64_thash             __thash
73865 -#define ia64_ttag              __ttag
73866 -#define ia64_itcd              __itcd
73867 -#define ia64_itci              __itci
73868 -#define ia64_itrd              __itrd
73869 -#define ia64_itri              __itri
73870 -#define ia64_ptce              __ptce
73871 -#define ia64_ptcl              __ptcl
73872 -#define ia64_ptcg              __ptcg
73873 -#define ia64_ptcga             __ptcga
73874 -#define ia64_ptri              __ptri
73875 -#define ia64_ptrd              __ptrd
73876 +#define __ia64_thash           __thash
73877 +#define __ia64_ttag            __ttag
73878 +#define __ia64_itcd            __itcd
73879 +#define __ia64_itci            __itci
73880 +#define __ia64_itrd            __itrd
73881 +#define __ia64_itri            __itri
73882 +#define __ia64_ptce            __ptce
73883 +#define __ia64_ptcl            __ptcl
73884 +#define __ia64_ptcg            __ptcg
73885 +#define __ia64_ptcga           __ptcga
73886 +#define __ia64_ptri            __ptri
73887 +#define __ia64_ptrd            __ptrd
73888  #define ia64_dep_mi            _m64_dep_mi
73889  
73890  /* Values for lfhint in __lfetch and __lfetch_fault */
73891 @@ -142,16 +144,18 @@
73892  #define ia64_lfetch_fault      __lfetch_fault
73893  #define ia64_lfetch_fault_excl __lfetch_fault_excl
73894  
73895 -#define ia64_intrin_local_irq_restore(x)               \
73896 +#define __ia64_intrin_local_irq_restore(x)             \
73897  do {                                                   \
73898         if ((x) != 0) {                                 \
73899 -               ia64_ssm(IA64_PSR_I);                   \
73900 +               __ia64_ssm(IA64_PSR_I);                 \
73901                 ia64_srlz_d();                          \
73902         } else {                                        \
73903 -               ia64_rsm(IA64_PSR_I);                   \
73904 +               __ia64_rsm(IA64_PSR_I);                 \
73905         }                                               \
73906  } while (0)
73907  
73908 +#define __ia64_get_psr_i()     (__ia64_getreg(_IA64_REG_PSR) & 0x4000UL)
73909 +
73910  #define __builtin_trap()       __break(0);
73911  
73912  #endif /* _ASM_IA64_INTEL_INTRIN_H */
73913 diff -urNp linux-2.6/include/asm-ia64/io.h new/include/asm-ia64/io.h
73914 --- linux-2.6/include/asm-ia64/io.h     2006-07-03 14:15:09.000000000 +0200
73915 +++ new/include/asm-ia64/io.h   2006-06-28 14:32:14.000000000 +0200
73916 @@ -66,9 +66,11 @@ extern unsigned int num_io_spaces;
73917  #define PIO_RESERVED           __IA64_UNCACHED_OFFSET
73918  #define HAVE_ARCH_PIO_SIZE
73919  
73920 +#include <asm/hypervisor.h>
73921  #include <asm/intrinsics.h>
73922  #include <asm/machvec.h>
73923  #include <asm/page.h>
73924 +#include <asm/privop.h>
73925  #include <asm/system.h>
73926  #include <asm-generic/iomap.h>
73927  
73928 @@ -95,9 +97,41 @@ extern int valid_mmap_phys_addr_range (u
73929   * The following two macros are deprecated and scheduled for removal.
73930   * Please use the PCI-DMA interface defined in <asm/pci.h> instead.
73931   */
73932 +#ifndef CONFIG_XEN_IA64_DOM0_VP
73933  #define bus_to_virt    phys_to_virt
73934  #define virt_to_bus    virt_to_phys
73935  #define page_to_bus    page_to_phys
73936 +#define page_to_phys(page)     (page_to_pfn(page) << PAGE_SHIFT)
73937 +#define page_to_pseudophys(page)       page_to_phys(page)
73938 +#else
73939 +#define bus_to_virt(bus)       \
73940 +       phys_to_virt(machine_to_phys_for_dma(bus))
73941 +#define virt_to_bus(virt)      \
73942 +       phys_to_machine_for_dma(virt_to_phys(virt))
73943 +#define page_to_bus(page)      \
73944 +       phys_to_machine_for_dma(page_to_pseudophys(page))
73945 +
73946 +#define page_to_pseudophys(page) \
73947 +       ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
73948 +
73949 +/*
73950 + * Drivers that use page_to_phys() for bus addresses are broken.
73951 + * This includes:
73952 + * drivers/ide/cris/ide-cris.c
73953 + * drivers/scsi/dec_esp.c
73954 + */
73955 +#define page_to_phys(page)     (page_to_pseudophys(page))
73956 +#define bvec_to_bus(bv)                (page_to_bus((bv)->bv_page) + \
73957 +                               (unsigned long) (bv)->bv_offset)
73958 +#define bio_to_pseudophys(bio) (page_to_pseudophys(bio_page((bio))) +  \
73959 +                                (unsigned long) bio_offset((bio)))
73960 +#define bvec_to_pseudophys(bv)  (page_to_pseudophys((bv)->bv_page) +   \
73961 +                                (unsigned long) (bv)->bv_offset)
73962 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)                              \
73963 +       (((bvec_to_bus((vec1)) + (vec1)->bv_len) == bvec_to_bus((vec2))) && \
73964 +        ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) ==              \
73965 +         bvec_to_pseudophys((vec2))))
73966 +#endif
73967  
73968  # endif /* KERNEL */
73969  
73970 @@ -416,14 +450,26 @@ __writeq (unsigned long val, volatile vo
73971  # define outl_p                outl
73972  #endif
73973  
73974 -extern void __iomem * ioremap(unsigned long offset, unsigned long size);
73975 -extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size);
73976 +/*
73977 + * An "address" in IO memory space is not clearly either an integer or a pointer. We will
73978 + * accept both, thus the casts.
73979 + *
73980 + * On ia-64, we access the physical I/O memory space through the uncached kernel region.
73981 + */
73982 +static inline void __iomem *
73983 +ioremap (unsigned long offset, unsigned long size)
73984 +{
73985 +       offset = HYPERVISOR_ioremap(offset, size);
73986 +       return (void __iomem *) (__IA64_UNCACHED_OFFSET | (offset));
73987 +}
73988  
73989  static inline void
73990  iounmap (volatile void __iomem *addr)
73991  {
73992  }
73993  
73994 +#define ioremap_nocache(o,s)   ioremap(o,s)
73995 +
73996  /* Use normal IO mappings for DMI */
73997  #define dmi_ioremap ioremap
73998  #define dmi_iounmap(x,l) iounmap(x)
73999 diff -urNp linux-2.6/include/asm-ia64/iosapic.h new/include/asm-ia64/iosapic.h
74000 --- linux-2.6/include/asm-ia64/iosapic.h        2006-07-03 14:15:09.000000000 +0200
74001 +++ new/include/asm-ia64/iosapic.h      2006-05-09 12:35:18.000000000 +0200
74002 @@ -53,6 +53,7 @@
74003  
74004  #define NR_IOSAPICS                    256
74005  
74006 +#ifndef CONFIG_XEN
74007  static inline unsigned int iosapic_read(char __iomem *iosapic, unsigned int reg)
74008  {
74009         writel(reg, iosapic + IOSAPIC_REG_SELECT);
74010 @@ -64,6 +65,7 @@ static inline void iosapic_write(char __
74011         writel(reg, iosapic + IOSAPIC_REG_SELECT);
74012         writel(val, iosapic + IOSAPIC_WINDOW);
74013  }
74014 +#endif
74015  
74016  static inline void iosapic_eoi(char __iomem *iosapic, u32 vector)
74017  {
74018 diff -urNp linux-2.6/include/asm-ia64/irq.h new/include/asm-ia64/irq.h
74019 --- linux-2.6/include/asm-ia64/irq.h    2006-07-03 14:15:09.000000000 +0200
74020 +++ new/include/asm-ia64/irq.h  2006-06-07 13:15:16.000000000 +0200
74021 @@ -11,8 +11,39 @@
74022   * 02/29/00     D.Mosberger    moved most things into hw_irq.h
74023   */
74024  
74025 +#ifndef CONFIG_XEN
74026  #define NR_IRQS                256
74027  #define NR_IRQ_VECTORS NR_IRQS
74028 +#else
74029 +/*
74030 + * The flat IRQ space is divided into two regions:
74031 + *  1. A one-to-one mapping of real physical IRQs. This space is only used
74032 + *     if we have physical device-access privilege. This region is at the 
74033 + *     start of the IRQ space so that existing device drivers do not need
74034 + *     to be modified to translate physical IRQ numbers into our IRQ space.
74035 + *  3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
74036 + *     are bound using the provided bind/unbind functions.
74037 + */
74038 +
74039 +#define PIRQ_BASE              0
74040 +#define NR_PIRQS               256
74041 +
74042 +#define DYNIRQ_BASE            (PIRQ_BASE + NR_PIRQS)
74043 +#define NR_DYNIRQS             256
74044 +
74045 +#define NR_IRQS                        (NR_PIRQS + NR_DYNIRQS)
74046 +#define NR_IRQ_VECTORS         NR_IRQS
74047 +
74048 +#define pirq_to_irq(_x)                ((_x) + PIRQ_BASE)
74049 +#define irq_to_pirq(_x)                ((_x) - PIRQ_BASE)
74050 +
74051 +#define dynirq_to_irq(_x)      ((_x) + DYNIRQ_BASE)
74052 +#define irq_to_dynirq(_x)      ((_x) - DYNIRQ_BASE)
74053 +
74054 +#define RESCHEDULE_VECTOR      0
74055 +#define IPI_VECTOR             1
74056 +#define NR_IPIS                        2
74057 +#endif /* CONFIG_XEN */
74058  
74059  /*
74060   * IRQ line status macro IRQ_PER_CPU is used
74061 diff -urNp linux-2.6/include/asm-ia64/machvec.h new/include/asm-ia64/machvec.h
74062 --- linux-2.6/include/asm-ia64/machvec.h        2006-07-03 14:15:09.000000000 +0200
74063 +++ new/include/asm-ia64/machvec.h      2006-05-09 12:35:19.000000000 +0200
74064 @@ -257,6 +257,21 @@ extern void machvec_init (const char *na
74065  #  error Unknown configuration.  Update asm-ia64/machvec.h.
74066  # endif /* CONFIG_IA64_GENERIC */
74067  
74068 +#ifdef CONFIG_XEN_IA64_DOM0_VP
74069 +# define platform_dma_map_sg           dma_map_sg
74070 +# define platform_dma_unmap_sg         dma_unmap_sg
74071 +# define platform_dma_mapping_error    dma_mapping_error
74072 +# define platform_dma_supported                dma_supported
74073 +# define platform_dma_alloc_coherent   dma_alloc_coherent
74074 +# define platform_dma_free_coherent    dma_free_coherent
74075 +# define platform_dma_map_single       dma_map_single
74076 +# define platform_dma_unmap_single     dma_unmap_single
74077 +# define platform_dma_sync_single_for_cpu \
74078 +                                       dma_sync_single_for_cpu
74079 +# define platform_dma_sync_single_for_device \
74080 +                                       dma_sync_single_for_device
74081 +#endif
74082 +
74083  /*
74084   * Declare default routines which aren't declared anywhere else:
74085   */
74086 diff -urNp linux-2.6/include/asm-ia64/meminit.h new/include/asm-ia64/meminit.h
74087 --- linux-2.6/include/asm-ia64/meminit.h        2006-07-03 14:15:09.000000000 +0200
74088 +++ new/include/asm-ia64/meminit.h      2006-05-09 12:35:19.000000000 +0200
74089 @@ -17,10 +17,15 @@
74090   *     - command line string
74091   *     - kernel code & data
74092   *     - Kernel memory map built from EFI memory map
74093 + *     - xen start info
74094   *
74095   * More could be added if necessary
74096   */
74097 +#ifndef CONFIG_XEN
74098  #define IA64_MAX_RSVD_REGIONS 6
74099 +#else
74100 +#define IA64_MAX_RSVD_REGIONS 7
74101 +#endif
74102  
74103  struct rsvd_region {
74104         unsigned long start;    /* virtual address of beginning of element */
74105 diff -urNp linux-2.6/include/asm-ia64/page.h new/include/asm-ia64/page.h
74106 --- linux-2.6/include/asm-ia64/page.h   2006-07-03 14:15:09.000000000 +0200
74107 +++ new/include/asm-ia64/page.h 2006-06-28 14:32:14.000000000 +0200
74108 @@ -127,7 +127,6 @@ extern unsigned long max_low_pfn;
74109  # define pfn_valid(pfn)                (((pfn) >= min_low_pfn) && ((pfn) < max_low_pfn) && ia64_pfn_valid(pfn))
74110  #endif
74111  
74112 -#define page_to_phys(page)     (page_to_pfn(page) << PAGE_SHIFT)
74113  #define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
74114  #define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
74115  
74116 @@ -229,4 +228,127 @@ get_order (unsigned long size)
74117                                          (((current->personality & READ_IMPLIES_EXEC) != 0)     \
74118                                           ? VM_EXEC : 0))
74119  
74120 +#ifndef __ASSEMBLY__
74121 +#ifdef CONFIG_XEN
74122 +
74123 +#define INVALID_P2M_ENTRY      (~0UL)
74124 +
74125 +#ifndef CONFIG_XEN_IA64_DOM0_VP
74126 +
74127 +#define virt_to_machine(v) __pa(v)
74128 +#define machine_to_virt(m) __va(m)
74129 +#define virt_to_mfn(v) ((__pa(v)) >> PAGE_SHIFT)
74130 +#define mfn_to_virt(m) (__va((m) << PAGE_SHIFT))
74131 +
74132 +#else
74133 +
74134 +#include <linux/kernel.h>
74135 +#include <asm/hypervisor.h>
74136 +#include <xen/features.h>      // to compile netback, netfront
74137 +typedef unsigned long maddr_t; // to compile netback, netfront
74138 +
74139 +/*
74140 + * XXX hack!
74141 + * Linux/IA64 uses PG_arch_1.
74142 + * This hack will be removed once PG_foreign bit is taken.
74143 + * #include <xen/foreign_page.h>
74144 + */
74145 +#ifdef __ASM_XEN_FOREIGN_PAGE_H__
74146 +# error "don't include include/xen/foreign_page.h!"
74147 +#endif
74148 +
74149 +extern struct address_space xen_ia64_foreign_dummy_mapping;
74150 +#define PageForeign(page)      \
74151 +       ((page)->mapping == &xen_ia64_foreign_dummy_mapping)
74152 +
74153 +#define SetPageForeign(page, dtor) do {                                \
74154 +       set_page_private((page), (unsigned long)(dtor));        \
74155 +       (page)->mapping = &xen_ia64_foreign_dummy_mapping;      \
74156 +       smp_rmb();                                              \
74157 +} while (0)
74158 +
74159 +#define ClearPageForeign(page) do {    \
74160 +       (page)->mapping = NULL;         \
74161 +       smp_rmb();                      \
74162 +       set_page_private((page), 0);    \
74163 +} while (0)
74164 +
74165 +#define PageForeignDestructor(page)    \
74166 +       ( (void (*) (struct page *)) page_private(page) )
74167 +
74168 +#define arch_free_page(_page,_order)                   \
74169 +({      int foreign = PageForeign(_page);               \
74170 +       if (foreign)                                    \
74171 +               (PageForeignDestructor(_page))(_page);  \
74172 +       foreign;                                        \
74173 +})
74174 +#define HAVE_ARCH_FREE_PAGE
74175 +
74176 +/* XXX xen page size != page size */
74177 +
74178 +static inline unsigned long
74179 +pfn_to_mfn_for_dma(unsigned long pfn)
74180 +{
74181 +       unsigned long mfn;
74182 +       mfn = HYPERVISOR_phystomach(pfn);
74183 +       BUG_ON(mfn == 0); // XXX
74184 +       BUG_ON(mfn == INVALID_P2M_ENTRY); // XXX
74185 +       BUG_ON(mfn == INVALID_MFN);
74186 +       return mfn;
74187 +}
74188 +
74189 +static inline unsigned long
74190 +phys_to_machine_for_dma(unsigned long phys)
74191 +{
74192 +       unsigned long machine =
74193 +                     pfn_to_mfn_for_dma(phys >> PAGE_SHIFT) << PAGE_SHIFT;
74194 +       machine |= (phys & ~PAGE_MASK);
74195 +       return machine;
74196 +}
74197 +
74198 +static inline unsigned long
74199 +mfn_to_pfn_for_dma(unsigned long mfn)
74200 +{
74201 +       unsigned long pfn;
74202 +       pfn = HYPERVISOR_machtophys(mfn);
74203 +       BUG_ON(pfn == 0);
74204 +       //BUG_ON(pfn == INVALID_M2P_ENTRY);
74205 +       return pfn;
74206 +}
74207 +
74208 +static inline unsigned long
74209 +machine_to_phys_for_dma(unsigned long machine)
74210 +{
74211 +       unsigned long phys =
74212 +                     mfn_to_pfn_for_dma(machine >> PAGE_SHIFT) << PAGE_SHIFT;
74213 +       phys |= (machine & ~PAGE_MASK);
74214 +       return phys;
74215 +}
74216 +
74217 +#define set_phys_to_machine(pfn, mfn) do { } while (0)
74218 +#define xen_machphys_update(mfn, pfn) do { } while (0)
74219 +
74220 +/* XXX to compile set_phys_to_machine(vaddr, FOREIGN_FRAME(m)) */
74221 +#define FOREIGN_FRAME(m)        (INVALID_P2M_ENTRY)
74222 +
74223 +#define mfn_to_pfn(mfn)                        (mfn)
74224 +#define mfn_to_virt(mfn)               (__va((mfn) << PAGE_SHIFT))
74225 +#define pfn_to_mfn(pfn)                        (pfn)
74226 +#define virt_to_mfn(virt)              (__pa(virt) >> PAGE_SHIFT)
74227 +#define virt_to_machine(virt)          __pa(virt) // for tpmfront.c
74228 +
74229 +static inline unsigned long
74230 +mfn_to_local_pfn(unsigned long mfn)
74231 +{
74232 +       extern unsigned long max_mapnr;
74233 +       unsigned long pfn = mfn_to_pfn(mfn);
74234 +       if (!pfn_valid(pfn))
74235 +               return INVALID_P2M_ENTRY;
74236 +       return pfn;
74237 +}
74238 +
74239 +#endif /* CONFIG_XEN_IA64_DOM0_VP */
74240 +#endif /* CONFIG_XEN */
74241 +#endif /* __ASSEMBLY__ */
74242 +
74243  #endif /* _ASM_IA64_PAGE_H */
74244 diff -urNp linux-2.6/include/asm-ia64/pal.h new/include/asm-ia64/pal.h
74245 --- linux-2.6/include/asm-ia64/pal.h    2006-07-03 14:15:09.000000000 +0200
74246 +++ new/include/asm-ia64/pal.h  2006-05-09 12:35:19.000000000 +0200
74247 @@ -82,6 +82,7 @@
74248  #ifndef __ASSEMBLY__
74249  
74250  #include <linux/types.h>
74251 +#include <asm/processor.h>
74252  #include <asm/fpu.h>
74253  
74254  /*
74255 diff -urNp linux-2.6/include/asm-ia64/pgalloc.h new/include/asm-ia64/pgalloc.h
74256 --- linux-2.6/include/asm-ia64/pgalloc.h        2006-07-03 14:15:09.000000000 +0200
74257 +++ new/include/asm-ia64/pgalloc.h      2006-05-09 12:35:19.000000000 +0200
74258 @@ -126,7 +126,7 @@ static inline void pmd_free(pmd_t * pmd)
74259  static inline void
74260  pmd_populate(struct mm_struct *mm, pmd_t * pmd_entry, struct page *pte)
74261  {
74262 -       pmd_val(*pmd_entry) = page_to_phys(pte);
74263 +       pmd_val(*pmd_entry) = page_to_pseudophys(pte);
74264  }
74265  
74266  static inline void
74267 diff -urNp linux-2.6/include/asm-ia64/privop.h new/include/asm-ia64/privop.h
74268 --- linux-2.6/include/asm-ia64/privop.h 1970-01-01 01:00:00.000000000 +0100
74269 +++ new/include/asm-ia64/privop.h       2006-06-28 14:32:14.000000000 +0200
74270 @@ -0,0 +1,59 @@
74271 +#ifndef _ASM_IA64_PRIVOP_H
74272 +#define _ASM_IA64_PRIVOP_H
74273 +
74274 +/*
74275 + * Copyright (C) 2005 Hewlett-Packard Co
74276 + *     Dan Magenheimer <dan.magenheimer@hp.com>
74277 + *
74278 + */
74279 +
74280 +#ifdef CONFIG_XEN
74281 +#include <asm/xen/privop.h>
74282 +#endif
74283 +
74284 +#ifndef __ASSEMBLY
74285 +
74286 +#ifndef IA64_PARAVIRTUALIZED
74287 +
74288 +#define ia64_getreg                    __ia64_getreg
74289 +#define ia64_setreg                    __ia64_setreg
74290 +#define ia64_hint                      __ia64_hint
74291 +#define ia64_thash                     __ia64_thash
74292 +#define ia64_itci                      __ia64_itci
74293 +#define ia64_itcd                      __ia64_itcd
74294 +#define ia64_itri                      __ia64_itri
74295 +#define ia64_itrd                      __ia64_itrd
74296 +#define ia64_tpa                       __ia64_tpa
74297 +#define ia64_set_ibr                   __ia64_set_ibr
74298 +#define ia64_set_pkr                   __ia64_set_pkr
74299 +#define ia64_set_pmc                   __ia64_set_pmc
74300 +#define ia64_set_pmd                   __ia64_set_pmd
74301 +#define ia64_set_rr                    __ia64_set_rr
74302 +#define ia64_get_cpuid                 __ia64_get_cpuid
74303 +#define ia64_get_ibr                   __ia64_get_ibr
74304 +#define ia64_get_pkr                   __ia64_get_pkr
74305 +#define ia64_get_pmc                   __ia64_get_pmc
74306 +#define ia64_get_pmd                   __ia64_get_pmd
74307 +#define ia64_get_rr                    __ia64_get_rr
74308 +#define ia64_fc                                __ia64_fc
74309 +#define ia64_ssm                       __ia64_ssm
74310 +#define ia64_rsm                       __ia64_rsm
74311 +#define ia64_ptce                      __ia64_ptce
74312 +#define ia64_ptcga                     __ia64_ptcga
74313 +#define ia64_ptcl                      __ia64_ptcl
74314 +#define ia64_ptri                      __ia64_ptri
74315 +#define ia64_ptrd                      __ia64_ptrd
74316 +#define ia64_get_psr_i                 __ia64_get_psr_i
74317 +#define ia64_intrin_local_irq_restore  __ia64_intrin_local_irq_restore
74318 +#define ia64_pal_halt_light            __ia64_pal_halt_light
74319 +#define ia64_leave_kernel              __ia64_leave_kernel
74320 +#define ia64_leave_syscall             __ia64_leave_syscall
74321 +#define ia64_trace_syscall             __ia64_trace_syscall
74322 +#define ia64_switch_to                 __ia64_switch_to
74323 +#define ia64_pal_call_static           __ia64_pal_call_static
74324 +
74325 +#endif /* !IA64_PARAVIRTUALIZED */
74326 +
74327 +#endif /* !__ASSEMBLY */
74328 +
74329 +#endif /* _ASM_IA64_PRIVOP_H */
74330 diff -urNp linux-2.6/include/asm-ia64/processor.h new/include/asm-ia64/processor.h
74331 --- linux-2.6/include/asm-ia64/processor.h      2006-07-03 14:15:09.000000000 +0200
74332 +++ new/include/asm-ia64/processor.h    2006-05-09 12:35:19.000000000 +0200
74333 @@ -19,6 +19,7 @@
74334  #include <asm/kregs.h>
74335  #include <asm/ptrace.h>
74336  #include <asm/ustack.h>
74337 +#include <asm/privop.h>
74338  
74339  #define IA64_NUM_DBG_REGS      8
74340  /*
74341 diff -urNp linux-2.6/include/asm-ia64/synch_bitops.h new/include/asm-ia64/synch_bitops.h
74342 --- linux-2.6/include/asm-ia64/synch_bitops.h   1970-01-01 01:00:00.000000000 +0100
74343 +++ new/include/asm-ia64/synch_bitops.h 2006-07-07 15:10:03.000000000 +0200
74344 @@ -0,0 +1,63 @@
74345 +#ifndef __XEN_SYNCH_BITOPS_H__
74346 +#define __XEN_SYNCH_BITOPS_H__
74347 +
74348 +/*
74349 + * Copyright 1992, Linus Torvalds.
74350 + * Heavily modified to provide guaranteed strong synchronisation
74351 + * when communicating with Xen or other guest OSes running on other CPUs.
74352 + */
74353 +
74354 +#include <linux/config.h>
74355 +
74356 +#define ADDR (*(volatile long *) addr)
74357 +
74358 +static __inline__ void synch_set_bit(int nr, volatile void * addr)
74359 +{
74360 +       set_bit(nr, addr);
74361 +}
74362 +
74363 +static __inline__ void synch_clear_bit(int nr, volatile void * addr)
74364 +{
74365 +       clear_bit(nr, addr);
74366 +}
74367 +
74368 +static __inline__ void synch_change_bit(int nr, volatile void * addr)
74369 +{
74370 +       change_bit(nr, addr);
74371 +}
74372 +
74373 +static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr)
74374 +{
74375 +    return test_and_set_bit(nr, addr);
74376 +}
74377 +
74378 +static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr)
74379 +{
74380 +    return test_and_clear_bit(nr, addr);
74381 +}
74382 +
74383 +static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr)
74384 +{
74385 +    return test_and_change_bit(nr, addr);
74386 +}
74387 +
74388 +static __inline__ int synch_const_test_bit(int nr, const volatile void * addr)
74389 +{
74390 +    return test_bit(nr, addr);
74391 +}
74392 +
74393 +static __inline__ int synch_var_test_bit(int nr, volatile void * addr)
74394 +{
74395 +    return test_bit(nr, addr);
74396 +}
74397 +
74398 +#define synch_cmpxchg  ia64_cmpxchg4_acq
74399 +
74400 +#define synch_test_bit(nr,addr) \
74401 +(__builtin_constant_p(nr) ? \
74402 + synch_const_test_bit((nr),(addr)) : \
74403 + synch_var_test_bit((nr),(addr)))
74404 +
74405 +#define synch_cmpxchg_subword synch_cmpxchg
74406 +
74407 +#endif /* __XEN_SYNCH_BITOPS_H__ */
74408 diff -urNp linux-2.6/include/asm-ia64/system.h new/include/asm-ia64/system.h
74409 --- linux-2.6/include/asm-ia64/system.h 2006-07-03 14:15:09.000000000 +0200
74410 +++ new/include/asm-ia64/system.h       2006-05-09 12:35:20.000000000 +0200
74411 @@ -125,7 +125,7 @@ extern struct ia64_boot_param {
74412  #define __local_irq_save(x)                    \
74413  do {                                           \
74414         ia64_stop();                            \
74415 -       (x) = ia64_getreg(_IA64_REG_PSR);       \
74416 +       (x) = ia64_get_psr_i();                 \
74417         ia64_stop();                            \
74418         ia64_rsm(IA64_PSR_I);                   \
74419  } while (0)
74420 @@ -173,7 +173,7 @@ do {                                                                \
74421  #endif /* !CONFIG_IA64_DEBUG_IRQ */
74422  
74423  #define local_irq_enable()     ({ ia64_stop(); ia64_ssm(IA64_PSR_I); ia64_srlz_d(); })
74424 -#define local_save_flags(flags)        ({ ia64_stop(); (flags) = ia64_getreg(_IA64_REG_PSR); })
74425 +#define local_save_flags(flags)        ({ ia64_stop(); (flags) = ia64_get_psr_i(); })
74426  
74427  #define irqs_disabled()                                \
74428  ({                                             \
74429 diff -urNp linux-2.6/include/asm-ia64/xen/privop.h new/include/asm-ia64/xen/privop.h
74430 --- linux-2.6/include/asm-ia64/xen/privop.h     1970-01-01 01:00:00.000000000 +0100
74431 +++ new/include/asm-ia64/xen/privop.h   2006-06-28 14:32:14.000000000 +0200
74432 @@ -0,0 +1,277 @@
74433 +#ifndef _ASM_IA64_XEN_PRIVOP_H
74434 +#define _ASM_IA64_XEN_PRIVOP_H
74435 +
74436 +/*
74437 + * Copyright (C) 2005 Hewlett-Packard Co
74438 + *     Dan Magenheimer <dan.magenheimer@hp.com>
74439 + *
74440 + * Paravirtualizations of privileged operations for Xen/ia64
74441 + *
74442 + */
74443 +
74444 +
74445 +#include <asm/xen/asm-xsi-offsets.h>
74446 +#include <xen/interface/arch-ia64.h>
74447 +
74448 +#define IA64_PARAVIRTUALIZED
74449 +
74450 +#ifdef __ASSEMBLY__
74451 +#define        XEN_HYPER_RFI                   break HYPERPRIVOP_RFI
74452 +#define        XEN_HYPER_RSM_PSR_DT            break HYPERPRIVOP_RSM_DT
74453 +#define        XEN_HYPER_SSM_PSR_DT            break HYPERPRIVOP_SSM_DT
74454 +#define        XEN_HYPER_COVER                 break HYPERPRIVOP_COVER
74455 +#define        XEN_HYPER_ITC_D                 break HYPERPRIVOP_ITC_D
74456 +#define        XEN_HYPER_ITC_I                 break HYPERPRIVOP_ITC_I
74457 +#define        XEN_HYPER_SSM_I                 break HYPERPRIVOP_SSM_I
74458 +#define        XEN_HYPER_GET_IVR               break HYPERPRIVOP_GET_IVR
74459 +#define        XEN_HYPER_GET_TPR               break HYPERPRIVOP_GET_TPR
74460 +#define        XEN_HYPER_SET_TPR               break HYPERPRIVOP_SET_TPR
74461 +#define        XEN_HYPER_EOI                   break HYPERPRIVOP_EOI
74462 +#define        XEN_HYPER_SET_ITM               break HYPERPRIVOP_SET_ITM
74463 +#define        XEN_HYPER_THASH                 break HYPERPRIVOP_THASH
74464 +#define        XEN_HYPER_PTC_GA                break HYPERPRIVOP_PTC_GA
74465 +#define        XEN_HYPER_ITR_D                 break HYPERPRIVOP_ITR_D
74466 +#define        XEN_HYPER_GET_RR                break HYPERPRIVOP_GET_RR
74467 +#define        XEN_HYPER_SET_RR                break HYPERPRIVOP_SET_RR
74468 +#define        XEN_HYPER_SET_KR                break HYPERPRIVOP_SET_KR
74469 +#define        XEN_HYPER_FC                    break HYPERPRIVOP_FC
74470 +#define        XEN_HYPER_GET_CPUID             break HYPERPRIVOP_GET_CPUID
74471 +#define        XEN_HYPER_GET_PMD               break HYPERPRIVOP_GET_PMD
74472 +#define        XEN_HYPER_GET_EFLAG             break HYPERPRIVOP_GET_EFLAG
74473 +#define        XEN_HYPER_SET_EFLAG             break HYPERPRIVOP_SET_EFLAG
74474 +#endif
74475 +
74476 +#ifndef __ASSEMBLY__
74477 +#define        XEN_HYPER_SSM_I         asm("break %0" : : "i" (HYPERPRIVOP_SSM_I))
74478 +#define        XEN_HYPER_GET_IVR       asm("break %0" : : "i" (HYPERPRIVOP_GET_IVR))
74479 +
74480 +/************************************************/
74481 +/* Instructions paravirtualized for correctness */
74482 +/************************************************/
74483 +
74484 +/* "fc" and "thash" are privilege-sensitive instructions, meaning they
74485 + *  may have different semantics depending on whether they are executed
74486 + *  at PL0 vs PL!=0.  When paravirtualized, these instructions mustn't
74487 + *  be allowed to execute directly, lest incorrect semantics result. */
74488 +extern unsigned long xen_fc(unsigned long addr);
74489 +#define ia64_fc(addr)                  xen_fc((unsigned long)(addr))
74490 +extern unsigned long xen_thash(unsigned long addr);
74491 +#define ia64_thash(addr)               xen_thash((unsigned long)(addr))
74492 +/* Note that "ttag" and "cover" are also privilege-sensitive; "ttag"
74493 + * is not currently used (though it may be in a long-format VHPT system!)
74494 + * and the semantics of cover only change if psr.ic is off which is very
74495 + * rare (and currently non-existent outside of assembly code */
74496 +
74497 +/* There are also privilege-sensitive registers.  These registers are
74498 + * readable at any privilege level but only writable at PL0. */
74499 +extern unsigned long xen_get_cpuid(int index);
74500 +#define        ia64_get_cpuid(i)               xen_get_cpuid(i)
74501 +extern unsigned long xen_get_pmd(int index);
74502 +#define        ia64_get_pmd(i)                 xen_get_pmd(i)
74503 +extern unsigned long xen_get_eflag(void);      /* see xen_ia64_getreg */
74504 +extern void xen_set_eflag(unsigned long);      /* see xen_ia64_setreg */
74505 +
74506 +/************************************************/
74507 +/* Instructions paravirtualized for performance */
74508 +/************************************************/
74509 +
74510 +/* Xen uses memory-mapped virtual privileged registers for access to many
74511 + * performance-sensitive privileged registers.  Some, like the processor
74512 + * status register (psr), are broken up into multiple memory locations.
74513 + * Others, like "pend", are abstractions based on privileged registers.
74514 + * "Pend" is guaranteed to be set if reading cr.ivr would return a
74515 + * (non-spurious) interrupt. */
74516 +#define XSI_PSR_I                      \
74517 +       (*(uint64_t *)(XSI_PSR_I_ADDR))
74518 +#define xen_get_virtual_psr_i()                \
74519 +       (!(*(uint8_t *)(XSI_PSR_I)))
74520 +#define xen_set_virtual_psr_i(_val)    \
74521 +       ({ *(uint8_t *)(XSI_PSR_I) = (uint8_t)(_val) ? 0:1; })
74522 +#define xen_set_virtual_psr_ic(_val)   \
74523 +       ({ *(int *)(XSI_PSR_IC) = _val ? 1:0; })
74524 +#define xen_get_virtual_pend()         (*(int *)(XSI_PEND))
74525 +
74526 +/* Hyperprivops are "break" instructions with a well-defined API.
74527 + * In particular, the virtual psr.ic bit must be off; in this way
74528 + * it is guaranteed to never conflict with a linux break instruction.
74529 + * Normally, this is done in a xen stub but this one is frequent enough
74530 + * that we inline it */
74531 +#define xen_hyper_ssm_i()                                              \
74532 +({                                                                     \
74533 +       xen_set_virtual_psr_i(0);                                       \
74534 +       xen_set_virtual_psr_ic(0);                                      \
74535 +       XEN_HYPER_SSM_I;                                                \
74536 +})
74537 +
74538 +/* turning off interrupts can be paravirtualized simply by writing
74539 + * to a memory-mapped virtual psr.i bit (implemented as a 16-bit bool) */
74540 +#define xen_rsm_i()    xen_set_virtual_psr_i(0)
74541 +
74542 +/* turning on interrupts is a bit more complicated.. write to the
74543 + * memory-mapped virtual psr.i bit first (to avoid race condition),
74544 + * then if any interrupts were pending, we have to execute a hyperprivop
74545 + * to ensure the pending interrupt gets delivered; else we're done! */
74546 +#define xen_ssm_i()                                                    \
74547 +({                                                                     \
74548 +       int old = xen_get_virtual_psr_i();                              \
74549 +       xen_set_virtual_psr_i(1);                                       \
74550 +       if (!old && xen_get_virtual_pend()) xen_hyper_ssm_i();          \
74551 +})
74552 +
74553 +#define xen_ia64_intrin_local_irq_restore(x)                           \
74554 +{                                                                      \
74555 +     if (is_running_on_xen()) {                                                \
74556 +       if ((x) & IA64_PSR_I) { xen_ssm_i(); }                          \
74557 +       else { xen_rsm_i(); }                                           \
74558 +    }                                                                  \
74559 +    else __ia64_intrin_local_irq_restore((x));                         \
74560 +}
74561 +
74562 +#define        xen_get_psr_i()                                                 \
74563 +(                                                                      \
74564 +       (is_running_on_xen()) ?                                         \
74565 +               (xen_get_virtual_psr_i() ? IA64_PSR_I : 0)              \
74566 +               : __ia64_get_psr_i()                                    \
74567 +)
74568 +
74569 +#define xen_ia64_ssm(mask)                                             \
74570 +{                                                                      \
74571 +       if ((mask)==IA64_PSR_I) {                                       \
74572 +               if (is_running_on_xen()) { xen_ssm_i(); }                       \
74573 +               else { __ia64_ssm(mask); }                              \
74574 +       }                                                               \
74575 +       else { __ia64_ssm(mask); }                                      \
74576 +}
74577 +
74578 +#define xen_ia64_rsm(mask)                                             \
74579 +{                                                                      \
74580 +       if ((mask)==IA64_PSR_I) {                                       \
74581 +               if (is_running_on_xen()) { xen_rsm_i(); }                       \
74582 +               else { __ia64_rsm(mask); }                              \
74583 +       }                                                               \
74584 +       else { __ia64_rsm(mask); }                                      \
74585 +}
74586 +
74587 +
74588 +/* Although all privileged operations can be left to trap and will
74589 + * be properly handled by Xen, some are frequent enough that we use
74590 + * hyperprivops for performance. */
74591 +
74592 +extern unsigned long xen_get_ivr(void);
74593 +extern unsigned long xen_get_tpr(void);
74594 +extern void xen_set_itm(unsigned long);
74595 +extern void xen_set_tpr(unsigned long);
74596 +extern void xen_eoi(void);
74597 +extern void xen_set_rr(unsigned long index, unsigned long val);
74598 +extern unsigned long xen_get_rr(unsigned long index);
74599 +extern void xen_set_kr(unsigned long index, unsigned long val);
74600 +extern void xen_ptcga(unsigned long addr, unsigned long size);
74601 +
74602 +/* Note: It may look wrong to test for is_running_on_xen() in each case.
74603 + * However regnum is always a constant so, as written, the compiler
74604 + * eliminates the switch statement, whereas is_running_on_xen() must be
74605 + * tested dynamically. */
74606 +#define xen_ia64_getreg(regnum)                                                \
74607 +({                                                                     \
74608 +       __u64 ia64_intri_res;                                           \
74609 +                                                                       \
74610 +       switch(regnum) {                                                \
74611 +       case _IA64_REG_CR_IVR:                                          \
74612 +               ia64_intri_res = (is_running_on_xen()) ?                        \
74613 +                       xen_get_ivr() :                                 \
74614 +                       __ia64_getreg(regnum);                          \
74615 +               break;                                                  \
74616 +       case _IA64_REG_CR_TPR:                                          \
74617 +               ia64_intri_res = (is_running_on_xen()) ?                        \
74618 +                       xen_get_tpr() :                                 \
74619 +                       __ia64_getreg(regnum);                          \
74620 +               break;                                                  \
74621 +       case _IA64_REG_AR_EFLAG:                                        \
74622 +               ia64_intri_res = (is_running_on_xen()) ?                        \
74623 +                       xen_get_eflag() :                               \
74624 +                       __ia64_getreg(regnum);                          \
74625 +               break;                                                  \
74626 +       default:                                                        \
74627 +               ia64_intri_res = __ia64_getreg(regnum);                 \
74628 +               break;                                                  \
74629 +       }                                                               \
74630 +       ia64_intri_res;                                                 \
74631 +})
74632 +
74633 +#define xen_ia64_setreg(regnum,val)                                    \
74634 +({                                                                     \
74635 +       switch(regnum) {                                                \
74636 +       case _IA64_REG_AR_KR0 ... _IA64_REG_AR_KR7:                     \
74637 +               (is_running_on_xen()) ?                                 \
74638 +                       xen_set_kr((regnum-_IA64_REG_AR_KR0), val) :    \
74639 +                       __ia64_setreg(regnum,val);                      \
74640 +               break;                                                  \
74641 +       case _IA64_REG_CR_ITM:                                          \
74642 +               (is_running_on_xen()) ?                                 \
74643 +                       xen_set_itm(val) :                              \
74644 +                       __ia64_setreg(regnum,val);                      \
74645 +               break;                                                  \
74646 +       case _IA64_REG_CR_TPR:                                          \
74647 +               (is_running_on_xen()) ?                                 \
74648 +                       xen_set_tpr(val) :                              \
74649 +                       __ia64_setreg(regnum,val);                      \
74650 +               break;                                                  \
74651 +       case _IA64_REG_CR_EOI:                                          \
74652 +               (is_running_on_xen()) ?                                 \
74653 +                       xen_eoi() :                                     \
74654 +                       __ia64_setreg(regnum,val);                      \
74655 +               break;                                                  \
74656 +       case _IA64_REG_AR_EFLAG:                                        \
74657 +               (is_running_on_xen()) ?                                 \
74658 +                       xen_set_eflag(val) :                            \
74659 +                       __ia64_setreg(regnum,val);                      \
74660 +               break;                                                  \
74661 +       default:                                                        \
74662 +               __ia64_setreg(regnum,val);                              \
74663 +               break;                                                  \
74664 +       }                                                               \
74665 +})
74666 +
74667 +#define ia64_ssm                       xen_ia64_ssm
74668 +#define ia64_rsm                       xen_ia64_rsm
74669 +#define ia64_intrin_local_irq_restore  xen_ia64_intrin_local_irq_restore
74670 +#define        ia64_ptcga                      xen_ptcga
74671 +#define        ia64_set_rr(index,val)          xen_set_rr(index,val)
74672 +#define        ia64_get_rr(index)              xen_get_rr(index)
74673 +#define ia64_getreg                    xen_ia64_getreg
74674 +#define ia64_setreg                    xen_ia64_setreg
74675 +#define        ia64_get_psr_i                  xen_get_psr_i
74676 +
74677 +/* the remainder of these are not performance-sensitive so its
74678 + * OK to not paravirtualize and just take a privop trap and emulate */
74679 +#define ia64_hint                      __ia64_hint
74680 +#define ia64_set_pmd                   __ia64_set_pmd
74681 +#define ia64_itci                      __ia64_itci
74682 +#define ia64_itcd                      __ia64_itcd
74683 +#define ia64_itri                      __ia64_itri
74684 +#define ia64_itrd                      __ia64_itrd
74685 +#define ia64_tpa                       __ia64_tpa
74686 +#define ia64_set_ibr                   __ia64_set_ibr
74687 +#define ia64_set_pkr                   __ia64_set_pkr
74688 +#define ia64_set_pmc                   __ia64_set_pmc
74689 +#define ia64_get_ibr                   __ia64_get_ibr
74690 +#define ia64_get_pkr                   __ia64_get_pkr
74691 +#define ia64_get_pmc                   __ia64_get_pmc
74692 +#define ia64_ptce                      __ia64_ptce
74693 +#define ia64_ptcl                      __ia64_ptcl
74694 +#define ia64_ptri                      __ia64_ptri
74695 +#define ia64_ptrd                      __ia64_ptrd
74696 +
74697 +#endif /* !__ASSEMBLY__ */
74698 +
74699 +/* these routines utilize privilege-sensitive or performance-sensitive
74700 + * privileged instructions so the code must be replaced with
74701 + * paravirtualized versions */
74702 +#define ia64_pal_halt_light            xen_pal_halt_light
74703 +#define        ia64_leave_kernel               xen_leave_kernel
74704 +#define        ia64_leave_syscall              xen_leave_syscall
74705 +#define        ia64_trace_syscall              xen_trace_syscall
74706 +#define        ia64_switch_to                  xen_switch_to
74707 +#define        ia64_pal_call_static            xen_pal_call_static
74708 +
74709 +#endif /* _ASM_IA64_XEN_PRIVOP_H */
74710 diff -urNp linux-2.6/include/asm-um/page.h new/include/asm-um/page.h
74711 --- linux-2.6/include/asm-um/page.h     2006-07-03 14:15:14.000000000 +0200
74712 +++ new/include/asm-um/page.h   2006-05-09 12:35:40.000000000 +0200
74713 @@ -115,7 +115,7 @@ extern unsigned long uml_physmem;
74714  extern struct page *arch_validate(struct page *page, gfp_t mask, int order);
74715  #define HAVE_ARCH_VALIDATE
74716  
74717 -extern void arch_free_page(struct page *page, int order);
74718 +extern int arch_free_page(struct page *page, int order);
74719  #define HAVE_ARCH_FREE_PAGE
74720  
74721  #include <asm-generic/memory_model.h>
74722 diff -urNp linux-2.6/include/asm-x86_64/apic.h new/include/asm-x86_64/apic.h
74723 --- linux-2.6/include/asm-x86_64/apic.h 2006-07-03 14:15:14.000000000 +0200
74724 +++ new/include/asm-x86_64/apic.h       2006-05-09 12:35:41.000000000 +0200
74725 @@ -105,11 +105,13 @@ extern int disable_timer_pin_1;
74726  
74727  extern void setup_threshold_lvt(unsigned long lvt_off);
74728  
74729 +#ifndef CONFIG_XEN
74730  void smp_send_timer_broadcast_ipi(void);
74731  void switch_APIC_timer_to_ipi(void *cpumask);
74732  void switch_ipi_to_APIC_timer(void *cpumask);
74733  
74734  #define ARCH_APICTIMER_STOPS_ON_C3     1
74735 +#endif
74736  
74737  #endif /* CONFIG_X86_LOCAL_APIC */
74738  
74739 diff -urNp linux-2.6/include/asm-x86_64/hw_irq.h new/include/asm-x86_64/hw_irq.h
74740 --- linux-2.6/include/asm-x86_64/hw_irq.h       2006-07-03 14:15:15.000000000 +0200
74741 +++ new/include/asm-x86_64/hw_irq.h     2006-05-09 12:35:41.000000000 +0200
74742 @@ -127,7 +127,7 @@ asmlinkage void IRQ_NAME(nr); \
74743  __asm__( \
74744  "\n.p2align\n" \
74745  "IRQ" #nr "_interrupt:\n\t" \
74746 -       "push $" #nr "-256 ; " \
74747 +       "push $~(" #nr ") ; " \
74748         "jmp common_interrupt");
74749  
74750  #if defined(CONFIG_X86_IO_APIC)
74751 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/arch_hooks.h new/include/asm-x86_64/mach-xen/asm/arch_hooks.h
74752 --- linux-2.6/include/asm-x86_64/mach-xen/asm/arch_hooks.h      1970-01-01 01:00:00.000000000 +0100
74753 +++ new/include/asm-x86_64/mach-xen/asm/arch_hooks.h    2006-05-09 12:35:41.000000000 +0200
74754 @@ -0,0 +1,27 @@
74755 +#ifndef _ASM_ARCH_HOOKS_H
74756 +#define _ASM_ARCH_HOOKS_H
74757 +
74758 +#include <linux/interrupt.h>
74759 +
74760 +/*
74761 + *     linux/include/asm/arch_hooks.h
74762 + *
74763 + *     define the architecture specific hooks 
74764 + */
74765 +
74766 +/* these aren't arch hooks, they are generic routines
74767 + * that can be used by the hooks */
74768 +extern void init_ISA_irqs(void);
74769 +extern void apic_intr_init(void);
74770 +extern void smp_intr_init(void);
74771 +extern irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs);
74772 +
74773 +/* these are the defined hooks */
74774 +extern void intr_init_hook(void);
74775 +extern void pre_intr_init_hook(void);
74776 +extern void pre_setup_arch_hook(void);
74777 +extern void trap_init_hook(void);
74778 +extern void time_init_hook(void);
74779 +extern void mca_nmi_hook(void);
74780 +
74781 +#endif
74782 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/bootsetup.h new/include/asm-x86_64/mach-xen/asm/bootsetup.h
74783 --- linux-2.6/include/asm-x86_64/mach-xen/asm/bootsetup.h       1970-01-01 01:00:00.000000000 +0100
74784 +++ new/include/asm-x86_64/mach-xen/asm/bootsetup.h     2006-05-09 12:35:41.000000000 +0200
74785 @@ -0,0 +1,42 @@
74786 +
74787 +#ifndef _X86_64_BOOTSETUP_H
74788 +#define _X86_64_BOOTSETUP_H 1
74789 +
74790 +#define BOOT_PARAM_SIZE                4096
74791 +extern char x86_boot_params[BOOT_PARAM_SIZE];
74792 +
74793 +/*
74794 + * This is set up by the setup-routine at boot-time
74795 + */
74796 +#define PARAM  ((unsigned char *)x86_boot_params)
74797 +#define SCREEN_INFO (*(struct screen_info *) (PARAM+0))
74798 +#define EXT_MEM_K (*(unsigned short *) (PARAM+2))
74799 +#define ALT_MEM_K (*(unsigned int *) (PARAM+0x1e0))
74800 +#define E820_MAP_NR (*(char*) (PARAM+E820NR))
74801 +#define E820_MAP    ((struct e820entry *) (PARAM+E820MAP))
74802 +#define APM_BIOS_INFO (*(struct apm_bios_info *) (PARAM+0x40))
74803 +#define DRIVE_INFO (*(struct drive_info_struct *) (PARAM+0x80))
74804 +#define SYS_DESC_TABLE (*(struct sys_desc_table_struct*)(PARAM+0xa0))
74805 +#define MOUNT_ROOT_RDONLY (*(unsigned short *) (PARAM+0x1F2))
74806 +#define RAMDISK_FLAGS (*(unsigned short *) (PARAM+0x1F8))
74807 +#define SAVED_VIDEO_MODE (*(unsigned short *) (PARAM+0x1FA))
74808 +#define ORIG_ROOT_DEV (*(unsigned short *) (PARAM+0x1FC))
74809 +#define AUX_DEVICE_INFO (*(unsigned char *) (PARAM+0x1FF))
74810 +#define LOADER_TYPE (*(unsigned char *) (PARAM+0x210))
74811 +#define KERNEL_START (*(unsigned int *) (PARAM+0x214))
74812 +
74813 +#define INITRD_START (__pa(xen_start_info->mod_start))
74814 +#define INITRD_SIZE (xen_start_info->mod_len)
74815 +#define EDID_INFO   (*(struct edid_info *) (PARAM+0x440))
74816 +
74817 +#define EDD_NR     (*(unsigned char *) (PARAM+EDDNR))
74818 +#define EDD_MBR_SIG_NR (*(unsigned char *) (PARAM+EDD_MBR_SIG_NR_BUF))
74819 +#define EDD_MBR_SIGNATURE ((unsigned int *) (PARAM+EDD_MBR_SIG_BUF))
74820 +#define EDD_BUF     ((struct edd_info *) (PARAM+EDDBUF))
74821 +#define COMMAND_LINE saved_command_line
74822 +
74823 +#define RAMDISK_IMAGE_START_MASK       0x07FF
74824 +#define RAMDISK_PROMPT_FLAG            0x8000
74825 +#define RAMDISK_LOAD_FLAG              0x4000  
74826 +
74827 +#endif
74828 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/desc.h new/include/asm-x86_64/mach-xen/asm/desc.h
74829 --- linux-2.6/include/asm-x86_64/mach-xen/asm/desc.h    1970-01-01 01:00:00.000000000 +0100
74830 +++ new/include/asm-x86_64/mach-xen/asm/desc.h  2006-05-09 12:35:41.000000000 +0200
74831 @@ -0,0 +1,263 @@
74832 +/* Written 2000 by Andi Kleen */ 
74833 +#ifndef __ARCH_DESC_H
74834 +#define __ARCH_DESC_H
74835 +
74836 +#include <linux/threads.h>
74837 +#include <asm/ldt.h>
74838 +
74839 +#ifndef __ASSEMBLY__
74840 +
74841 +#include <linux/string.h>
74842 +#include <linux/smp.h>
74843 +
74844 +#include <asm/segment.h>
74845 +#include <asm/mmu.h>
74846 +
74847 +// 8 byte segment descriptor
74848 +struct desc_struct { 
74849 +       u16 limit0;
74850 +       u16 base0;
74851 +       unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1;
74852 +       unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8;
74853 +} __attribute__((packed)); 
74854 +
74855 +struct n_desc_struct { 
74856 +       unsigned int a,b;
74857 +};     
74858 +
74859 +enum { 
74860 +       GATE_INTERRUPT = 0xE, 
74861 +       GATE_TRAP = 0xF,        
74862 +       GATE_CALL = 0xC,
74863 +};     
74864 +
74865 +// 16byte gate
74866 +struct gate_struct {          
74867 +       u16 offset_low;
74868 +       u16 segment; 
74869 +       unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
74870 +       u16 offset_middle;
74871 +       u32 offset_high;
74872 +       u32 zero1; 
74873 +} __attribute__((packed));
74874 +
74875 +#define PTR_LOW(x) ((unsigned long)(x) & 0xFFFF) 
74876 +#define PTR_MIDDLE(x) (((unsigned long)(x) >> 16) & 0xFFFF)
74877 +#define PTR_HIGH(x) ((unsigned long)(x) >> 32)
74878 +
74879 +enum { 
74880 +       DESC_TSS = 0x9,
74881 +       DESC_LDT = 0x2,
74882 +}; 
74883 +
74884 +// LDT or TSS descriptor in the GDT. 16 bytes.
74885 +struct ldttss_desc { 
74886 +       u16 limit0;
74887 +       u16 base0;
74888 +       unsigned base1 : 8, type : 5, dpl : 2, p : 1;
74889 +       unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
74890 +       u32 base3;
74891 +       u32 zero1; 
74892 +} __attribute__((packed)); 
74893 +
74894 +struct desc_ptr {
74895 +       unsigned short size;
74896 +       unsigned long address;
74897 +} __attribute__((packed)) ;
74898 +
74899 +extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS];
74900 +
74901 +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
74902 +
74903 +#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8))
74904 +#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8))
74905 +
74906 +static inline void clear_LDT(void)
74907 +{
74908 +       int cpu = get_cpu();
74909 +
74910 +       /*
74911 +        * NB. We load the default_ldt for lcall7/27 handling on demand, as
74912 +        * it slows down context switching. Noone uses it anyway.
74913 +        */
74914 +       cpu = cpu;              /* XXX avoid compiler warning */
74915 +       xen_set_ldt(0UL, 0);
74916 +       put_cpu();
74917 +}
74918 +
74919 +/*
74920 + * This is the ldt that every process will get unless we need
74921 + * something other than this.
74922 + */
74923 +extern struct desc_struct default_ldt[];
74924 +#ifndef CONFIG_X86_NO_IDT
74925 +extern struct gate_struct idt_table[]; 
74926 +#endif
74927 +extern struct desc_ptr cpu_gdt_descr[];
74928 +
74929 +/* the cpu gdt accessor */
74930 +#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)
74931 +
74932 +static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist)  
74933 +{
74934 +       struct gate_struct s;   
74935 +       s.offset_low = PTR_LOW(func); 
74936 +       s.segment = __KERNEL_CS;
74937 +       s.ist = ist; 
74938 +       s.p = 1;
74939 +       s.dpl = dpl; 
74940 +       s.zero0 = 0;
74941 +       s.zero1 = 0; 
74942 +       s.type = type; 
74943 +       s.offset_middle = PTR_MIDDLE(func); 
74944 +       s.offset_high = PTR_HIGH(func); 
74945 +       /* does not need to be atomic because it is only done once at setup time */ 
74946 +       memcpy(adr, &s, 16); 
74947 +} 
74948 +
74949 +#ifndef CONFIG_X86_NO_IDT
74950 +static inline void set_intr_gate(int nr, void *func) 
74951 +{ 
74952 +       BUG_ON((unsigned)nr > 0xFF);
74953 +       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0); 
74954 +} 
74955 +
74956 +static inline void set_intr_gate_ist(int nr, void *func, unsigned ist) 
74957 +{ 
74958 +       BUG_ON((unsigned)nr > 0xFF);
74959 +       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist); 
74960 +} 
74961 +
74962 +static inline void set_system_gate(int nr, void *func) 
74963 +{ 
74964 +       BUG_ON((unsigned)nr > 0xFF);
74965 +       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0); 
74966 +} 
74967 +
74968 +static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
74969 +{
74970 +       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist);
74971 +}
74972 +#endif
74973 +
74974 +static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type, 
74975 +                                        unsigned size) 
74976 +{ 
74977 +       struct ldttss_desc d;
74978 +       memset(&d,0,sizeof(d)); 
74979 +       d.limit0 = size & 0xFFFF;
74980 +       d.base0 = PTR_LOW(tss); 
74981 +       d.base1 = PTR_MIDDLE(tss) & 0xFF; 
74982 +       d.type = type;
74983 +       d.p = 1; 
74984 +       d.limit1 = (size >> 16) & 0xF;
74985 +       d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF; 
74986 +       d.base3 = PTR_HIGH(tss); 
74987 +       memcpy(ptr, &d, 16); 
74988 +}
74989 +
74990 +#ifndef CONFIG_X86_NO_TSS
74991 +static inline void set_tss_desc(unsigned cpu, void *addr)
74992 +{ 
74993 +       /*
74994 +        * sizeof(unsigned long) coming from an extra "long" at the end
74995 +        * of the iobitmap. See tss_struct definition in processor.h
74996 +        *
74997 +        * -1? seg base+limit should be pointing to the address of the
74998 +        * last valid byte
74999 +        */
75000 +       set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS], 
75001 +               (unsigned long)addr, DESC_TSS,
75002 +               IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
75003 +} 
75004 +#endif
75005 +
75006 +static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
75007 +{ 
75008 +       set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr,
75009 +                             DESC_LDT, size * 8 - 1);
75010 +}
75011 +
75012 +static inline void set_seg_base(unsigned cpu, int entry, void *base)
75013 +{ 
75014 +       struct desc_struct *d = &cpu_gdt(cpu)[entry];
75015 +       u32 addr = (u32)(u64)base;
75016 +       BUG_ON((u64)base >> 32); 
75017 +       d->base0 = addr & 0xffff;
75018 +       d->base1 = (addr >> 16) & 0xff;
75019 +       d->base2 = (addr >> 24) & 0xff;
75020 +} 
75021 +
75022 +#define LDT_entry_a(info) \
75023 +       ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
75024 +/* Don't allow setting of the lm bit. It is useless anyways because 
75025 +   64bit system calls require __USER_CS. */ 
75026 +#define LDT_entry_b(info) \
75027 +       (((info)->base_addr & 0xff000000) | \
75028 +       (((info)->base_addr & 0x00ff0000) >> 16) | \
75029 +       ((info)->limit & 0xf0000) | \
75030 +       (((info)->read_exec_only ^ 1) << 9) | \
75031 +       ((info)->contents << 10) | \
75032 +       (((info)->seg_not_present ^ 1) << 15) | \
75033 +       ((info)->seg_32bit << 22) | \
75034 +       ((info)->limit_in_pages << 23) | \
75035 +       ((info)->useable << 20) | \
75036 +       /* ((info)->lm << 21) | */ \
75037 +       0x7000)
75038 +
75039 +#define LDT_empty(info) (\
75040 +       (info)->base_addr       == 0    && \
75041 +       (info)->limit           == 0    && \
75042 +       (info)->contents        == 0    && \
75043 +       (info)->read_exec_only  == 1    && \
75044 +       (info)->seg_32bit       == 0    && \
75045 +       (info)->limit_in_pages  == 0    && \
75046 +       (info)->seg_not_present == 1    && \
75047 +       (info)->useable         == 0    && \
75048 +       (info)->lm              == 0)
75049 +
75050 +#if TLS_SIZE != 24
75051 +# error update this code.
75052 +#endif
75053 +
75054 +static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
75055 +{
75056 +#if 0
75057 +       u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
75058 +       gdt[0] = t->tls_array[0];
75059 +       gdt[1] = t->tls_array[1];
75060 +       gdt[2] = t->tls_array[2];
75061 +#endif
75062 +#define C(i) \
75063 +       HYPERVISOR_update_descriptor(virt_to_machine(&cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]), t->tls_array[i])
75064 +
75065 +       C(0); C(1); C(2);
75066 +#undef C
75067 +} 
75068 +
75069 +/*
75070 + * load one particular LDT into the current CPU
75071 + */
75072 +static inline void load_LDT_nolock (mm_context_t *pc, int cpu)
75073 +{
75074 +       void *segments = pc->ldt;
75075 +       int count = pc->size;
75076 +
75077 +       if (likely(!count))
75078 +               segments = NULL;
75079 +
75080 +       xen_set_ldt((unsigned long)segments, count);
75081 +}
75082 +
75083 +static inline void load_LDT(mm_context_t *pc)
75084 +{
75085 +       int cpu = get_cpu();
75086 +       load_LDT_nolock(pc, cpu);
75087 +       put_cpu();
75088 +}
75089 +
75090 +extern struct desc_ptr idt_descr;
75091 +
75092 +#endif /* !__ASSEMBLY__ */
75093 +
75094 +#endif
75095 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/dma-mapping.h new/include/asm-x86_64/mach-xen/asm/dma-mapping.h
75096 --- linux-2.6/include/asm-x86_64/mach-xen/asm/dma-mapping.h     1970-01-01 01:00:00.000000000 +0100
75097 +++ new/include/asm-x86_64/mach-xen/asm/dma-mapping.h   2006-05-09 12:35:41.000000000 +0200
75098 @@ -0,0 +1,191 @@
75099 +#ifndef _X8664_DMA_MAPPING_H
75100 +#define _X8664_DMA_MAPPING_H 1
75101 +
75102 +/*
75103 + * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
75104 + * documentation.
75105 + */
75106 +
75107 +#include <linux/config.h>
75108 +
75109 +#include <asm/scatterlist.h>
75110 +#include <asm/io.h>
75111 +#include <asm/swiotlb.h>
75112 +
75113 +struct dma_mapping_ops {
75114 +       int             (*mapping_error)(dma_addr_t dma_addr);
75115 +       void*           (*alloc_coherent)(struct device *dev, size_t size,
75116 +                                dma_addr_t *dma_handle, gfp_t gfp);
75117 +       void            (*free_coherent)(struct device *dev, size_t size,
75118 +                                void *vaddr, dma_addr_t dma_handle);
75119 +       dma_addr_t      (*map_single)(struct device *hwdev, void *ptr,
75120 +                                size_t size, int direction);
75121 +       /* like map_single, but doesn't check the device mask */
75122 +       dma_addr_t      (*map_simple)(struct device *hwdev, char *ptr,
75123 +                                size_t size, int direction);
75124 +       void            (*unmap_single)(struct device *dev, dma_addr_t addr,
75125 +                               size_t size, int direction);
75126 +       void            (*sync_single_for_cpu)(struct device *hwdev,
75127 +                               dma_addr_t dma_handle, size_t size,
75128 +                               int direction);
75129 +       void            (*sync_single_for_device)(struct device *hwdev,
75130 +                                dma_addr_t dma_handle, size_t size,
75131 +                               int direction);
75132 +       void            (*sync_single_range_for_cpu)(struct device *hwdev,
75133 +                                dma_addr_t dma_handle, unsigned long offset,
75134 +                               size_t size, int direction);
75135 +       void            (*sync_single_range_for_device)(struct device *hwdev,
75136 +                               dma_addr_t dma_handle, unsigned long offset,
75137 +                               size_t size, int direction);
75138 +       void            (*sync_sg_for_cpu)(struct device *hwdev,
75139 +                                struct scatterlist *sg, int nelems,
75140 +                               int direction);
75141 +       void            (*sync_sg_for_device)(struct device *hwdev,
75142 +                               struct scatterlist *sg, int nelems,
75143 +                               int direction);
75144 +       int             (*map_sg)(struct device *hwdev, struct scatterlist *sg,
75145 +                               int nents, int direction);
75146 +       void            (*unmap_sg)(struct device *hwdev,
75147 +                               struct scatterlist *sg, int nents,
75148 +                               int direction);
75149 +       int             (*dma_supported)(struct device *hwdev, u64 mask);
75150 +       int             is_phys;
75151 +};
75152 +
75153 +extern dma_addr_t bad_dma_address;
75154 +extern struct dma_mapping_ops* dma_ops;
75155 +extern int iommu_merge;
75156 +
75157 +#if 0
75158 +static inline int dma_mapping_error(dma_addr_t dma_addr)
75159 +{
75160 +       if (dma_ops->mapping_error)
75161 +               return dma_ops->mapping_error(dma_addr);
75162 +
75163 +       return (dma_addr == bad_dma_address);
75164 +}
75165 +
75166 +extern void *dma_alloc_coherent(struct device *dev, size_t size,
75167 +                               dma_addr_t *dma_handle, gfp_t gfp);
75168 +extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
75169 +                             dma_addr_t dma_handle);
75170 +
75171 +static inline dma_addr_t
75172 +dma_map_single(struct device *hwdev, void *ptr, size_t size,
75173 +              int direction)
75174 +{
75175 +       return dma_ops->map_single(hwdev, ptr, size, direction);
75176 +}
75177 +
75178 +static inline void
75179 +dma_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
75180 +                int direction)
75181 +{
75182 +       dma_ops->unmap_single(dev, addr, size, direction);
75183 +}
75184 +
75185 +#define dma_map_page(dev,page,offset,size,dir) \
75186 +       dma_map_single((dev), page_address(page)+(offset), (size), (dir))
75187 +
75188 +#define dma_unmap_page dma_unmap_single
75189 +
75190 +static inline void
75191 +dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
75192 +                       size_t size, int direction)
75193 +{
75194 +       if (dma_ops->sync_single_for_cpu)
75195 +               dma_ops->sync_single_for_cpu(hwdev, dma_handle, size,
75196 +                                            direction);
75197 +       flush_write_buffers();
75198 +}
75199 +
75200 +static inline void
75201 +dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
75202 +                          size_t size, int direction)
75203 +{
75204 +       if (dma_ops->sync_single_for_device)
75205 +               dma_ops->sync_single_for_device(hwdev, dma_handle, size,
75206 +                                               direction);
75207 +       flush_write_buffers();
75208 +}
75209 +
75210 +static inline void
75211 +dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
75212 +                             unsigned long offset, size_t size, int direction)
75213 +{
75214 +       if (dma_ops->sync_single_range_for_cpu) {
75215 +               dma_ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, size, direction);
75216 +       }
75217 +
75218 +       flush_write_buffers();
75219 +}
75220 +
75221 +static inline void
75222 +dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
75223 +                                unsigned long offset, size_t size, int direction)
75224 +{
75225 +       if (dma_ops->sync_single_range_for_device)
75226 +               dma_ops->sync_single_range_for_device(hwdev, dma_handle,
75227 +                                                     offset, size, direction);
75228 +
75229 +       flush_write_buffers();
75230 +}
75231 +
75232 +static inline void
75233 +dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
75234 +                   int nelems, int direction)
75235 +{
75236 +       if (dma_ops->sync_sg_for_cpu)
75237 +               dma_ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
75238 +       flush_write_buffers();
75239 +}
75240 +
75241 +static inline void
75242 +dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
75243 +                      int nelems, int direction)
75244 +{
75245 +       if (dma_ops->sync_sg_for_device) {
75246 +               dma_ops->sync_sg_for_device(hwdev, sg, nelems, direction);
75247 +       }
75248 +
75249 +       flush_write_buffers();
75250 +}
75251 +
75252 +static inline int
75253 +dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction)
75254 +{
75255 +       return dma_ops->map_sg(hwdev, sg, nents, direction);
75256 +}
75257 +
75258 +static inline void
75259 +dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
75260 +            int direction)
75261 +{
75262 +       dma_ops->unmap_sg(hwdev, sg, nents, direction);
75263 +}
75264 +
75265 +extern int dma_supported(struct device *hwdev, u64 mask);
75266 +
75267 +/* same for gart, swiotlb, and nommu */
75268 +static inline int dma_get_cache_alignment(void)
75269 +{
75270 +       return boot_cpu_data.x86_clflush_size;
75271 +}
75272 +
75273 +#define dma_is_consistent(h) 1
75274 +
75275 +extern int dma_set_mask(struct device *dev, u64 mask);
75276 +
75277 +static inline void
75278 +dma_cache_sync(void *vaddr, size_t size, enum dma_data_direction dir)
75279 +{
75280 +       flush_write_buffers();
75281 +}
75282 +
75283 +extern struct device fallback_dev;
75284 +extern int panic_on_overflow;
75285 +#endif
75286 +
75287 +#endif /* _X8664_DMA_MAPPING_H */
75288 +
75289 +#include <asm-i386/mach-xen/asm/dma-mapping.h>
75290 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/dmi.h new/include/asm-x86_64/mach-xen/asm/dmi.h
75291 --- linux-2.6/include/asm-x86_64/mach-xen/asm/dmi.h     1970-01-01 01:00:00.000000000 +0100
75292 +++ new/include/asm-x86_64/mach-xen/asm/dmi.h   2006-05-09 12:35:41.000000000 +0200
75293 @@ -0,0 +1,29 @@
75294 +#ifndef _ASM_DMI_H
75295 +#define _ASM_DMI_H 1
75296 +
75297 +#include <asm/io.h>
75298 +
75299 +extern void *dmi_ioremap(unsigned long addr, unsigned long size);
75300 +extern void dmi_iounmap(void *addr, unsigned long size);
75301 +extern void *bt_ioremap(unsigned long addr, unsigned long size);
75302 +extern void bt_iounmap(void *addr, unsigned long size);
75303 +
75304 +#define DMI_MAX_DATA 2048
75305 +
75306 +extern int dmi_alloc_index;
75307 +extern char dmi_alloc_data[DMI_MAX_DATA];
75308 +
75309 +/* This is so early that there is no good way to allocate dynamic memory. 
75310 +   Allocate data in an BSS array. */
75311 +static inline void *dmi_alloc(unsigned len)
75312 +{
75313 +       int idx = dmi_alloc_index;
75314 +       if ((dmi_alloc_index += len) > DMI_MAX_DATA)
75315 +               return NULL;
75316 +       return dmi_alloc_data + idx;
75317 +}
75318 +
75319 +#define dmi_ioremap bt_ioremap
75320 +#define dmi_iounmap bt_iounmap
75321 +
75322 +#endif
75323 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/e820.h new/include/asm-x86_64/mach-xen/asm/e820.h
75324 --- linux-2.6/include/asm-x86_64/mach-xen/asm/e820.h    1970-01-01 01:00:00.000000000 +0100
75325 +++ new/include/asm-x86_64/mach-xen/asm/e820.h  2006-06-05 15:54:33.000000000 +0200
75326 @@ -0,0 +1,63 @@
75327 +/*
75328 + * structures and definitions for the int 15, ax=e820 memory map
75329 + * scheme.
75330 + *
75331 + * In a nutshell, setup.S populates a scratch table in the
75332 + * empty_zero_block that contains a list of usable address/size
75333 + * duples.  setup.c, this information is transferred into the e820map,
75334 + * and in init.c/numa.c, that new information is used to mark pages
75335 + * reserved or not.
75336 + */
75337 +#ifndef __E820_HEADER
75338 +#define __E820_HEADER
75339 +
75340 +#include <linux/mmzone.h>
75341 +
75342 +#define E820MAP        0x2d0           /* our map */
75343 +#define E820MAX        128             /* number of entries in E820MAP */
75344 +#define E820NR 0x1e8           /* # entries in E820MAP */
75345 +
75346 +#define E820_RAM       1
75347 +#define E820_RESERVED  2
75348 +#define E820_ACPI      3 /* usable as RAM once ACPI tables have been read */
75349 +#define E820_NVS       4
75350 +
75351 +#define HIGH_MEMORY    (1024*1024)
75352 +
75353 +#define LOWMEMSIZE()   (0x9f000)
75354 +
75355 +#ifndef __ASSEMBLY__
75356 +struct e820entry {
75357 +       u64 addr;       /* start of memory segment */
75358 +       u64 size;       /* size of memory segment */
75359 +       u32 type;       /* type of memory segment */
75360 +} __attribute__((packed));
75361 +
75362 +struct e820map {
75363 +    int nr_map;
75364 +       struct e820entry map[E820MAX];
75365 +};
75366 +
75367 +extern unsigned long find_e820_area(unsigned long start, unsigned long end, 
75368 +                                   unsigned size);
75369 +extern void add_memory_region(unsigned long start, unsigned long size, 
75370 +                             int type);
75371 +extern void setup_memory_region(void);
75372 +extern void contig_e820_setup(void); 
75373 +extern unsigned long e820_end_of_ram(void);
75374 +extern void e820_reserve_resources(struct e820entry *e820, int nr_map);
75375 +extern void e820_print_map(char *who);
75376 +extern int e820_mapped(unsigned long start, unsigned long end, unsigned type);
75377 +
75378 +extern void e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end);
75379 +extern void e820_setup_gap(struct e820entry *e820, int nr_map);
75380 +extern unsigned long e820_hole_size(unsigned long start_pfn,
75381 +                                   unsigned long end_pfn);
75382 +
75383 +extern void __init parse_memopt(char *p, char **end);
75384 +extern void __init parse_memmapopt(char *p, char **end);
75385 +
75386 +extern struct e820map e820;
75387 +#endif/*!__ASSEMBLY__*/
75388 +
75389 +#endif/*__E820_HEADER*/
75390 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/fixmap.h new/include/asm-x86_64/mach-xen/asm/fixmap.h
75391 --- linux-2.6/include/asm-x86_64/mach-xen/asm/fixmap.h  1970-01-01 01:00:00.000000000 +0100
75392 +++ new/include/asm-x86_64/mach-xen/asm/fixmap.h        2006-05-09 12:35:41.000000000 +0200
75393 @@ -0,0 +1,114 @@
75394 +/*
75395 + * fixmap.h: compile-time virtual memory allocation
75396 + *
75397 + * This file is subject to the terms and conditions of the GNU General Public
75398 + * License.  See the file "COPYING" in the main directory of this archive
75399 + * for more details.
75400 + *
75401 + * Copyright (C) 1998 Ingo Molnar
75402 + */
75403 +
75404 +#ifndef _ASM_FIXMAP_H
75405 +#define _ASM_FIXMAP_H
75406 +
75407 +#include <linux/config.h>
75408 +#include <linux/kernel.h>
75409 +#include <asm/apicdef.h>
75410 +#include <xen/gnttab.h>
75411 +#include <asm/page.h>
75412 +#include <asm/vsyscall.h>
75413 +#include <asm/vsyscall32.h>
75414 +#include <asm/acpi.h>
75415 +
75416 +/*
75417 + * Here we define all the compile-time 'special' virtual
75418 + * addresses. The point is to have a constant address at
75419 + * compile time, but to set the physical address only
75420 + * in the boot process.
75421 + *
75422 + * these 'compile-time allocated' memory buffers are
75423 + * fixed-size 4k pages. (or larger if used with an increment
75424 + * highger than 1) use fixmap_set(idx,phys) to associate
75425 + * physical memory with fixmap indices.
75426 + *
75427 + * TLB entries of such buffers will not be flushed across
75428 + * task switches.
75429 + */
75430 +
75431 +enum fixed_addresses {
75432 +       VSYSCALL_LAST_PAGE,
75433 +       VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
75434 +       VSYSCALL_HPET,
75435 +       FIX_HPET_BASE,
75436 +#ifdef CONFIG_X86_LOCAL_APIC
75437 +       FIX_APIC_BASE,  /* local (CPU) APIC) -- required for SMP or not */
75438 +#endif
75439 +#ifdef CONFIG_X86_IO_APIC
75440 +       FIX_IO_APIC_BASE_0,
75441 +       FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
75442 +#endif
75443 +#ifdef CONFIG_ACPI
75444 +       FIX_ACPI_BEGIN,
75445 +       FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
75446 +#endif
75447 +       FIX_SHARED_INFO,
75448 +#define NR_FIX_ISAMAPS 256
75449 +       FIX_ISAMAP_END,
75450 +       FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
75451 +       __end_of_permanent_fixed_addresses,
75452 +       /* temporary boot-time mappings, used before ioremap() is functional */
75453 +#define NR_FIX_BTMAPS  16
75454 +       FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
75455 +       FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
75456 +       __end_of_fixed_addresses
75457 +};
75458 +
75459 +extern void __set_fixmap (enum fixed_addresses idx,
75460 +                                       unsigned long phys, pgprot_t flags);
75461 +
75462 +#define set_fixmap(idx, phys) \
75463 +               __set_fixmap(idx, phys, PAGE_KERNEL)
75464 +/*
75465 + * Some hardware wants to get fixmapped without caching.
75466 + */
75467 +#define set_fixmap_nocache(idx, phys) \
75468 +               __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
75469 +
75470 +#define clear_fixmap(idx) \
75471 +                __set_fixmap(idx, 0, __pgprot(0))
75472 +
75473 +#define FIXADDR_TOP    (VSYSCALL_END-PAGE_SIZE)
75474 +#define FIXADDR_SIZE   (__end_of_fixed_addresses << PAGE_SHIFT)
75475 +#define FIXADDR_START  (FIXADDR_TOP - FIXADDR_SIZE)
75476 +
75477 +/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
75478 +#define FIXADDR_USER_START     ((unsigned long)VSYSCALL32_VSYSCALL)
75479 +#define FIXADDR_USER_END       (FIXADDR_USER_START + PAGE_SIZE)
75480 +
75481 +#define __fix_to_virt(x)       (FIXADDR_TOP - ((x) << PAGE_SHIFT))
75482 +
75483 +extern void __this_fixmap_does_not_exist(void);
75484 +
75485 +/*
75486 + * 'index to address' translation. If anyone tries to use the idx
75487 + * directly without translation, we catch the bug with a NULL-deference
75488 + * kernel oops. Illegal ranges of incoming indices are caught too.
75489 + */
75490 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
75491 +{
75492 +       /*
75493 +        * this branch gets completely eliminated after inlining,
75494 +        * except when someone tries to use fixaddr indices in an
75495 +        * illegal way. (such as mixing up address types or using
75496 +        * out-of-range indices).
75497 +        *
75498 +        * If it doesn't get removed, the linker will complain
75499 +        * loudly with a reasonably clear error message..
75500 +        */
75501 +       if (idx >= __end_of_fixed_addresses)
75502 +               __this_fixmap_does_not_exist();
75503 +
75504 +        return __fix_to_virt(idx);
75505 +}
75506 +
75507 +#endif
75508 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/floppy.h new/include/asm-x86_64/mach-xen/asm/floppy.h
75509 --- linux-2.6/include/asm-x86_64/mach-xen/asm/floppy.h  1970-01-01 01:00:00.000000000 +0100
75510 +++ new/include/asm-x86_64/mach-xen/asm/floppy.h        2006-05-09 12:35:41.000000000 +0200
75511 @@ -0,0 +1,206 @@
75512 +/*
75513 + * Architecture specific parts of the Floppy driver
75514 + *
75515 + * This file is subject to the terms and conditions of the GNU General Public
75516 + * License.  See the file "COPYING" in the main directory of this archive
75517 + * for more details.
75518 + *
75519 + * Copyright (C) 1995
75520 + *
75521 + * Modifications for Xen are Copyright (c) 2004, Keir Fraser.
75522 + */
75523 +#ifndef __ASM_XEN_X86_64_FLOPPY_H
75524 +#define __ASM_XEN_X86_64_FLOPPY_H
75525 +
75526 +#include <linux/vmalloc.h>
75527 +
75528 +/*
75529 + * The DMA channel used by the floppy controller cannot access data at
75530 + * addresses >= 16MB
75531 + *
75532 + * Went back to the 1MB limit, as some people had problems with the floppy
75533 + * driver otherwise. It doesn't matter much for performance anyway, as most
75534 + * floppy accesses go through the track buffer.
75535 + */
75536 +#define _CROSS_64KB(a,s,vdma) \
75537 +(!(vdma) && ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64))
75538 +
75539 +/* XEN: Hit DMA paths on the head. This trick from asm-m68k/floppy.h. */
75540 +#include <asm/dma.h>
75541 +#undef MAX_DMA_ADDRESS
75542 +#define MAX_DMA_ADDRESS 0
75543 +#define CROSS_64KB(a,s) (0)
75544 +
75545 +#define fd_inb(port)                   inb_p(port)
75546 +#define fd_outb(value,port)            outb_p(value,port)
75547 +
75548 +#define fd_request_dma()        (0)
75549 +#define fd_free_dma()           ((void)0)
75550 +#define fd_enable_irq()         enable_irq(FLOPPY_IRQ)
75551 +#define fd_disable_irq()        disable_irq(FLOPPY_IRQ)
75552 +#define fd_free_irq()          free_irq(FLOPPY_IRQ, NULL)
75553 +#define fd_get_dma_residue()    vdma_get_dma_residue(FLOPPY_DMA)
75554 +/*
75555 + * Do not use vmalloc/vfree: floppy_release_irq_and_dma() gets called from
75556 + * softirq context via motor_off_callback. A generic bug we happen to trigger.
75557 + */
75558 +#define fd_dma_mem_alloc(size) __get_free_pages(GFP_KERNEL|__GFP_NORETRY, get_order(size))
75559 +#define fd_dma_mem_free(addr, size) free_pages(addr, get_order(size))
75560 +#define fd_dma_setup(addr, size, mode, io) vdma_dma_setup(addr, size, mode, io)
75561 +
75562 +static int virtual_dma_count;
75563 +static int virtual_dma_residue;
75564 +static char *virtual_dma_addr;
75565 +static int virtual_dma_mode;
75566 +static int doing_pdma;
75567 +
75568 +static irqreturn_t floppy_hardint(int irq, void *dev_id, struct pt_regs * regs)
75569 +{
75570 +       register unsigned char st;
75571 +
75572 +#undef TRACE_FLPY_INT
75573 +
75574 +#ifdef TRACE_FLPY_INT
75575 +       static int calls=0;
75576 +       static int bytes=0;
75577 +       static int dma_wait=0;
75578 +#endif
75579 +       if (!doing_pdma)
75580 +               return floppy_interrupt(irq, dev_id, regs);
75581 +
75582 +#ifdef TRACE_FLPY_INT
75583 +       if(!calls)
75584 +               bytes = virtual_dma_count;
75585 +#endif
75586 +
75587 +       {
75588 +               register int lcount;
75589 +               register char *lptr;
75590 +
75591 +               st = 1;
75592 +               for(lcount=virtual_dma_count, lptr=virtual_dma_addr; 
75593 +                   lcount; lcount--, lptr++) {
75594 +                       st=inb(virtual_dma_port+4) & 0xa0 ;
75595 +                       if(st != 0xa0) 
75596 +                               break;
75597 +                       if(virtual_dma_mode)
75598 +                               outb_p(*lptr, virtual_dma_port+5);
75599 +                       else
75600 +                               *lptr = inb_p(virtual_dma_port+5);
75601 +               }
75602 +               virtual_dma_count = lcount;
75603 +               virtual_dma_addr = lptr;
75604 +               st = inb(virtual_dma_port+4);
75605 +       }
75606 +
75607 +#ifdef TRACE_FLPY_INT
75608 +       calls++;
75609 +#endif
75610 +       if(st == 0x20)
75611 +               return IRQ_HANDLED;
75612 +       if(!(st & 0x20)) {
75613 +               virtual_dma_residue += virtual_dma_count;
75614 +               virtual_dma_count=0;
75615 +#ifdef TRACE_FLPY_INT
75616 +               printk("count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n", 
75617 +                      virtual_dma_count, virtual_dma_residue, calls, bytes,
75618 +                      dma_wait);
75619 +               calls = 0;
75620 +               dma_wait=0;
75621 +#endif
75622 +               doing_pdma = 0;
75623 +               floppy_interrupt(irq, dev_id, regs);
75624 +               return IRQ_HANDLED;
75625 +       }
75626 +#ifdef TRACE_FLPY_INT
75627 +       if(!virtual_dma_count)
75628 +               dma_wait++;
75629 +#endif
75630 +       return IRQ_HANDLED;
75631 +}
75632 +
75633 +static void fd_disable_dma(void)
75634 +{
75635 +       doing_pdma = 0;
75636 +       virtual_dma_residue += virtual_dma_count;
75637 +       virtual_dma_count=0;
75638 +}
75639 +
75640 +static int vdma_get_dma_residue(unsigned int dummy)
75641 +{
75642 +       return virtual_dma_count + virtual_dma_residue;
75643 +}
75644 +
75645 +
75646 +static int fd_request_irq(void)
75647 +{
75648 +       return request_irq(FLOPPY_IRQ, floppy_hardint,SA_INTERRUPT,
75649 +                                          "floppy", NULL);
75650 +}
75651 +
75652 +#if 0
75653 +static unsigned long vdma_mem_alloc(unsigned long size)
75654 +{
75655 +       return (unsigned long) vmalloc(size);
75656 +
75657 +}
75658 +
75659 +static void vdma_mem_free(unsigned long addr, unsigned long size)
75660 +{
75661 +       vfree((void *)addr);
75662 +}
75663 +#endif
75664 +
75665 +static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io)
75666 +{
75667 +       doing_pdma = 1;
75668 +       virtual_dma_port = io;
75669 +       virtual_dma_mode = (mode  == DMA_MODE_WRITE);
75670 +       virtual_dma_addr = addr;
75671 +       virtual_dma_count = size;
75672 +       virtual_dma_residue = 0;
75673 +       return 0;
75674 +}
75675 +
75676 +/* XEN: This trick to force 'virtual DMA' is from include/asm-m68k/floppy.h. */
75677 +#define FDC1 xen_floppy_init()
75678 +static int FDC2 = -1;
75679 +
75680 +static int xen_floppy_init(void)
75681 +{
75682 +       use_virtual_dma = 1;
75683 +       can_use_virtual_dma = 1;
75684 +       return 0x3f0;
75685 +}
75686 +
75687 +/*
75688 + * Floppy types are stored in the rtc's CMOS RAM and so rtc_lock
75689 + * is needed to prevent corrupted CMOS RAM in case "insmod floppy"
75690 + * coincides with another rtc CMOS user.               Paul G.
75691 + */
75692 +#define FLOPPY0_TYPE   ({                              \
75693 +       unsigned long flags;                            \
75694 +       unsigned char val;                              \
75695 +       spin_lock_irqsave(&rtc_lock, flags);            \
75696 +       val = (CMOS_READ(0x10) >> 4) & 15;              \
75697 +       spin_unlock_irqrestore(&rtc_lock, flags);       \
75698 +       val;                                            \
75699 +})
75700 +
75701 +#define FLOPPY1_TYPE   ({                              \
75702 +       unsigned long flags;                            \
75703 +       unsigned char val;                              \
75704 +       spin_lock_irqsave(&rtc_lock, flags);            \
75705 +       val = CMOS_READ(0x10) & 15;                     \
75706 +       spin_unlock_irqrestore(&rtc_lock, flags);       \
75707 +       val;                                            \
75708 +})
75709 +
75710 +#define N_FDC 2
75711 +#define N_DRIVE 8
75712 +
75713 +#define FLOPPY_MOTOR_MASK 0xf0
75714 +
75715 +#define EXTRA_FLOPPY_PARAMS
75716 +
75717 +#endif /* __ASM_XEN_X86_64_FLOPPY_H */
75718 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/hw_irq.h new/include/asm-x86_64/mach-xen/asm/hw_irq.h
75719 --- linux-2.6/include/asm-x86_64/mach-xen/asm/hw_irq.h  1970-01-01 01:00:00.000000000 +0100
75720 +++ new/include/asm-x86_64/mach-xen/asm/hw_irq.h        2006-06-28 14:32:14.000000000 +0200
75721 @@ -0,0 +1,145 @@
75722 +#ifndef _ASM_HW_IRQ_H
75723 +#define _ASM_HW_IRQ_H
75724 +
75725 +/*
75726 + *     linux/include/asm/hw_irq.h
75727 + *
75728 + *     (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
75729 + *
75730 + *     moved some of the old arch/i386/kernel/irq.h to here. VY
75731 + *
75732 + *     IRQ/IPI changes taken from work by Thomas Radke
75733 + *     <tomsoft@informatik.tu-chemnitz.de>
75734 + *
75735 + *     hacked by Andi Kleen for x86-64.
75736 + * 
75737 + *  $Id$
75738 + */
75739 +
75740 +#ifndef __ASSEMBLY__
75741 +#include <linux/config.h>
75742 +#include <asm/atomic.h>
75743 +#include <asm/irq.h>
75744 +#include <linux/profile.h>
75745 +#include <linux/smp.h>
75746 +
75747 +struct hw_interrupt_type;
75748 +#endif
75749 +
75750 +#define NMI_VECTOR             0x02
75751 +/*
75752 + * IDT vectors usable for external interrupt sources start
75753 + * at 0x20:
75754 + */
75755 +#define FIRST_EXTERNAL_VECTOR  0x20
75756 +
75757 +#define IA32_SYSCALL_VECTOR    0x80
75758 +
75759 +
75760 +/*
75761 + * Vectors 0x20-0x2f are used for ISA interrupts.
75762 + */
75763 +
75764 +/*
75765 + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
75766 + *
75767 + *  some of the following vectors are 'rare', they are merged
75768 + *  into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
75769 + *  TLB, reschedule and local APIC vectors are performance-critical.
75770 + */
75771 +#ifndef CONFIG_XEN
75772 +#define SPURIOUS_APIC_VECTOR   0xff
75773 +#define ERROR_APIC_VECTOR      0xfe
75774 +#define RESCHEDULE_VECTOR      0xfd
75775 +#define CALL_FUNCTION_VECTOR   0xfc
75776 +/* fb free - please don't readd KDB here because it's useless
75777 +   (hint - think what a NMI bit does to a vector) */
75778 +#define THERMAL_APIC_VECTOR    0xfa
75779 +#define THRESHOLD_APIC_VECTOR   0xf9
75780 +/* f8 free */
75781 +#define INVALIDATE_TLB_VECTOR_END      0xf7
75782 +#define INVALIDATE_TLB_VECTOR_START    0xf0    /* f0-f7 used for TLB flush */
75783 +
75784 +#define NUM_INVALIDATE_TLB_VECTORS     8
75785 +#endif
75786 +
75787 +/*
75788 + * Local APIC timer IRQ vector is on a different priority level,
75789 + * to work around the 'lost local interrupt if more than 2 IRQ
75790 + * sources per level' errata.
75791 + */
75792 +#define LOCAL_TIMER_VECTOR     0xef
75793 +
75794 +/*
75795 + * First APIC vector available to drivers: (vectors 0x30-0xee)
75796 + * we start at 0x31 to spread out vectors evenly between priority
75797 + * levels. (0x80 is the syscall vector)
75798 + */
75799 +#define FIRST_DEVICE_VECTOR    0x31
75800 +#define FIRST_SYSTEM_VECTOR    0xef   /* duplicated in irq.h */
75801 +
75802 +
75803 +#ifndef __ASSEMBLY__
75804 +extern u8 irq_vector[NR_IRQ_VECTORS];
75805 +#define IO_APIC_VECTOR(irq)    (irq_vector[irq])
75806 +#define AUTO_ASSIGN            -1
75807 +
75808 +/*
75809 + * Various low-level irq details needed by irq.c, process.c,
75810 + * time.c, io_apic.c and smp.c
75811 + *
75812 + * Interrupt entry/exit code at both C and assembly level
75813 + */
75814 +
75815 +extern void disable_8259A_irq(unsigned int irq);
75816 +extern void enable_8259A_irq(unsigned int irq);
75817 +extern int i8259A_irq_pending(unsigned int irq);
75818 +extern void make_8259A_irq(unsigned int irq);
75819 +extern void init_8259A(int aeoi);
75820 +extern void FASTCALL(send_IPI_self(int vector));
75821 +extern void init_VISWS_APIC_irqs(void);
75822 +extern void setup_IO_APIC(void);
75823 +extern void disable_IO_APIC(void);
75824 +extern void print_IO_APIC(void);
75825 +extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
75826 +extern void send_IPI(int dest, int vector);
75827 +extern void setup_ioapic_dest(void);
75828 +
75829 +extern unsigned long io_apic_irqs;
75830 +
75831 +extern atomic_t irq_err_count;
75832 +extern atomic_t irq_mis_count;
75833 +
75834 +#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs))
75835 +
75836 +#define __STR(x) #x
75837 +#define STR(x) __STR(x)
75838 +
75839 +#include <asm/ptrace.h>
75840 +
75841 +#define IRQ_NAME2(nr) nr##_interrupt(void)
75842 +#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
75843 +
75844 +/*
75845 + *     SMP has a few special interrupts for IPI messages
75846 + */
75847 +
75848 +#define BUILD_IRQ(nr) \
75849 +asmlinkage void IRQ_NAME(nr); \
75850 +__asm__( \
75851 +"\n.p2align\n" \
75852 +"IRQ" #nr "_interrupt:\n\t" \
75853 +       "push $" #nr "-256 ; " \
75854 +       "jmp common_interrupt");
75855 +
75856 +extern void resend_irq_on_evtchn(struct hw_interrupt_type *h, unsigned int i);
75857 +static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i)
75858 +{
75859 +       resend_irq_on_evtchn(h, i);
75860 +}
75861 +
75862 +#define platform_legacy_irq(irq)       ((irq) < 16)
75863 +
75864 +#endif
75865 +
75866 +#endif /* _ASM_HW_IRQ_H */
75867 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/hypercall.h new/include/asm-x86_64/mach-xen/asm/hypercall.h
75868 --- linux-2.6/include/asm-x86_64/mach-xen/asm/hypercall.h       1970-01-01 01:00:00.000000000 +0100
75869 +++ new/include/asm-x86_64/mach-xen/asm/hypercall.h     2006-06-28 14:32:14.000000000 +0200
75870 @@ -0,0 +1,372 @@
75871 +/******************************************************************************
75872 + * hypercall.h
75873 + * 
75874 + * Linux-specific hypervisor handling.
75875 + * 
75876 + * Copyright (c) 2002-2004, K A Fraser
75877 + * 
75878 + * 64-bit updates:
75879 + *   Benjamin Liu <benjamin.liu@intel.com>
75880 + *   Jun Nakajima <jun.nakajima@intel.com>
75881 + * 
75882 + * This program is free software; you can redistribute it and/or
75883 + * modify it under the terms of the GNU General Public License version 2
75884 + * as published by the Free Software Foundation; or, when distributed
75885 + * separately from the Linux kernel or incorporated into other
75886 + * software packages, subject to the following license:
75887 + * 
75888 + * Permission is hereby granted, free of charge, to any person obtaining a copy
75889 + * of this source file (the "Software"), to deal in the Software without
75890 + * restriction, including without limitation the rights to use, copy, modify,
75891 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
75892 + * and to permit persons to whom the Software is furnished to do so, subject to
75893 + * the following conditions:
75894 + * 
75895 + * The above copyright notice and this permission notice shall be included in
75896 + * all copies or substantial portions of the Software.
75897 + * 
75898 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
75899 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
75900 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
75901 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
75902 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
75903 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
75904 + * IN THE SOFTWARE.
75905 + */
75906 +
75907 +#ifndef __HYPERCALL_H__
75908 +#define __HYPERCALL_H__
75909 +
75910 +#include <linux/string.h> /* memcpy() */
75911 +
75912 +#ifndef __HYPERVISOR_H__
75913 +# error "please don't include this file directly"
75914 +#endif
75915 +
75916 +#define __STR(x) #x
75917 +#define STR(x) __STR(x)
75918 +
75919 +#define _hypercall0(type, name)                        \
75920 +({                                             \
75921 +       long __res;                             \
75922 +       asm volatile (                          \
75923 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
75924 +               : "=a" (__res)                  \
75925 +               :                               \
75926 +               : "memory" );                   \
75927 +       (type)__res;                            \
75928 +})
75929 +
75930 +#define _hypercall1(type, name, a1)                            \
75931 +({                                                             \
75932 +       long __res, __ign1;                                     \
75933 +       asm volatile (                                          \
75934 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
75935 +               : "=a" (__res), "=D" (__ign1)                   \
75936 +               : "1" ((long)(a1))                              \
75937 +               : "memory" );                                   \
75938 +       (type)__res;                                            \
75939 +})
75940 +
75941 +#define _hypercall2(type, name, a1, a2)                                \
75942 +({                                                             \
75943 +       long __res, __ign1, __ign2;                             \
75944 +       asm volatile (                                          \
75945 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
75946 +               : "=a" (__res), "=D" (__ign1), "=S" (__ign2)    \
75947 +               : "1" ((long)(a1)), "2" ((long)(a2))            \
75948 +               : "memory" );                                   \
75949 +       (type)__res;                                            \
75950 +})
75951 +
75952 +#define _hypercall3(type, name, a1, a2, a3)                    \
75953 +({                                                             \
75954 +       long __res, __ign1, __ign2, __ign3;                     \
75955 +       asm volatile (                                          \
75956 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
75957 +               : "=a" (__res), "=D" (__ign1), "=S" (__ign2),   \
75958 +               "=d" (__ign3)                                   \
75959 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
75960 +               "3" ((long)(a3))                                \
75961 +               : "memory" );                                   \
75962 +       (type)__res;                                            \
75963 +})
75964 +
75965 +#define _hypercall4(type, name, a1, a2, a3, a4)                        \
75966 +({                                                             \
75967 +       long __res, __ign1, __ign2, __ign3;                     \
75968 +       asm volatile (                                          \
75969 +               "movq %7,%%r10; "                               \
75970 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
75971 +               : "=a" (__res), "=D" (__ign1), "=S" (__ign2),   \
75972 +               "=d" (__ign3)                                   \
75973 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
75974 +               "3" ((long)(a3)), "g" ((long)(a4))              \
75975 +               : "memory", "r10" );                            \
75976 +       (type)__res;                                            \
75977 +})
75978 +
75979 +#define _hypercall5(type, name, a1, a2, a3, a4, a5)            \
75980 +({                                                             \
75981 +       long __res, __ign1, __ign2, __ign3;                     \
75982 +       asm volatile (                                          \
75983 +               "movq %7,%%r10; movq %8,%%r8; "                 \
75984 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
75985 +               : "=a" (__res), "=D" (__ign1), "=S" (__ign2),   \
75986 +               "=d" (__ign3)                                   \
75987 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
75988 +               "3" ((long)(a3)), "g" ((long)(a4)),             \
75989 +               "g" ((long)(a5))                                \
75990 +               : "memory", "r10", "r8" );                      \
75991 +       (type)__res;                                            \
75992 +})
75993 +
75994 +static inline int
75995 +HYPERVISOR_set_trap_table(
75996 +       trap_info_t *table)
75997 +{
75998 +       return _hypercall1(int, set_trap_table, table);
75999 +}
76000 +
76001 +static inline int
76002 +HYPERVISOR_mmu_update(
76003 +       mmu_update_t *req, int count, int *success_count, domid_t domid)
76004 +{
76005 +       return _hypercall4(int, mmu_update, req, count, success_count, domid);
76006 +}
76007 +
76008 +static inline int
76009 +HYPERVISOR_mmuext_op(
76010 +       struct mmuext_op *op, int count, int *success_count, domid_t domid)
76011 +{
76012 +       return _hypercall4(int, mmuext_op, op, count, success_count, domid);
76013 +}
76014 +
76015 +static inline int
76016 +HYPERVISOR_set_gdt(
76017 +       unsigned long *frame_list, int entries)
76018 +{
76019 +       return _hypercall2(int, set_gdt, frame_list, entries);
76020 +}
76021 +
76022 +static inline int
76023 +HYPERVISOR_stack_switch(
76024 +       unsigned long ss, unsigned long esp)
76025 +{
76026 +       return _hypercall2(int, stack_switch, ss, esp);
76027 +}
76028 +
76029 +static inline int
76030 +HYPERVISOR_set_callbacks(
76031 +       unsigned long event_address, unsigned long failsafe_address, 
76032 +       unsigned long syscall_address)
76033 +{
76034 +       return _hypercall3(int, set_callbacks,
76035 +                          event_address, failsafe_address, syscall_address);
76036 +}
76037 +
76038 +static inline int
76039 +HYPERVISOR_fpu_taskswitch(
76040 +       int set)
76041 +{
76042 +       return _hypercall1(int, fpu_taskswitch, set);
76043 +}
76044 +
76045 +static inline int
76046 +HYPERVISOR_sched_op_compat(
76047 +       int cmd, unsigned long arg)
76048 +{
76049 +       return _hypercall2(int, sched_op_compat, cmd, arg);
76050 +}
76051 +
76052 +static inline int
76053 +HYPERVISOR_sched_op(
76054 +       int cmd, void *arg)
76055 +{
76056 +       return _hypercall2(int, sched_op, cmd, arg);
76057 +}
76058 +
76059 +static inline long
76060 +HYPERVISOR_set_timer_op(
76061 +       u64 timeout)
76062 +{
76063 +       return _hypercall1(long, set_timer_op, timeout);
76064 +}
76065 +
76066 +static inline int
76067 +HYPERVISOR_dom0_op(
76068 +       dom0_op_t *dom0_op)
76069 +{
76070 +       dom0_op->interface_version = DOM0_INTERFACE_VERSION;
76071 +       return _hypercall1(int, dom0_op, dom0_op);
76072 +}
76073 +
76074 +static inline int
76075 +HYPERVISOR_set_debugreg(
76076 +       int reg, unsigned long value)
76077 +{
76078 +       return _hypercall2(int, set_debugreg, reg, value);
76079 +}
76080 +
76081 +static inline unsigned long
76082 +HYPERVISOR_get_debugreg(
76083 +       int reg)
76084 +{
76085 +       return _hypercall1(unsigned long, get_debugreg, reg);
76086 +}
76087 +
76088 +static inline int
76089 +HYPERVISOR_update_descriptor(
76090 +       unsigned long ma, unsigned long word)
76091 +{
76092 +       return _hypercall2(int, update_descriptor, ma, word);
76093 +}
76094 +
76095 +static inline int
76096 +HYPERVISOR_memory_op(
76097 +       unsigned int cmd, void *arg)
76098 +{
76099 +       return _hypercall2(int, memory_op, cmd, arg);
76100 +}
76101 +
76102 +static inline int
76103 +HYPERVISOR_multicall(
76104 +       void *call_list, int nr_calls)
76105 +{
76106 +       return _hypercall2(int, multicall, call_list, nr_calls);
76107 +}
76108 +
76109 +static inline int
76110 +HYPERVISOR_update_va_mapping(
76111 +       unsigned long va, pte_t new_val, unsigned long flags)
76112 +{
76113 +       return _hypercall3(int, update_va_mapping, va, new_val.pte, flags);
76114 +}
76115 +
76116 +static inline int
76117 +HYPERVISOR_event_channel_op(
76118 +       int cmd, void *arg)
76119 +{
76120 +       int rc = _hypercall2(int, event_channel_op, cmd, arg);
76121 +       if (unlikely(rc == -ENOSYS)) {
76122 +               struct evtchn_op op;
76123 +               op.cmd = cmd;
76124 +               memcpy(&op.u, arg, sizeof(op.u));
76125 +               rc = _hypercall1(int, event_channel_op_compat, &op);
76126 +               memcpy(arg, &op.u, sizeof(op.u));
76127 +       }
76128 +       return rc;
76129 +}
76130 +
76131 +static inline int
76132 +HYPERVISOR_acm_op(
76133 +       int cmd, void *arg)
76134 +{
76135 +       return _hypercall2(int, acm_op, cmd, arg);
76136 +}
76137 +
76138 +static inline int
76139 +HYPERVISOR_xen_version(
76140 +       int cmd, void *arg)
76141 +{
76142 +       return _hypercall2(int, xen_version, cmd, arg);
76143 +}
76144 +
76145 +static inline int
76146 +HYPERVISOR_console_io(
76147 +       int cmd, int count, char *str)
76148 +{
76149 +       return _hypercall3(int, console_io, cmd, count, str);
76150 +}
76151 +
76152 +static inline int
76153 +HYPERVISOR_physdev_op(
76154 +       int cmd, void *arg)
76155 +{
76156 +       int rc = _hypercall2(int, physdev_op, cmd, arg);
76157 +       if (unlikely(rc == -ENOSYS)) {
76158 +               struct physdev_op op;
76159 +               op.cmd = cmd;
76160 +               memcpy(&op.u, arg, sizeof(op.u));
76161 +               rc = _hypercall1(int, physdev_op_compat, &op);
76162 +               memcpy(arg, &op.u, sizeof(op.u));
76163 +       }
76164 +       return rc;
76165 +}
76166 +
76167 +static inline int
76168 +HYPERVISOR_grant_table_op(
76169 +       unsigned int cmd, void *uop, unsigned int count)
76170 +{
76171 +       return _hypercall3(int, grant_table_op, cmd, uop, count);
76172 +}
76173 +
76174 +static inline int
76175 +HYPERVISOR_update_va_mapping_otherdomain(
76176 +       unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
76177 +{
76178 +       return _hypercall4(int, update_va_mapping_otherdomain, va,
76179 +                          new_val.pte, flags, domid);
76180 +}
76181 +
76182 +static inline int
76183 +HYPERVISOR_vm_assist(
76184 +       unsigned int cmd, unsigned int type)
76185 +{
76186 +       return _hypercall2(int, vm_assist, cmd, type);
76187 +}
76188 +
76189 +static inline int
76190 +HYPERVISOR_vcpu_op(
76191 +       int cmd, int vcpuid, void *extra_args)
76192 +{
76193 +       return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
76194 +}
76195 +
76196 +static inline int
76197 +HYPERVISOR_set_segment_base(
76198 +       int reg, unsigned long value)
76199 +{
76200 +       return _hypercall2(int, set_segment_base, reg, value);
76201 +}
76202 +
76203 +static inline int
76204 +HYPERVISOR_suspend(
76205 +       unsigned long srec)
76206 +{
76207 +       struct sched_shutdown sched_shutdown = {
76208 +               .reason = SHUTDOWN_suspend
76209 +       };
76210 +
76211 +       int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
76212 +                            &sched_shutdown, srec);
76213 +
76214 +       if (rc == -ENOSYS)
76215 +               rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
76216 +                                SHUTDOWN_suspend, srec);
76217 +
76218 +       return rc;
76219 +}
76220 +
76221 +static inline int
76222 +HYPERVISOR_nmi_op(
76223 +       unsigned long op, void *arg)
76224 +{
76225 +       return _hypercall2(int, nmi_op, op, arg);
76226 +}
76227 +
76228 +static inline int
76229 +HYPERVISOR_callback_op(
76230 +       int cmd, void *arg)
76231 +{
76232 +       return _hypercall2(int, callback_op, cmd, arg);
76233 +}
76234 +
76235 +static inline int
76236 +HYPERVISOR_xenoprof_op(
76237 +       int op, void *arg)
76238 +{
76239 +       return _hypercall2(int, xenoprof_op, op, arg);
76240 +}
76241 +
76242 +#endif /* __HYPERCALL_H__ */
76243 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/hypervisor.h new/include/asm-x86_64/mach-xen/asm/hypervisor.h
76244 --- linux-2.6/include/asm-x86_64/mach-xen/asm/hypervisor.h      1970-01-01 01:00:00.000000000 +0100
76245 +++ new/include/asm-x86_64/mach-xen/asm/hypervisor.h    2006-05-09 12:35:41.000000000 +0200
76246 @@ -0,0 +1,2 @@
76247 +
76248 +#include <asm-i386/mach-xen/asm/hypervisor.h>
76249 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/io.h new/include/asm-x86_64/mach-xen/asm/io.h
76250 --- linux-2.6/include/asm-x86_64/mach-xen/asm/io.h      1970-01-01 01:00:00.000000000 +0100
76251 +++ new/include/asm-x86_64/mach-xen/asm/io.h    2006-05-09 12:35:41.000000000 +0200
76252 @@ -0,0 +1,328 @@
76253 +#ifndef _ASM_IO_H
76254 +#define _ASM_IO_H
76255 +
76256 +#include <linux/config.h>
76257 +#include <asm/fixmap.h>
76258 +
76259 +/*
76260 + * This file contains the definitions for the x86 IO instructions
76261 + * inb/inw/inl/outb/outw/outl and the "string versions" of the same
76262 + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
76263 + * versions of the single-IO instructions (inb_p/inw_p/..).
76264 + *
76265 + * This file is not meant to be obfuscating: it's just complicated
76266 + * to (a) handle it all in a way that makes gcc able to optimize it
76267 + * as well as possible and (b) trying to avoid writing the same thing
76268 + * over and over again with slight variations and possibly making a
76269 + * mistake somewhere.
76270 + */
76271 +
76272 +/*
76273 + * Thanks to James van Artsdalen for a better timing-fix than
76274 + * the two short jumps: using outb's to a nonexistent port seems
76275 + * to guarantee better timings even on fast machines.
76276 + *
76277 + * On the other hand, I'd like to be sure of a non-existent port:
76278 + * I feel a bit unsafe about using 0x80 (should be safe, though)
76279 + *
76280 + *             Linus
76281 + */
76282 +
76283 + /*
76284 +  *  Bit simplified and optimized by Jan Hubicka
76285 +  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
76286 +  *
76287 +  *  isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
76288 +  *  isa_read[wl] and isa_write[wl] fixed
76289 +  *  - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
76290 +  */
76291 +
76292 +#define __SLOW_DOWN_IO "\noutb %%al,$0x80"
76293 +
76294 +#ifdef REALLY_SLOW_IO
76295 +#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
76296 +#else
76297 +#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO
76298 +#endif
76299 +
76300 +/*
76301 + * Talk about misusing macros..
76302 + */
76303 +#define __OUT1(s,x) \
76304 +static inline void out##s(unsigned x value, unsigned short port) {
76305 +
76306 +#define __OUT2(s,s1,s2) \
76307 +__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1"
76308 +
76309 +#define __OUT(s,s1,x) \
76310 +__OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \
76311 +__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} \
76312 +
76313 +#define __IN1(s) \
76314 +static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
76315 +
76316 +#define __IN2(s,s1,s2) \
76317 +__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0"
76318 +
76319 +#define __IN(s,s1,i...) \
76320 +__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
76321 +__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
76322 +
76323 +#define __INS(s) \
76324 +static inline void ins##s(unsigned short port, void * addr, unsigned long count) \
76325 +{ __asm__ __volatile__ ("rep ; ins" #s \
76326 +: "=D" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
76327 +
76328 +#define __OUTS(s) \
76329 +static inline void outs##s(unsigned short port, const void * addr, unsigned long count) \
76330 +{ __asm__ __volatile__ ("rep ; outs" #s \
76331 +: "=S" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
76332 +
76333 +#define RETURN_TYPE unsigned char
76334 +__IN(b,"")
76335 +#undef RETURN_TYPE
76336 +#define RETURN_TYPE unsigned short
76337 +__IN(w,"")
76338 +#undef RETURN_TYPE
76339 +#define RETURN_TYPE unsigned int
76340 +__IN(l,"")
76341 +#undef RETURN_TYPE
76342 +
76343 +__OUT(b,"b",char)
76344 +__OUT(w,"w",short)
76345 +__OUT(l,,int)
76346 +
76347 +__INS(b)
76348 +__INS(w)
76349 +__INS(l)
76350 +
76351 +__OUTS(b)
76352 +__OUTS(w)
76353 +__OUTS(l)
76354 +
76355 +#define IO_SPACE_LIMIT 0xffff
76356 +
76357 +#if defined(__KERNEL__) && __x86_64__
76358 +
76359 +#include <linux/vmalloc.h>
76360 +
76361 +#ifndef __i386__
76362 +/*
76363 + * Change virtual addresses to physical addresses and vv.
76364 + * These are pretty trivial
76365 + */
76366 +static inline unsigned long virt_to_phys(volatile void * address)
76367 +{
76368 +       return __pa(address);
76369 +}
76370 +
76371 +static inline void * phys_to_virt(unsigned long address)
76372 +{
76373 +       return __va(address);
76374 +}
76375 +
76376 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
76377 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
76378 +#endif
76379 +
76380 +/*
76381 + * Change "struct page" to physical address.
76382 + */
76383 +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
76384 +#define page_to_phys(page)      (phys_to_machine(page_to_pseudophys(page)))
76385 +#define page_to_bus(page)       (phys_to_machine(page_to_pseudophys(page)))
76386 +
76387 +#define bio_to_pseudophys(bio)  (page_to_pseudophys(bio_page((bio))) + \
76388 +                                 (unsigned long) bio_offset((bio)))
76389 +#define bvec_to_pseudophys(bv)  (page_to_pseudophys((bv)->bv_page) + \
76390 +                                 (unsigned long) (bv)->bv_offset)
76391 +
76392 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)      \
76393 +       (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
76394 +        ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
76395 +         bvec_to_pseudophys((vec2))))
76396 +
76397 +#include <asm-generic/iomap.h>
76398 +
76399 +extern void __iomem *__ioremap(unsigned long offset, unsigned long size, unsigned long flags);
76400 +
76401 +static inline void __iomem * ioremap (unsigned long offset, unsigned long size)
76402 +{
76403 +       return __ioremap(offset, size, 0);
76404 +}
76405 +
76406 +extern void *early_ioremap(unsigned long addr, unsigned long size);
76407 +extern void early_iounmap(void *addr, unsigned long size);
76408 +
76409 +/*
76410 + * This one maps high address device memory and turns off caching for that area.
76411 + * it's useful if some control registers are in such an area and write combining
76412 + * or read caching is not desirable:
76413 + */
76414 +extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size);
76415 +extern void iounmap(volatile void __iomem *addr);
76416 +
76417 +/*
76418 + * ISA I/O bus memory addresses are 1:1 with the physical address.
76419 + */
76420 +
76421 +#define isa_virt_to_bus(_x) isa_virt_to_bus_is_UNSUPPORTED->x
76422 +#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x
76423 +#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
76424 +
76425 +/*
76426 + * However PCI ones are not necessarily 1:1 and therefore these interfaces
76427 + * are forbidden in portable PCI drivers.
76428 + *
76429 + * Allow them on x86 for legacy drivers, though.
76430 + */
76431 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
76432 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
76433 +
76434 +/*
76435 + * readX/writeX() are used to access memory mapped devices. On some
76436 + * architectures the memory mapped IO stuff needs to be accessed
76437 + * differently. On the x86 architecture, we just read/write the
76438 + * memory location directly.
76439 + */
76440 +
76441 +static inline __u8 __readb(const volatile void __iomem *addr)
76442 +{
76443 +       return *(__force volatile __u8 *)addr;
76444 +}
76445 +static inline __u16 __readw(const volatile void __iomem *addr)
76446 +{
76447 +       return *(__force volatile __u16 *)addr;
76448 +}
76449 +static __always_inline __u32 __readl(const volatile void __iomem *addr)
76450 +{
76451 +       return *(__force volatile __u32 *)addr;
76452 +}
76453 +static inline __u64 __readq(const volatile void __iomem *addr)
76454 +{
76455 +       return *(__force volatile __u64 *)addr;
76456 +}
76457 +#define readb(x) __readb(x)
76458 +#define readw(x) __readw(x)
76459 +#define readl(x) __readl(x)
76460 +#define readq(x) __readq(x)
76461 +#define readb_relaxed(a) readb(a)
76462 +#define readw_relaxed(a) readw(a)
76463 +#define readl_relaxed(a) readl(a)
76464 +#define readq_relaxed(a) readq(a)
76465 +#define __raw_readb readb
76466 +#define __raw_readw readw
76467 +#define __raw_readl readl
76468 +#define __raw_readq readq
76469 +
76470 +#define mmiowb()
76471 +
76472 +static inline void __writel(__u32 b, volatile void __iomem *addr)
76473 +{
76474 +       *(__force volatile __u32 *)addr = b;
76475 +}
76476 +static inline void __writeq(__u64 b, volatile void __iomem *addr)
76477 +{
76478 +       *(__force volatile __u64 *)addr = b;
76479 +}
76480 +static inline void __writeb(__u8 b, volatile void __iomem *addr)
76481 +{
76482 +       *(__force volatile __u8 *)addr = b;
76483 +}
76484 +static inline void __writew(__u16 b, volatile void __iomem *addr)
76485 +{
76486 +       *(__force volatile __u16 *)addr = b;
76487 +}
76488 +#define writeq(val,addr) __writeq((val),(addr))
76489 +#define writel(val,addr) __writel((val),(addr))
76490 +#define writew(val,addr) __writew((val),(addr))
76491 +#define writeb(val,addr) __writeb((val),(addr))
76492 +#define __raw_writeb writeb
76493 +#define __raw_writew writew
76494 +#define __raw_writel writel
76495 +#define __raw_writeq writeq
76496 +
76497 +void __memcpy_fromio(void*,unsigned long,unsigned);
76498 +void __memcpy_toio(unsigned long,const void*,unsigned);
76499 +
76500 +static inline void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned len)
76501 +{
76502 +       __memcpy_fromio(to,(unsigned long)from,len);
76503 +}
76504 +static inline void memcpy_toio(volatile void __iomem *to, const void *from, unsigned len)
76505 +{
76506 +       __memcpy_toio((unsigned long)to,from,len);
76507 +}
76508 +
76509 +void memset_io(volatile void __iomem *a, int b, size_t c);
76510 +
76511 +/*
76512 + * ISA space is 'always mapped' on a typical x86 system, no need to
76513 + * explicitly ioremap() it. The fact that the ISA IO space is mapped
76514 + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
76515 + * are physical addresses. The following constant pointer can be
76516 + * used as the IO-area pointer (it can be iounmapped as well, so the
76517 + * analogy with PCI is quite large):
76518 + */
76519 +#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
76520 +
76521 +/*
76522 + * Again, x86-64 does not require mem IO specific function.
76523 + */
76524 +
76525 +#define eth_io_copy_and_sum(a,b,c,d)           eth_copy_and_sum((a),(void *)(b),(c),(d))
76526 +
76527 +/**
76528 + *     check_signature         -       find BIOS signatures
76529 + *     @io_addr: mmio address to check 
76530 + *     @signature:  signature block
76531 + *     @length: length of signature
76532 + *
76533 + *     Perform a signature comparison with the mmio address io_addr. This
76534 + *     address should have been obtained by ioremap.
76535 + *     Returns 1 on a match.
76536 + */
76537
76538 +static inline int check_signature(void __iomem *io_addr,
76539 +       const unsigned char *signature, int length)
76540 +{
76541 +       int retval = 0;
76542 +       do {
76543 +               if (readb(io_addr) != *signature)
76544 +                       goto out;
76545 +               io_addr++;
76546 +               signature++;
76547 +               length--;
76548 +       } while (length);
76549 +       retval = 1;
76550 +out:
76551 +       return retval;
76552 +}
76553 +
76554 +/* Nothing to do */
76555 +
76556 +#define dma_cache_inv(_start,_size)            do { } while (0)
76557 +#define dma_cache_wback(_start,_size)          do { } while (0)
76558 +#define dma_cache_wback_inv(_start,_size)      do { } while (0)
76559 +
76560 +#define flush_write_buffers() 
76561 +
76562 +extern int iommu_bio_merge;
76563 +#define BIO_VMERGE_BOUNDARY iommu_bio_merge
76564 +
76565 +/*
76566 + * Convert a physical pointer to a virtual kernel pointer for /dev/mem
76567 + * access
76568 + */
76569 +#define xlate_dev_mem_ptr(p)   __va(p)
76570 +
76571 +/*
76572 + * Convert a virtual cached pointer to an uncached pointer
76573 + */
76574 +#define xlate_dev_kmem_ptr(p)  p
76575 +
76576 +#endif /* __KERNEL__ */
76577 +
76578 +#define ARCH_HAS_DEV_MEM
76579 +
76580 +#endif
76581 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/irq.h new/include/asm-x86_64/mach-xen/asm/irq.h
76582 --- linux-2.6/include/asm-x86_64/mach-xen/asm/irq.h     1970-01-01 01:00:00.000000000 +0100
76583 +++ new/include/asm-x86_64/mach-xen/asm/irq.h   2006-05-09 12:35:41.000000000 +0200
76584 @@ -0,0 +1,39 @@
76585 +#ifndef _ASM_IRQ_H
76586 +#define _ASM_IRQ_H
76587 +
76588 +/*
76589 + *     linux/include/asm/irq.h
76590 + *
76591 + *     (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
76592 + *
76593 + *     IRQ/IPI changes taken from work by Thomas Radke
76594 + *     <tomsoft@informatik.tu-chemnitz.de>
76595 + */
76596 +
76597 +#include <linux/config.h>
76598 +#include <linux/sched.h>
76599 +/* include comes from machine specific directory */
76600 +#include "irq_vectors.h"
76601 +#include <asm/thread_info.h>
76602 +
76603 +static __inline__ int irq_canonicalize(int irq)
76604 +{
76605 +       return ((irq == 2) ? 9 : irq);
76606 +}
76607 +
76608 +#ifdef CONFIG_X86_LOCAL_APIC
76609 +#define ARCH_HAS_NMI_WATCHDOG          /* See include/linux/nmi.h */
76610 +#endif
76611 +
76612 +#define KDB_VECTOR     0xf9
76613 +
76614 +# define irq_ctx_init(cpu) do { } while (0)
76615 +
76616 +#ifdef CONFIG_HOTPLUG_CPU
76617 +#include <linux/cpumask.h>
76618 +extern void fixup_irqs(cpumask_t map);
76619 +#endif
76620 +
76621 +#define __ARCH_HAS_DO_SOFTIRQ 1
76622 +
76623 +#endif /* _ASM_IRQ_H */
76624 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/mmu_context.h new/include/asm-x86_64/mach-xen/asm/mmu_context.h
76625 --- linux-2.6/include/asm-x86_64/mach-xen/asm/mmu_context.h     1970-01-01 01:00:00.000000000 +0100
76626 +++ new/include/asm-x86_64/mach-xen/asm/mmu_context.h   2006-06-28 14:32:14.000000000 +0200
76627 @@ -0,0 +1,136 @@
76628 +#ifndef __X86_64_MMU_CONTEXT_H
76629 +#define __X86_64_MMU_CONTEXT_H
76630 +
76631 +#include <linux/config.h>
76632 +#include <asm/desc.h>
76633 +#include <asm/atomic.h>
76634 +#include <asm/pgalloc.h>
76635 +#include <asm/page.h>
76636 +#include <asm/pda.h>
76637 +#include <asm/pgtable.h>
76638 +#include <asm/tlbflush.h>
76639 +
76640 +/*
76641 + * possibly do the LDT unload here?
76642 + */
76643 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
76644 +void destroy_context(struct mm_struct *mm);
76645 +
76646 +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
76647 +{
76648 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
76649 +       if (read_pda(mmu_state) == TLBSTATE_OK) 
76650 +               write_pda(mmu_state, TLBSTATE_LAZY);
76651 +#endif
76652 +}
76653 +
76654 +#define prepare_arch_switch(next)      __prepare_arch_switch()
76655 +
76656 +static inline void __prepare_arch_switch(void)
76657 +{
76658 +       /*
76659 +        * Save away %es, %ds, %fs and %gs. Must happen before reload
76660 +        * of cr3/ldt (i.e., not in __switch_to).
76661 +        */
76662 +       __asm__ __volatile__ (
76663 +               "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3"
76664 +               : "=m" (current->thread.es),
76665 +                 "=m" (current->thread.ds),
76666 +                 "=m" (current->thread.fsindex),
76667 +                 "=m" (current->thread.gsindex) );
76668 +
76669 +       if (current->thread.ds)
76670 +               __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) );
76671 +
76672 +       if (current->thread.es)
76673 +               __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) );
76674 +
76675 +       if (current->thread.fsindex) {
76676 +               __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) );
76677 +               current->thread.fs = 0;
76678 +       }
76679 +
76680 +       if (current->thread.gsindex) {
76681 +               load_gs_index(0);
76682 +               current->thread.gs = 0;
76683 +       }
76684 +}
76685 +
76686 +extern void mm_pin(struct mm_struct *mm);
76687 +extern void mm_unpin(struct mm_struct *mm);
76688 +void mm_pin_all(void);
76689 +
76690 +static inline void load_cr3(pgd_t *pgd)
76691 +{
76692 +       asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) :
76693 +                    "memory");
76694 +}
76695 +
76696 +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, 
76697 +                            struct task_struct *tsk)
76698 +{
76699 +       unsigned cpu = smp_processor_id();
76700 +       struct mmuext_op _op[3], *op = _op;
76701 +
76702 +       if (likely(prev != next)) {
76703 +               BUG_ON(!next->context.pinned);
76704 +
76705 +               /* stop flush ipis for the previous mm */
76706 +               cpu_clear(cpu, prev->cpu_vm_mask);
76707 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
76708 +               write_pda(mmu_state, TLBSTATE_OK);
76709 +               write_pda(active_mm, next);
76710 +#endif
76711 +               cpu_set(cpu, next->cpu_vm_mask);
76712 +
76713 +               /* load_cr3(next->pgd) */
76714 +               op->cmd = MMUEXT_NEW_BASEPTR;
76715 +               op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
76716 +               op++;
76717 +
76718 +               /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */
76719 +               op->cmd = MMUEXT_NEW_USER_BASEPTR;
76720 +               op->arg1.mfn = pfn_to_mfn(__pa(__user_pgd(next->pgd)) >> PAGE_SHIFT);
76721 +               op++;
76722 +               
76723 +               if (unlikely(next->context.ldt != prev->context.ldt)) {
76724 +                       /* load_LDT_nolock(&next->context, cpu) */
76725 +                       op->cmd = MMUEXT_SET_LDT;
76726 +                       op->arg1.linear_addr = (unsigned long)next->context.ldt;
76727 +                       op->arg2.nr_ents     = next->context.size;
76728 +                       op++;
76729 +               }
76730 +
76731 +               BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
76732 +       }
76733 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
76734 +       else {
76735 +               write_pda(mmu_state, TLBSTATE_OK);
76736 +               if (read_pda(active_mm) != next)
76737 +                       out_of_line_bug();
76738 +               if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
76739 +                       /* We were in lazy tlb mode and leave_mm disabled 
76740 +                        * tlb flush IPI delivery. We must reload CR3
76741 +                        * to make sure to use no freed page tables.
76742 +                        */
76743 +                        load_cr3(next->pgd);
76744 +                        xen_new_user_pt(__pa(__user_pgd(next->pgd)));          
76745 +                       load_LDT_nolock(&next->context, cpu);
76746 +               }
76747 +       }
76748 +#endif
76749 +}
76750 +
76751 +#define deactivate_mm(tsk,mm)  do { \
76752 +       load_gs_index(0); \
76753 +       asm volatile("movl %0,%%fs"::"r"(0));  \
76754 +} while(0)
76755 +
76756 +static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
76757 +{
76758 +       if (!next->context.pinned)
76759 +               mm_pin(next);
76760 +       switch_mm(prev, next, NULL);
76761 +}
76762 +
76763 +#endif
76764 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/mmu.h new/include/asm-x86_64/mach-xen/asm/mmu.h
76765 --- linux-2.6/include/asm-x86_64/mach-xen/asm/mmu.h     1970-01-01 01:00:00.000000000 +0100
76766 +++ new/include/asm-x86_64/mach-xen/asm/mmu.h   2006-06-28 14:32:14.000000000 +0200
76767 @@ -0,0 +1,37 @@
76768 +#ifndef __x86_64_MMU_H
76769 +#define __x86_64_MMU_H
76770 +
76771 +#include <linux/spinlock.h>
76772 +#include <asm/semaphore.h>
76773 +
76774 +/*
76775 + * The x86_64 doesn't have a mmu context, but
76776 + * we put the segment information here.
76777 + *
76778 + * cpu_vm_mask is used to optimize ldt flushing.
76779 + */
76780 +typedef struct { 
76781 +       void *ldt;
76782 +       rwlock_t ldtlock; 
76783 +       int size;
76784 +       struct semaphore sem; 
76785 +#ifdef CONFIG_XEN
76786 +       unsigned pinned:1;
76787 +       struct list_head unpinned;
76788 +#endif
76789 +} mm_context_t;
76790 +
76791 +#ifdef CONFIG_XEN
76792 +extern struct list_head mm_unpinned;
76793 +extern spinlock_t mm_unpinned_lock;
76794 +
76795 +/* mm/memory.c:exit_mmap hook */
76796 +extern void _arch_exit_mmap(struct mm_struct *mm);
76797 +#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
76798 +
76799 +/* kernel/fork.c:dup_mmap hook */
76800 +extern void _arch_dup_mmap(struct mm_struct *mm);
76801 +#define arch_dup_mmap(mm, oldmm) ((void)(oldmm), _arch_dup_mmap(mm))
76802 +#endif
76803 +
76804 +#endif
76805 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/msr.h new/include/asm-x86_64/mach-xen/asm/msr.h
76806 --- linux-2.6/include/asm-x86_64/mach-xen/asm/msr.h     1970-01-01 01:00:00.000000000 +0100
76807 +++ new/include/asm-x86_64/mach-xen/asm/msr.h   2006-05-09 12:35:41.000000000 +0200
76808 @@ -0,0 +1,399 @@
76809 +#ifndef X86_64_MSR_H
76810 +#define X86_64_MSR_H 1
76811 +
76812 +#ifndef __ASSEMBLY__
76813 +/*
76814 + * Access to machine-specific registers (available on 586 and better only)
76815 + * Note: the rd* operations modify the parameters directly (without using
76816 + * pointer indirection), this allows gcc to optimize better
76817 + */
76818 +
76819 +#define rdmsr(msr,val1,val2) \
76820 +       __asm__ __volatile__("rdmsr" \
76821 +                           : "=a" (val1), "=d" (val2) \
76822 +                           : "c" (msr))
76823 +
76824 +
76825 +#define rdmsrl(msr,val) do { unsigned long a__,b__; \
76826 +       __asm__ __volatile__("rdmsr" \
76827 +                           : "=a" (a__), "=d" (b__) \
76828 +                           : "c" (msr)); \
76829 +       val = a__ | (b__<<32); \
76830 +} while(0)
76831 +
76832 +#define wrmsr(msr,val1,val2) \
76833 +     __asm__ __volatile__("wrmsr" \
76834 +                         : /* no outputs */ \
76835 +                         : "c" (msr), "a" (val1), "d" (val2))
76836 +
76837 +#define wrmsrl(msr,val) wrmsr(msr,(__u32)((__u64)(val)),((__u64)(val))>>32) 
76838 +
76839 +/* wrmsr with exception handling */
76840 +#define wrmsr_safe(msr,a,b) ({ int ret__;                      \
76841 +       asm volatile("2: wrmsr ; xorl %0,%0\n"                  \
76842 +                    "1:\n\t"                                   \
76843 +                    ".section .fixup,\"ax\"\n\t"               \
76844 +                    "3:  movl %4,%0 ; jmp 1b\n\t"              \
76845 +                    ".previous\n\t"                            \
76846 +                    ".section __ex_table,\"a\"\n"              \
76847 +                    "   .align 8\n\t"                          \
76848 +                    "   .quad  2b,3b\n\t"                      \
76849 +                    ".previous"                                \
76850 +                    : "=a" (ret__)                             \
76851 +                    : "c" (msr), "0" (a), "d" (b), "i" (-EFAULT)); \
76852 +       ret__; })
76853 +
76854 +#define checking_wrmsrl(msr,val) wrmsr_safe(msr,(u32)(val),(u32)((val)>>32))
76855 +
76856 +#define rdmsr_safe(msr,a,b) \
76857 +       ({ int ret__;                                           \
76858 +         asm volatile ("1:       rdmsr\n"                      \
76859 +                      "2:\n"                                   \
76860 +                      ".section .fixup,\"ax\"\n"               \
76861 +                      "3:       movl %4,%0\n"                  \
76862 +                      " jmp 2b\n"                              \
76863 +                      ".previous\n"                            \
76864 +                      ".section __ex_table,\"a\"\n"            \
76865 +                      " .align 8\n"                            \
76866 +                      " .quad 1b,3b\n"                         \
76867 +                      ".previous":"=&bDS" (ret__), "=a"(*(a)), "=d"(*(b))\
76868 +                      :"c"(msr), "i"(-EIO), "0"(0));           \
76869 +         ret__; })             
76870 +
76871 +#define rdtsc(low,high) \
76872 +     __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
76873 +
76874 +#define rdtscl(low) \
76875 +     __asm__ __volatile__ ("rdtsc" : "=a" (low) : : "edx")
76876 +
76877 +#define rdtscll(val) do { \
76878 +     unsigned int __a,__d; \
76879 +     asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
76880 +     (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
76881 +} while(0)
76882 +
76883 +#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
76884 +
76885 +#define rdpmc(counter,low,high) \
76886 +     __asm__ __volatile__("rdpmc" \
76887 +                         : "=a" (low), "=d" (high) \
76888 +                         : "c" (counter))
76889 +
76890 +static inline void cpuid(int op, unsigned int *eax, unsigned int *ebx,
76891 +                        unsigned int *ecx, unsigned int *edx)
76892 +{
76893 +       __asm__(XEN_CPUID
76894 +               : "=a" (*eax),
76895 +                 "=b" (*ebx),
76896 +                 "=c" (*ecx),
76897 +                 "=d" (*edx)
76898 +               : "0" (op));
76899 +}
76900 +
76901 +/* Some CPUID calls want 'count' to be placed in ecx */
76902 +static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
76903 +               int *edx)
76904 +{
76905 +       __asm__(XEN_CPUID
76906 +               : "=a" (*eax),
76907 +                 "=b" (*ebx),
76908 +                 "=c" (*ecx),
76909 +                 "=d" (*edx)
76910 +               : "0" (op), "c" (count));
76911 +}
76912 +
76913 +/*
76914 + * CPUID functions returning a single datum
76915 + */
76916 +static inline unsigned int cpuid_eax(unsigned int op)
76917 +{
76918 +       unsigned int eax;
76919 +
76920 +       __asm__(XEN_CPUID
76921 +               : "=a" (eax)
76922 +               : "0" (op)
76923 +               : "bx", "cx", "dx");
76924 +       return eax;
76925 +}
76926 +static inline unsigned int cpuid_ebx(unsigned int op)
76927 +{
76928 +       unsigned int eax, ebx;
76929 +
76930 +       __asm__(XEN_CPUID
76931 +               : "=a" (eax), "=b" (ebx)
76932 +               : "0" (op)
76933 +               : "cx", "dx" );
76934 +       return ebx;
76935 +}
76936 +static inline unsigned int cpuid_ecx(unsigned int op)
76937 +{
76938 +       unsigned int eax, ecx;
76939 +
76940 +       __asm__(XEN_CPUID
76941 +               : "=a" (eax), "=c" (ecx)
76942 +               : "0" (op)
76943 +               : "bx", "dx" );
76944 +       return ecx;
76945 +}
76946 +static inline unsigned int cpuid_edx(unsigned int op)
76947 +{
76948 +       unsigned int eax, edx;
76949 +
76950 +       __asm__(XEN_CPUID
76951 +               : "=a" (eax), "=d" (edx)
76952 +               : "0" (op)
76953 +               : "bx", "cx");
76954 +       return edx;
76955 +}
76956 +
76957 +#define MSR_IA32_UCODE_WRITE           0x79
76958 +#define MSR_IA32_UCODE_REV             0x8b
76959 +
76960 +
76961 +#endif
76962 +
76963 +/* AMD/K8 specific MSRs */ 
76964 +#define MSR_EFER 0xc0000080            /* extended feature register */
76965 +#define MSR_STAR 0xc0000081            /* legacy mode SYSCALL target */
76966 +#define MSR_LSTAR 0xc0000082           /* long mode SYSCALL target */
76967 +#define MSR_CSTAR 0xc0000083           /* compatibility mode SYSCALL target */
76968 +#define MSR_SYSCALL_MASK 0xc0000084    /* EFLAGS mask for syscall */
76969 +#define MSR_FS_BASE 0xc0000100         /* 64bit GS base */
76970 +#define MSR_GS_BASE 0xc0000101         /* 64bit FS base */
76971 +#define MSR_KERNEL_GS_BASE  0xc0000102 /* SwapGS GS shadow (or USER_GS from kernel) */ 
76972 +/* EFER bits: */ 
76973 +#define _EFER_SCE 0  /* SYSCALL/SYSRET */
76974 +#define _EFER_LME 8  /* Long mode enable */
76975 +#define _EFER_LMA 10 /* Long mode active (read-only) */
76976 +#define _EFER_NX 11  /* No execute enable */
76977 +
76978 +#define EFER_SCE (1<<_EFER_SCE)
76979 +#define EFER_LME (1<<_EFER_LME)
76980 +#define EFER_LMA (1<<_EFER_LMA)
76981 +#define EFER_NX (1<<_EFER_NX)
76982 +
76983 +/* Intel MSRs. Some also available on other CPUs */
76984 +#define MSR_IA32_TSC           0x10
76985 +#define MSR_IA32_PLATFORM_ID   0x17
76986 +
76987 +#define MSR_IA32_PERFCTR0      0xc1
76988 +#define MSR_IA32_PERFCTR1      0xc2
76989 +
76990 +#define MSR_MTRRcap            0x0fe
76991 +#define MSR_IA32_BBL_CR_CTL        0x119
76992 +
76993 +#define MSR_IA32_SYSENTER_CS   0x174
76994 +#define MSR_IA32_SYSENTER_ESP  0x175
76995 +#define MSR_IA32_SYSENTER_EIP  0x176
76996 +
76997 +#define MSR_IA32_MCG_CAP       0x179
76998 +#define MSR_IA32_MCG_STATUS        0x17a
76999 +#define MSR_IA32_MCG_CTL       0x17b
77000 +
77001 +#define MSR_IA32_EVNTSEL0      0x186
77002 +#define MSR_IA32_EVNTSEL1      0x187
77003 +
77004 +#define MSR_IA32_DEBUGCTLMSR       0x1d9
77005 +#define MSR_IA32_LASTBRANCHFROMIP  0x1db
77006 +#define MSR_IA32_LASTBRANCHTOIP        0x1dc
77007 +#define MSR_IA32_LASTINTFROMIP     0x1dd
77008 +#define MSR_IA32_LASTINTTOIP       0x1de
77009 +
77010 +#define MSR_MTRRfix64K_00000   0x250
77011 +#define MSR_MTRRfix16K_80000   0x258
77012 +#define MSR_MTRRfix16K_A0000   0x259
77013 +#define MSR_MTRRfix4K_C0000    0x268
77014 +#define MSR_MTRRfix4K_C8000    0x269
77015 +#define MSR_MTRRfix4K_D0000    0x26a
77016 +#define MSR_MTRRfix4K_D8000    0x26b
77017 +#define MSR_MTRRfix4K_E0000    0x26c
77018 +#define MSR_MTRRfix4K_E8000    0x26d
77019 +#define MSR_MTRRfix4K_F0000    0x26e
77020 +#define MSR_MTRRfix4K_F8000    0x26f
77021 +#define MSR_MTRRdefType                0x2ff
77022 +
77023 +#define MSR_IA32_MC0_CTL       0x400
77024 +#define MSR_IA32_MC0_STATUS        0x401
77025 +#define MSR_IA32_MC0_ADDR      0x402
77026 +#define MSR_IA32_MC0_MISC      0x403
77027 +
77028 +#define MSR_P6_PERFCTR0                        0xc1
77029 +#define MSR_P6_PERFCTR1                        0xc2
77030 +#define MSR_P6_EVNTSEL0                        0x186
77031 +#define MSR_P6_EVNTSEL1                        0x187
77032 +
77033 +/* K7/K8 MSRs. Not complete. See the architecture manual for a more complete list. */
77034 +#define MSR_K7_EVNTSEL0            0xC0010000
77035 +#define MSR_K7_PERFCTR0            0xC0010004
77036 +#define MSR_K7_EVNTSEL1            0xC0010001
77037 +#define MSR_K7_PERFCTR1            0xC0010005
77038 +#define MSR_K7_EVNTSEL2            0xC0010002
77039 +#define MSR_K7_PERFCTR2            0xC0010006
77040 +#define MSR_K7_EVNTSEL3            0xC0010003
77041 +#define MSR_K7_PERFCTR3            0xC0010007
77042 +#define MSR_K8_TOP_MEM1                   0xC001001A
77043 +#define MSR_K8_TOP_MEM2                   0xC001001D
77044 +#define MSR_K8_SYSCFG             0xC0010010
77045 +#define MSR_K8_HWCR               0xC0010015
77046 +
77047 +/* K6 MSRs */
77048 +#define MSR_K6_EFER                    0xC0000080
77049 +#define MSR_K6_STAR                    0xC0000081
77050 +#define MSR_K6_WHCR                    0xC0000082
77051 +#define MSR_K6_UWCCR                   0xC0000085
77052 +#define MSR_K6_PSOR                    0xC0000087
77053 +#define MSR_K6_PFIR                    0xC0000088
77054 +
77055 +/* Centaur-Hauls/IDT defined MSRs. */
77056 +#define MSR_IDT_FCR1                   0x107
77057 +#define MSR_IDT_FCR2                   0x108
77058 +#define MSR_IDT_FCR3                   0x109
77059 +#define MSR_IDT_FCR4                   0x10a
77060 +
77061 +#define MSR_IDT_MCR0                   0x110
77062 +#define MSR_IDT_MCR1                   0x111
77063 +#define MSR_IDT_MCR2                   0x112
77064 +#define MSR_IDT_MCR3                   0x113
77065 +#define MSR_IDT_MCR4                   0x114
77066 +#define MSR_IDT_MCR5                   0x115
77067 +#define MSR_IDT_MCR6                   0x116
77068 +#define MSR_IDT_MCR7                   0x117
77069 +#define MSR_IDT_MCR_CTRL               0x120
77070 +
77071 +/* VIA Cyrix defined MSRs*/
77072 +#define MSR_VIA_FCR                    0x1107
77073 +#define MSR_VIA_LONGHAUL               0x110a
77074 +#define MSR_VIA_RNG                    0x110b
77075 +#define MSR_VIA_BCR2                   0x1147
77076 +
77077 +/* Intel defined MSRs. */
77078 +#define MSR_IA32_P5_MC_ADDR            0
77079 +#define MSR_IA32_P5_MC_TYPE            1
77080 +#define MSR_IA32_PLATFORM_ID           0x17
77081 +#define MSR_IA32_EBL_CR_POWERON                0x2a
77082 +
77083 +#define MSR_IA32_APICBASE               0x1b
77084 +#define MSR_IA32_APICBASE_BSP           (1<<8)
77085 +#define MSR_IA32_APICBASE_ENABLE        (1<<11)
77086 +#define MSR_IA32_APICBASE_BASE          (0xfffff<<12)
77087 +
77088 +/* P4/Xeon+ specific */
77089 +#define MSR_IA32_MCG_EAX               0x180
77090 +#define MSR_IA32_MCG_EBX               0x181
77091 +#define MSR_IA32_MCG_ECX               0x182
77092 +#define MSR_IA32_MCG_EDX               0x183
77093 +#define MSR_IA32_MCG_ESI               0x184
77094 +#define MSR_IA32_MCG_EDI               0x185
77095 +#define MSR_IA32_MCG_EBP               0x186
77096 +#define MSR_IA32_MCG_ESP               0x187
77097 +#define MSR_IA32_MCG_EFLAGS            0x188
77098 +#define MSR_IA32_MCG_EIP               0x189
77099 +#define MSR_IA32_MCG_RESERVED          0x18A
77100 +
77101 +#define MSR_P6_EVNTSEL0                        0x186
77102 +#define MSR_P6_EVNTSEL1                        0x187
77103 +
77104 +#define MSR_IA32_PERF_STATUS           0x198
77105 +#define MSR_IA32_PERF_CTL              0x199
77106 +
77107 +#define MSR_IA32_THERM_CONTROL         0x19a
77108 +#define MSR_IA32_THERM_INTERRUPT       0x19b
77109 +#define MSR_IA32_THERM_STATUS          0x19c
77110 +#define MSR_IA32_MISC_ENABLE           0x1a0
77111 +
77112 +#define MSR_IA32_DEBUGCTLMSR           0x1d9
77113 +#define MSR_IA32_LASTBRANCHFROMIP      0x1db
77114 +#define MSR_IA32_LASTBRANCHTOIP                0x1dc
77115 +#define MSR_IA32_LASTINTFROMIP         0x1dd
77116 +#define MSR_IA32_LASTINTTOIP           0x1de
77117 +
77118 +#define MSR_IA32_MC0_CTL               0x400
77119 +#define MSR_IA32_MC0_STATUS            0x401
77120 +#define MSR_IA32_MC0_ADDR              0x402
77121 +#define MSR_IA32_MC0_MISC              0x403
77122 +
77123 +/* Pentium IV performance counter MSRs */
77124 +#define MSR_P4_BPU_PERFCTR0            0x300
77125 +#define MSR_P4_BPU_PERFCTR1            0x301
77126 +#define MSR_P4_BPU_PERFCTR2            0x302
77127 +#define MSR_P4_BPU_PERFCTR3            0x303
77128 +#define MSR_P4_MS_PERFCTR0             0x304
77129 +#define MSR_P4_MS_PERFCTR1             0x305
77130 +#define MSR_P4_MS_PERFCTR2             0x306
77131 +#define MSR_P4_MS_PERFCTR3             0x307
77132 +#define MSR_P4_FLAME_PERFCTR0          0x308
77133 +#define MSR_P4_FLAME_PERFCTR1          0x309
77134 +#define MSR_P4_FLAME_PERFCTR2          0x30a
77135 +#define MSR_P4_FLAME_PERFCTR3          0x30b
77136 +#define MSR_P4_IQ_PERFCTR0             0x30c
77137 +#define MSR_P4_IQ_PERFCTR1             0x30d
77138 +#define MSR_P4_IQ_PERFCTR2             0x30e
77139 +#define MSR_P4_IQ_PERFCTR3             0x30f
77140 +#define MSR_P4_IQ_PERFCTR4             0x310
77141 +#define MSR_P4_IQ_PERFCTR5             0x311
77142 +#define MSR_P4_BPU_CCCR0               0x360
77143 +#define MSR_P4_BPU_CCCR1               0x361
77144 +#define MSR_P4_BPU_CCCR2               0x362
77145 +#define MSR_P4_BPU_CCCR3               0x363
77146 +#define MSR_P4_MS_CCCR0                0x364
77147 +#define MSR_P4_MS_CCCR1                0x365
77148 +#define MSR_P4_MS_CCCR2                0x366
77149 +#define MSR_P4_MS_CCCR3                0x367
77150 +#define MSR_P4_FLAME_CCCR0             0x368
77151 +#define MSR_P4_FLAME_CCCR1             0x369
77152 +#define MSR_P4_FLAME_CCCR2             0x36a
77153 +#define MSR_P4_FLAME_CCCR3             0x36b
77154 +#define MSR_P4_IQ_CCCR0                0x36c
77155 +#define MSR_P4_IQ_CCCR1                0x36d
77156 +#define MSR_P4_IQ_CCCR2                0x36e
77157 +#define MSR_P4_IQ_CCCR3                0x36f
77158 +#define MSR_P4_IQ_CCCR4                0x370
77159 +#define MSR_P4_IQ_CCCR5                0x371
77160 +#define MSR_P4_ALF_ESCR0               0x3ca
77161 +#define MSR_P4_ALF_ESCR1               0x3cb
77162 +#define MSR_P4_BPU_ESCR0               0x3b2
77163 +#define MSR_P4_BPU_ESCR1               0x3b3
77164 +#define MSR_P4_BSU_ESCR0               0x3a0
77165 +#define MSR_P4_BSU_ESCR1               0x3a1
77166 +#define MSR_P4_CRU_ESCR0               0x3b8
77167 +#define MSR_P4_CRU_ESCR1               0x3b9
77168 +#define MSR_P4_CRU_ESCR2               0x3cc
77169 +#define MSR_P4_CRU_ESCR3               0x3cd
77170 +#define MSR_P4_CRU_ESCR4               0x3e0
77171 +#define MSR_P4_CRU_ESCR5               0x3e1
77172 +#define MSR_P4_DAC_ESCR0               0x3a8
77173 +#define MSR_P4_DAC_ESCR1               0x3a9
77174 +#define MSR_P4_FIRM_ESCR0              0x3a4
77175 +#define MSR_P4_FIRM_ESCR1              0x3a5
77176 +#define MSR_P4_FLAME_ESCR0             0x3a6
77177 +#define MSR_P4_FLAME_ESCR1             0x3a7
77178 +#define MSR_P4_FSB_ESCR0               0x3a2
77179 +#define MSR_P4_FSB_ESCR1               0x3a3
77180 +#define MSR_P4_IQ_ESCR0                0x3ba
77181 +#define MSR_P4_IQ_ESCR1                0x3bb
77182 +#define MSR_P4_IS_ESCR0                0x3b4
77183 +#define MSR_P4_IS_ESCR1                0x3b5
77184 +#define MSR_P4_ITLB_ESCR0              0x3b6
77185 +#define MSR_P4_ITLB_ESCR1              0x3b7
77186 +#define MSR_P4_IX_ESCR0                0x3c8
77187 +#define MSR_P4_IX_ESCR1                0x3c9
77188 +#define MSR_P4_MOB_ESCR0               0x3aa
77189 +#define MSR_P4_MOB_ESCR1               0x3ab
77190 +#define MSR_P4_MS_ESCR0                0x3c0
77191 +#define MSR_P4_MS_ESCR1                0x3c1
77192 +#define MSR_P4_PMH_ESCR0               0x3ac
77193 +#define MSR_P4_PMH_ESCR1               0x3ad
77194 +#define MSR_P4_RAT_ESCR0               0x3bc
77195 +#define MSR_P4_RAT_ESCR1               0x3bd
77196 +#define MSR_P4_SAAT_ESCR0              0x3ae
77197 +#define MSR_P4_SAAT_ESCR1              0x3af
77198 +#define MSR_P4_SSU_ESCR0               0x3be
77199 +#define MSR_P4_SSU_ESCR1               0x3bf    /* guess: not defined in manual */
77200 +#define MSR_P4_TBPU_ESCR0              0x3c2
77201 +#define MSR_P4_TBPU_ESCR1              0x3c3
77202 +#define MSR_P4_TC_ESCR0                0x3c4
77203 +#define MSR_P4_TC_ESCR1                0x3c5
77204 +#define MSR_P4_U2L_ESCR0               0x3b0
77205 +#define MSR_P4_U2L_ESCR1               0x3b1
77206 +
77207 +#endif
77208 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/nmi.h new/include/asm-x86_64/mach-xen/asm/nmi.h
77209 --- linux-2.6/include/asm-x86_64/mach-xen/asm/nmi.h     1970-01-01 01:00:00.000000000 +0100
77210 +++ new/include/asm-x86_64/mach-xen/asm/nmi.h   2006-05-09 12:35:41.000000000 +0200
77211 @@ -0,0 +1,75 @@
77212 +/*
77213 + *  linux/include/asm-i386/nmi.h
77214 + */
77215 +#ifndef ASM_NMI_H
77216 +#define ASM_NMI_H
77217 +
77218 +#include <linux/pm.h>
77219 +
77220 +#include <xen/interface/nmi.h>
77221 +
77222 +struct pt_regs;
77223
77224 +typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu);
77225
77226 +/** 
77227 + * set_nmi_callback
77228 + *
77229 + * Set a handler for an NMI. Only one handler may be
77230 + * set. Return 1 if the NMI was handled.
77231 + */
77232 +void set_nmi_callback(nmi_callback_t callback);
77233
77234 +/** 
77235 + * unset_nmi_callback
77236 + *
77237 + * Remove the handler previously set.
77238 + */
77239 +void unset_nmi_callback(void);
77240
77241 +#ifdef CONFIG_PM
77242
77243 +/** Replace the PM callback routine for NMI. */
77244 +struct pm_dev * set_nmi_pm_callback(pm_callback callback);
77245 +
77246 +/** Unset the PM callback routine back to the default. */
77247 +void unset_nmi_pm_callback(struct pm_dev * dev);
77248 +
77249 +#else
77250 +
77251 +static inline struct pm_dev * set_nmi_pm_callback(pm_callback callback)
77252 +{
77253 +       return 0;
77254 +} 
77255
77256 +static inline void unset_nmi_pm_callback(struct pm_dev * dev)
77257 +{
77258 +}
77259 +
77260 +#endif /* CONFIG_PM */
77261
77262 +extern void default_do_nmi(struct pt_regs *);
77263 +extern void die_nmi(char *str, struct pt_regs *regs);
77264 +
77265 +static inline unsigned char get_nmi_reason(void)
77266 +{
77267 +        shared_info_t *s = HYPERVISOR_shared_info;
77268 +        unsigned char reason = 0;
77269 +
77270 +        /* construct a value which looks like it came from
77271 +         * port 0x61.
77272 +         */
77273 +        if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
77274 +                reason |= 0x40;
77275 +        if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
77276 +                reason |= 0x80;
77277 +
77278 +        return reason;
77279 +}
77280 +
77281 +extern int panic_on_timeout;
77282 +extern int unknown_nmi_panic;
77283 +
77284 +extern int check_nmi_watchdog(void);
77285
77286 +#endif /* ASM_NMI_H */
77287 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/page.h new/include/asm-x86_64/mach-xen/asm/page.h
77288 --- linux-2.6/include/asm-x86_64/mach-xen/asm/page.h    1970-01-01 01:00:00.000000000 +0100
77289 +++ new/include/asm-x86_64/mach-xen/asm/page.h  2006-06-28 14:32:14.000000000 +0200
77290 @@ -0,0 +1,327 @@
77291 +#ifndef _X86_64_PAGE_H
77292 +#define _X86_64_PAGE_H
77293 +
77294 +#include <linux/config.h>
77295 +/* #include <linux/string.h> */
77296 +#ifndef __ASSEMBLY__
77297 +#include <linux/kernel.h>
77298 +#include <linux/types.h>
77299 +#include <asm/bug.h>
77300 +#include <xen/features.h>
77301 +#endif
77302 +#include <xen/interface/xen.h> 
77303 +#include <xen/foreign_page.h>
77304 +
77305 +#define arch_free_page(_page,_order)                   \
77306 +({     int foreign = PageForeign(_page);               \
77307 +       if (foreign)                                    \
77308 +               (PageForeignDestructor(_page))(_page);  \
77309 +       foreign;                                        \
77310 +})
77311 +#define HAVE_ARCH_FREE_PAGE
77312 +
77313 +#ifdef CONFIG_XEN_SCRUB_PAGES
77314 +#define scrub_pages(_p,_n) memset((void *)(_p), 0, (_n) << PAGE_SHIFT)
77315 +#else
77316 +#define scrub_pages(_p,_n) ((void)0)
77317 +#endif
77318 +
77319 +/* PAGE_SHIFT determines the page size */
77320 +#define PAGE_SHIFT     12
77321 +#ifdef __ASSEMBLY__
77322 +#define PAGE_SIZE      (0x1 << PAGE_SHIFT)
77323 +#else
77324 +#define PAGE_SIZE      (1UL << PAGE_SHIFT)
77325 +#endif
77326 +#define PAGE_MASK      (~(PAGE_SIZE-1))
77327 +#define PHYSICAL_PAGE_MASK     (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
77328 +
77329 +#define THREAD_ORDER 1 
77330 +#define THREAD_SIZE  (PAGE_SIZE << THREAD_ORDER)
77331 +#define CURRENT_MASK (~(THREAD_SIZE-1))
77332 +
77333 +#define EXCEPTION_STACK_ORDER 0
77334 +#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
77335 +
77336 +#define DEBUG_STACK_ORDER EXCEPTION_STACK_ORDER
77337 +#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
77338 +
77339 +#define IRQSTACK_ORDER 2
77340 +#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER)
77341 +
77342 +#define STACKFAULT_STACK 1
77343 +#define DOUBLEFAULT_STACK 2
77344 +#define NMI_STACK 3
77345 +#define DEBUG_STACK 4
77346 +#define MCE_STACK 5
77347 +#define N_EXCEPTION_STACKS 5  /* hw limit: 7 */
77348 +
77349 +#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
77350 +#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
77351 +
77352 +#define HPAGE_SHIFT PMD_SHIFT
77353 +#define HPAGE_SIZE     ((1UL) << HPAGE_SHIFT)
77354 +#define HPAGE_MASK     (~(HPAGE_SIZE - 1))
77355 +#define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
77356 +
77357 +#ifdef __KERNEL__
77358 +#ifndef __ASSEMBLY__
77359 +
77360 +extern unsigned long end_pfn;
77361 +
77362 +void clear_page(void *);
77363 +void copy_page(void *, void *);
77364 +
77365 +#define clear_user_page(page, vaddr, pg)       clear_page(page)
77366 +#define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)
77367 +
77368 +#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
77369 +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
77370 +
77371 +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
77372 +#define INVALID_P2M_ENTRY      (~0UL)
77373 +#define FOREIGN_FRAME_BIT      (1UL<<63)
77374 +#define FOREIGN_FRAME(m)       ((m) | FOREIGN_FRAME_BIT)
77375 +
77376 +extern unsigned long *phys_to_machine_mapping;
77377 +
77378 +#undef machine_to_phys_mapping
77379 +extern unsigned long *machine_to_phys_mapping;
77380 +extern unsigned int   machine_to_phys_order;
77381 +
77382 +static inline unsigned long pfn_to_mfn(unsigned long pfn)
77383 +{
77384 +       if (xen_feature(XENFEAT_auto_translated_physmap))
77385 +               return pfn;
77386 +       return phys_to_machine_mapping[(unsigned int)(pfn)] &
77387 +               ~FOREIGN_FRAME_BIT;
77388 +}
77389 +
77390 +static inline int phys_to_machine_mapping_valid(unsigned long pfn)
77391 +{
77392 +       if (xen_feature(XENFEAT_auto_translated_physmap))
77393 +               return 1;
77394 +       return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
77395 +}
77396 +
77397 +static inline unsigned long mfn_to_pfn(unsigned long mfn)
77398 +{
77399 +       unsigned long pfn;
77400 +
77401 +       if (xen_feature(XENFEAT_auto_translated_physmap))
77402 +               return mfn;
77403 +
77404 +       if (unlikely((mfn >> machine_to_phys_order) != 0))
77405 +               return end_pfn;
77406 +
77407 +       /* The array access can fail (e.g., device space beyond end of RAM). */
77408 +       asm (
77409 +               "1:     movq %1,%0\n"
77410 +               "2:\n"
77411 +               ".section .fixup,\"ax\"\n"
77412 +               "3:     movq %2,%0\n"
77413 +               "       jmp  2b\n"
77414 +               ".previous\n"
77415 +               ".section __ex_table,\"a\"\n"
77416 +               "       .align 8\n"
77417 +               "       .quad 1b,3b\n"
77418 +               ".previous"
77419 +               : "=r" (pfn)
77420 +               : "m" (machine_to_phys_mapping[mfn]), "m" (end_pfn) );
77421 +
77422 +       return pfn;
77423 +}
77424 +
77425 +/*
77426 + * We detect special mappings in one of two ways:
77427 + *  1. If the MFN is an I/O page then Xen will set the m2p entry
77428 + *     to be outside our maximum possible pseudophys range.
77429 + *  2. If the MFN belongs to a different domain then we will certainly
77430 + *     not have MFN in our p2m table. Conversely, if the page is ours,
77431 + *     then we'll have p2m(m2p(MFN))==MFN.
77432 + * If we detect a special mapping then it doesn't have a 'struct page'.
77433 + * We force !pfn_valid() by returning an out-of-range pointer.
77434 + *
77435 + * NB. These checks require that, for any MFN that is not in our reservation,
77436 + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
77437 + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
77438 + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
77439 + *
77440 + * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
77441 + *      use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
77442 + *      require. In all the cases we care about, the FOREIGN_FRAME bit is
77443 + *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.
77444 + */
77445 +static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
77446 +{
77447 +       unsigned long pfn = mfn_to_pfn(mfn);
77448 +       if ((pfn < end_pfn)
77449 +           && !xen_feature(XENFEAT_auto_translated_physmap)
77450 +           && (phys_to_machine_mapping[pfn] != mfn))
77451 +               return end_pfn; /* force !pfn_valid() */
77452 +       return pfn;
77453 +}
77454 +
77455 +
77456 +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
77457 +{
77458 +       if (xen_feature(XENFEAT_auto_translated_physmap)) {
77459 +               BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
77460 +               return;
77461 +       }
77462 +       phys_to_machine_mapping[pfn] = mfn;
77463 +}
77464 +
77465 +/* Definitions for machine and pseudophysical addresses. */
77466 +typedef unsigned long paddr_t;
77467 +typedef unsigned long maddr_t;
77468 +
77469 +static inline maddr_t phys_to_machine(paddr_t phys)
77470 +{
77471 +       maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
77472 +       machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
77473 +       return machine;
77474 +}
77475 +
77476 +static inline paddr_t machine_to_phys(maddr_t machine)
77477 +{
77478 +       paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
77479 +       phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
77480 +       return phys;
77481 +}
77482 +
77483 +/*
77484 + * These are used to make use of C type-checking..
77485 + */
77486 +typedef struct { unsigned long pte; } pte_t;
77487 +typedef struct { unsigned long pmd; } pmd_t;
77488 +typedef struct { unsigned long pud; } pud_t;
77489 +typedef struct { unsigned long pgd; } pgd_t;
77490 +#define PTE_MASK       PHYSICAL_PAGE_MASK
77491 +
77492 +typedef struct { unsigned long pgprot; } pgprot_t;
77493 +
77494 +#define pte_val(x)     (((x).pte & 1) ? machine_to_phys((x).pte) : \
77495 +                        (x).pte)
77496 +#define pte_val_ma(x)  ((x).pte)
77497 +
77498 +static inline unsigned long pmd_val(pmd_t x)
77499 +{
77500 +       unsigned long ret = x.pmd;
77501 +       if (ret) ret = machine_to_phys(ret);
77502 +       return ret;
77503 +}
77504 +
77505 +static inline unsigned long pud_val(pud_t x)
77506 +{
77507 +       unsigned long ret = x.pud;
77508 +       if (ret) ret = machine_to_phys(ret);
77509 +       return ret;
77510 +}
77511 +
77512 +static inline unsigned long pgd_val(pgd_t x)
77513 +{
77514 +       unsigned long ret = x.pgd;
77515 +       if (ret) ret = machine_to_phys(ret);
77516 +       return ret;
77517 +}
77518 +
77519 +#define pgprot_val(x)  ((x).pgprot)
77520 +
77521 +#define __pte_ma(x)     ((pte_t) { (x) } )
77522 +
77523 +static inline pte_t __pte(unsigned long x)
77524 +{
77525 +       if (x & 1) x = phys_to_machine(x);
77526 +       return ((pte_t) { (x) });
77527 +}
77528 +
77529 +static inline pmd_t __pmd(unsigned long x)
77530 +{
77531 +       if ((x & 1)) x = phys_to_machine(x);
77532 +       return ((pmd_t) { (x) });
77533 +}
77534 +
77535 +static inline pud_t __pud(unsigned long x)
77536 +{
77537 +       if ((x & 1)) x = phys_to_machine(x);
77538 +       return ((pud_t) { (x) });
77539 +}
77540 +
77541 +static inline pgd_t __pgd(unsigned long x)
77542 +{
77543 +       if ((x & 1)) x = phys_to_machine(x);
77544 +       return ((pgd_t) { (x) });
77545 +}
77546 +
77547 +#define __pgprot(x)    ((pgprot_t) { (x) } )
77548 +
77549 +#define __PHYSICAL_START       ((unsigned long)CONFIG_PHYSICAL_START)
77550 +#define __START_KERNEL         (__START_KERNEL_map + __PHYSICAL_START)
77551 +#define __START_KERNEL_map     0xffffffff80000000UL
77552 +#define __PAGE_OFFSET           0xffff880000000000UL   
77553 +
77554 +#else
77555 +#define __PHYSICAL_START       CONFIG_PHYSICAL_START
77556 +#define __START_KERNEL         (__START_KERNEL_map + __PHYSICAL_START)
77557 +#define __START_KERNEL_map     0xffffffff80000000
77558 +#define __PAGE_OFFSET           0xffff880000000000
77559 +#endif /* !__ASSEMBLY__ */
77560 +
77561 +#ifdef CONFIG_XEN_COMPAT_030002
77562 +#undef LOAD_OFFSET
77563 +#define LOAD_OFFSET            0
77564 +#endif /* CONFIG_XEN_COMPAT_030002 */
77565 +
77566 +/* to align the pointer to the (next) page boundary */
77567 +#define PAGE_ALIGN(addr)       (((addr)+PAGE_SIZE-1)&PAGE_MASK)
77568 +
77569 +/* See Documentation/x86_64/mm.txt for a description of the memory map. */
77570 +#define __PHYSICAL_MASK_SHIFT  46
77571 +#define __PHYSICAL_MASK                ((1UL << __PHYSICAL_MASK_SHIFT) - 1)
77572 +#define __VIRTUAL_MASK_SHIFT   48
77573 +#define __VIRTUAL_MASK         ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
77574 +
77575 +#define KERNEL_TEXT_SIZE  (40UL*1024*1024)
77576 +#define KERNEL_TEXT_START 0xffffffff80000000UL 
77577 +
77578 +#define PAGE_OFFSET            ((unsigned long)__PAGE_OFFSET)
77579 +
77580 +/* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol.
77581 +   Otherwise you risk miscompilation. */ 
77582 +#define __pa(x)                        (((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET)
77583 +/* __pa_symbol should be used for C visible symbols.
77584 +   This seems to be the official gcc blessed way to do such arithmetic. */ 
77585 +#define __pa_symbol(x)         \
77586 +       ({unsigned long v;  \
77587 +         asm("" : "=r" (v) : "0" (x)); \
77588 +         __pa(v); })
77589 +
77590 +#define __va(x)                        ((void *)((unsigned long)(x)+PAGE_OFFSET))
77591 +#define __boot_va(x)           __va(x)
77592 +#define __boot_pa(x)           __pa(x)
77593 +#ifdef CONFIG_FLATMEM
77594 +#define pfn_valid(pfn)         ((pfn) < end_pfn)
77595 +#endif
77596 +
77597 +#define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
77598 +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
77599 +#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
77600 +
77601 +/* VIRT <-> MACHINE conversion */
77602 +#define virt_to_machine(v)     (phys_to_machine(__pa(v)))
77603 +#define virt_to_mfn(v)         (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
77604 +#define mfn_to_virt(m)         (__va(mfn_to_pfn(m) << PAGE_SHIFT))
77605 +
77606 +#define VM_DATA_DEFAULT_FLAGS \
77607 +       (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
77608 +        VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
77609 +
77610 +#define __HAVE_ARCH_GATE_AREA 1        
77611 +
77612 +#endif /* __KERNEL__ */
77613 +
77614 +#include <asm-generic/memory_model.h>
77615 +#include <asm-generic/page.h>
77616 +
77617 +#endif /* _X86_64_PAGE_H */
77618 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/param.h new/include/asm-x86_64/mach-xen/asm/param.h
77619 --- linux-2.6/include/asm-x86_64/mach-xen/asm/param.h   1970-01-01 01:00:00.000000000 +0100
77620 +++ new/include/asm-x86_64/mach-xen/asm/param.h 2006-05-09 12:35:41.000000000 +0200
77621 @@ -0,0 +1,23 @@
77622 +#ifndef _ASMx86_64_PARAM_H
77623 +#define _ASMx86_64_PARAM_H
77624 +
77625 +#ifdef __KERNEL__
77626 +# include <linux/config.h>
77627 +# define HZ            CONFIG_HZ       /* Internal kernel timer frequency */
77628 +# define USER_HZ       100             /* .. some user interfaces are in "ticks */
77629 +# define CLOCKS_PER_SEC                (USER_HZ)       /* like times() */
77630 +#endif
77631 +
77632 +#ifndef HZ
77633 +#define HZ 100
77634 +#endif
77635 +
77636 +#define EXEC_PAGESIZE  4096
77637 +
77638 +#ifndef NOGROUP
77639 +#define NOGROUP                (-1)
77640 +#endif
77641 +
77642 +#define MAXHOSTNAMELEN 64      /* max length of hostname */
77643 +
77644 +#endif
77645 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/pci.h new/include/asm-x86_64/mach-xen/asm/pci.h
77646 --- linux-2.6/include/asm-x86_64/mach-xen/asm/pci.h     1970-01-01 01:00:00.000000000 +0100
77647 +++ new/include/asm-x86_64/mach-xen/asm/pci.h   2006-05-09 12:35:41.000000000 +0200
77648 @@ -0,0 +1,174 @@
77649 +#ifndef __x8664_PCI_H
77650 +#define __x8664_PCI_H
77651 +
77652 +#include <linux/config.h>
77653 +#include <asm/io.h>
77654 +
77655 +#ifdef __KERNEL__
77656 +
77657 +#include <linux/mm.h> /* for struct page */
77658 +
77659 +/* Can be used to override the logic in pci_scan_bus for skipping
77660 +   already-configured bus numbers - to be used for buggy BIOSes
77661 +   or architectures with incomplete PCI setup by the loader */
77662 +
77663 +#ifdef CONFIG_PCI
77664 +extern unsigned int pcibios_assign_all_busses(void);
77665 +#else
77666 +#define pcibios_assign_all_busses()    0
77667 +#endif
77668 +#define pcibios_scan_all_fns(a, b)     0
77669 +
77670 +extern unsigned long pci_mem_start;
77671 +#define PCIBIOS_MIN_IO         0x1000
77672 +#define PCIBIOS_MIN_MEM                (pci_mem_start)
77673 +
77674 +#define PCIBIOS_MIN_CARDBUS_IO 0x4000
77675 +
77676 +void pcibios_config_init(void);
77677 +struct pci_bus * pcibios_scan_root(int bus);
77678 +extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value);
77679 +extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value);
77680 +
77681 +void pcibios_set_master(struct pci_dev *dev);
77682 +void pcibios_penalize_isa_irq(int irq, int active);
77683 +struct irq_routing_table *pcibios_get_irq_routing_table(void);
77684 +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
77685 +
77686 +#include <linux/types.h>
77687 +#include <linux/slab.h>
77688 +#include <asm/scatterlist.h>
77689 +#include <linux/string.h>
77690 +#include <asm/page.h>
77691 +#include <linux/dma-mapping.h> /* for have_iommu */
77692 +
77693 +extern int iommu_setup(char *opt);
77694 +
77695 +/* The PCI address space does equal the physical memory
77696 + * address space.  The networking and block device layers use
77697 + * this boolean for bounce buffer decisions
77698 + *
77699 + * On AMD64 it mostly equals, but we set it to zero if a hardware
77700 + * IOMMU (gart) of sotware IOMMU (swiotlb) is available.
77701 + */
77702 +#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
77703 +
77704 +#ifdef CONFIG_GART_IOMMU
77705 +
77706 +/*
77707 + * x86-64 always supports DAC, but sometimes it is useful to force
77708 + * devices through the IOMMU to get automatic sg list merging.
77709 + * Optional right now.
77710 + */
77711 +extern int iommu_sac_force;
77712 +#define pci_dac_dma_supported(pci_dev, mask)   (!iommu_sac_force)
77713 +
77714 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)      \
77715 +       dma_addr_t ADDR_NAME;
77716 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)                \
77717 +       __u32 LEN_NAME;
77718 +#define pci_unmap_addr(PTR, ADDR_NAME)                 \
77719 +       ((PTR)->ADDR_NAME)
77720 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)                \
77721 +       (((PTR)->ADDR_NAME) = (VAL))
77722 +#define pci_unmap_len(PTR, LEN_NAME)                   \
77723 +       ((PTR)->LEN_NAME)
77724 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)          \
77725 +       (((PTR)->LEN_NAME) = (VAL))
77726 +
77727 +#elif defined(CONFIG_SWIOTLB)
77728 +
77729 +#define pci_dac_dma_supported(pci_dev, mask)    1
77730 +
77731 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)      \
77732 +       dma_addr_t ADDR_NAME;
77733 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)                \
77734 +       __u32 LEN_NAME;
77735 +#define pci_unmap_addr(PTR, ADDR_NAME)                 \
77736 +       ((PTR)->ADDR_NAME)
77737 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)                \
77738 +       (((PTR)->ADDR_NAME) = (VAL))
77739 +#define pci_unmap_len(PTR, LEN_NAME)                   \
77740 +       ((PTR)->LEN_NAME)
77741 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)          \
77742 +       (((PTR)->LEN_NAME) = (VAL))
77743 +
77744 +#else
77745 +/* No IOMMU */
77746 +
77747 +#define pci_dac_dma_supported(pci_dev, mask)    1
77748 +
77749 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
77750 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
77751 +#define pci_unmap_addr(PTR, ADDR_NAME)         (0)
77752 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)        do { } while (0)
77753 +#define pci_unmap_len(PTR, LEN_NAME)           (0)
77754 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)  do { } while (0)
77755 +
77756 +#endif
77757 +
77758 +#include <asm-generic/pci-dma-compat.h>
77759 +
77760 +static inline dma64_addr_t
77761 +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
77762 +{
77763 +       return ((dma64_addr_t) page_to_phys(page) +
77764 +               (dma64_addr_t) offset);
77765 +}
77766 +
77767 +static inline struct page *
77768 +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
77769 +{
77770 +       return virt_to_page(__va(dma_addr));    
77771 +}
77772 +
77773 +static inline unsigned long
77774 +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
77775 +{
77776 +       return (dma_addr & ~PAGE_MASK);
77777 +}
77778 +
77779 +static inline void
77780 +pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
77781 +{
77782 +}
77783 +
77784 +static inline void
77785 +pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
77786 +{
77787 +       flush_write_buffers();
77788 +}
77789 +
77790 +#ifdef CONFIG_PCI
77791 +static inline void pci_dma_burst_advice(struct pci_dev *pdev,
77792 +                                       enum pci_dma_burst_strategy *strat,
77793 +                                       unsigned long *strategy_parameter)
77794 +{
77795 +       *strat = PCI_DMA_BURST_INFINITY;
77796 +       *strategy_parameter = ~0UL;
77797 +}
77798 +#endif
77799 +
77800 +#define HAVE_PCI_MMAP
77801 +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
77802 +                              enum pci_mmap_state mmap_state, int write_combine);
77803 +
77804 +static inline void pcibios_add_platform_entries(struct pci_dev *dev)
77805 +{
77806 +}
77807 +
77808 +#endif /* __KERNEL__ */
77809 +
77810 +/* generic pci stuff */
77811 +#ifdef CONFIG_PCI
77812 +#include <asm-generic/pci.h>
77813 +#endif
77814 +
77815 +/* On Xen we have to scan all functions since Xen hides bridges from
77816 + * us.  If a bridge is at fn=0 and that slot has a multifunction
77817 + * device, we won't find the additional devices without scanning all
77818 + * functions. */
77819 +#undef pcibios_scan_all_fns
77820 +#define pcibios_scan_all_fns(a, b)     1
77821 +
77822 +#endif /* __x8664_PCI_H */
77823 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/pgalloc.h new/include/asm-x86_64/mach-xen/asm/pgalloc.h
77824 --- linux-2.6/include/asm-x86_64/mach-xen/asm/pgalloc.h 1970-01-01 01:00:00.000000000 +0100
77825 +++ new/include/asm-x86_64/mach-xen/asm/pgalloc.h       2006-05-09 12:35:41.000000000 +0200
77826 @@ -0,0 +1,226 @@
77827 +#ifndef _X86_64_PGALLOC_H
77828 +#define _X86_64_PGALLOC_H
77829 +
77830 +#include <asm/fixmap.h>
77831 +#include <asm/pda.h>
77832 +#include <linux/threads.h>
77833 +#include <linux/mm.h>
77834 +#include <asm/io.h>            /* for phys_to_virt and page_to_pseudophys */
77835 +
77836 +#include <xen/features.h>
77837 +void make_page_readonly(void *va, unsigned int feature);
77838 +void make_page_writable(void *va, unsigned int feature);
77839 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
77840 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
77841 +
77842 +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
77843 +
77844 +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
77845 +{
77846 +       set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
77847 +}
77848 +
77849 +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
77850 +{
77851 +       if (unlikely((mm)->context.pinned)) {
77852 +               BUG_ON(HYPERVISOR_update_va_mapping(
77853 +                              (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
77854 +                              pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
77855 +               set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
77856 +       } else {
77857 +               *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
77858 +       }
77859 +}
77860 +
77861 +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
77862 +{
77863 +       if (unlikely((mm)->context.pinned)) {
77864 +               BUG_ON(HYPERVISOR_update_va_mapping(
77865 +                              (unsigned long)pmd,
77866 +                              pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, 
77867 +                                      PAGE_KERNEL_RO), 0));
77868 +               set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
77869 +       } else {
77870 +               *(pud) =  __pud(_PAGE_TABLE | __pa(pmd));
77871 +       }
77872 +}
77873 +
77874 +/*
77875 + * We need to use the batch mode here, but pgd_pupulate() won't be
77876 + * be called frequently.
77877 + */
77878 +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
77879 +{
77880 +       if (unlikely((mm)->context.pinned)) {
77881 +               BUG_ON(HYPERVISOR_update_va_mapping(
77882 +                              (unsigned long)pud,
77883 +                              pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, 
77884 +                                      PAGE_KERNEL_RO), 0));
77885 +               set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
77886 +               set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
77887 +       } else {
77888 +               *(pgd) =  __pgd(_PAGE_TABLE | __pa(pud));
77889 +               *(__user_pgd(pgd)) = *(pgd);
77890 +       }
77891 +}
77892 +
77893 +static inline void pmd_free(pmd_t *pmd)
77894 +{
77895 +       pte_t *ptep = virt_to_ptep(pmd);
77896 +
77897 +       if (!pte_write(*ptep)) {
77898 +               BUG_ON(HYPERVISOR_update_va_mapping(
77899 +                       (unsigned long)pmd,
77900 +                       pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, PAGE_KERNEL),
77901 +                       0));
77902 +       }
77903 +       free_page((unsigned long)pmd);
77904 +}
77905 +
77906 +static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
77907 +{
77908 +        pmd_t *pmd = (pmd_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
77909 +        return pmd;
77910 +}
77911 +
77912 +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
77913 +{
77914 +        pud_t *pud = (pud_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
77915 +        return pud;
77916 +}
77917 +
77918 +static inline void pud_free(pud_t *pud)
77919 +{
77920 +       pte_t *ptep = virt_to_ptep(pud);
77921 +
77922 +       if (!pte_write(*ptep)) {
77923 +               BUG_ON(HYPERVISOR_update_va_mapping(
77924 +                       (unsigned long)pud,
77925 +                       pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, PAGE_KERNEL),
77926 +                       0));
77927 +       }
77928 +       free_page((unsigned long)pud);
77929 +}
77930 +
77931 +static inline void pgd_list_add(pgd_t *pgd)
77932 +{
77933 +       struct page *page = virt_to_page(pgd);
77934 +
77935 +       spin_lock(&pgd_lock);
77936 +       page->index = (pgoff_t)pgd_list;
77937 +       if (pgd_list)
77938 +               pgd_list->private = (unsigned long)&page->index;
77939 +       pgd_list = page;
77940 +       page->private = (unsigned long)&pgd_list;
77941 +       spin_unlock(&pgd_lock);
77942 +}
77943 +
77944 +static inline void pgd_list_del(pgd_t *pgd)
77945 +{
77946 +       struct page *next, **pprev, *page = virt_to_page(pgd);
77947 +
77948 +       spin_lock(&pgd_lock);
77949 +       next = (struct page *)page->index;
77950 +       pprev = (struct page **)page->private;
77951 +       *pprev = next;
77952 +       if (next)
77953 +               next->private = (unsigned long)pprev;
77954 +       spin_unlock(&pgd_lock);
77955 +}
77956 +
77957 +static inline pgd_t *pgd_alloc(struct mm_struct *mm)
77958 +{
77959 +        /*
77960 +         * We allocate two contiguous pages for kernel and user.
77961 +         */
77962 +        unsigned boundary;
77963 +       pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1);
77964 +
77965 +       if (!pgd)
77966 +               return NULL;
77967 +       pgd_list_add(pgd);
77968 +       /*
77969 +        * Copy kernel pointers in from init.
77970 +        * Could keep a freelist or slab cache of those because the kernel
77971 +        * part never changes.
77972 +        */
77973 +       boundary = pgd_index(__PAGE_OFFSET);
77974 +       memset(pgd, 0, boundary * sizeof(pgd_t));
77975 +       memcpy(pgd + boundary,
77976 +              init_level4_pgt + boundary,
77977 +              (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
77978 +
77979 +       memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */
77980 +        /*
77981 +         * Set level3_user_pgt for vsyscall area
77982 +         */
77983 +       set_pgd(__user_pgd(pgd) + pgd_index(VSYSCALL_START), 
77984 +                mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
77985 +       return pgd;
77986 +}
77987 +
77988 +static inline void pgd_free(pgd_t *pgd)
77989 +{
77990 +       pte_t *ptep = virt_to_ptep(pgd);
77991 +
77992 +       if (!pte_write(*ptep)) {
77993 +               xen_pgd_unpin(__pa(pgd));
77994 +               BUG_ON(HYPERVISOR_update_va_mapping(
77995 +                              (unsigned long)pgd,
77996 +                              pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL),
77997 +                              0));
77998 +       }
77999 +
78000 +       ptep = virt_to_ptep(__user_pgd(pgd));
78001 +
78002 +       if (!pte_write(*ptep)) {
78003 +               xen_pgd_unpin(__pa(__user_pgd(pgd)));
78004 +               BUG_ON(HYPERVISOR_update_va_mapping(
78005 +                              (unsigned long)__user_pgd(pgd),
78006 +                              pfn_pte(virt_to_phys(__user_pgd(pgd))>>PAGE_SHIFT, 
78007 +                                      PAGE_KERNEL),
78008 +                              0));
78009 +       }
78010 +
78011 +       pgd_list_del(pgd);
78012 +       free_pages((unsigned long)pgd, 1);
78013 +}
78014 +
78015 +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
78016 +{
78017 +        pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
78018 +        if (pte)
78019 +               make_page_readonly(pte, XENFEAT_writable_page_tables);
78020 +
78021 +       return pte;
78022 +}
78023 +
78024 +static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
78025 +{
78026 +       struct page *pte;
78027 +
78028 +       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
78029 +       return pte;
78030 +}
78031 +
78032 +/* Should really implement gc for free page table pages. This could be
78033 +   done with a reference count in struct page. */
78034 +
78035 +static inline void pte_free_kernel(pte_t *pte)
78036 +{
78037 +       BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
78038 +        make_page_writable(pte, XENFEAT_writable_page_tables);
78039 +       free_page((unsigned long)pte); 
78040 +}
78041 +
78042 +extern void pte_free(struct page *pte);
78043 +
78044 +//#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) 
78045 +//#define __pmd_free_tlb(tlb,x)   tlb_remove_page((tlb),virt_to_page(x))
78046 +//#define __pud_free_tlb(tlb,x)   tlb_remove_page((tlb),virt_to_page(x))
78047 +
78048 +#define __pte_free_tlb(tlb,x)   pte_free((x))
78049 +#define __pmd_free_tlb(tlb,x)   pmd_free((x))
78050 +#define __pud_free_tlb(tlb,x)   pud_free((x))
78051 +
78052 +#endif /* _X86_64_PGALLOC_H */
78053 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/pgtable.h new/include/asm-x86_64/mach-xen/asm/pgtable.h
78054 --- linux-2.6/include/asm-x86_64/mach-xen/asm/pgtable.h 1970-01-01 01:00:00.000000000 +0100
78055 +++ new/include/asm-x86_64/mach-xen/asm/pgtable.h       2006-05-09 12:35:42.000000000 +0200
78056 @@ -0,0 +1,564 @@
78057 +#ifndef _X86_64_PGTABLE_H
78058 +#define _X86_64_PGTABLE_H
78059 +
78060 +/*
78061 + * This file contains the functions and defines necessary to modify and use
78062 + * the x86-64 page table tree.
78063 + */
78064 +#include <asm/processor.h>
78065 +#include <asm/fixmap.h>
78066 +#include <asm/bitops.h>
78067 +#include <linux/threads.h>
78068 +#include <linux/sched.h>
78069 +#include <asm/pda.h>
78070 +#ifdef CONFIG_XEN
78071 +#include <asm/hypervisor.h>
78072 +
78073 +extern pud_t level3_user_pgt[512];
78074 +extern pud_t init_level4_user_pgt[];
78075 +
78076 +extern void xen_init_pt(void);
78077 +
78078 +#define virt_to_ptep(__va)                                             \
78079 +({                                                                     \
78080 +       pgd_t *__pgd = pgd_offset_k((unsigned long)(__va));             \
78081 +       pud_t *__pud = pud_offset(__pgd, (unsigned long)(__va));        \
78082 +       pmd_t *__pmd = pmd_offset(__pud, (unsigned long)(__va));        \
78083 +       pte_offset_kernel(__pmd, (unsigned long)(__va));                \
78084 +})
78085 +
78086 +#define arbitrary_virt_to_machine(__va)                                        \
78087 +({                                                                     \
78088 +       maddr_t m = (maddr_t)pte_mfn(*virt_to_ptep(__va)) << PAGE_SHIFT;\
78089 +       m | ((unsigned long)(__va) & (PAGE_SIZE-1));                    \
78090 +})
78091 +#endif
78092 +
78093 +extern pud_t level3_kernel_pgt[512];
78094 +extern pud_t level3_physmem_pgt[512];
78095 +extern pud_t level3_ident_pgt[512];
78096 +extern pmd_t level2_kernel_pgt[512];
78097 +extern pgd_t init_level4_pgt[];
78098 +extern pgd_t boot_level4_pgt[];
78099 +extern unsigned long __supported_pte_mask;
78100 +
78101 +#define swapper_pg_dir init_level4_pgt
78102 +
78103 +extern int nonx_setup(char *str);
78104 +extern void paging_init(void);
78105 +extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
78106 +
78107 +extern unsigned long pgkern_mask;
78108 +
78109 +/*
78110 + * ZERO_PAGE is a global shared page that is always zero: used
78111 + * for zero-mapped memory areas etc..
78112 + */
78113 +extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
78114 +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
78115 +
78116 +/*
78117 + * PGDIR_SHIFT determines what a top-level page table entry can map
78118 + */
78119 +#define PGDIR_SHIFT    39
78120 +#define PTRS_PER_PGD   512
78121 +
78122 +/*
78123 + * 3rd level page
78124 + */
78125 +#define PUD_SHIFT      30
78126 +#define PTRS_PER_PUD   512
78127 +
78128 +/*
78129 + * PMD_SHIFT determines the size of the area a middle-level
78130 + * page table can map
78131 + */
78132 +#define PMD_SHIFT      21
78133 +#define PTRS_PER_PMD   512
78134 +
78135 +/*
78136 + * entries per page directory level
78137 + */
78138 +#define PTRS_PER_PTE   512
78139 +
78140 +#define pte_ERROR(e) \
78141 +       printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), pte_val(e))
78142 +#define pmd_ERROR(e) \
78143 +       printk("%s:%d: bad pmd %p(%016lx).\n", __FILE__, __LINE__, &(e), pmd_val(e))
78144 +#define pud_ERROR(e) \
78145 +       printk("%s:%d: bad pud %p(%016lx).\n", __FILE__, __LINE__, &(e), pud_val(e))
78146 +#define pgd_ERROR(e) \
78147 +       printk("%s:%d: bad pgd %p(%016lx).\n", __FILE__, __LINE__, &(e), pgd_val(e))
78148 +
78149 +#define pgd_none(x)    (!pgd_val(x))
78150 +#define pud_none(x)    (!pud_val(x))
78151 +
78152 +#define set_pte_batched(pteptr, pteval) \
78153 +       queue_l1_entry_update(pteptr, (pteval))
78154 +
78155 +extern inline int pud_present(pud_t pud)       { return !pud_none(pud); }
78156 +
78157 +static inline void set_pte(pte_t *dst, pte_t val)
78158 +{
78159 +       *dst = val;
78160 +}
78161 +
78162 +#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval))
78163 +#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval))
78164 +#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval))
78165 +
78166 +static inline void pud_clear (pud_t * pud)
78167 +{
78168 +       set_pud(pud, __pud(0));
78169 +}
78170 +
78171 +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
78172 +
78173 +static inline void pgd_clear (pgd_t * pgd)
78174 +{
78175 +        set_pgd(pgd, __pgd(0));
78176 +        set_pgd(__user_pgd(pgd), __pgd(0));
78177 +}
78178 +
78179 +#define pud_page(pud) \
78180 +    ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
78181 +
78182 +/*
78183 + * A note on implementation of this atomic 'get-and-clear' operation.
78184 + * This is actually very simple because Xen Linux can only run on a single
78185 + * processor. Therefore, we cannot race other processors setting the 'accessed'
78186 + * or 'dirty' bits on a page-table entry.
78187 + * Even if pages are shared between domains, that is not a problem because
78188 + * each domain will have separate page tables, with their own versions of
78189 + * accessed & dirty state.
78190 + */
78191 +#define ptep_get_and_clear(mm,addr,xp) __pte_ma(xchg(&(xp)->pte, 0))
78192 +
78193 +#if 0
78194 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp)
78195 +{
78196 +        pte_t pte = *xp;
78197 +        if (pte.pte)
78198 +                set_pte(xp, __pte_ma(0));
78199 +        return pte;
78200 +}
78201 +#endif
78202 +
78203 +struct mm_struct;
78204 +
78205 +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
78206 +{
78207 +       pte_t pte;
78208 +       if (full) {
78209 +               pte = *ptep;
78210 +               *ptep = __pte(0);
78211 +       } else {
78212 +               pte = ptep_get_and_clear(mm, addr, ptep);
78213 +       }
78214 +       return pte;
78215 +}
78216 +
78217 +#define pte_same(a, b)         ((a).pte == (b).pte)
78218 +
78219 +#define pte_pgprot(a)  (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
78220 +
78221 +#define PMD_SIZE       (1UL << PMD_SHIFT)
78222 +#define PMD_MASK       (~(PMD_SIZE-1))
78223 +#define PUD_SIZE       (1UL << PUD_SHIFT)
78224 +#define PUD_MASK       (~(PUD_SIZE-1))
78225 +#define PGDIR_SIZE     (1UL << PGDIR_SHIFT)
78226 +#define PGDIR_MASK     (~(PGDIR_SIZE-1))
78227 +
78228 +#define USER_PTRS_PER_PGD      ((TASK_SIZE-1)/PGDIR_SIZE+1)
78229 +#define FIRST_USER_ADDRESS     0
78230 +
78231 +#ifndef __ASSEMBLY__
78232 +#define MAXMEM          0x3fffffffffffUL
78233 +#define VMALLOC_START    0xffffc20000000000UL
78234 +#define VMALLOC_END      0xffffe1ffffffffffUL
78235 +#define MODULES_VADDR    0xffffffff88000000UL
78236 +#define MODULES_END      0xfffffffffff00000UL
78237 +#define MODULES_LEN   (MODULES_END - MODULES_VADDR)
78238 +
78239 +#define _PAGE_BIT_PRESENT      0
78240 +#define _PAGE_BIT_RW           1
78241 +#define _PAGE_BIT_USER         2
78242 +#define _PAGE_BIT_PWT          3
78243 +#define _PAGE_BIT_PCD          4
78244 +#define _PAGE_BIT_ACCESSED     5
78245 +#define _PAGE_BIT_DIRTY                6
78246 +#define _PAGE_BIT_PSE          7       /* 4 MB (or 2MB) page */
78247 +#define _PAGE_BIT_GLOBAL       8       /* Global TLB entry PPro+ */
78248 +#define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
78249 +
78250 +#define _PAGE_PRESENT  0x001
78251 +#define _PAGE_RW       0x002
78252 +#define _PAGE_USER     0x004
78253 +#define _PAGE_PWT      0x008
78254 +#define _PAGE_PCD      0x010
78255 +#define _PAGE_ACCESSED 0x020
78256 +#define _PAGE_DIRTY    0x040
78257 +#define _PAGE_PSE      0x080   /* 2MB page */
78258 +#define _PAGE_FILE     0x040   /* nonlinear file mapping, saved PTE; unset:swap */
78259 +#define _PAGE_GLOBAL   0x100   /* Global TLB entry */
78260 +
78261 +#define _PAGE_PROTNONE 0x080   /* If not present */
78262 +#define _PAGE_NX        (1UL<<_PAGE_BIT_NX)
78263 +
78264 +#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
78265 +#define _KERNPG_TABLE  _PAGE_TABLE
78266 +
78267 +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
78268 +
78269 +#define PAGE_NONE      __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
78270 +#define PAGE_SHARED    __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
78271 +#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
78272 +#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
78273 +#define PAGE_COPY PAGE_COPY_NOEXEC
78274 +#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
78275 +#define PAGE_READONLY  __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
78276 +#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
78277 +#define __PAGE_KERNEL \
78278 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | _PAGE_USER )
78279 +#define __PAGE_KERNEL_EXEC \
78280 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER )
78281 +#define __PAGE_KERNEL_NOCACHE \
78282 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | _PAGE_USER )
78283 +#define __PAGE_KERNEL_RO \
78284 +       (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | _PAGE_USER )
78285 +#define __PAGE_KERNEL_VSYSCALL \
78286 +       (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_USER )
78287 +#define __PAGE_KERNEL_VSYSCALL_NOCACHE \
78288 +       (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD | _PAGE_USER )
78289 +#define __PAGE_KERNEL_LARGE \
78290 +       (__PAGE_KERNEL | _PAGE_PSE | _PAGE_USER )
78291 +#define __PAGE_KERNEL_LARGE_EXEC \
78292 +       (__PAGE_KERNEL_EXEC | _PAGE_PSE | _PAGE_USER )
78293 +
78294 +
78295 +/*
78296 + * We don't support GLOBAL page in xenolinux64
78297 + */
78298 +#define MAKE_GLOBAL(x) __pgprot((x))
78299 +
78300 +#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
78301 +#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
78302 +#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
78303 +#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
78304 +#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL)
78305 +#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
78306 +#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
78307 +#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
78308 +
78309 +/*         xwr */
78310 +#define __P000 PAGE_NONE
78311 +#define __P001 PAGE_READONLY
78312 +#define __P010 PAGE_COPY
78313 +#define __P011 PAGE_COPY
78314 +#define __P100 PAGE_READONLY_EXEC
78315 +#define __P101 PAGE_READONLY_EXEC
78316 +#define __P110 PAGE_COPY_EXEC
78317 +#define __P111 PAGE_COPY_EXEC
78318 +
78319 +#define __S000 PAGE_NONE
78320 +#define __S001 PAGE_READONLY
78321 +#define __S010 PAGE_SHARED
78322 +#define __S011 PAGE_SHARED
78323 +#define __S100 PAGE_READONLY_EXEC
78324 +#define __S101 PAGE_READONLY_EXEC
78325 +#define __S110 PAGE_SHARED_EXEC
78326 +#define __S111 PAGE_SHARED_EXEC
78327 +
78328 +static inline unsigned long pgd_bad(pgd_t pgd)
78329 +{
78330 +       unsigned long val = pgd_val(pgd);
78331 +       val &= ~PTE_MASK;
78332 +       val &= ~(_PAGE_USER | _PAGE_DIRTY);
78333 +       return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
78334 +}
78335 +
78336 +static inline unsigned long pud_bad(pud_t pud) 
78337 +{ 
78338 +       unsigned long val = pud_val(pud);
78339 +       val &= ~PTE_MASK; 
78340 +       val &= ~(_PAGE_USER | _PAGE_DIRTY); 
78341 +       return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);      
78342 +} 
78343 +
78344 +#define set_pte_at(_mm,addr,ptep,pteval) do {                          \
78345 +       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
78346 +           HYPERVISOR_update_va_mapping((addr), (pteval), 0))          \
78347 +               set_pte((ptep), (pteval));                              \
78348 +} while (0)
78349 +
78350 +#define pte_none(x)    (!(x).pte)
78351 +#define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
78352 +#define pte_clear(mm,addr,xp)  do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
78353 +
78354 +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
78355 +
78356 +#define pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
78357 +#define pte_pfn(_pte) mfn_to_local_pfn(pte_mfn(_pte))
78358 +
78359 +#define pte_page(x)    pfn_to_page(pte_pfn(x))
78360 +
78361 +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
78362 +{
78363 +       pte_t pte;
78364 +        
78365 +       (pte).pte = (pfn_to_mfn(page_nr) << PAGE_SHIFT);
78366 +       (pte).pte |= pgprot_val(pgprot);
78367 +       (pte).pte &= __supported_pte_mask;
78368 +       return pte;
78369 +}
78370 +
78371 +#define pfn_pte_ma(pfn, prot)  __pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask)
78372 +/*
78373 + * The following only work if pte_present() is true.
78374 + * Undefined behaviour if not..
78375 + */
78376 +#define __pte_val(x)   ((x).pte)
78377 +
78378 +#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
78379 +static inline int pte_user(pte_t pte)          { return __pte_val(pte) & _PAGE_USER; }
78380 +static inline int pte_read(pte_t pte)          { return __pte_val(pte) & _PAGE_USER; }
78381 +static inline int pte_exec(pte_t pte)          { return __pte_val(pte) & _PAGE_USER; }
78382 +static inline int pte_dirty(pte_t pte)         { return __pte_val(pte) & _PAGE_DIRTY; }
78383 +static inline int pte_young(pte_t pte)         { return __pte_val(pte) & _PAGE_ACCESSED; }
78384 +static inline int pte_write(pte_t pte)         { return __pte_val(pte) & _PAGE_RW; }
78385 +static inline int pte_file(pte_t pte)          { return __pte_val(pte) & _PAGE_FILE; }
78386 +static inline int pte_huge(pte_t pte)          { return __pte_val(pte) & _PAGE_PSE; }
78387 +
78388 +static inline pte_t pte_rdprotect(pte_t pte)   { __pte_val(pte) &= ~_PAGE_USER; return pte; }
78389 +static inline pte_t pte_exprotect(pte_t pte)   { __pte_val(pte) &= ~_PAGE_USER; return pte; }
78390 +static inline pte_t pte_mkclean(pte_t pte)     { __pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
78391 +static inline pte_t pte_mkold(pte_t pte)       { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
78392 +static inline pte_t pte_wrprotect(pte_t pte)   { __pte_val(pte) &= ~_PAGE_RW; return pte; }
78393 +static inline pte_t pte_mkread(pte_t pte)      { __pte_val(pte) |= _PAGE_USER; return pte; }
78394 +static inline pte_t pte_mkexec(pte_t pte)      { __pte_val(pte) |= _PAGE_USER; return pte; }
78395 +static inline pte_t pte_mkdirty(pte_t pte)     { __pte_val(pte) |= _PAGE_DIRTY; return pte; }
78396 +static inline pte_t pte_mkyoung(pte_t pte)     { __pte_val(pte) |= _PAGE_ACCESSED; return pte; }
78397 +static inline pte_t pte_mkwrite(pte_t pte)     { __pte_val(pte) |= _PAGE_RW; return pte; }
78398 +static inline pte_t pte_mkhuge(pte_t pte)      { __pte_val(pte) |= _PAGE_PSE; return pte; }
78399 +
78400 +struct vm_area_struct;
78401 +
78402 +static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
78403 +{
78404 +       pte_t pte = *ptep;
78405 +       int ret = pte_dirty(pte);
78406 +       if (ret)
78407 +               set_pte(ptep, pte_mkclean(pte));
78408 +       return ret;
78409 +}
78410 +
78411 +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
78412 +{
78413 +       pte_t pte = *ptep;
78414 +       int ret = pte_young(pte);
78415 +       if (ret)
78416 +               set_pte(ptep, pte_mkold(pte));
78417 +       return ret;
78418 +}
78419 +
78420 +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
78421 +{
78422 +       pte_t pte = *ptep;
78423 +       if (pte_write(pte))
78424 +               set_pte(ptep, pte_wrprotect(pte));
78425 +}
78426 +
78427 +/*
78428 + * Macro to mark a page protection value as "uncacheable".
78429 + */
78430 +#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
78431 +
78432 +static inline int pmd_large(pmd_t pte) { 
78433 +       return (pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE; 
78434 +}      
78435 +
78436 +
78437 +/*
78438 + * Conversion functions: convert a page and protection to a page entry,
78439 + * and a page entry and page directory to the page they refer to.
78440 + */
78441 +
78442 +/*
78443 + * Level 4 access.
78444 + * Never use these in the common code.
78445 + */
78446 +#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
78447 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
78448 +#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
78449 +#define pgd_offset_k(address) (pgd_t *)(init_level4_pgt + pgd_index(address))
78450 +#define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT)
78451 +#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
78452 +
78453 +/* PUD - Level3 access */
78454 +/* to find an entry in a page-table-directory. */
78455 +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
78456 +#define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address))
78457 +static inline pud_t *__pud_offset_k(pud_t *pud, unsigned long address)
78458 +{ 
78459 +       return pud + pud_index(address);
78460 +} 
78461 +
78462 +/* Find correct pud via the hidden fourth level page level: */
78463 +
78464 +/* This accesses the reference page table of the boot cpu. 
78465 +   Other CPUs get synced lazily via the page fault handler. */
78466 +static inline pud_t *pud_offset_k(pgd_t *pgd, unsigned long address)
78467 +{
78468 +       return pud_offset(pgd_offset_k(address), address);
78469 +}
78470 +
78471 +/* PMD  - Level 2 access */
78472 +#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
78473 +#define pmd_page(pmd)          (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
78474 +
78475 +#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
78476 +#define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \
78477 +                                  pmd_index(address))
78478 +#define pmd_none(x)    (!pmd_val(x))
78479 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
78480 +   can temporarily clear it. */
78481 +#define pmd_present(x) (pmd_val(x))
78482 +#define pmd_clear(xp)  do { set_pmd(xp, __pmd(0)); } while (0)
78483 +#define        pmd_bad(x)      ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
78484 +#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
78485 +#define pmd_pfn(x)  ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
78486 +
78487 +#define pte_to_pgoff(pte) ((pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
78488 +#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
78489 +#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
78490 +
78491 +/* PTE - Level 1 access. */
78492 +
78493 +/* page, protection -> pte */
78494 +#define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))
78495 +#define mk_pte_huge(entry) (pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE)
78496
78497 +/* physical address -> PTE */
78498 +static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot)
78499 +{ 
78500 +       pte_t pte;
78501 +       (pte).pte = physpage | pgprot_val(pgprot); 
78502 +       return pte; 
78503 +}
78504
78505 +/* Change flags of a PTE */
78506 +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
78507 +{ 
78508 +        (pte).pte &= _PAGE_CHG_MASK;
78509 +       (pte).pte |= pgprot_val(newprot);
78510 +       (pte).pte &= __supported_pte_mask;
78511 +       return pte; 
78512 +}
78513 +
78514 +#define pte_index(address) \
78515 +               (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
78516 +#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_kernel(*(dir)) + \
78517 +                       pte_index(address))
78518 +
78519 +/* x86-64 always has all page tables mapped. */
78520 +#define pte_offset_map(dir,address) pte_offset_kernel(dir,address)
78521 +#define pte_offset_map_nested(dir,address) pte_offset_kernel(dir,address)
78522 +#define pte_unmap(pte) /* NOP */
78523 +#define pte_unmap_nested(pte) /* NOP */ 
78524 +
78525 +#define update_mmu_cache(vma,address,pte) do { } while (0)
78526 +
78527 +/* We only update the dirty/accessed state if we set
78528 + * the dirty bit by hand in the kernel, since the hardware
78529 + * will do the accessed bit for us, and we don't want to
78530 + * race with other CPU's that might be updating the dirty
78531 + * bit at the same time. */
78532 +#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
78533 +#if 0
78534 +#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
78535 +       do {                                                              \
78536 +               if (__dirty) {                                            \
78537 +                       set_pte(__ptep, __entry);                         \
78538 +                       flush_tlb_page(__vma, __address);                 \
78539 +               }                                                         \
78540 +       } while (0)
78541 +#endif
78542 +#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
78543 +       do {                                                              \
78544 +               if (__dirty) {                                            \
78545 +                       if ( likely((__vma)->vm_mm == current->mm) ) {    \
78546 +                           BUG_ON(HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG|UVMF_MULTI|(unsigned long)((__vma)->vm_mm->cpu_vm_mask.bits))); \
78547 +                       } else {                                          \
78548 +                            xen_l1_entry_update((__ptep), (__entry)); \
78549 +                           flush_tlb_page((__vma), (__address));         \
78550 +                       }                                                 \
78551 +               }                                                         \
78552 +       } while (0)
78553 +
78554 +/* Encode and de-code a swap entry */
78555 +#define __swp_type(x)                  (((x).val >> 1) & 0x3f)
78556 +#define __swp_offset(x)                        ((x).val >> 8)
78557 +#define __swp_entry(type, offset)      ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
78558 +#define __pte_to_swp_entry(pte)                ((swp_entry_t) { pte_val(pte) })
78559 +#define __swp_entry_to_pte(x)          ((pte_t) { (x).val })
78560 +
78561 +extern spinlock_t pgd_lock;
78562 +extern struct page *pgd_list;
78563 +void vmalloc_sync_all(void);
78564 +
78565 +#endif /* !__ASSEMBLY__ */
78566 +
78567 +extern int kern_addr_valid(unsigned long addr); 
78568 +
78569 +#define DOMID_LOCAL (0xFFFFU)
78570 +
78571 +int direct_remap_pfn_range(struct vm_area_struct *vma,
78572 +                            unsigned long address,
78573 +                            unsigned long mfn,
78574 +                            unsigned long size,
78575 +                            pgprot_t prot,
78576 +                            domid_t  domid);
78577 +
78578 +int direct_kernel_remap_pfn_range(unsigned long address, 
78579 +                                 unsigned long mfn,
78580 +                                 unsigned long size, 
78581 +                                 pgprot_t prot,
78582 +                                 domid_t  domid);
78583 +
78584 +int create_lookup_pte_addr(struct mm_struct *mm,
78585 +                           unsigned long address,
78586 +                           uint64_t *ptep);
78587 +
78588 +int touch_pte_range(struct mm_struct *mm,
78589 +                    unsigned long address,
78590 +                    unsigned long size);
78591 +
78592 +#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)                \
78593 +               direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
78594 +
78595 +#define MK_IOSPACE_PFN(space, pfn)     (pfn)
78596 +#define GET_IOSPACE(pfn)               0
78597 +#define GET_PFN(pfn)                   (pfn)
78598 +
78599 +#define HAVE_ARCH_UNMAPPED_AREA
78600 +
78601 +#define pgtable_cache_init()   do { } while (0)
78602 +#define check_pgt_cache()      do { } while (0)
78603 +
78604 +#define PAGE_AGP    PAGE_KERNEL_NOCACHE
78605 +#define HAVE_PAGE_AGP 1
78606 +
78607 +/* fs/proc/kcore.c */
78608 +#define        kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
78609 +#define        kc_offset_to_vaddr(o) \
78610 +   (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
78611 +
78612 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
78613 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
78614 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
78615 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
78616 +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
78617 +#define __HAVE_ARCH_PTE_SAME
78618 +#include <asm-generic/pgtable.h>
78619 +
78620 +#endif /* _X86_64_PGTABLE_H */
78621 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/processor.h new/include/asm-x86_64/mach-xen/asm/processor.h
78622 --- linux-2.6/include/asm-x86_64/mach-xen/asm/processor.h       1970-01-01 01:00:00.000000000 +0100
78623 +++ new/include/asm-x86_64/mach-xen/asm/processor.h     2006-05-09 12:35:42.000000000 +0200
78624 @@ -0,0 +1,494 @@
78625 +/*
78626 + * include/asm-x86_64/processor.h
78627 + *
78628 + * Copyright (C) 1994 Linus Torvalds
78629 + */
78630 +
78631 +#ifndef __ASM_X86_64_PROCESSOR_H
78632 +#define __ASM_X86_64_PROCESSOR_H
78633 +
78634 +#include <asm/segment.h>
78635 +#include <asm/page.h>
78636 +#include <asm/types.h>
78637 +#include <asm/sigcontext.h>
78638 +#include <asm/cpufeature.h>
78639 +#include <linux/config.h>
78640 +#include <linux/threads.h>
78641 +#include <asm/msr.h>
78642 +#include <asm/current.h>
78643 +#include <asm/system.h>
78644 +#include <asm/mmsegment.h>
78645 +#include <asm/percpu.h>
78646 +#include <linux/personality.h>
78647 +#include <linux/cpumask.h>
78648 +
78649 +#define TF_MASK                0x00000100
78650 +#define IF_MASK                0x00000200
78651 +#define IOPL_MASK      0x00003000
78652 +#define NT_MASK                0x00004000
78653 +#define VM_MASK                0x00020000
78654 +#define AC_MASK                0x00040000
78655 +#define VIF_MASK       0x00080000      /* virtual interrupt flag */
78656 +#define VIP_MASK       0x00100000      /* virtual interrupt pending */
78657 +#define ID_MASK                0x00200000
78658 +
78659 +#define desc_empty(desc) \
78660 +               (!((desc)->a | (desc)->b))
78661 +
78662 +#define desc_equal(desc1, desc2) \
78663 +               (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
78664 +
78665 +/*
78666 + * Default implementation of macro that returns current
78667 + * instruction pointer ("program counter").
78668 + */
78669 +#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; })
78670 +
78671 +/*
78672 + *  CPU type and hardware bug flags. Kept separately for each CPU.
78673 + */
78674 +
78675 +struct cpuinfo_x86 {
78676 +       __u8    x86;            /* CPU family */
78677 +       __u8    x86_vendor;     /* CPU vendor */
78678 +       __u8    x86_model;
78679 +       __u8    x86_mask;
78680 +       int     cpuid_level;    /* Maximum supported CPUID level, -1=no CPUID */
78681 +       __u32   x86_capability[NCAPINTS];
78682 +       char    x86_vendor_id[16];
78683 +       char    x86_model_id[64];
78684 +       int     x86_cache_size;  /* in KB */
78685 +       int     x86_clflush_size;
78686 +       int     x86_cache_alignment;
78687 +       int     x86_tlbsize;    /* number of 4K pages in DTLB/ITLB combined(in pages)*/
78688 +        __u8    x86_virt_bits, x86_phys_bits;
78689 +       __u8    x86_max_cores;  /* cpuid returned max cores value */
78690 +        __u32   x86_power;     
78691 +       __u32   extended_cpuid_level;   /* Max extended CPUID function supported */
78692 +       unsigned long loops_per_jiffy;
78693 +#ifdef CONFIG_SMP
78694 +       cpumask_t llc_shared_map;       /* cpus sharing the last level cache */
78695 +#endif
78696 +       __u8    apicid;
78697 +       __u8    booted_cores;   /* number of cores as seen by OS */
78698 +} ____cacheline_aligned;
78699 +
78700 +#define X86_VENDOR_INTEL 0
78701 +#define X86_VENDOR_CYRIX 1
78702 +#define X86_VENDOR_AMD 2
78703 +#define X86_VENDOR_UMC 3
78704 +#define X86_VENDOR_NEXGEN 4
78705 +#define X86_VENDOR_CENTAUR 5
78706 +#define X86_VENDOR_RISE 6
78707 +#define X86_VENDOR_TRANSMETA 7
78708 +#define X86_VENDOR_NUM 8
78709 +#define X86_VENDOR_UNKNOWN 0xff
78710 +
78711 +#ifdef CONFIG_SMP
78712 +extern struct cpuinfo_x86 cpu_data[];
78713 +#define current_cpu_data cpu_data[smp_processor_id()]
78714 +#else
78715 +#define cpu_data (&boot_cpu_data)
78716 +#define current_cpu_data boot_cpu_data
78717 +#endif
78718 +
78719 +extern char ignore_irq13;
78720 +
78721 +extern void identify_cpu(struct cpuinfo_x86 *);
78722 +extern void print_cpu_info(struct cpuinfo_x86 *);
78723 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
78724 +
78725 +/*
78726 + * EFLAGS bits
78727 + */
78728 +#define X86_EFLAGS_CF  0x00000001 /* Carry Flag */
78729 +#define X86_EFLAGS_PF  0x00000004 /* Parity Flag */
78730 +#define X86_EFLAGS_AF  0x00000010 /* Auxillary carry Flag */
78731 +#define X86_EFLAGS_ZF  0x00000040 /* Zero Flag */
78732 +#define X86_EFLAGS_SF  0x00000080 /* Sign Flag */
78733 +#define X86_EFLAGS_TF  0x00000100 /* Trap Flag */
78734 +#define X86_EFLAGS_IF  0x00000200 /* Interrupt Flag */
78735 +#define X86_EFLAGS_DF  0x00000400 /* Direction Flag */
78736 +#define X86_EFLAGS_OF  0x00000800 /* Overflow Flag */
78737 +#define X86_EFLAGS_IOPL        0x00003000 /* IOPL mask */
78738 +#define X86_EFLAGS_NT  0x00004000 /* Nested Task */
78739 +#define X86_EFLAGS_RF  0x00010000 /* Resume Flag */
78740 +#define X86_EFLAGS_VM  0x00020000 /* Virtual Mode */
78741 +#define X86_EFLAGS_AC  0x00040000 /* Alignment Check */
78742 +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
78743 +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
78744 +#define X86_EFLAGS_ID  0x00200000 /* CPUID detection flag */
78745 +
78746 +/*
78747 + * Intel CPU features in CR4
78748 + */
78749 +#define X86_CR4_VME            0x0001  /* enable vm86 extensions */
78750 +#define X86_CR4_PVI            0x0002  /* virtual interrupts flag enable */
78751 +#define X86_CR4_TSD            0x0004  /* disable time stamp at ipl 3 */
78752 +#define X86_CR4_DE             0x0008  /* enable debugging extensions */
78753 +#define X86_CR4_PSE            0x0010  /* enable page size extensions */
78754 +#define X86_CR4_PAE            0x0020  /* enable physical address extensions */
78755 +#define X86_CR4_MCE            0x0040  /* Machine check enable */
78756 +#define X86_CR4_PGE            0x0080  /* enable global pages */
78757 +#define X86_CR4_PCE            0x0100  /* enable performance counters at ipl 3 */
78758 +#define X86_CR4_OSFXSR         0x0200  /* enable fast FPU save and restore */
78759 +#define X86_CR4_OSXMMEXCPT     0x0400  /* enable unmasked SSE exceptions */
78760 +
78761 +/*
78762 + * Save the cr4 feature set we're using (ie
78763 + * Pentium 4MB enable and PPro Global page
78764 + * enable), so that any CPU's that boot up
78765 + * after us can get the correct flags.
78766 + */
78767 +extern unsigned long mmu_cr4_features;
78768 +
78769 +static inline void set_in_cr4 (unsigned long mask)
78770 +{
78771 +       mmu_cr4_features |= mask;
78772 +       __asm__("movq %%cr4,%%rax\n\t"
78773 +               "orq %0,%%rax\n\t"
78774 +               "movq %%rax,%%cr4\n"
78775 +               : : "irg" (mask)
78776 +               :"ax");
78777 +}
78778 +
78779 +static inline void clear_in_cr4 (unsigned long mask)
78780 +{
78781 +       mmu_cr4_features &= ~mask;
78782 +       __asm__("movq %%cr4,%%rax\n\t"
78783 +               "andq %0,%%rax\n\t"
78784 +               "movq %%rax,%%cr4\n"
78785 +               : : "irg" (~mask)
78786 +               :"ax");
78787 +}
78788 +
78789 +
78790 +/*
78791 + * Bus types
78792 + */
78793 +#define MCA_bus 0
78794 +#define MCA_bus__is_a_macro
78795 +
78796 +/*
78797 + * User space process size. 47bits minus one guard page.
78798 + */
78799 +#define TASK_SIZE64    (0x800000000000UL - 4096)
78800 +
78801 +/* This decides where the kernel will search for a free chunk of vm
78802 + * space during mmap's.
78803 + */
78804 +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000)
78805 +
78806 +#define TASK_SIZE              (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
78807 +#define TASK_SIZE_OF(child)    ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
78808 +
78809 +#define TASK_UNMAPPED_BASE     PAGE_ALIGN(TASK_SIZE/3)
78810 +
78811 +/*
78812 + * Size of io_bitmap.
78813 + */
78814 +#define IO_BITMAP_BITS  65536
78815 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
78816 +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
78817 +#ifndef CONFIG_X86_NO_TSS
78818 +#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
78819 +#endif
78820 +#define INVALID_IO_BITMAP_OFFSET 0x8000
78821 +
78822 +struct i387_fxsave_struct {
78823 +       u16     cwd;
78824 +       u16     swd;
78825 +       u16     twd;
78826 +       u16     fop;
78827 +       u64     rip;
78828 +       u64     rdp; 
78829 +       u32     mxcsr;
78830 +       u32     mxcsr_mask;
78831 +       u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
78832 +       u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 128 bytes */
78833 +       u32     padding[24];
78834 +} __attribute__ ((aligned (16)));
78835 +
78836 +union i387_union {
78837 +       struct i387_fxsave_struct       fxsave;
78838 +};
78839 +
78840 +#ifndef CONFIG_X86_NO_TSS
78841 +struct tss_struct {
78842 +       u32 reserved1;
78843 +       u64 rsp0;       
78844 +       u64 rsp1;
78845 +       u64 rsp2;
78846 +       u64 reserved2;
78847 +       u64 ist[7];
78848 +       u32 reserved3;
78849 +       u32 reserved4;
78850 +       u16 reserved5;
78851 +       u16 io_bitmap_base;
78852 +       /*
78853 +        * The extra 1 is there because the CPU will access an
78854 +        * additional byte beyond the end of the IO permission
78855 +        * bitmap. The extra byte must be all 1 bits, and must
78856 +        * be within the limit. Thus we have:
78857 +        *
78858 +        * 128 bytes, the bitmap itself, for ports 0..0x3ff
78859 +        * 8 bytes, for an extra "long" of ~0UL
78860 +        */
78861 +       unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
78862 +} __attribute__((packed)) ____cacheline_aligned;
78863 +
78864 +DECLARE_PER_CPU(struct tss_struct,init_tss);
78865 +#endif
78866 +
78867 +extern struct cpuinfo_x86 boot_cpu_data;
78868 +
78869 +#ifdef CONFIG_X86_VSMP
78870 +#define ARCH_MIN_TASKALIGN     (1 << INTERNODE_CACHE_SHIFT)
78871 +#define ARCH_MIN_MMSTRUCT_ALIGN        (1 << INTERNODE_CACHE_SHIFT)
78872 +#else
78873 +#define ARCH_MIN_TASKALIGN     16
78874 +#define ARCH_MIN_MMSTRUCT_ALIGN        0
78875 +#endif
78876 +
78877 +struct thread_struct {
78878 +       unsigned long   rsp0;
78879 +       unsigned long   rsp;
78880 +       unsigned long   userrsp;        /* Copy from PDA */ 
78881 +       unsigned long   fs;
78882 +       unsigned long   gs;
78883 +       unsigned short  es, ds, fsindex, gsindex;       
78884 +/* Hardware debugging registers */
78885 +       unsigned long   debugreg0;  
78886 +       unsigned long   debugreg1;  
78887 +       unsigned long   debugreg2;  
78888 +       unsigned long   debugreg3;  
78889 +       unsigned long   debugreg6;  
78890 +       unsigned long   debugreg7;  
78891 +/* fault info */
78892 +       unsigned long   cr2, trap_no, error_code;
78893 +/* floating point info */
78894 +       union i387_union        i387  __attribute__((aligned(16)));
78895 +/* IO permissions. the bitmap could be moved into the GDT, that would make
78896 +   switch faster for a limited number of ioperm using tasks. -AK */
78897 +       int             ioperm;
78898 +       unsigned long   *io_bitmap_ptr;
78899 +       unsigned io_bitmap_max;
78900 +/* cached TLS descriptors. */
78901 +       u64 tls_array[GDT_ENTRY_TLS_ENTRIES];
78902 +       unsigned int    iopl;
78903 +} __attribute__((aligned(16)));
78904 +
78905 +#define INIT_THREAD  { \
78906 +       .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
78907 +}
78908 +
78909 +#ifndef CONFIG_X86_NO_TSS
78910 +#define INIT_TSS  { \
78911 +       .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
78912 +}
78913 +#endif
78914 +
78915 +#define INIT_MMAP \
78916 +{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
78917 +
78918 +#define start_thread(regs,new_rip,new_rsp) do { \
78919 +       asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0));      \
78920 +       load_gs_index(0);                                                       \
78921 +       (regs)->rip = (new_rip);                                                 \
78922 +       (regs)->rsp = (new_rsp);                                                 \
78923 +       write_pda(oldrsp, (new_rsp));                                            \
78924 +       (regs)->cs = __USER_CS;                                                  \
78925 +       (regs)->ss = __USER_DS;                                                  \
78926 +       (regs)->eflags = 0x200;                                                  \
78927 +       set_fs(USER_DS);                                                         \
78928 +} while(0) 
78929 +
78930 +#define get_debugreg(var, register)                            \
78931 +       var = HYPERVISOR_get_debugreg(register)
78932 +#define set_debugreg(value, register)                  \
78933 +       HYPERVISOR_set_debugreg(register, value)
78934 +
78935 +struct task_struct;
78936 +struct mm_struct;
78937 +
78938 +/* Free all resources held by a thread. */
78939 +extern void release_thread(struct task_struct *);
78940 +
78941 +/* Prepare to copy thread state - unlazy all lazy status */
78942 +extern void prepare_to_copy(struct task_struct *tsk);
78943 +
78944 +/*
78945 + * create a kernel thread without removing it from tasklists
78946 + */
78947 +extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
78948 +
78949 +/*
78950 + * Return saved PC of a blocked thread.
78951 + * What is this good for? it will be always the scheduler or ret_from_fork.
78952 + */
78953 +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8))
78954 +
78955 +extern unsigned long get_wchan(struct task_struct *p);
78956 +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
78957 +#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip)
78958 +#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
78959 +
78960 +
78961 +struct microcode_header {
78962 +       unsigned int hdrver;
78963 +       unsigned int rev;
78964 +       unsigned int date;
78965 +       unsigned int sig;
78966 +       unsigned int cksum;
78967 +       unsigned int ldrver;
78968 +       unsigned int pf;
78969 +       unsigned int datasize;
78970 +       unsigned int totalsize;
78971 +       unsigned int reserved[3];
78972 +};
78973 +
78974 +struct microcode {
78975 +       struct microcode_header hdr;
78976 +       unsigned int bits[0];
78977 +};
78978 +
78979 +typedef struct microcode microcode_t;
78980 +typedef struct microcode_header microcode_header_t;
78981 +
78982 +/* microcode format is extended from prescott processors */
78983 +struct extended_signature {
78984 +       unsigned int sig;
78985 +       unsigned int pf;
78986 +       unsigned int cksum;
78987 +};
78988 +
78989 +struct extended_sigtable {
78990 +       unsigned int count;
78991 +       unsigned int cksum;
78992 +       unsigned int reserved[3];
78993 +       struct extended_signature sigs[0];
78994 +};
78995 +
78996 +
78997 +#define ASM_NOP1 K8_NOP1
78998 +#define ASM_NOP2 K8_NOP2
78999 +#define ASM_NOP3 K8_NOP3
79000 +#define ASM_NOP4 K8_NOP4
79001 +#define ASM_NOP5 K8_NOP5
79002 +#define ASM_NOP6 K8_NOP6
79003 +#define ASM_NOP7 K8_NOP7
79004 +#define ASM_NOP8 K8_NOP8
79005 +
79006 +/* Opteron nops */
79007 +#define K8_NOP1 ".byte 0x90\n"
79008 +#define K8_NOP2        ".byte 0x66,0x90\n" 
79009 +#define K8_NOP3        ".byte 0x66,0x66,0x90\n" 
79010 +#define K8_NOP4        ".byte 0x66,0x66,0x66,0x90\n" 
79011 +#define K8_NOP5        K8_NOP3 K8_NOP2 
79012 +#define K8_NOP6        K8_NOP3 K8_NOP3
79013 +#define K8_NOP7        K8_NOP4 K8_NOP3
79014 +#define K8_NOP8        K8_NOP4 K8_NOP4
79015 +
79016 +#define ASM_NOP_MAX 8
79017 +
79018 +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
79019 +static inline void rep_nop(void)
79020 +{
79021 +       __asm__ __volatile__("rep;nop": : :"memory");
79022 +}
79023 +
79024 +/* Stop speculative execution */
79025 +static inline void sync_core(void)
79026 +{ 
79027 +       int tmp;
79028 +       asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
79029 +} 
79030 +
79031 +#define cpu_has_fpu 1
79032 +
79033 +#define ARCH_HAS_PREFETCH
79034 +static inline void prefetch(void *x) 
79035 +{ 
79036 +       asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
79037 +} 
79038 +
79039 +#define ARCH_HAS_PREFETCHW 1
79040 +static inline void prefetchw(void *x) 
79041 +{ 
79042 +       alternative_input("prefetcht0 (%1)",
79043 +                         "prefetchw (%1)",
79044 +                         X86_FEATURE_3DNOW,
79045 +                         "r" (x));
79046 +} 
79047 +
79048 +#define ARCH_HAS_SPINLOCK_PREFETCH 1
79049 +
79050 +#define spin_lock_prefetch(x)  prefetchw(x)
79051 +
79052 +#define cpu_relax()   rep_nop()
79053 +
79054 +/*
79055 + *      NSC/Cyrix CPU configuration register indexes
79056 + */
79057 +#define CX86_CCR0 0xc0
79058 +#define CX86_CCR1 0xc1
79059 +#define CX86_CCR2 0xc2
79060 +#define CX86_CCR3 0xc3
79061 +#define CX86_CCR4 0xe8
79062 +#define CX86_CCR5 0xe9
79063 +#define CX86_CCR6 0xea
79064 +#define CX86_CCR7 0xeb
79065 +#define CX86_DIR0 0xfe
79066 +#define CX86_DIR1 0xff
79067 +#define CX86_ARR_BASE 0xc4
79068 +#define CX86_RCR_BASE 0xdc
79069 +
79070 +/*
79071 + *      NSC/Cyrix CPU indexed register access macros
79072 + */
79073 +
79074 +#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); })
79075 +
79076 +#define setCx86(reg, data) do { \
79077 +       outb((reg), 0x22); \
79078 +       outb((data), 0x23); \
79079 +} while (0)
79080 +
79081 +static inline void serialize_cpu(void)
79082 +{
79083 +       __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
79084 +}
79085 +
79086 +static inline void __monitor(const void *eax, unsigned long ecx,
79087 +               unsigned long edx)
79088 +{
79089 +       /* "monitor %eax,%ecx,%edx;" */
79090 +       asm volatile(
79091 +               ".byte 0x0f,0x01,0xc8;"
79092 +               : :"a" (eax), "c" (ecx), "d"(edx));
79093 +}
79094 +
79095 +static inline void __mwait(unsigned long eax, unsigned long ecx)
79096 +{
79097 +       /* "mwait %eax,%ecx;" */
79098 +       asm volatile(
79099 +               ".byte 0x0f,0x01,0xc9;"
79100 +               : :"a" (eax), "c" (ecx));
79101 +}
79102 +
79103 +#define stack_current() \
79104 +({                                                             \
79105 +       struct thread_info *ti;                                 \
79106 +       asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));  \
79107 +       ti->task;                                       \
79108 +})
79109 +
79110 +#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
79111 +
79112 +extern unsigned long boot_option_idle_override;
79113 +/* Boot loader type from the setup header */
79114 +extern int bootloader_type;
79115 +
79116 +#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
79117 +
79118 +#endif /* __ASM_X86_64_PROCESSOR_H */
79119 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/ptrace.h new/include/asm-x86_64/mach-xen/asm/ptrace.h
79120 --- linux-2.6/include/asm-x86_64/mach-xen/asm/ptrace.h  1970-01-01 01:00:00.000000000 +0100
79121 +++ new/include/asm-x86_64/mach-xen/asm/ptrace.h        2006-05-09 12:35:42.000000000 +0200
79122 @@ -0,0 +1,125 @@
79123 +#ifndef _X86_64_PTRACE_H
79124 +#define _X86_64_PTRACE_H
79125 +
79126 +#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS) 
79127 +#define R15 0
79128 +#define R14 8
79129 +#define R13 16
79130 +#define R12 24
79131 +#define RBP 32
79132 +#define RBX 40
79133 +/* arguments: interrupts/non tracing syscalls only save upto here*/
79134 +#define R11 48
79135 +#define R10 56 
79136 +#define R9 64
79137 +#define R8 72
79138 +#define RAX 80
79139 +#define RCX 88
79140 +#define RDX 96
79141 +#define RSI 104
79142 +#define RDI 112
79143 +#define ORIG_RAX 120       /* = ERROR */ 
79144 +/* end of arguments */         
79145 +/* cpu exception frame or undefined in case of fast syscall. */
79146 +#define RIP 128
79147 +#define CS 136
79148 +#define EFLAGS 144
79149 +#define RSP 152
79150 +#define SS 160
79151 +#define ARGOFFSET R11
79152 +#endif /* __ASSEMBLY__ */
79153 +
79154 +/* top of stack page */ 
79155 +#define FRAME_SIZE 168
79156 +
79157 +#define PTRACE_OLDSETOPTIONS         21
79158 +
79159 +#ifndef __ASSEMBLY__ 
79160 +
79161 +struct pt_regs {
79162 +       unsigned long r15;
79163 +       unsigned long r14;
79164 +       unsigned long r13;
79165 +       unsigned long r12;
79166 +       unsigned long rbp;
79167 +       unsigned long rbx;
79168 +/* arguments: non interrupts/non tracing syscalls only save upto here*/
79169 +       unsigned long r11;
79170 +       unsigned long r10;      
79171 +       unsigned long r9;
79172 +       unsigned long r8;
79173 +       unsigned long rax;
79174 +       unsigned long rcx;
79175 +       unsigned long rdx;
79176 +       unsigned long rsi;
79177 +       unsigned long rdi;
79178 +       unsigned long orig_rax;
79179 +/* end of arguments */         
79180 +/* cpu exception frame or undefined */
79181 +       unsigned long rip;
79182 +       unsigned long cs;
79183 +       unsigned long eflags; 
79184 +       unsigned long rsp; 
79185 +       unsigned long ss;
79186 +/* top of stack page */ 
79187 +};
79188 +
79189 +#endif
79190 +
79191 +/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
79192 +#define PTRACE_GETREGS            12
79193 +#define PTRACE_SETREGS            13
79194 +#define PTRACE_GETFPREGS          14
79195 +#define PTRACE_SETFPREGS          15
79196 +#define PTRACE_GETFPXREGS         18
79197 +#define PTRACE_SETFPXREGS         19
79198 +
79199 +/* only useful for access 32bit programs */
79200 +#define PTRACE_GET_THREAD_AREA    25
79201 +#define PTRACE_SET_THREAD_AREA    26
79202 +
79203 +#define PTRACE_ARCH_PRCTL        30    /* arch_prctl for child */
79204 +
79205 +#if defined(__KERNEL__) && !defined(__ASSEMBLY__) 
79206 +#define user_mode(regs) (!!((regs)->cs & 3))
79207 +#define user_mode_vm(regs) user_mode(regs)
79208 +#define instruction_pointer(regs) ((regs)->rip)
79209 +#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
79210 +extern unsigned long profile_pc(struct pt_regs *regs);
79211 +#else
79212 +#define profile_pc(regs) instruction_pointer(regs)
79213 +#endif
79214 +
79215 +void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
79216 +
79217 +struct task_struct;
79218 +
79219 +extern unsigned long
79220 +convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs);
79221 +
79222 +enum {
79223 +        EF_CF   = 0x00000001,
79224 +        EF_PF   = 0x00000004,
79225 +        EF_AF   = 0x00000010,
79226 +        EF_ZF   = 0x00000040,
79227 +        EF_SF   = 0x00000080,
79228 +        EF_TF   = 0x00000100,
79229 +        EF_IE   = 0x00000200,
79230 +        EF_DF   = 0x00000400,
79231 +        EF_OF   = 0x00000800,
79232 +        EF_IOPL = 0x00003000,
79233 +        EF_IOPL_RING0 = 0x00000000,
79234 +        EF_IOPL_RING1 = 0x00001000,
79235 +        EF_IOPL_RING2 = 0x00002000,
79236 +        EF_NT   = 0x00004000,   /* nested task */
79237 +        EF_RF   = 0x00010000,   /* resume */
79238 +        EF_VM   = 0x00020000,   /* virtual mode */
79239 +        EF_AC   = 0x00040000,   /* alignment */
79240 +        EF_VIF  = 0x00080000,   /* virtual interrupt */
79241 +        EF_VIP  = 0x00100000,   /* virtual interrupt pending */
79242 +        EF_ID   = 0x00200000,   /* id */
79243 +};
79244 +
79245 +#endif
79246 +
79247 +#endif
79248 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/smp.h new/include/asm-x86_64/mach-xen/asm/smp.h
79249 --- linux-2.6/include/asm-x86_64/mach-xen/asm/smp.h     1970-01-01 01:00:00.000000000 +0100
79250 +++ new/include/asm-x86_64/mach-xen/asm/smp.h   2006-05-09 12:35:42.000000000 +0200
79251 @@ -0,0 +1,153 @@
79252 +#ifndef __ASM_SMP_H
79253 +#define __ASM_SMP_H
79254 +
79255 +/*
79256 + * We need the APIC definitions automatically as part of 'smp.h'
79257 + */
79258 +#ifndef __ASSEMBLY__
79259 +#include <linux/config.h>
79260 +#include <linux/threads.h>
79261 +#include <linux/cpumask.h>
79262 +#include <linux/bitops.h>
79263 +extern int disable_apic;
79264 +#endif
79265 +
79266 +#ifdef CONFIG_X86_LOCAL_APIC
79267 +#ifndef __ASSEMBLY__
79268 +#include <asm/fixmap.h>
79269 +#include <asm/mpspec.h>
79270 +#ifdef CONFIG_X86_IO_APIC
79271 +#include <asm/io_apic.h>
79272 +#endif
79273 +#include <asm/apic.h>
79274 +#include <asm/thread_info.h>
79275 +#endif
79276 +#endif
79277 +
79278 +#ifdef CONFIG_SMP
79279 +#ifndef ASSEMBLY
79280 +
79281 +#include <asm/pda.h>
79282 +
79283 +struct pt_regs;
79284 +
79285 +extern cpumask_t cpu_present_mask;
79286 +extern cpumask_t cpu_possible_map;
79287 +extern cpumask_t cpu_online_map;
79288 +extern cpumask_t cpu_initialized;
79289 +
79290 +/*
79291 + * Private routines/data
79292 + */
79293
79294 +extern void smp_alloc_memory(void);
79295 +extern volatile unsigned long smp_invalidate_needed;
79296 +extern int pic_mode;
79297 +extern void lock_ipi_call_lock(void);
79298 +extern void unlock_ipi_call_lock(void);
79299 +extern int smp_num_siblings;
79300 +extern void smp_send_reschedule(int cpu);
79301 +void smp_stop_cpu(void);
79302 +extern int smp_call_function_single(int cpuid, void (*func) (void *info),
79303 +                               void *info, int retry, int wait);
79304 +
79305 +extern cpumask_t cpu_sibling_map[NR_CPUS];
79306 +extern cpumask_t cpu_core_map[NR_CPUS];
79307 +extern int phys_proc_id[NR_CPUS];
79308 +extern int cpu_core_id[NR_CPUS];
79309 +extern u8 cpu_llc_id[NR_CPUS];
79310 +
79311 +#define SMP_TRAMPOLINE_BASE 0x6000
79312 +
79313 +/*
79314 + * On x86 all CPUs are mapped 1:1 to the APIC space.
79315 + * This simplifies scheduling and IPI sending and
79316 + * compresses data structures.
79317 + */
79318 +
79319 +static inline int num_booting_cpus(void)
79320 +{
79321 +       return cpus_weight(cpu_possible_map);
79322 +}
79323 +
79324 +#define raw_smp_processor_id() read_pda(cpunumber)
79325 +
79326 +#ifdef CONFIG_X86_LOCAL_APIC
79327 +static inline int hard_smp_processor_id(void)
79328 +{
79329 +       /* we don't want to mark this access volatile - bad code generation */
79330 +       return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
79331 +}
79332 +#endif
79333 +
79334 +extern int safe_smp_processor_id(void);
79335 +extern int __cpu_disable(void);
79336 +extern void __cpu_die(unsigned int cpu);
79337 +extern void prefill_possible_map(void);
79338 +extern unsigned num_processors;
79339 +extern unsigned disabled_cpus;
79340 +
79341 +#endif /* !ASSEMBLY */
79342 +
79343 +#define NO_PROC_ID             0xFF            /* No processor magic marker */
79344 +
79345 +#endif
79346 +
79347 +#ifndef ASSEMBLY
79348 +/*
79349 + * Some lowlevel functions might want to know about
79350 + * the real APIC ID <-> CPU # mapping.
79351 + */
79352 +extern u8 x86_cpu_to_apicid[NR_CPUS];  /* physical ID */
79353 +extern u8 x86_cpu_to_log_apicid[NR_CPUS];
79354 +extern u8 bios_cpu_apicid[];
79355 +
79356 +#ifdef CONFIG_X86_LOCAL_APIC
79357 +static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
79358 +{
79359 +       return cpus_addr(cpumask)[0];
79360 +}
79361 +
79362 +static inline int cpu_present_to_apicid(int mps_cpu)
79363 +{
79364 +       if (mps_cpu < NR_CPUS)
79365 +               return (int)bios_cpu_apicid[mps_cpu];
79366 +       else
79367 +               return BAD_APICID;
79368 +}
79369 +#endif
79370 +
79371 +#endif /* !ASSEMBLY */
79372 +
79373 +#ifndef CONFIG_SMP
79374 +#define stack_smp_processor_id() 0
79375 +#define safe_smp_processor_id() 0
79376 +#define cpu_logical_map(x) (x)
79377 +#else
79378 +#include <asm/thread_info.h>
79379 +#define stack_smp_processor_id() \
79380 +({                                                             \
79381 +       struct thread_info *ti;                                 \
79382 +       __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));      \
79383 +       ti->cpu;                                                \
79384 +})
79385 +#endif
79386 +
79387 +#ifndef __ASSEMBLY__
79388 +#ifdef CONFIG_X86_LOCAL_APIC
79389 +static __inline int logical_smp_processor_id(void)
79390 +{
79391 +       /* we don't want to mark this access volatile - bad code generation */
79392 +       return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
79393 +}
79394 +#endif
79395 +#endif
79396 +
79397 +#ifdef CONFIG_SMP
79398 +#define cpu_physical_id(cpu)           x86_cpu_to_apicid[cpu]
79399 +#else
79400 +#define cpu_physical_id(cpu)           boot_cpu_id
79401 +#endif
79402 +
79403 +#endif
79404 +
79405 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/synch_bitops.h new/include/asm-x86_64/mach-xen/asm/synch_bitops.h
79406 --- linux-2.6/include/asm-x86_64/mach-xen/asm/synch_bitops.h    1970-01-01 01:00:00.000000000 +0100
79407 +++ new/include/asm-x86_64/mach-xen/asm/synch_bitops.h  2006-05-09 12:35:42.000000000 +0200
79408 @@ -0,0 +1,2 @@
79409 +
79410 +#include <asm-i386/mach-xen/asm/synch_bitops.h>
79411 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/system.h new/include/asm-x86_64/mach-xen/asm/system.h
79412 --- linux-2.6/include/asm-x86_64/mach-xen/asm/system.h  1970-01-01 01:00:00.000000000 +0100
79413 +++ new/include/asm-x86_64/mach-xen/asm/system.h        2006-06-07 13:15:16.000000000 +0200
79414 @@ -0,0 +1,428 @@
79415 +#ifndef __ASM_SYSTEM_H
79416 +#define __ASM_SYSTEM_H
79417 +
79418 +#include <linux/config.h>
79419 +#include <linux/kernel.h>
79420 +#include <asm/segment.h>
79421 +#include <asm/synch_bitops.h>
79422 +#include <asm/hypervisor.h>
79423 +#include <xen/interface/arch-x86_64.h>
79424 +
79425 +#ifdef __KERNEL__
79426 +
79427 +#ifdef CONFIG_SMP
79428 +#define __vcpu_id smp_processor_id()
79429 +#else
79430 +#define __vcpu_id 0
79431 +#endif
79432 +
79433 +#ifdef CONFIG_SMP
79434 +#define LOCK_PREFIX "lock ; "
79435 +#else
79436 +#define LOCK_PREFIX ""
79437 +#endif
79438 +
79439 +#define __STR(x) #x
79440 +#define STR(x) __STR(x)
79441 +
79442 +#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
79443 +#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
79444 +
79445 +/* frame pointer must be last for get_wchan */
79446 +#define SAVE_CONTEXT    "pushq %%rbp ; movq %%rsi,%%rbp\n\t"
79447 +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\n\t"
79448 +
79449 +#define __EXTRA_CLOBBER  \
79450 +       ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
79451 +
79452 +#define switch_to(prev,next,last) \
79453 +       asm volatile(SAVE_CONTEXT                                                   \
79454 +                    "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
79455 +                    "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
79456 +                    "call __switch_to\n\t"                                       \
79457 +                    ".globl thread_return\n"                                   \
79458 +                    "thread_return:\n\t"                                           \
79459 +                    "movq %%gs:%P[pda_pcurrent],%%rsi\n\t"                       \
79460 +                    "movq %P[thread_info](%%rsi),%%r8\n\t"                       \
79461 +                    LOCK "btr  %[tif_fork],%P[ti_flags](%%r8)\n\t"               \
79462 +                    "movq %%rax,%%rdi\n\t"                                       \
79463 +                    "jc   ret_from_fork\n\t"                                     \
79464 +                    RESTORE_CONTEXT                                                \
79465 +                    : "=a" (last)                                                \
79466 +                    : [next] "S" (next), [prev] "D" (prev),                      \
79467 +                      [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
79468 +                      [ti_flags] "i" (offsetof(struct thread_info, flags)),\
79469 +                      [tif_fork] "i" (TIF_FORK),                         \
79470 +                      [thread_info] "i" (offsetof(struct task_struct, thread_info)), \
79471 +                      [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent))   \
79472 +                    : "memory", "cc" __EXTRA_CLOBBER)
79473 +    
79474 +
79475 +extern void load_gs_index(unsigned);
79476 +
79477 +/*
79478 + * Load a segment. Fall back on loading the zero
79479 + * segment if something goes wrong..
79480 + */
79481 +#define loadsegment(seg,value) \
79482 +       asm volatile("\n"                       \
79483 +               "1:\t"                          \
79484 +               "movl %k0,%%" #seg "\n"         \
79485 +               "2:\n"                          \
79486 +               ".section .fixup,\"ax\"\n"      \
79487 +               "3:\t"                          \
79488 +               "movl %1,%%" #seg "\n\t"        \
79489 +               "jmp 2b\n"                      \
79490 +               ".previous\n"                   \
79491 +               ".section __ex_table,\"a\"\n\t" \
79492 +               ".align 8\n\t"                  \
79493 +               ".quad 1b,3b\n"                 \
79494 +               ".previous"                     \
79495 +               : :"r" (value), "r" (0))
79496 +
79497 +#ifdef __KERNEL__
79498 +struct alt_instr { 
79499 +       __u8 *instr;            /* original instruction */
79500 +       __u8 *replacement;
79501 +       __u8  cpuid;            /* cpuid bit set for replacement */
79502 +       __u8  instrlen;         /* length of original instruction */
79503 +       __u8  replacementlen;   /* length of new instruction, <= instrlen */ 
79504 +       __u8  pad[5];
79505 +}; 
79506 +#endif
79507 +
79508 +/*
79509 + * Alternative instructions for different CPU types or capabilities.
79510 + * 
79511 + * This allows to use optimized instructions even on generic binary
79512 + * kernels.
79513 + * 
79514 + * length of oldinstr must be longer or equal the length of newinstr
79515 + * It can be padded with nops as needed.
79516 + * 
79517 + * For non barrier like inlines please define new variants
79518 + * without volatile and memory clobber.
79519 + */
79520 +#define alternative(oldinstr, newinstr, feature)       \
79521 +       asm volatile ("661:\n\t" oldinstr "\n662:\n"                 \
79522 +                     ".section .altinstructions,\"a\"\n"            \
79523 +                     "  .align 8\n"                                   \
79524 +                     "  .quad 661b\n"            /* label */          \
79525 +                     "  .quad 663f\n"            /* new instruction */ \
79526 +                     "  .byte %c0\n"             /* feature bit */    \
79527 +                     "  .byte 662b-661b\n"       /* sourcelen */      \
79528 +                     "  .byte 664f-663f\n"       /* replacementlen */ \
79529 +                     ".previous\n"                                     \
79530 +                     ".section .altinstr_replacement,\"ax\"\n"         \
79531 +                     "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
79532 +                     ".previous" :: "i" (feature) : "memory")  
79533 +
79534 +/*
79535 + * Alternative inline assembly with input.
79536 + * 
79537 + * Peculiarities:
79538 + * No memory clobber here. 
79539 + * Argument numbers start with 1.
79540 + * Best is to use constraints that are fixed size (like (%1) ... "r")
79541 + * If you use variable sized constraints like "m" or "g" in the 
79542 + * replacement make sure to pad to the worst case length.
79543 + */
79544 +#define alternative_input(oldinstr, newinstr, feature, input...)       \
79545 +       asm volatile ("661:\n\t" oldinstr "\n662:\n"                    \
79546 +                     ".section .altinstructions,\"a\"\n"               \
79547 +                     "  .align 8\n"                                    \
79548 +                     "  .quad 661b\n"            /* label */           \
79549 +                     "  .quad 663f\n"            /* new instruction */ \
79550 +                     "  .byte %c0\n"             /* feature bit */     \
79551 +                     "  .byte 662b-661b\n"       /* sourcelen */       \
79552 +                     "  .byte 664f-663f\n"       /* replacementlen */  \
79553 +                     ".previous\n"                                     \
79554 +                     ".section .altinstr_replacement,\"ax\"\n"         \
79555 +                     "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
79556 +                     ".previous" :: "i" (feature), ##input)
79557 +
79558 +/* Like alternative_input, but with a single output argument */
79559 +#define alternative_io(oldinstr, newinstr, feature, output, input...) \
79560 +       asm volatile ("661:\n\t" oldinstr "\n662:\n"                    \
79561 +                     ".section .altinstructions,\"a\"\n"               \
79562 +                     "  .align 8\n"                                    \
79563 +                     "  .quad 661b\n"            /* label */           \
79564 +                     "  .quad 663f\n"            /* new instruction */ \
79565 +                     "  .byte %c[feat]\n"        /* feature bit */     \
79566 +                     "  .byte 662b-661b\n"       /* sourcelen */       \
79567 +                     "  .byte 664f-663f\n"       /* replacementlen */  \
79568 +                     ".previous\n"                                     \
79569 +                     ".section .altinstr_replacement,\"ax\"\n"         \
79570 +                     "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
79571 +                     ".previous" : output : [feat] "i" (feature), ##input)
79572 +
79573 +/*
79574 + * Clear and set 'TS' bit respectively
79575 + */
79576 +#define clts() (HYPERVISOR_fpu_taskswitch(0))
79577 +
79578 +static inline unsigned long read_cr0(void)
79579 +{ 
79580 +       unsigned long cr0;
79581 +       asm volatile("movq %%cr0,%0" : "=r" (cr0));
79582 +       return cr0;
79583 +} 
79584 +
79585 +static inline void write_cr0(unsigned long val) 
79586 +{ 
79587 +       asm volatile("movq %0,%%cr0" :: "r" (val));
79588 +} 
79589 +
79590 +#define read_cr3() ({ \
79591 +       unsigned long __dummy; \
79592 +       asm("movq %%cr3,%0" : "=r" (__dummy)); \
79593 +       machine_to_phys(__dummy); \
79594 +})
79595 +
79596 +static inline unsigned long read_cr4(void)
79597 +{ 
79598 +       unsigned long cr4;
79599 +       asm("movq %%cr4,%0" : "=r" (cr4));
79600 +       return cr4;
79601 +} 
79602 +
79603 +static inline void write_cr4(unsigned long val)
79604 +{ 
79605 +       asm volatile("movq %0,%%cr4" :: "r" (val));
79606 +} 
79607 +
79608 +#define stts() (HYPERVISOR_fpu_taskswitch(1))
79609 +
79610 +#define wbinvd() \
79611 +       __asm__ __volatile__ ("wbinvd": : :"memory");
79612 +
79613 +/*
79614 + * On SMP systems, when the scheduler does migration-cost autodetection,
79615 + * it needs a way to flush as much of the CPU's caches as possible.
79616 + */
79617 +static inline void sched_cacheflush(void)
79618 +{
79619 +       wbinvd();
79620 +}
79621 +
79622 +#endif /* __KERNEL__ */
79623 +
79624 +#define nop() __asm__ __volatile__ ("nop")
79625 +
79626 +#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
79627 +
79628 +#define tas(ptr) (xchg((ptr),1))
79629 +
79630 +#define __xg(x) ((volatile long *)(x))
79631 +
79632 +static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
79633 +{
79634 +       *ptr = val;
79635 +}
79636 +
79637 +#define _set_64bit set_64bit
79638 +
79639 +/*
79640 + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
79641 + * Note 2: xchg has side effect, so that attribute volatile is necessary,
79642 + *       but generally the primitive is invalid, *ptr is output argument. --ANK
79643 + */
79644 +static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
79645 +{
79646 +       switch (size) {
79647 +               case 1:
79648 +                       __asm__ __volatile__("xchgb %b0,%1"
79649 +                               :"=q" (x)
79650 +                               :"m" (*__xg(ptr)), "0" (x)
79651 +                               :"memory");
79652 +                       break;
79653 +               case 2:
79654 +                       __asm__ __volatile__("xchgw %w0,%1"
79655 +                               :"=r" (x)
79656 +                               :"m" (*__xg(ptr)), "0" (x)
79657 +                               :"memory");
79658 +                       break;
79659 +               case 4:
79660 +                       __asm__ __volatile__("xchgl %k0,%1"
79661 +                               :"=r" (x)
79662 +                               :"m" (*__xg(ptr)), "0" (x)
79663 +                               :"memory");
79664 +                       break;
79665 +               case 8:
79666 +                       __asm__ __volatile__("xchgq %0,%1"
79667 +                               :"=r" (x)
79668 +                               :"m" (*__xg(ptr)), "0" (x)
79669 +                               :"memory");
79670 +                       break;
79671 +       }
79672 +       return x;
79673 +}
79674 +
79675 +/*
79676 + * Atomic compare and exchange.  Compare OLD with MEM, if identical,
79677 + * store NEW in MEM.  Return the initial value in MEM.  Success is
79678 + * indicated by comparing RETURN with OLD.
79679 + */
79680 +
79681 +#define __HAVE_ARCH_CMPXCHG 1
79682 +
79683 +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
79684 +                                     unsigned long new, int size)
79685 +{
79686 +       unsigned long prev;
79687 +       switch (size) {
79688 +       case 1:
79689 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
79690 +                                    : "=a"(prev)
79691 +                                    : "q"(new), "m"(*__xg(ptr)), "0"(old)
79692 +                                    : "memory");
79693 +               return prev;
79694 +       case 2:
79695 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
79696 +                                    : "=a"(prev)
79697 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
79698 +                                    : "memory");
79699 +               return prev;
79700 +       case 4:
79701 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %k1,%2"
79702 +                                    : "=a"(prev)
79703 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
79704 +                                    : "memory");
79705 +               return prev;
79706 +       case 8:
79707 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgq %1,%2"
79708 +                                    : "=a"(prev)
79709 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
79710 +                                    : "memory");
79711 +               return prev;
79712 +       }
79713 +       return old;
79714 +}
79715 +
79716 +#define cmpxchg(ptr,o,n)\
79717 +       ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
79718 +                                       (unsigned long)(n),sizeof(*(ptr))))
79719 +
79720 +#ifdef CONFIG_SMP
79721 +#define smp_mb()       mb()
79722 +#define smp_rmb()      rmb()
79723 +#define smp_wmb()      wmb()
79724 +#define smp_read_barrier_depends()     do {} while(0)
79725 +#else
79726 +#define smp_mb()       barrier()
79727 +#define smp_rmb()      barrier()
79728 +#define smp_wmb()      barrier()
79729 +#define smp_read_barrier_depends()     do {} while(0)
79730 +#endif
79731 +
79732 +    
79733 +/*
79734 + * Force strict CPU ordering.
79735 + * And yes, this is required on UP too when we're talking
79736 + * to devices.
79737 + */
79738 +#define mb()   asm volatile("mfence":::"memory")
79739 +#define rmb()  asm volatile("lfence":::"memory")
79740 +
79741 +#ifdef CONFIG_UNORDERED_IO
79742 +#define wmb()  asm volatile("sfence" ::: "memory")
79743 +#else
79744 +#define wmb()  asm volatile("" ::: "memory")
79745 +#endif
79746 +#define read_barrier_depends() do {} while(0)
79747 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
79748 +#define set_wmb(var, value) do { var = value; wmb(); } while (0)
79749 +
79750 +#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
79751 +
79752 +
79753 +/* 
79754 + * The use of 'barrier' in the following reflects their use as local-lock
79755 + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
79756 + * critical operations are executed. All critical operations must complete
79757 + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
79758 + * includes these barriers, for example.
79759 + */
79760 +
79761 +#define __cli()                                                                \
79762 +do {                                                                   \
79763 +       vcpu_info_t *_vcpu;                                             \
79764 +       preempt_disable();                                              \
79765 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
79766 +       _vcpu->evtchn_upcall_mask = 1;                                  \
79767 +       preempt_enable_no_resched();                                    \
79768 +       barrier();                                                      \
79769 +} while (0)
79770 +
79771 +#define __sti()                                                                \
79772 +do {                                                                   \
79773 +       vcpu_info_t *_vcpu;                                             \
79774 +       barrier();                                                      \
79775 +       preempt_disable();                                              \
79776 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
79777 +       _vcpu->evtchn_upcall_mask = 0;                                  \
79778 +       barrier(); /* unmask then check (avoid races) */                \
79779 +       if ( unlikely(_vcpu->evtchn_upcall_pending) )                   \
79780 +               force_evtchn_callback();                                \
79781 +       preempt_enable();                                               \
79782 +} while (0)
79783 +
79784 +#define __save_flags(x)                                                        \
79785 +do {                                                                   \
79786 +       vcpu_info_t *_vcpu;                                             \
79787 +       preempt_disable();                                              \
79788 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
79789 +       (x) = _vcpu->evtchn_upcall_mask;                                \
79790 +       preempt_enable();                                               \
79791 +} while (0)
79792 +
79793 +#define __restore_flags(x)                                             \
79794 +do {                                                                   \
79795 +       vcpu_info_t *_vcpu;                                             \
79796 +       barrier();                                                      \
79797 +       preempt_disable();                                              \
79798 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
79799 +       if ((_vcpu->evtchn_upcall_mask = (x)) == 0) {                   \
79800 +               barrier(); /* unmask then check (avoid races) */        \
79801 +               if ( unlikely(_vcpu->evtchn_upcall_pending) )           \
79802 +                       force_evtchn_callback();                        \
79803 +               preempt_enable();                                       \
79804 +       } else                                                          \
79805 +               preempt_enable_no_resched();                            \
79806 +} while (0)
79807 +
79808 +#define __save_and_cli(x)                                              \
79809 +do {                                                                   \
79810 +       vcpu_info_t *_vcpu;                                             \
79811 +       preempt_disable();                                              \
79812 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
79813 +       (x) = _vcpu->evtchn_upcall_mask;                                \
79814 +       _vcpu->evtchn_upcall_mask = 1;                                  \
79815 +       preempt_enable_no_resched();                                    \
79816 +       barrier();                                                      \
79817 +} while (0)
79818 +
79819 +#define local_irq_save(x)      __save_and_cli(x)
79820 +#define local_irq_restore(x)   __restore_flags(x)
79821 +#define local_save_flags(x)    __save_flags(x)
79822 +#define local_irq_disable()    __cli()
79823 +#define local_irq_enable()     __sti()
79824 +
79825 +/* Cannot use preempt_enable() here as we would recurse in preempt_sched(). */
79826 +#define irqs_disabled()                                                        \
79827 +({     int ___x;                                                       \
79828 +       vcpu_info_t *_vcpu;                                             \
79829 +       preempt_disable();                                              \
79830 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
79831 +       ___x = (_vcpu->evtchn_upcall_mask != 0);                        \
79832 +       preempt_enable_no_resched();                                    \
79833 +       ___x; })
79834 +
79835 +void safe_halt(void);
79836 +void halt(void);
79837 +
79838 +void cpu_idle_wait(void);
79839 +
79840 +extern unsigned long arch_align_stack(unsigned long sp);
79841 +
79842 +#endif
79843 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/timer.h new/include/asm-x86_64/mach-xen/asm/timer.h
79844 --- linux-2.6/include/asm-x86_64/mach-xen/asm/timer.h   1970-01-01 01:00:00.000000000 +0100
79845 +++ new/include/asm-x86_64/mach-xen/asm/timer.h 2006-05-09 12:35:42.000000000 +0200
79846 @@ -0,0 +1,67 @@
79847 +#ifndef _ASMi386_TIMER_H
79848 +#define _ASMi386_TIMER_H
79849 +#include <linux/init.h>
79850 +
79851 +/**
79852 + * struct timer_ops - used to define a timer source
79853 + *
79854 + * @name: name of the timer.
79855 + * @init: Probes and initializes the timer. Takes clock= override 
79856 + *        string as an argument. Returns 0 on success, anything else
79857 + *        on failure.
79858 + * @mark_offset: called by the timer interrupt.
79859 + * @get_offset:  called by gettimeofday(). Returns the number of microseconds
79860 + *               since the last timer interupt.
79861 + * @monotonic_clock: returns the number of nanoseconds since the init of the
79862 + *                   timer.
79863 + * @delay: delays this many clock cycles.
79864 + */
79865 +struct timer_opts {
79866 +       char* name;
79867 +       void (*mark_offset)(void);
79868 +       unsigned long (*get_offset)(void);
79869 +       unsigned long long (*monotonic_clock)(void);
79870 +       void (*delay)(unsigned long);
79871 +       unsigned long (*read_timer)(void);
79872 +       int (*suspend)(pm_message_t state);
79873 +       int (*resume)(void);
79874 +};
79875 +
79876 +struct init_timer_opts {
79877 +       int (*init)(char *override);
79878 +       struct timer_opts *opts;
79879 +};
79880 +
79881 +#define TICK_SIZE (tick_nsec / 1000)
79882 +
79883 +extern struct timer_opts* __init select_timer(void);
79884 +extern void clock_fallback(void);
79885 +void setup_pit_timer(void);
79886 +
79887 +/* Modifiers for buggy PIT handling */
79888 +
79889 +extern int pit_latch_buggy;
79890 +
79891 +extern struct timer_opts *cur_timer;
79892 +extern int timer_ack;
79893 +
79894 +/* list of externed timers */
79895 +extern struct timer_opts timer_none;
79896 +extern struct timer_opts timer_pit;
79897 +extern struct init_timer_opts timer_pit_init;
79898 +extern struct init_timer_opts timer_tsc_init;
79899 +#ifdef CONFIG_X86_CYCLONE_TIMER
79900 +extern struct init_timer_opts timer_cyclone_init;
79901 +#endif
79902 +
79903 +extern unsigned long calibrate_tsc(void);
79904 +extern void init_cpu_khz(void);
79905 +#ifdef CONFIG_HPET_TIMER
79906 +extern struct init_timer_opts timer_hpet_init;
79907 +extern unsigned long calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr);
79908 +#endif
79909 +
79910 +#ifdef CONFIG_X86_PM_TIMER
79911 +extern struct init_timer_opts timer_pmtmr_init;
79912 +#endif
79913 +#endif
79914 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/tlbflush.h new/include/asm-x86_64/mach-xen/asm/tlbflush.h
79915 --- linux-2.6/include/asm-x86_64/mach-xen/asm/tlbflush.h        1970-01-01 01:00:00.000000000 +0100
79916 +++ new/include/asm-x86_64/mach-xen/asm/tlbflush.h      2006-05-09 12:35:42.000000000 +0200
79917 @@ -0,0 +1,104 @@
79918 +#ifndef _X8664_TLBFLUSH_H
79919 +#define _X8664_TLBFLUSH_H
79920 +
79921 +#include <linux/config.h>
79922 +#include <linux/mm.h>
79923 +#include <asm/processor.h>
79924 +
79925 +#define __flush_tlb()  xen_tlb_flush()
79926 +
79927 +/*
79928 + * Global pages have to be flushed a bit differently. Not a real
79929 + * performance problem because this does not happen often.
79930 + */
79931 +#define __flush_tlb_global()   xen_tlb_flush()
79932 +
79933 +
79934 +extern unsigned long pgkern_mask;
79935 +
79936 +#define __flush_tlb_all() __flush_tlb_global()
79937 +
79938 +#define __flush_tlb_one(addr)  xen_invlpg((unsigned long)addr)
79939 +
79940 +
79941 +/*
79942 + * TLB flushing:
79943 + *
79944 + *  - flush_tlb() flushes the current mm struct TLBs
79945 + *  - flush_tlb_all() flushes all processes TLBs
79946 + *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
79947 + *  - flush_tlb_page(vma, vmaddr) flushes one page
79948 + *  - flush_tlb_range(vma, start, end) flushes a range of pages
79949 + *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
79950 + *  - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
79951 + *
79952 + * x86-64 can only flush individual pages or full VMs. For a range flush
79953 + * we always do the full VM. Might be worth trying if for a small
79954 + * range a few INVLPGs in a row are a win.
79955 + */
79956 +
79957 +#ifndef CONFIG_SMP
79958 +
79959 +#define flush_tlb() __flush_tlb()
79960 +#define flush_tlb_all() __flush_tlb_all()
79961 +#define local_flush_tlb() __flush_tlb()
79962 +
79963 +static inline void flush_tlb_mm(struct mm_struct *mm)
79964 +{
79965 +       if (mm == current->active_mm)
79966 +               __flush_tlb();
79967 +}
79968 +
79969 +static inline void flush_tlb_page(struct vm_area_struct *vma,
79970 +       unsigned long addr)
79971 +{
79972 +       if (vma->vm_mm == current->active_mm)
79973 +               __flush_tlb_one(addr);
79974 +}
79975 +
79976 +static inline void flush_tlb_range(struct vm_area_struct *vma,
79977 +       unsigned long start, unsigned long end)
79978 +{
79979 +       if (vma->vm_mm == current->active_mm)
79980 +               __flush_tlb();
79981 +}
79982 +
79983 +#else
79984 +
79985 +#include <asm/smp.h>
79986 +
79987 +#define local_flush_tlb() \
79988 +       __flush_tlb()
79989 +
79990 +extern void flush_tlb_all(void);
79991 +extern void flush_tlb_current_task(void);
79992 +extern void flush_tlb_mm(struct mm_struct *);
79993 +extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
79994 +
79995 +#define flush_tlb()    flush_tlb_current_task()
79996 +
79997 +static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
79998 +{
79999 +       flush_tlb_mm(vma->vm_mm);
80000 +}
80001 +
80002 +#define TLBSTATE_OK    1
80003 +#define TLBSTATE_LAZY  2
80004 +
80005 +/* Roughly an IPI every 20MB with 4k pages for freeing page table
80006 +   ranges. Cost is about 42k of memory for each CPU. */
80007 +#define ARCH_FREE_PTE_NR 5350  
80008 +
80009 +#endif
80010 +
80011 +#define flush_tlb_kernel_range(start, end) flush_tlb_all()
80012 +
80013 +static inline void flush_tlb_pgtables(struct mm_struct *mm,
80014 +                                     unsigned long start, unsigned long end)
80015 +{
80016 +       /* x86_64 does not keep any page table caches in a software TLB.
80017 +          The CPUs do in their hardware TLBs, but they are handled
80018 +          by the normal TLB flushing algorithms. */
80019 +}
80020 +
80021 +#endif /* _X8664_TLBFLUSH_H */
80022 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/vga.h new/include/asm-x86_64/mach-xen/asm/vga.h
80023 --- linux-2.6/include/asm-x86_64/mach-xen/asm/vga.h     1970-01-01 01:00:00.000000000 +0100
80024 +++ new/include/asm-x86_64/mach-xen/asm/vga.h   2006-05-09 12:35:42.000000000 +0200
80025 @@ -0,0 +1,20 @@
80026 +/*
80027 + *     Access to VGA videoram
80028 + *
80029 + *     (c) 1998 Martin Mares <mj@ucw.cz>
80030 + */
80031 +
80032 +#ifndef _LINUX_ASM_VGA_H_
80033 +#define _LINUX_ASM_VGA_H_
80034 +
80035 +/*
80036 + *     On the PC, we can just recalculate addresses and then
80037 + *     access the videoram directly without any black magic.
80038 + */
80039 +
80040 +#define VGA_MAP_MEM(x) (unsigned long)isa_bus_to_virt(x)
80041 +
80042 +#define vga_readb(x) (*(x))
80043 +#define vga_writeb(x,y) (*(y) = (x))
80044 +
80045 +#endif
80046 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/asm/xor.h new/include/asm-x86_64/mach-xen/asm/xor.h
80047 --- linux-2.6/include/asm-x86_64/mach-xen/asm/xor.h     1970-01-01 01:00:00.000000000 +0100
80048 +++ new/include/asm-x86_64/mach-xen/asm/xor.h   2006-05-09 12:35:42.000000000 +0200
80049 @@ -0,0 +1,328 @@
80050 +/*
80051 + * x86-64 changes / gcc fixes from Andi Kleen. 
80052 + * Copyright 2002 Andi Kleen, SuSE Labs.
80053 + *
80054 + * This hasn't been optimized for the hammer yet, but there are likely
80055 + * no advantages to be gotten from x86-64 here anyways.
80056 + */
80057 +
80058 +typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
80059 +
80060 +/* Doesn't use gcc to save the XMM registers, because there is no easy way to 
80061 +   tell it to do a clts before the register saving. */
80062 +#define XMMS_SAVE do {                         \
80063 +       preempt_disable();                      \
80064 +       if (!(current_thread_info()->status & TS_USEDFPU))      \
80065 +               clts();                         \
80066 +       __asm__ __volatile__ (                  \
80067 +               "movups %%xmm0,(%1)     ;\n\t"  \
80068 +               "movups %%xmm1,0x10(%1) ;\n\t"  \
80069 +               "movups %%xmm2,0x20(%1) ;\n\t"  \
80070 +               "movups %%xmm3,0x30(%1) ;\n\t"  \
80071 +               : "=&r" (cr0)                   \
80072 +               : "r" (xmm_save)                \
80073 +               : "memory");                    \
80074 +} while(0)
80075 +
80076 +#define XMMS_RESTORE do {                      \
80077 +       asm volatile (                          \
80078 +               "sfence                 ;\n\t"  \
80079 +               "movups (%1),%%xmm0     ;\n\t"  \
80080 +               "movups 0x10(%1),%%xmm1 ;\n\t"  \
80081 +               "movups 0x20(%1),%%xmm2 ;\n\t"  \
80082 +               "movups 0x30(%1),%%xmm3 ;\n\t"  \
80083 +               :                               \
80084 +               : "r" (cr0), "r" (xmm_save)     \
80085 +               : "memory");                    \
80086 +       if (!(current_thread_info()->status & TS_USEDFPU))      \
80087 +               stts();                         \
80088 +       preempt_enable();                       \
80089 +} while(0)
80090 +
80091 +#define OFFS(x)                "16*("#x")"
80092 +#define PF_OFFS(x)     "256+16*("#x")"
80093 +#define        PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
80094 +#define LD(x,y)                "       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"    ;\n"
80095 +#define ST(x,y)                "       movaps %%xmm"#y",   "OFFS(x)"(%[p1])    ;\n"
80096 +#define PF1(x)         "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
80097 +#define PF2(x)         "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
80098 +#define PF3(x)         "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
80099 +#define PF4(x)         "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
80100 +#define PF5(x)         "       prefetchnta "PF_OFFS(x)"(%[p6])         ;\n"
80101 +#define XO1(x,y)       "       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"     ;\n"
80102 +#define XO2(x,y)       "       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"     ;\n"
80103 +#define XO3(x,y)       "       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"     ;\n"
80104 +#define XO4(x,y)       "       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"     ;\n"
80105 +#define XO5(x,y)       "       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"     ;\n"
80106 +
80107 +
80108 +static void
80109 +xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
80110 +{
80111 +        unsigned int lines = bytes >> 8;
80112 +       unsigned long cr0;
80113 +       xmm_store_t xmm_save[4];
80114 +
80115 +       XMMS_SAVE;
80116 +
80117 +        asm volatile (
80118 +#undef BLOCK
80119 +#define BLOCK(i) \
80120 +               LD(i,0)                                 \
80121 +                       LD(i+1,1)                       \
80122 +               PF1(i)                                  \
80123 +                               PF1(i+2)                \
80124 +                               LD(i+2,2)               \
80125 +                                       LD(i+3,3)       \
80126 +               PF0(i+4)                                \
80127 +                               PF0(i+6)                \
80128 +               XO1(i,0)                                \
80129 +                       XO1(i+1,1)                      \
80130 +                               XO1(i+2,2)              \
80131 +                                       XO1(i+3,3)      \
80132 +               ST(i,0)                                 \
80133 +                       ST(i+1,1)                       \
80134 +                               ST(i+2,2)               \
80135 +                                       ST(i+3,3)       \
80136 +
80137 +
80138 +               PF0(0)
80139 +                               PF0(2)
80140 +
80141 +       " .align 32                     ;\n"
80142 +        " 1:                            ;\n"
80143 +
80144 +               BLOCK(0)
80145 +               BLOCK(4)
80146 +               BLOCK(8)
80147 +               BLOCK(12)
80148 +
80149 +        "       addq %[inc], %[p1]           ;\n"
80150 +        "       addq %[inc], %[p2]           ;\n"
80151 +               "               decl %[cnt] ; jnz 1b"
80152 +       : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
80153 +       : [inc] "r" (256UL) 
80154 +        : "memory");
80155 +
80156 +       XMMS_RESTORE;
80157 +}
80158 +
80159 +static void
80160 +xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
80161 +         unsigned long *p3)
80162 +{
80163 +       unsigned int lines = bytes >> 8;
80164 +       xmm_store_t xmm_save[4];
80165 +       unsigned long cr0;
80166 +
80167 +       XMMS_SAVE;
80168 +
80169 +        __asm__ __volatile__ (
80170 +#undef BLOCK
80171 +#define BLOCK(i) \
80172 +               PF1(i)                                  \
80173 +                               PF1(i+2)                \
80174 +               LD(i,0)                                 \
80175 +                       LD(i+1,1)                       \
80176 +                               LD(i+2,2)               \
80177 +                                       LD(i+3,3)       \
80178 +               PF2(i)                                  \
80179 +                               PF2(i+2)                \
80180 +               PF0(i+4)                                \
80181 +                               PF0(i+6)                \
80182 +               XO1(i,0)                                \
80183 +                       XO1(i+1,1)                      \
80184 +                               XO1(i+2,2)              \
80185 +                                       XO1(i+3,3)      \
80186 +               XO2(i,0)                                \
80187 +                       XO2(i+1,1)                      \
80188 +                               XO2(i+2,2)              \
80189 +                                       XO2(i+3,3)      \
80190 +               ST(i,0)                                 \
80191 +                       ST(i+1,1)                       \
80192 +                               ST(i+2,2)               \
80193 +                                       ST(i+3,3)       \
80194 +
80195 +
80196 +               PF0(0)
80197 +                               PF0(2)
80198 +
80199 +       " .align 32                     ;\n"
80200 +        " 1:                            ;\n"
80201 +
80202 +               BLOCK(0)
80203 +               BLOCK(4)
80204 +               BLOCK(8)
80205 +               BLOCK(12)
80206 +
80207 +        "       addq %[inc], %[p1]           ;\n"
80208 +        "       addq %[inc], %[p2]          ;\n"
80209 +        "       addq %[inc], %[p3]           ;\n"
80210 +               "               decl %[cnt] ; jnz 1b"
80211 +       : [cnt] "+r" (lines),
80212 +         [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
80213 +       : [inc] "r" (256UL)
80214 +       : "memory"); 
80215 +       XMMS_RESTORE;
80216 +}
80217 +
80218 +static void
80219 +xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
80220 +         unsigned long *p3, unsigned long *p4)
80221 +{
80222 +       unsigned int lines = bytes >> 8;
80223 +       xmm_store_t xmm_save[4]; 
80224 +       unsigned long cr0;
80225 +
80226 +       XMMS_SAVE;
80227 +
80228 +        __asm__ __volatile__ (
80229 +#undef BLOCK
80230 +#define BLOCK(i) \
80231 +               PF1(i)                                  \
80232 +                               PF1(i+2)                \
80233 +               LD(i,0)                                 \
80234 +                       LD(i+1,1)                       \
80235 +                               LD(i+2,2)               \
80236 +                                       LD(i+3,3)       \
80237 +               PF2(i)                                  \
80238 +                               PF2(i+2)                \
80239 +               XO1(i,0)                                \
80240 +                       XO1(i+1,1)                      \
80241 +                               XO1(i+2,2)              \
80242 +                                       XO1(i+3,3)      \
80243 +               PF3(i)                                  \
80244 +                               PF3(i+2)                \
80245 +               PF0(i+4)                                \
80246 +                               PF0(i+6)                \
80247 +               XO2(i,0)                                \
80248 +                       XO2(i+1,1)                      \
80249 +                               XO2(i+2,2)              \
80250 +                                       XO2(i+3,3)      \
80251 +               XO3(i,0)                                \
80252 +                       XO3(i+1,1)                      \
80253 +                               XO3(i+2,2)              \
80254 +                                       XO3(i+3,3)      \
80255 +               ST(i,0)                                 \
80256 +                       ST(i+1,1)                       \
80257 +                               ST(i+2,2)               \
80258 +                                       ST(i+3,3)       \
80259 +
80260 +
80261 +               PF0(0)
80262 +                               PF0(2)
80263 +
80264 +       " .align 32                     ;\n"
80265 +        " 1:                            ;\n"
80266 +
80267 +               BLOCK(0)
80268 +               BLOCK(4)
80269 +               BLOCK(8)
80270 +               BLOCK(12)
80271 +
80272 +        "       addq %[inc], %[p1]           ;\n"
80273 +        "       addq %[inc], %[p2]           ;\n"
80274 +        "       addq %[inc], %[p3]           ;\n"
80275 +        "       addq %[inc], %[p4]           ;\n"
80276 +       "       decl %[cnt] ; jnz 1b"
80277 +       : [cnt] "+c" (lines),
80278 +         [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
80279 +       : [inc] "r" (256UL)
80280 +        : "memory" );
80281 +
80282 +       XMMS_RESTORE;
80283 +}
80284 +
80285 +static void
80286 +xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
80287 +         unsigned long *p3, unsigned long *p4, unsigned long *p5)
80288 +{
80289 +        unsigned int lines = bytes >> 8;
80290 +       xmm_store_t xmm_save[4];
80291 +       unsigned long cr0;
80292 +
80293 +       XMMS_SAVE;
80294 +
80295 +        __asm__ __volatile__ (
80296 +#undef BLOCK
80297 +#define BLOCK(i) \
80298 +               PF1(i)                                  \
80299 +                               PF1(i+2)                \
80300 +               LD(i,0)                                 \
80301 +                       LD(i+1,1)                       \
80302 +                               LD(i+2,2)               \
80303 +                                       LD(i+3,3)       \
80304 +               PF2(i)                                  \
80305 +                               PF2(i+2)                \
80306 +               XO1(i,0)                                \
80307 +                       XO1(i+1,1)                      \
80308 +                               XO1(i+2,2)              \
80309 +                                       XO1(i+3,3)      \
80310 +               PF3(i)                                  \
80311 +                               PF3(i+2)                \
80312 +               XO2(i,0)                                \
80313 +                       XO2(i+1,1)                      \
80314 +                               XO2(i+2,2)              \
80315 +                                       XO2(i+3,3)      \
80316 +               PF4(i)                                  \
80317 +                               PF4(i+2)                \
80318 +               PF0(i+4)                                \
80319 +                               PF0(i+6)                \
80320 +               XO3(i,0)                                \
80321 +                       XO3(i+1,1)                      \
80322 +                               XO3(i+2,2)              \
80323 +                                       XO3(i+3,3)      \
80324 +               XO4(i,0)                                \
80325 +                       XO4(i+1,1)                      \
80326 +                               XO4(i+2,2)              \
80327 +                                       XO4(i+3,3)      \
80328 +               ST(i,0)                                 \
80329 +                       ST(i+1,1)                       \
80330 +                               ST(i+2,2)               \
80331 +                                       ST(i+3,3)       \
80332 +
80333 +
80334 +               PF0(0)
80335 +                               PF0(2)
80336 +
80337 +       " .align 32                     ;\n"
80338 +        " 1:                            ;\n"
80339 +
80340 +               BLOCK(0)
80341 +               BLOCK(4)
80342 +               BLOCK(8)
80343 +               BLOCK(12)
80344 +
80345 +        "       addq %[inc], %[p1]           ;\n"
80346 +        "       addq %[inc], %[p2]           ;\n"
80347 +        "       addq %[inc], %[p3]           ;\n"
80348 +        "       addq %[inc], %[p4]           ;\n"
80349 +        "       addq %[inc], %[p5]           ;\n"
80350 +       "       decl %[cnt] ; jnz 1b"
80351 +       : [cnt] "+c" (lines),
80352 +         [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), 
80353 +         [p5] "+r" (p5)
80354 +       : [inc] "r" (256UL)
80355 +       : "memory");
80356 +
80357 +       XMMS_RESTORE;
80358 +}
80359 +
80360 +static struct xor_block_template xor_block_sse = {
80361 +        .name = "generic_sse",
80362 +        .do_2 = xor_sse_2,
80363 +        .do_3 = xor_sse_3,
80364 +        .do_4 = xor_sse_4,
80365 +        .do_5 = xor_sse_5,
80366 +};
80367 +
80368 +#undef XOR_TRY_TEMPLATES
80369 +#define XOR_TRY_TEMPLATES                              \
80370 +       do {                                            \
80371 +               xor_speed(&xor_block_sse);      \
80372 +       } while (0)
80373 +
80374 +/* We force the use of the SSE xor block because it can write around L2.
80375 +   We may also be able to load into the L1 only depending on how the cpu
80376 +   deals with a load to a line that is being prefetched.  */
80377 +#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
80378 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/irq_vectors.h new/include/asm-x86_64/mach-xen/irq_vectors.h
80379 --- linux-2.6/include/asm-x86_64/mach-xen/irq_vectors.h 1970-01-01 01:00:00.000000000 +0100
80380 +++ new/include/asm-x86_64/mach-xen/irq_vectors.h       2006-05-09 12:35:42.000000000 +0200
80381 @@ -0,0 +1,123 @@
80382 +/*
80383 + * This file should contain #defines for all of the interrupt vector
80384 + * numbers used by this architecture.
80385 + *
80386 + * In addition, there are some standard defines:
80387 + *
80388 + *     FIRST_EXTERNAL_VECTOR:
80389 + *             The first free place for external interrupts
80390 + *
80391 + *     SYSCALL_VECTOR:
80392 + *             The IRQ vector a syscall makes the user to kernel transition
80393 + *             under.
80394 + *
80395 + *     TIMER_IRQ:
80396 + *             The IRQ number the timer interrupt comes in at.
80397 + *
80398 + *     NR_IRQS:
80399 + *             The total number of interrupt vectors (including all the
80400 + *             architecture specific interrupts) needed.
80401 + *
80402 + */                    
80403 +#ifndef _ASM_IRQ_VECTORS_H
80404 +#define _ASM_IRQ_VECTORS_H
80405 +
80406 +/*
80407 + * IDT vectors usable for external interrupt sources start
80408 + * at 0x20:
80409 + */
80410 +#define FIRST_EXTERNAL_VECTOR  0x20
80411 +
80412 +#define SYSCALL_VECTOR         0x80
80413 +
80414 +/*
80415 + * Vectors 0x20-0x2f are used for ISA interrupts.
80416 + */
80417 +
80418 +#if 0
80419 +/*
80420 + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
80421 + *
80422 + *  some of the following vectors are 'rare', they are merged
80423 + *  into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
80424 + *  TLB, reschedule and local APIC vectors are performance-critical.
80425 + *
80426 + *  Vectors 0xf0-0xfa are free (reserved for future Linux use).
80427 + */
80428 +#define INVALIDATE_TLB_VECTOR  0xfd
80429 +#define RESCHEDULE_VECTOR      0xfc
80430 +#define CALL_FUNCTION_VECTOR   0xfb
80431 +
80432 +#define THERMAL_APIC_VECTOR    0xf0
80433 +/*
80434 + * Local APIC timer IRQ vector is on a different priority level,
80435 + * to work around the 'lost local interrupt if more than 2 IRQ
80436 + * sources per level' errata.
80437 + */
80438 +#define LOCAL_TIMER_VECTOR     0xef
80439 +#endif
80440 +
80441 +#define SPURIOUS_APIC_VECTOR   0xff
80442 +#define ERROR_APIC_VECTOR      0xfe
80443 +
80444 +/*
80445 + * First APIC vector available to drivers: (vectors 0x30-0xee)
80446 + * we start at 0x31 to spread out vectors evenly between priority
80447 + * levels. (0x80 is the syscall vector)
80448 + */
80449 +#define FIRST_DEVICE_VECTOR    0x31
80450 +#define FIRST_SYSTEM_VECTOR    0xef
80451 +
80452 +/*
80453 + * 16 8259A IRQ's, 208 potential APIC interrupt sources.
80454 + * Right now the APIC is mostly only used for SMP.
80455 + * 256 vectors is an architectural limit. (we can have
80456 + * more than 256 devices theoretically, but they will
80457 + * have to use shared interrupts)
80458 + * Since vectors 0x00-0x1f are used/reserved for the CPU,
80459 + * the usable vector space is 0x20-0xff (224 vectors)
80460 + */
80461 +
80462 +#define RESCHEDULE_VECTOR      0
80463 +#define CALL_FUNCTION_VECTOR   1
80464 +#define NR_IPIS                        2
80465 +
80466 +/*
80467 + * The maximum number of vectors supported by i386 processors
80468 + * is limited to 256. For processors other than i386, NR_VECTORS
80469 + * should be changed accordingly.
80470 + */
80471 +#define NR_VECTORS 256
80472 +
80473 +#define FPU_IRQ                        13
80474 +
80475 +#define        FIRST_VM86_IRQ          3
80476 +#define LAST_VM86_IRQ          15
80477 +#define invalid_vm86_irq(irq)  ((irq) < 3 || (irq) > 15)
80478 +
80479 +/*
80480 + * The flat IRQ space is divided into two regions:
80481 + *  1. A one-to-one mapping of real physical IRQs. This space is only used
80482 + *     if we have physical device-access privilege. This region is at the 
80483 + *     start of the IRQ space so that existing device drivers do not need
80484 + *     to be modified to translate physical IRQ numbers into our IRQ space.
80485 + *  3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
80486 + *     are bound using the provided bind/unbind functions.
80487 + */
80488 +
80489 +#define PIRQ_BASE              0
80490 +#define NR_PIRQS               256
80491 +
80492 +#define DYNIRQ_BASE            (PIRQ_BASE + NR_PIRQS)
80493 +#define NR_DYNIRQS             256
80494 +
80495 +#define NR_IRQS                        (NR_PIRQS + NR_DYNIRQS)
80496 +#define NR_IRQ_VECTORS         NR_IRQS
80497 +
80498 +#define pirq_to_irq(_x)                ((_x) + PIRQ_BASE)
80499 +#define irq_to_pirq(_x)                ((_x) - PIRQ_BASE)
80500 +
80501 +#define dynirq_to_irq(_x)      ((_x) + DYNIRQ_BASE)
80502 +#define irq_to_dynirq(_x)      ((_x) - DYNIRQ_BASE)
80503 +
80504 +#endif /* _ASM_IRQ_VECTORS_H */
80505 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/mach_time.h new/include/asm-x86_64/mach-xen/mach_time.h
80506 --- linux-2.6/include/asm-x86_64/mach-xen/mach_time.h   1970-01-01 01:00:00.000000000 +0100
80507 +++ new/include/asm-x86_64/mach-xen/mach_time.h 2006-05-09 12:35:42.000000000 +0200
80508 @@ -0,0 +1,111 @@
80509 +/*
80510 + *  include/asm-i386/mach-default/mach_time.h
80511 + *
80512 + *  Machine specific set RTC function for generic.
80513 + *  Split out from time.c by Osamu Tomita <tomita@cinet.co.jp>
80514 + */
80515 +#ifndef _MACH_TIME_H
80516 +#define _MACH_TIME_H
80517 +
80518 +#include <asm-i386/mc146818rtc.h>
80519 +
80520 +/* for check timing call set_rtc_mmss() 500ms     */
80521 +/* used in arch/i386/time.c::do_timer_interrupt() */
80522 +#define USEC_AFTER     500000
80523 +#define USEC_BEFORE    500000
80524 +
80525 +/*
80526 + * In order to set the CMOS clock precisely, set_rtc_mmss has to be
80527 + * called 500 ms after the second nowtime has started, because when
80528 + * nowtime is written into the registers of the CMOS clock, it will
80529 + * jump to the next second precisely 500 ms later. Check the Motorola
80530 + * MC146818A or Dallas DS12887 data sheet for details.
80531 + *
80532 + * BUG: This routine does not handle hour overflow properly; it just
80533 + *      sets the minutes. Usually you'll only notice that after reboot!
80534 + */
80535 +static inline int mach_set_rtc_mmss(unsigned long nowtime)
80536 +{
80537 +       int retval = 0;
80538 +       int real_seconds, real_minutes, cmos_minutes;
80539 +       unsigned char save_control, save_freq_select;
80540 +
80541 +       save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being set */
80542 +       CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
80543 +
80544 +       save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset prescaler */
80545 +       CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
80546 +
80547 +       cmos_minutes = CMOS_READ(RTC_MINUTES);
80548 +       if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
80549 +               BCD_TO_BIN(cmos_minutes);
80550 +
80551 +       /*
80552 +        * since we're only adjusting minutes and seconds,
80553 +        * don't interfere with hour overflow. This avoids
80554 +        * messing with unknown time zones but requires your
80555 +        * RTC not to be off by more than 15 minutes
80556 +        */
80557 +       real_seconds = nowtime % 60;
80558 +       real_minutes = nowtime / 60;
80559 +       if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
80560 +               real_minutes += 30;             /* correct for half hour time zone */
80561 +       real_minutes %= 60;
80562 +
80563 +       if (abs(real_minutes - cmos_minutes) < 30) {
80564 +               if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
80565 +                       BIN_TO_BCD(real_seconds);
80566 +                       BIN_TO_BCD(real_minutes);
80567 +               }
80568 +               CMOS_WRITE(real_seconds,RTC_SECONDS);
80569 +               CMOS_WRITE(real_minutes,RTC_MINUTES);
80570 +       } else {
80571 +               printk(KERN_WARNING
80572 +                      "set_rtc_mmss: can't update from %d to %d\n",
80573 +                      cmos_minutes, real_minutes);
80574 +               retval = -1;
80575 +       }
80576 +
80577 +       /* The following flags have to be released exactly in this order,
80578 +        * otherwise the DS12887 (popular MC146818A clone with integrated
80579 +        * battery and quartz) will not reset the oscillator and will not
80580 +        * update precisely 500 ms later. You won't find this mentioned in
80581 +        * the Dallas Semiconductor data sheets, but who believes data
80582 +        * sheets anyway ...                           -- Markus Kuhn
80583 +        */
80584 +       CMOS_WRITE(save_control, RTC_CONTROL);
80585 +       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
80586 +
80587 +       return retval;
80588 +}
80589 +
80590 +static inline unsigned long mach_get_cmos_time(void)
80591 +{
80592 +       unsigned int year, mon, day, hour, min, sec;
80593 +
80594 +       do {
80595 +               sec = CMOS_READ(RTC_SECONDS);
80596 +               min = CMOS_READ(RTC_MINUTES);
80597 +               hour = CMOS_READ(RTC_HOURS);
80598 +               day = CMOS_READ(RTC_DAY_OF_MONTH);
80599 +               mon = CMOS_READ(RTC_MONTH);
80600 +               year = CMOS_READ(RTC_YEAR);
80601 +       } while (sec != CMOS_READ(RTC_SECONDS));
80602 +
80603 +       if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
80604 +               BCD_TO_BIN(sec);
80605 +               BCD_TO_BIN(min);
80606 +               BCD_TO_BIN(hour);
80607 +               BCD_TO_BIN(day);
80608 +               BCD_TO_BIN(mon);
80609 +               BCD_TO_BIN(year);
80610 +       }
80611 +
80612 +       year += 1900;
80613 +       if (year < 1970)
80614 +               year += 100;
80615 +
80616 +       return mktime(year, mon, day, hour, min, sec);
80617 +}
80618 +
80619 +#endif /* !_MACH_TIME_H */
80620 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/mach_timer.h new/include/asm-x86_64/mach-xen/mach_timer.h
80621 --- linux-2.6/include/asm-x86_64/mach-xen/mach_timer.h  1970-01-01 01:00:00.000000000 +0100
80622 +++ new/include/asm-x86_64/mach-xen/mach_timer.h        2006-05-09 12:35:42.000000000 +0200
80623 @@ -0,0 +1,48 @@
80624 +/*
80625 + *  include/asm-i386/mach-default/mach_timer.h
80626 + *
80627 + *  Machine specific calibrate_tsc() for generic.
80628 + *  Split out from timer_tsc.c by Osamu Tomita <tomita@cinet.co.jp>
80629 + */
80630 +/* ------ Calibrate the TSC ------- 
80631 + * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset().
80632 + * Too much 64-bit arithmetic here to do this cleanly in C, and for
80633 + * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2)
80634 + * output busy loop as low as possible. We avoid reading the CTC registers
80635 + * directly because of the awkward 8-bit access mechanism of the 82C54
80636 + * device.
80637 + */
80638 +#ifndef _MACH_TIMER_H
80639 +#define _MACH_TIMER_H
80640 +
80641 +#define CALIBRATE_LATCH        (5 * LATCH)
80642 +
80643 +static inline void mach_prepare_counter(void)
80644 +{
80645 +       /* Set the Gate high, disable speaker */
80646 +       outb((inb(0x61) & ~0x02) | 0x01, 0x61);
80647 +
80648 +       /*
80649 +        * Now let's take care of CTC channel 2
80650 +        *
80651 +        * Set the Gate high, program CTC channel 2 for mode 0,
80652 +        * (interrupt on terminal count mode), binary count,
80653 +        * load 5 * LATCH count, (LSB and MSB) to begin countdown.
80654 +        *
80655 +        * Some devices need a delay here.
80656 +        */
80657 +       outb(0xb0, 0x43);                       /* binary, mode 0, LSB/MSB, Ch 2 */
80658 +       outb_p(CALIBRATE_LATCH & 0xff, 0x42);   /* LSB of count */
80659 +       outb_p(CALIBRATE_LATCH >> 8, 0x42);       /* MSB of count */
80660 +}
80661 +
80662 +static inline void mach_countup(unsigned long *count_p)
80663 +{
80664 +       unsigned long count = 0;
80665 +       do {
80666 +               count++;
80667 +       } while ((inb_p(0x61) & 0x20) == 0);
80668 +       *count_p = count;
80669 +}
80670 +
80671 +#endif /* !_MACH_TIMER_H */
80672 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/setup_arch_post.h new/include/asm-x86_64/mach-xen/setup_arch_post.h
80673 --- linux-2.6/include/asm-x86_64/mach-xen/setup_arch_post.h     1970-01-01 01:00:00.000000000 +0100
80674 +++ new/include/asm-x86_64/mach-xen/setup_arch_post.h   2006-05-09 12:35:42.000000000 +0200
80675 @@ -0,0 +1,58 @@
80676 +/**
80677 + * machine_specific_* - Hooks for machine specific setup.
80678 + *
80679 + * Description:
80680 + *     This is included late in kernel/setup.c so that it can make
80681 + *     use of all of the static functions.
80682 + **/
80683 +
80684 +#include <xen/interface/callback.h>
80685 +
80686 +extern void hypervisor_callback(void);
80687 +extern void failsafe_callback(void);
80688 +extern void nmi(void);
80689 +
80690 +static void __init machine_specific_arch_setup(void)
80691 +{
80692 +       int ret;
80693 +       struct callback_register event = {
80694 +               .type = CALLBACKTYPE_event,
80695 +               .address = (unsigned long) hypervisor_callback,
80696 +       };
80697 +       struct callback_register failsafe = {
80698 +               .type = CALLBACKTYPE_failsafe,
80699 +               .address = (unsigned long)failsafe_callback,
80700 +       };
80701 +       struct callback_register syscall = {
80702 +               .type = CALLBACKTYPE_syscall,
80703 +               .address = (unsigned long)system_call,
80704 +       };
80705 +#ifdef CONFIG_X86_LOCAL_APIC
80706 +       struct callback_register nmi_cb = {
80707 +               .type = CALLBACKTYPE_nmi,
80708 +               .address = (unsigned long)nmi,
80709 +       };
80710 +#endif
80711 +
80712 +       ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
80713 +       if (ret == 0)
80714 +               ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
80715 +       if (ret == 0)
80716 +               ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
80717 +       if (ret == -ENOSYS)
80718 +               ret = HYPERVISOR_set_callbacks(
80719 +                       event.address,
80720 +                       failsafe.address,
80721 +                       syscall.address);
80722 +       BUG_ON(ret);
80723 +
80724 +#ifdef CONFIG_X86_LOCAL_APIC
80725 +       ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
80726 +       if (ret == -ENOSYS) {
80727 +               struct xennmi_callback cb;
80728 +
80729 +               cb.handler_address = nmi_cb.address;
80730 +               HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
80731 +       }
80732 +#endif
80733 +}
80734 diff -urNp linux-2.6/include/asm-x86_64/mach-xen/setup_arch_pre.h new/include/asm-x86_64/mach-xen/setup_arch_pre.h
80735 --- linux-2.6/include/asm-x86_64/mach-xen/setup_arch_pre.h      1970-01-01 01:00:00.000000000 +0100
80736 +++ new/include/asm-x86_64/mach-xen/setup_arch_pre.h    2006-05-09 12:35:42.000000000 +0200
80737 @@ -0,0 +1,5 @@
80738 +/* Hook to call BIOS initialisation function */
80739 +
80740 +#define ARCH_SETUP machine_specific_arch_setup();
80741 +
80742 +static void __init machine_specific_arch_setup(void);
80743 diff -urNp linux-2.6/include/linux/ethtool.h new/include/linux/ethtool.h
80744 --- linux-2.6/include/linux/ethtool.h   2006-07-03 14:15:16.000000000 +0200
80745 +++ new/include/linux/ethtool.h 2006-07-07 15:10:03.000000000 +0200
80746 @@ -408,6 +408,8 @@ struct ethtool_ops {
80747  #define ETHTOOL_GPERMADDR      0x00000020 /* Get permanent hardware address */
80748  #define ETHTOOL_GUFO           0x00000021 /* Get UFO enable (ethtool_value) */
80749  #define ETHTOOL_SUFO           0x00000022 /* Set UFO enable (ethtool_value) */
80750 +#define ETHTOOL_GGSO           0x00000023 /* Get GSO enable (ethtool_value) */
80751 +#define ETHTOOL_SGSO           0x00000024 /* Set GSO enable (ethtool_value) */
80752  
80753  /* compatibility with older code */
80754  #define SPARC_ETH_GSET         ETHTOOL_GSET
80755 diff -urNp linux-2.6/include/linux/gfp.h new/include/linux/gfp.h
80756 --- linux-2.6/include/linux/gfp.h       2006-07-03 14:15:16.000000000 +0200
80757 +++ new/include/linux/gfp.h     2006-05-09 12:35:45.000000000 +0200
80758 @@ -100,7 +100,11 @@ static inline int gfp_zone(gfp_t gfp)
80759   */
80760  
80761  #ifndef HAVE_ARCH_FREE_PAGE
80762 -static inline void arch_free_page(struct page *page, int order) { }
80763 +/*
80764 + * If arch_free_page returns non-zero then the generic free_page code can
80765 + * immediately bail: the arch-specific function has done all the work.
80766 + */
80767 +static inline int arch_free_page(struct page *page, int order) { return 0; }
80768  #endif
80769  
80770  extern struct page *
80771 diff -urNp linux-2.6/include/linux/highmem.h new/include/linux/highmem.h
80772 --- linux-2.6/include/linux/highmem.h   2006-07-03 14:15:16.000000000 +0200
80773 +++ new/include/linux/highmem.h 2006-05-09 12:35:45.000000000 +0200
80774 @@ -25,10 +25,16 @@ static inline void flush_kernel_dcache_p
80775  
80776  /* declarations for linux/mm/highmem.c */
80777  unsigned int nr_free_highpages(void);
80778 +#ifdef CONFIG_XEN
80779 +void kmap_flush_unused(void);
80780 +#endif
80781  
80782  #else /* CONFIG_HIGHMEM */
80783  
80784  static inline unsigned int nr_free_highpages(void) { return 0; }
80785 +#ifdef CONFIG_XEN
80786 +static inline void kmap_flush_unused(void) { }
80787 +#endif
80788  
80789  static inline void *kmap(struct page *page)
80790  {
80791 diff -urNp linux-2.6/include/linux/interrupt.h new/include/linux/interrupt.h
80792 --- linux-2.6/include/linux/interrupt.h 2006-07-03 14:15:17.000000000 +0200
80793 +++ new/include/linux/interrupt.h       2006-06-28 14:32:14.000000000 +0200
80794 @@ -58,6 +58,12 @@ extern void disable_irq(unsigned int irq
80795  extern void enable_irq(unsigned int irq);
80796  #endif
80797  
80798 +#ifdef CONFIG_HAVE_IRQ_IGNORE_UNHANDLED
80799 +int irq_ignore_unhandled(unsigned int irq);
80800 +#else
80801 +#define irq_ignore_unhandled(irq) 0
80802 +#endif
80803 +
80804  #ifndef __ARCH_SET_SOFTIRQ_PENDING
80805  #define set_softirq_pending(x) (local_softirq_pending() = (x))
80806  #define or_softirq_pending(x)  (local_softirq_pending() |= (x))
80807 diff -urNp linux-2.6/include/linux/mm.h new/include/linux/mm.h
80808 --- linux-2.6/include/linux/mm.h        2006-07-03 14:15:17.000000000 +0200
80809 +++ new/include/linux/mm.h      2006-05-09 12:40:16.000000000 +0200
80810 @@ -166,6 +166,9 @@ extern unsigned int kobjsize(const void 
80811  #define VM_NONLINEAR   0x00800000      /* Is non-linear (remap_file_pages) */
80812  #define VM_MAPPED_COPY 0x01000000      /* T if mapped copy of data (nommu mmap) */
80813  #define VM_INSERTPAGE  0x02000000      /* The vma has had "vm_insert_page()" done on it */
80814 +#ifdef CONFIG_XEN
80815 +#define VM_FOREIGN     0x04000000      /* Has pages belonging to another VM */
80816 +#endif
80817  
80818  #ifndef VM_STACK_DEFAULT_FLAGS         /* arch can override this */
80819  #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
80820 @@ -1014,6 +1017,13 @@ struct page *follow_page(struct vm_area_
80821  #define FOLL_GET       0x04    /* do get_page on page */
80822  #define FOLL_ANON      0x08    /* give ZERO_PAGE if no pgtable */
80823  
80824 +#ifdef CONFIG_XEN
80825 +typedef int (*pte_fn_t)(pte_t *pte, struct page *pmd_page, unsigned long addr,
80826 +                       void *data);
80827 +extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
80828 +                              unsigned long size, pte_fn_t fn, void *data);
80829 +#endif
80830 +
80831  #ifdef CONFIG_PROC_FS
80832  void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
80833  #else
80834 diff -urNp linux-2.6/include/linux/netdevice.h new/include/linux/netdevice.h
80835 --- linux-2.6/include/linux/netdevice.h 2006-07-03 14:15:17.000000000 +0200
80836 +++ new/include/linux/netdevice.h       2006-07-07 16:50:50.000000000 +0200
80837 @@ -232,6 +232,7 @@ enum netdev_state_t
80838         __LINK_STATE_RX_SCHED,
80839         __LINK_STATE_LINKWATCH_PENDING,
80840         __LINK_STATE_DORMANT,
80841 +       __LINK_STATE_QDISC_RUNNING,
80842  };
80843  
80844  
80845 @@ -307,9 +308,17 @@ struct net_device
80846  #define NETIF_F_HW_VLAN_RX     256     /* Receive VLAN hw acceleration */
80847  #define NETIF_F_HW_VLAN_FILTER 512     /* Receive filtering on VLAN */
80848  #define NETIF_F_VLAN_CHALLENGED        1024    /* Device cannot handle VLAN packets */
80849 -#define NETIF_F_TSO            2048    /* Can offload TCP/IP segmentation */
80850 +#define NETIF_F_GSO            2048    /* Enable software GSO. */
80851  #define NETIF_F_LLTX           4096    /* LockLess TX */
80852 -#define NETIF_F_UFO             8192    /* Can offload UDP Large Send*/
80853 +
80854 +       /* Segmentation offload features */
80855 +#define NETIF_F_GSO_SHIFT      16
80856 +#define NETIF_F_TSO            (SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT)
80857 +#define NETIF_F_UFO            (SKB_GSO_UDPV4 << NETIF_F_GSO_SHIFT)
80858 +#define NETIF_F_GSO_ROBUST     (SKB_GSO_DODGY << NETIF_F_GSO_SHIFT)
80859 +
80860 +#define NETIF_F_GEN_CSUM       (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
80861 +#define NETIF_F_ALL_CSUM       (NETIF_F_IP_CSUM | NETIF_F_GEN_CSUM)
80862  
80863         struct net_device       *next_sched;
80864  
80865 @@ -398,6 +407,9 @@ struct net_device
80866         struct list_head        qdisc_list;
80867         unsigned long           tx_queue_len;   /* Max frames per queue allowed */
80868  
80869 +       /* Partially transmitted GSO packet. */
80870 +       struct sk_buff          *gso_skb;
80871 +
80872         /* ingress path synchronizer */
80873         spinlock_t              ingress_lock;
80874         struct Qdisc            *qdisc_ingress;
80875 @@ -406,7 +418,7 @@ struct net_device
80876   * One part is mostly used on xmit path (device)
80877   */
80878         /* hard_start_xmit synchronizer */
80879 -       spinlock_t              xmit_lock ____cacheline_aligned_in_smp;
80880 +       spinlock_t              _xmit_lock ____cacheline_aligned_in_smp;
80881         /* cpu id of processor entered to hard_start_xmit or -1,
80882            if nobody entered there.
80883          */
80884 @@ -532,6 +544,8 @@ struct packet_type {
80885                                          struct net_device *,
80886                                          struct packet_type *,
80887                                          struct net_device *);
80888 +       struct sk_buff          *(*gso_segment)(struct sk_buff *skb,
80889 +                                               int features);
80890         void                    *af_packet_priv;
80891         struct list_head        list;
80892  };
80893 @@ -679,7 +693,8 @@ extern int          dev_change_name(struct net_d
80894  extern int             dev_set_mtu(struct net_device *, int);
80895  extern int             dev_set_mac_address(struct net_device *,
80896                                             struct sockaddr *);
80897 -extern void            dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
80898 +extern int             dev_hard_start_xmit(struct sk_buff *skb,
80899 +                                           struct net_device *dev);
80900  
80901  extern void            dev_init(void);
80902  
80903 @@ -889,11 +904,43 @@ static inline void __netif_rx_complete(s
80904         clear_bit(__LINK_STATE_RX_SCHED, &dev->state);
80905  }
80906  
80907 +static inline void netif_tx_lock(struct net_device *dev)
80908 +{
80909 +       spin_lock(&dev->_xmit_lock);
80910 +       dev->xmit_lock_owner = smp_processor_id();
80911 +}
80912 +
80913 +static inline void netif_tx_lock_bh(struct net_device *dev)
80914 +{
80915 +       spin_lock_bh(&dev->_xmit_lock);
80916 +       dev->xmit_lock_owner = smp_processor_id();
80917 +}
80918 +
80919 +static inline int netif_tx_trylock(struct net_device *dev)
80920 +{
80921 +       int err = spin_trylock(&dev->_xmit_lock);
80922 +       if (!err)
80923 +               dev->xmit_lock_owner = smp_processor_id();
80924 +       return err;
80925 +}
80926 +
80927 +static inline void netif_tx_unlock(struct net_device *dev)
80928 +{
80929 +       dev->xmit_lock_owner = -1;
80930 +       spin_unlock(&dev->_xmit_lock);
80931 +}
80932 +
80933 +static inline void netif_tx_unlock_bh(struct net_device *dev)
80934 +{
80935 +       dev->xmit_lock_owner = -1;
80936 +       spin_unlock_bh(&dev->_xmit_lock);
80937 +}
80938 +
80939  static inline void netif_tx_disable(struct net_device *dev)
80940  {
80941 -       spin_lock_bh(&dev->xmit_lock);
80942 +       netif_tx_lock_bh(dev);
80943         netif_stop_queue(dev);
80944 -       spin_unlock_bh(&dev->xmit_lock);
80945 +       netif_tx_unlock_bh(dev);
80946  }
80947  
80948  /* These functions live elsewhere (drivers/net/net_init.c, but related) */
80949 @@ -921,6 +968,7 @@ extern int          netdev_max_backlog;
80950  extern int             weight_p;
80951  extern int             netdev_set_master(struct net_device *dev, struct net_device *master);
80952  extern int skb_checksum_help(struct sk_buff *skb, int inward);
80953 +extern struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features);
80954  #ifdef CONFIG_BUG
80955  extern void netdev_rx_csum_fault(struct net_device *dev);
80956  #else
80957 @@ -940,6 +988,18 @@ extern void dev_seq_stop(struct seq_file
80958  
80959  extern void linkwatch_run_queue(void);
80960  
80961 +static inline int skb_gso_ok(struct sk_buff *skb, int features)
80962 +{
80963 +       int feature = skb_shinfo(skb)->gso_size ?
80964 +                     skb_shinfo(skb)->gso_type << NETIF_F_GSO_SHIFT : 0;
80965 +       return (features & feature) == feature;
80966 +}
80967 +
80968 +static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb)
80969 +{
80970 +       return !skb_gso_ok(skb, dev->features);
80971 +}
80972 +
80973  #endif /* __KERNEL__ */
80974  
80975  #endif /* _LINUX_DEV_H */
80976 diff -urNp linux-2.6/include/linux/oprofile.h new/include/linux/oprofile.h
80977 --- linux-2.6/include/linux/oprofile.h  2006-07-03 14:15:18.000000000 +0200
80978 +++ new/include/linux/oprofile.h        2006-07-07 16:08:22.000000000 +0200
80979 @@ -16,6 +16,10 @@
80980  #include <linux/types.h>
80981  #include <linux/spinlock.h>
80982  #include <asm/atomic.h>
80983 +
80984 +#ifdef CONFIG_XEN
80985 +#include <xen/interface/xenoprof.h>
80986 +#endif
80987   
80988  struct super_block;
80989  struct dentry;
80990 @@ -27,6 +31,11 @@ struct oprofile_operations {
80991         /* create any necessary configuration files in the oprofile fs.
80992          * Optional. */
80993         int (*create_files)(struct super_block * sb, struct dentry * root);
80994 +       /* setup active domains with Xen */
80995 +       int (*set_active)(int *active_domains, unsigned int adomains);
80996 +        /* setup passive domains with Xen */
80997 +        int (*set_passive)(int *passive_domains, unsigned int pdomains);
80998 +       
80999         /* Do any necessary interrupt setup. Optional. */
81000         int (*setup)(void);
81001         /* Do any necessary interrupt shutdown. Optional. */
81002 diff -urNp linux-2.6/include/linux/pfn.h new/include/linux/pfn.h
81003 --- linux-2.6/include/linux/pfn.h       2006-07-03 14:15:18.000000000 +0200
81004 +++ new/include/linux/pfn.h     2006-05-09 12:35:49.000000000 +0200
81005 @@ -4,6 +4,6 @@
81006  #define PFN_ALIGN(x)   (((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK)
81007  #define PFN_UP(x)      (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
81008  #define PFN_DOWN(x)    ((x) >> PAGE_SHIFT)
81009 -#define PFN_PHYS(x)    ((x) << PAGE_SHIFT)
81010 +#define PFN_PHYS(x)    ((unsigned long long)(x) << PAGE_SHIFT)
81011  
81012  #endif
81013 diff -urNp linux-2.6/include/linux/skbuff.h new/include/linux/skbuff.h
81014 --- linux-2.6/include/linux/skbuff.h    2006-07-03 14:15:18.000000000 +0200
81015 +++ new/include/linux/skbuff.h  2006-07-07 16:12:19.000000000 +0200
81016 @@ -134,9 +134,10 @@ struct skb_frag_struct {
81017  struct skb_shared_info {
81018         atomic_t        dataref;
81019         unsigned short  nr_frags;
81020 -       unsigned short  tso_size;
81021 -       unsigned short  tso_segs;
81022 -       unsigned short  ufo_size;
81023 +       unsigned short  gso_size;
81024 +       /* Warning: this field is not always filled in (UFO)! */
81025 +       unsigned short  gso_segs;
81026 +       unsigned short  gso_type;
81027         unsigned int    ip6_frag_id;
81028         struct sk_buff  *frag_list;
81029         skb_frag_t      frags[MAX_SKB_FRAGS];
81030 @@ -168,6 +169,14 @@ enum {
81031         SKB_FCLONE_CLONE,
81032  };
81033  
81034 +enum {
81035 +       SKB_GSO_TCPV4 = 1 << 0,
81036 +       SKB_GSO_UDPV4 = 1 << 1,
81037 +
81038 +       /* This indicates the skb is from an untrusted source. */
81039 +       SKB_GSO_DODGY = 1 << 2,
81040 +};
81041 +
81042  /** 
81043   *     struct sk_buff - socket buffer
81044   *     @next: Next buffer in list
81045 @@ -189,6 +198,8 @@ enum {
81046   *     @local_df: allow local fragmentation
81047   *     @cloned: Head may be cloned (check refcnt to be sure)
81048   *     @nohdr: Payload reference only, must not modify header
81049 + *     @proto_data_valid: Protocol data validated since arriving at localhost
81050 + *     @proto_csum_blank: Protocol csum must be added before leaving localhost
81051   *     @pkt_type: Packet class
81052   *     @fclone: skbuff clone status
81053   *     @ip_summed: Driver fed us an IP checksum
81054 @@ -265,7 +276,13 @@ struct sk_buff {
81055                                 nfctinfo:3;
81056         __u8                    pkt_type:3,
81057                                 fclone:2,
81058 +#ifndef CONFIG_XEN
81059                                 ipvs_property:1;
81060 +#else
81061 +                               ipvs_property:1,
81062 +                               proto_data_valid:1,
81063 +                               proto_csum_blank:1;
81064 +#endif
81065         __be16                  protocol;
81066  
81067         void                    (*destructor)(struct sk_buff *skb);
81068 @@ -322,7 +339,8 @@ static inline struct sk_buff *alloc_skb_
81069  
81070  extern struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
81071                                             unsigned int size,
81072 -                                           gfp_t priority);
81073 +                                           gfp_t priority,
81074 +                                           int fclone);
81075  extern void           kfree_skbmem(struct sk_buff *skb);
81076  extern struct sk_buff *skb_clone(struct sk_buff *skb,
81077                                  gfp_t priority);
81078 @@ -1062,7 +1080,7 @@ static inline struct sk_buff *__dev_allo
81079         return skb;
81080  }
81081  #else
81082 -extern struct sk_buff *__dev_alloc_skb(unsigned int length, int gfp_mask);
81083 +extern struct sk_buff *__dev_alloc_skb(unsigned int length, gfp_t gfp_mask);
81084  #endif
81085  
81086  /**
81087 @@ -1161,18 +1179,34 @@ static inline int skb_can_coalesce(struc
81088         return 0;
81089  }
81090  
81091 +static inline int __skb_linearize(struct sk_buff *skb)
81092 +{
81093 +       return __pskb_pull_tail(skb, skb->data_len) ? 0 : -ENOMEM;
81094 +}
81095 +
81096  /**
81097   *     skb_linearize - convert paged skb to linear one
81098   *     @skb: buffer to linarize
81099 - *     @gfp: allocation mode
81100   *
81101   *     If there is no free memory -ENOMEM is returned, otherwise zero
81102   *     is returned and the old skb data released.
81103   */
81104 -extern int __skb_linearize(struct sk_buff *skb, gfp_t gfp);
81105 -static inline int skb_linearize(struct sk_buff *skb, gfp_t gfp)
81106 +static inline int skb_linearize(struct sk_buff *skb)
81107 +{
81108 +       return skb_is_nonlinear(skb) ? __skb_linearize(skb) : 0;
81109 +}
81110 +
81111 +/**
81112 + *     skb_linearize_cow - make sure skb is linear and writable
81113 + *     @skb: buffer to process
81114 + *
81115 + *     If there is no free memory -ENOMEM is returned, otherwise zero
81116 + *     is returned and the old skb data released.
81117 + */
81118 +static inline int skb_linearize_cow(struct sk_buff *skb)
81119  {
81120 -       return __skb_linearize(skb, gfp);
81121 +       return skb_is_nonlinear(skb) || skb_cloned(skb) ?
81122 +              __skb_linearize(skb) : 0;
81123  }
81124  
81125  /**
81126 @@ -1269,6 +1303,7 @@ extern void              skb_split(struct sk_b
81127                                  struct sk_buff *skb1, const u32 len);
81128  
81129  extern void           skb_release_data(struct sk_buff *skb);
81130 +extern struct sk_buff *skb_segment(struct sk_buff *skb, int features);
81131  
81132  static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
81133                                        int len, void *buffer)
81134 diff -urNp linux-2.6/include/net/pkt_sched.h new/include/net/pkt_sched.h
81135 --- linux-2.6/include/net/pkt_sched.h   2006-07-03 14:15:19.000000000 +0200
81136 +++ new/include/net/pkt_sched.h 2006-07-07 15:10:03.000000000 +0200
81137 @@ -218,12 +218,13 @@ extern struct qdisc_rate_table *qdisc_ge
81138                 struct rtattr *tab);
81139  extern void qdisc_put_rtab(struct qdisc_rate_table *tab);
81140  
81141 -extern int qdisc_restart(struct net_device *dev);
81142 +extern void __qdisc_run(struct net_device *dev);
81143  
81144  static inline void qdisc_run(struct net_device *dev)
81145  {
81146 -       while (!netif_queue_stopped(dev) && qdisc_restart(dev) < 0)
81147 -               /* NOTHING */;
81148 +       if (!netif_queue_stopped(dev) &&
81149 +           !test_and_set_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
81150 +               __qdisc_run(dev);
81151  }
81152  
81153  extern int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
81154 diff -urNp linux-2.6/include/net/protocol.h new/include/net/protocol.h
81155 --- linux-2.6/include/net/protocol.h    2006-07-03 14:15:19.000000000 +0200
81156 +++ new/include/net/protocol.h  2006-07-07 15:10:03.000000000 +0200
81157 @@ -37,6 +37,8 @@
81158  struct net_protocol {
81159         int                     (*handler)(struct sk_buff *skb);
81160         void                    (*err_handler)(struct sk_buff *skb, u32 info);
81161 +       struct sk_buff         *(*gso_segment)(struct sk_buff *skb,
81162 +                                              int features);
81163         int                     no_policy;
81164  };
81165  
81166 diff -urNp linux-2.6/include/net/sock.h new/include/net/sock.h
81167 --- linux-2.6/include/net/sock.h        2006-07-03 14:15:19.000000000 +0200
81168 +++ new/include/net/sock.h      2006-07-07 16:12:19.000000000 +0200
81169 @@ -1032,9 +1032,13 @@ static inline void sk_setup_caps(struct 
81170  {
81171         __sk_dst_set(sk, dst);
81172         sk->sk_route_caps = dst->dev->features;
81173 +       if (sk->sk_route_caps & NETIF_F_GSO)
81174 +               sk->sk_route_caps |= NETIF_F_TSO;
81175         if (sk->sk_route_caps & NETIF_F_TSO) {
81176                 if (sock_flag(sk, SOCK_NO_LARGESEND) || dst->header_len)
81177                         sk->sk_route_caps &= ~NETIF_F_TSO;
81178 +               else 
81179 +                       sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
81180         }
81181  }
81182  
81183 diff -urNp linux-2.6/include/net/tcp.h new/include/net/tcp.h
81184 --- linux-2.6/include/net/tcp.h 2006-07-03 14:15:19.000000000 +0200
81185 +++ new/include/net/tcp.h       2006-07-07 16:12:20.000000000 +0200
81186 @@ -565,13 +565,13 @@ struct tcp_skb_cb {
81187   */
81188  static inline int tcp_skb_pcount(const struct sk_buff *skb)
81189  {
81190 -       return skb_shinfo(skb)->tso_segs;
81191 +       return skb_shinfo(skb)->gso_segs;
81192  }
81193  
81194  /* This is valid iff tcp_skb_pcount() > 1. */
81195  static inline int tcp_skb_mss(const struct sk_buff *skb)
81196  {
81197 -       return skb_shinfo(skb)->tso_size;
81198 +       return skb_shinfo(skb)->gso_size;
81199  }
81200  
81201  static inline void tcp_dec_pcount_approx(__u32 *count,
81202 @@ -1076,6 +1076,8 @@ extern struct request_sock_ops tcp_reque
81203  
81204  extern int tcp_v4_destroy_sock(struct sock *sk);
81205  
81206 +extern struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features);
81207 +
81208  #ifdef CONFIG_PROC_FS
81209  extern int  tcp4_proc_init(void);
81210  extern void tcp4_proc_exit(void);
81211 diff -urNp linux-2.6/include/xen/balloon.h new/include/xen/balloon.h
81212 --- linux-2.6/include/xen/balloon.h     1970-01-01 01:00:00.000000000 +0100
81213 +++ new/include/xen/balloon.h   2006-05-23 18:42:17.000000000 +0200
81214 @@ -0,0 +1,63 @@
81215 +/******************************************************************************
81216 + * balloon.h
81217 + *
81218 + * Xen balloon driver - enables returning/claiming memory to/from Xen.
81219 + *
81220 + * Copyright (c) 2003, B Dragovic
81221 + * Copyright (c) 2003-2004, M Williamson, K Fraser
81222 + * 
81223 + * This program is free software; you can redistribute it and/or
81224 + * modify it under the terms of the GNU General Public License version 2
81225 + * as published by the Free Software Foundation; or, when distributed
81226 + * separately from the Linux kernel or incorporated into other
81227 + * software packages, subject to the following license:
81228 + * 
81229 + * Permission is hereby granted, free of charge, to any person obtaining a copy
81230 + * of this source file (the "Software"), to deal in the Software without
81231 + * restriction, including without limitation the rights to use, copy, modify,
81232 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
81233 + * and to permit persons to whom the Software is furnished to do so, subject to
81234 + * the following conditions:
81235 + * 
81236 + * The above copyright notice and this permission notice shall be included in
81237 + * all copies or substantial portions of the Software.
81238 + * 
81239 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
81240 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
81241 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
81242 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
81243 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
81244 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
81245 + * IN THE SOFTWARE.
81246 + */
81247 +
81248 +#ifndef __ASM_BALLOON_H__
81249 +#define __ASM_BALLOON_H__
81250 +
81251 +/*
81252 + * Inform the balloon driver that it should allow some slop for device-driver
81253 + * memory activities.
81254 + */
81255 +extern void
81256 +balloon_update_driver_allowance(
81257 +       long delta);
81258 +
81259 +/* Allocate an empty low-memory page range. */
81260 +extern struct page *
81261 +balloon_alloc_empty_page_range(
81262 +       unsigned long nr_pages);
81263 +
81264 +/* Deallocate an empty page range, adding to the balloon. */
81265 +extern void
81266 +balloon_dealloc_empty_page_range(
81267 +       struct page *page, unsigned long nr_pages);
81268 +
81269 +/*
81270 + * Prevent the balloon driver from changing the memory reservation during
81271 + * a driver critical region.
81272 + */
81273 +extern spinlock_t balloon_lock;
81274 +#define balloon_lock(__flags)   spin_lock_irqsave(&balloon_lock, __flags)
81275 +#define balloon_unlock(__flags) spin_unlock_irqrestore(&balloon_lock, __flags)
81276 +
81277 +#endif /* __ASM_BALLOON_H__ */
81278 diff -urNp linux-2.6/include/xen/cpu_hotplug.h new/include/xen/cpu_hotplug.h
81279 --- linux-2.6/include/xen/cpu_hotplug.h 1970-01-01 01:00:00.000000000 +0100
81280 +++ new/include/xen/cpu_hotplug.h       2006-06-07 13:15:16.000000000 +0200
81281 @@ -0,0 +1,44 @@
81282 +#ifndef __XEN_CPU_HOTPLUG_H__
81283 +#define __XEN_CPU_HOTPLUG_H__
81284 +
81285 +#include <linux/config.h>
81286 +#include <linux/kernel.h>
81287 +#include <linux/cpumask.h>
81288 +
81289 +#if defined(CONFIG_HOTPLUG_CPU)
81290 +
81291 +#if defined(CONFIG_X86)
81292 +void cpu_initialize_context(unsigned int cpu);
81293 +#else
81294 +#define cpu_initialize_context(cpu)    ((void)0)
81295 +#endif
81296 +
81297 +int cpu_up_check(unsigned int cpu);
81298 +void init_xenbus_allowed_cpumask(void);
81299 +int smp_suspend(void);
81300 +void smp_resume(void);
81301 +
81302 +void cpu_bringup(void);
81303 +
81304 +#else /* !defined(CONFIG_HOTPLUG_CPU) */
81305 +
81306 +#define cpu_up_check(cpu)              (0)
81307 +#define init_xenbus_allowed_cpumask()  ((void)0)
81308 +
81309 +static inline int smp_suspend(void)
81310 +{
81311 +       if (num_online_cpus() > 1) {
81312 +               printk(KERN_WARNING "Can't suspend SMP guests "
81313 +                      "without CONFIG_HOTPLUG_CPU\n");
81314 +               return -EOPNOTSUPP;
81315 +       }
81316 +       return 0;
81317 +}
81318 +
81319 +static inline void smp_resume(void)
81320 +{
81321 +}
81322 +
81323 +#endif /* !defined(CONFIG_HOTPLUG_CPU) */
81324 +
81325 +#endif /* __XEN_CPU_HOTPLUG_H__ */
81326 diff -urNp linux-2.6/include/xen/driver_util.h new/include/xen/driver_util.h
81327 --- linux-2.6/include/xen/driver_util.h 1970-01-01 01:00:00.000000000 +0100
81328 +++ new/include/xen/driver_util.h       2006-05-23 18:42:17.000000000 +0200
81329 @@ -0,0 +1,16 @@
81330 +
81331 +#ifndef __ASM_XEN_DRIVER_UTIL_H__
81332 +#define __ASM_XEN_DRIVER_UTIL_H__
81333 +
81334 +#include <linux/config.h>
81335 +#include <linux/vmalloc.h>
81336 +
81337 +/* Allocate/destroy a 'vmalloc' VM area. */
81338 +extern struct vm_struct *alloc_vm_area(unsigned long size);
81339 +extern void free_vm_area(struct vm_struct *area);
81340 +
81341 +/* Lock an area so that PTEs are accessible in the current address space. */
81342 +extern void lock_vm_area(struct vm_struct *area);
81343 +extern void unlock_vm_area(struct vm_struct *area);
81344 +
81345 +#endif /* __ASM_XEN_DRIVER_UTIL_H__ */
81346 diff -urNp linux-2.6/include/xen/evtchn.h new/include/xen/evtchn.h
81347 --- linux-2.6/include/xen/evtchn.h      1970-01-01 01:00:00.000000000 +0100
81348 +++ new/include/xen/evtchn.h    2006-05-23 18:42:17.000000000 +0200
81349 @@ -0,0 +1,114 @@
81350 +/******************************************************************************
81351 + * evtchn.h
81352 + * 
81353 + * Communication via Xen event channels.
81354 + * Also definitions for the device that demuxes notifications to userspace.
81355 + * 
81356 + * Copyright (c) 2004-2005, K A Fraser
81357 + * 
81358 + * This program is free software; you can redistribute it and/or
81359 + * modify it under the terms of the GNU General Public License version 2
81360 + * as published by the Free Software Foundation; or, when distributed
81361 + * separately from the Linux kernel or incorporated into other
81362 + * software packages, subject to the following license:
81363 + * 
81364 + * Permission is hereby granted, free of charge, to any person obtaining a copy
81365 + * of this source file (the "Software"), to deal in the Software without
81366 + * restriction, including without limitation the rights to use, copy, modify,
81367 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
81368 + * and to permit persons to whom the Software is furnished to do so, subject to
81369 + * the following conditions:
81370 + * 
81371 + * The above copyright notice and this permission notice shall be included in
81372 + * all copies or substantial portions of the Software.
81373 + * 
81374 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
81375 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
81376 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
81377 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
81378 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
81379 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
81380 + * IN THE SOFTWARE.
81381 + */
81382 +
81383 +#ifndef __ASM_EVTCHN_H__
81384 +#define __ASM_EVTCHN_H__
81385 +
81386 +#include <linux/config.h>
81387 +#include <linux/interrupt.h>
81388 +#include <asm/hypervisor.h>
81389 +#include <asm/ptrace.h>
81390 +#include <asm/synch_bitops.h>
81391 +#include <xen/interface/event_channel.h>
81392 +#include <linux/smp.h>
81393 +
81394 +/*
81395 + * LOW-LEVEL DEFINITIONS
81396 + */
81397 +
81398 +/*
81399 + * Dynamically bind an event source to an IRQ-like callback handler.
81400 + * On some platforms this may not be implemented via the Linux IRQ subsystem.
81401 + * The IRQ argument passed to the callback handler is the same as returned
81402 + * from the bind call. It may not correspond to a Linux IRQ number.
81403 + * Returns IRQ or negative errno.
81404 + * UNBIND: Takes IRQ to unbind from; automatically closes the event channel.
81405 + */
81406 +extern int bind_evtchn_to_irqhandler(
81407 +       unsigned int evtchn,
81408 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
81409 +       unsigned long irqflags,
81410 +       const char *devname,
81411 +       void *dev_id);
81412 +extern int bind_virq_to_irqhandler(
81413 +       unsigned int virq,
81414 +       unsigned int cpu,
81415 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
81416 +       unsigned long irqflags,
81417 +       const char *devname,
81418 +       void *dev_id);
81419 +extern int bind_ipi_to_irqhandler(
81420 +       unsigned int ipi,
81421 +       unsigned int cpu,
81422 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
81423 +       unsigned long irqflags,
81424 +       const char *devname,
81425 +       void *dev_id);
81426 +
81427 +/*
81428 + * Common unbind function for all event sources. Takes IRQ to unbind from.
81429 + * Automatically closes the underlying event channel (even for bindings
81430 + * made with bind_evtchn_to_irqhandler()).
81431 + */
81432 +extern void unbind_from_irqhandler(unsigned int irq, void *dev_id);
81433 +
81434 +extern void irq_resume(void);
81435 +
81436 +/* Entry point for notifications into Linux subsystems. */
81437 +asmlinkage void evtchn_do_upcall(struct pt_regs *regs);
81438 +
81439 +/* Entry point for notifications into the userland character device. */
81440 +extern void evtchn_device_upcall(int port);
81441 +
81442 +extern void mask_evtchn(int port);
81443 +extern void unmask_evtchn(int port);
81444 +
81445 +static inline void clear_evtchn(int port)
81446 +{
81447 +       shared_info_t *s = HYPERVISOR_shared_info;
81448 +       synch_clear_bit(port, &s->evtchn_pending[0]);
81449 +}
81450 +
81451 +static inline void notify_remote_via_evtchn(int port)
81452 +{
81453 +       struct evtchn_send send = { .port = port };
81454 +       (void)HYPERVISOR_event_channel_op(EVTCHNOP_send, &send);
81455 +}
81456 +
81457 +/*
81458 + * Unlike notify_remote_via_evtchn(), this is safe to use across
81459 + * save/restore. Notifications on a broken connection are silently dropped.
81460 + */
81461 +extern void notify_remote_via_irq(int irq);
81462 +
81463 +#endif /* __ASM_EVTCHN_H__ */
81464 diff -urNp linux-2.6/include/xen/features.h new/include/xen/features.h
81465 --- linux-2.6/include/xen/features.h    1970-01-01 01:00:00.000000000 +0100
81466 +++ new/include/xen/features.h  2006-05-09 12:35:56.000000000 +0200
81467 @@ -0,0 +1,20 @@
81468 +/******************************************************************************
81469 + * features.h
81470 + *
81471 + * Query the features reported by Xen.
81472 + *
81473 + * Copyright (c) 2006, Ian Campbell
81474 + */
81475 +
81476 +#ifndef __ASM_XEN_FEATURES_H__
81477 +#define __ASM_XEN_FEATURES_H__
81478 +
81479 +#include <xen/interface/version.h>
81480 +
81481 +extern void setup_xen_features(void);
81482 +
81483 +extern u8 xen_features[XENFEAT_NR_SUBMAPS * 32];
81484 +
81485 +#define xen_feature(flag)      (xen_features[flag])
81486 +
81487 +#endif /* __ASM_XEN_FEATURES_H__ */
81488 diff -urNp linux-2.6/include/xen/foreign_page.h new/include/xen/foreign_page.h
81489 --- linux-2.6/include/xen/foreign_page.h        1970-01-01 01:00:00.000000000 +0100
81490 +++ new/include/xen/foreign_page.h      2006-05-23 18:42:17.000000000 +0200
81491 @@ -0,0 +1,30 @@
81492 +/******************************************************************************
81493 + * foreign_page.h
81494 + * 
81495 + * Provide a "foreign" page type, that is owned by a foreign allocator and 
81496 + * not the normal buddy allocator in page_alloc.c
81497 + * 
81498 + * Copyright (c) 2004, K A Fraser
81499 + */
81500 +
81501 +#ifndef __ASM_XEN_FOREIGN_PAGE_H__
81502 +#define __ASM_XEN_FOREIGN_PAGE_H__
81503 +
81504 +#define PG_foreign             PG_arch_1
81505 +
81506 +#define PageForeign(page)      test_bit(PG_foreign, &(page)->flags)
81507 +
81508 +#define SetPageForeign(page, dtor) do {                \
81509 +       set_bit(PG_foreign, &(page)->flags);    \
81510 +       (page)->mapping = (void *)dtor;         \
81511 +} while (0)
81512 +
81513 +#define ClearPageForeign(page) do {            \
81514 +       clear_bit(PG_foreign, &(page)->flags);  \
81515 +       (page)->mapping = NULL;                 \
81516 +} while (0)
81517 +
81518 +#define PageForeignDestructor(page)    \
81519 +       ( (void (*) (struct page *)) (page)->mapping )
81520 +
81521 +#endif /* __ASM_XEN_FOREIGN_PAGE_H__ */
81522 diff -urNp linux-2.6/include/xen/gnttab.h new/include/xen/gnttab.h
81523 --- linux-2.6/include/xen/gnttab.h      1970-01-01 01:00:00.000000000 +0100
81524 +++ new/include/xen/gnttab.h    2006-06-28 14:32:14.000000000 +0200
81525 @@ -0,0 +1,151 @@
81526 +/******************************************************************************
81527 + * gnttab.h
81528 + * 
81529 + * Two sets of functionality:
81530 + * 1. Granting foreign access to our memory reservation.
81531 + * 2. Accessing others' memory reservations via grant references.
81532 + * (i.e., mechanisms for both sender and recipient of grant references)
81533 + * 
81534 + * Copyright (c) 2004-2005, K A Fraser
81535 + * Copyright (c) 2005, Christopher Clark
81536 + * 
81537 + * This program is free software; you can redistribute it and/or
81538 + * modify it under the terms of the GNU General Public License version 2
81539 + * as published by the Free Software Foundation; or, when distributed
81540 + * separately from the Linux kernel or incorporated into other
81541 + * software packages, subject to the following license:
81542 + * 
81543 + * Permission is hereby granted, free of charge, to any person obtaining a copy
81544 + * of this source file (the "Software"), to deal in the Software without
81545 + * restriction, including without limitation the rights to use, copy, modify,
81546 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
81547 + * and to permit persons to whom the Software is furnished to do so, subject to
81548 + * the following conditions:
81549 + * 
81550 + * The above copyright notice and this permission notice shall be included in
81551 + * all copies or substantial portions of the Software.
81552 + * 
81553 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
81554 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
81555 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
81556 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
81557 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
81558 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
81559 + * IN THE SOFTWARE.
81560 + */
81561 +
81562 +#ifndef __ASM_GNTTAB_H__
81563 +#define __ASM_GNTTAB_H__
81564 +
81565 +#include <linux/config.h>
81566 +#include <asm/hypervisor.h>
81567 +#include <xen/interface/grant_table.h>
81568 +#include <xen/features.h>
81569 +
81570 +/* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */
81571 +#ifdef __ia64__
81572 +#define NR_GRANT_FRAMES 1
81573 +#else
81574 +#define NR_GRANT_FRAMES 4
81575 +#endif
81576 +
81577 +struct gnttab_free_callback {
81578 +       struct gnttab_free_callback *next;
81579 +       void (*fn)(void *);
81580 +       void *arg;
81581 +       u16 count;
81582 +};
81583 +
81584 +int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
81585 +                               int readonly);
81586 +
81587 +/*
81588 + * End access through the given grant reference, iff the grant entry is no
81589 + * longer in use.  Return 1 if the grant entry was freed, 0 if it is still in
81590 + * use.
81591 + */
81592 +int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly);
81593 +
81594 +/*
81595 + * Eventually end access through the given grant reference, and once that
81596 + * access has been ended, free the given page too.  Access will be ended
81597 + * immediately iff the grant entry is not in use, otherwise it will happen
81598 + * some time later.  page may be 0, in which case no freeing will occur.
81599 + */
81600 +void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
81601 +                              unsigned long page);
81602 +
81603 +int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn);
81604 +
81605 +unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
81606 +unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
81607 +
81608 +int gnttab_query_foreign_access(grant_ref_t ref);
81609 +
81610 +/*
81611 + * operations on reserved batches of grant references
81612 + */
81613 +int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head);
81614 +
81615 +void gnttab_free_grant_reference(grant_ref_t ref);
81616 +
81617 +void gnttab_free_grant_references(grant_ref_t head);
81618 +
81619 +int gnttab_empty_grant_references(const grant_ref_t *pprivate_head);
81620 +
81621 +int gnttab_claim_grant_reference(grant_ref_t *pprivate_head);
81622 +
81623 +void gnttab_release_grant_reference(grant_ref_t *private_head,
81624 +                                   grant_ref_t release);
81625 +
81626 +void gnttab_request_free_callback(struct gnttab_free_callback *callback,
81627 +                                 void (*fn)(void *), void *arg, u16 count);
81628 +void gnttab_cancel_free_callback(struct gnttab_free_callback *callback);
81629 +
81630 +void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
81631 +                                    unsigned long frame, int readonly);
81632 +
81633 +void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
81634 +                                      unsigned long pfn);
81635 +
81636 +#ifdef __ia64__
81637 +#define gnttab_map_vaddr(map) __va(map.dev_bus_addr)
81638 +#else
81639 +#define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
81640 +#endif
81641 +
81642 +int gnttab_suspend(void);
81643 +int gnttab_resume(void);
81644 +
81645 +static inline void
81646 +gnttab_set_map_op(struct gnttab_map_grant_ref *map, unsigned long addr,
81647 +                 uint32_t flags, grant_ref_t ref, domid_t domid)
81648 +{
81649 +       if (flags & GNTMAP_contains_pte)
81650 +               map->host_addr = addr;
81651 +       else if (xen_feature(XENFEAT_auto_translated_physmap))
81652 +               map->host_addr = __pa(addr);
81653 +       else
81654 +               map->host_addr = addr;
81655 +
81656 +       map->flags = flags;
81657 +       map->ref = ref;
81658 +       map->dom = domid;
81659 +}
81660 +
81661 +static inline void
81662 +gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, unsigned long addr,
81663 +                   uint32_t flags, grant_handle_t handle)
81664 +{
81665 +       if (flags & GNTMAP_contains_pte)
81666 +               unmap->host_addr = addr;
81667 +       else if (xen_feature(XENFEAT_auto_translated_physmap))
81668 +               unmap->host_addr = __pa(addr);
81669 +       else
81670 +               unmap->host_addr = addr;
81671 +
81672 +       unmap->handle = handle;
81673 +       unmap->dev_bus_addr = 0;
81674 +}
81675 +
81676 +#endif /* __ASM_GNTTAB_H__ */
81677 diff -urNp linux-2.6/include/xen/hypervisor_sysfs.h new/include/xen/hypervisor_sysfs.h
81678 --- linux-2.6/include/xen/hypervisor_sysfs.h    1970-01-01 01:00:00.000000000 +0100
81679 +++ new/include/xen/hypervisor_sysfs.h  2006-05-09 12:35:56.000000000 +0200
81680 @@ -0,0 +1,32 @@
81681 +/*
81682 + *  copyright (c) 2006 IBM Corporation
81683 + *  Authored by: Mike D. Day <ncmike@us.ibm.com>
81684 + *
81685 + *  This program is free software; you can redistribute it and/or modify
81686 + *  it under the terms of the GNU General Public License version 2 as
81687 + *  published by the Free Software Foundation.
81688 + */
81689 +
81690 +#ifndef _HYP_SYSFS_H_
81691 +#define _HYP_SYSFS_H_
81692 +
81693 +#include <linux/kobject.h>
81694 +#include <linux/sysfs.h>
81695 +
81696 +#define HYPERVISOR_ATTR_RO(_name) \
81697 +static struct hyp_sysfs_attr  _name##_attr = __ATTR_RO(_name)
81698 +
81699 +#define HYPERVISOR_ATTR_RW(_name) \
81700 +static struct hyp_sysfs_attr _name##_attr = \
81701 +       __ATTR(_name, 0644, _name##_show, _name##_store)
81702 +
81703 +extern struct subsystem hypervisor_subsys;
81704 +
81705 +struct hyp_sysfs_attr {
81706 +       struct attribute attr;
81707 +       ssize_t (*show)(struct hyp_sysfs_attr *, char *);
81708 +       ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t);
81709 +       void *hyp_attr_data;
81710 +};
81711 +
81712 +#endif /* _HYP_SYSFS_H_ */
81713 diff -urNp linux-2.6/include/xen/interface/acm.h new/include/xen/interface/acm.h
81714 --- linux-2.6/include/xen/interface/acm.h       1970-01-01 01:00:00.000000000 +0100
81715 +++ new/include/xen/interface/acm.h     2006-06-28 14:32:14.000000000 +0200
81716 @@ -0,0 +1,188 @@
81717 +/*
81718 + * acm.h: Xen access control module interface defintions
81719 + *
81720 + * Reiner Sailer <sailer@watson.ibm.com>
81721 + * Copyright (c) 2005, International Business Machines Corporation.
81722 + */
81723 +
81724 +#ifndef _XEN_PUBLIC_ACM_H
81725 +#define _XEN_PUBLIC_ACM_H
81726 +
81727 +#include "xen.h"
81728 +#include "sched_ctl.h"
81729 +
81730 +/* if ACM_DEBUG defined, all hooks should
81731 + * print a short trace message (comment it out
81732 + * when not in testing mode )
81733 + */
81734 +/* #define ACM_DEBUG */
81735 +
81736 +#ifdef ACM_DEBUG
81737 +#  define printkd(fmt, args...) printk(fmt,## args)
81738 +#else
81739 +#  define printkd(fmt, args...)
81740 +#endif
81741 +
81742 +/* default ssid reference value if not supplied */
81743 +#define ACM_DEFAULT_SSID  0x0
81744 +#define ACM_DEFAULT_LOCAL_SSID  0x0
81745 +
81746 +/* Internal ACM ERROR types */
81747 +#define ACM_OK     0
81748 +#define ACM_UNDEF   -1
81749 +#define ACM_INIT_SSID_ERROR  -2
81750 +#define ACM_INIT_SOID_ERROR  -3
81751 +#define ACM_ERROR          -4
81752 +
81753 +/* External ACCESS DECISIONS */
81754 +#define ACM_ACCESS_PERMITTED        0
81755 +#define ACM_ACCESS_DENIED           -111
81756 +#define ACM_NULL_POINTER_ERROR      -200
81757 +
81758 +/* primary policy in lower 4 bits */
81759 +#define ACM_NULL_POLICY 0
81760 +#define ACM_CHINESE_WALL_POLICY 1
81761 +#define ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY 2
81762 +#define ACM_POLICY_UNDEFINED 15
81763 +
81764 +/* combinations have secondary policy component in higher 4bit */
81765 +#define ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY \
81766 +    ((ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY << 4) | ACM_CHINESE_WALL_POLICY)
81767 +
81768 +/* policy: */
81769 +#define ACM_POLICY_NAME(X) \
81770 + ((X) == (ACM_NULL_POLICY)) ? "NULL" :                        \
81771 +    ((X) == (ACM_CHINESE_WALL_POLICY)) ? "CHINESE WALL" :        \
81772 +    ((X) == (ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "SIMPLE TYPE ENFORCEMENT" : \
81773 +    ((X) == (ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "CHINESE WALL AND SIMPLE TYPE ENFORCEMENT" : \
81774 +     "UNDEFINED"
81775 +
81776 +/* the following policy versions must be increased
81777 + * whenever the interpretation of the related
81778 + * policy's data structure changes
81779 + */
81780 +#define ACM_POLICY_VERSION 2
81781 +#define ACM_CHWALL_VERSION 1
81782 +#define ACM_STE_VERSION  1
81783 +
81784 +/* defines a ssid reference used by xen */
81785 +typedef uint32_t ssidref_t;
81786 +
81787 +/* hooks that are known to domains */
81788 +#define ACMHOOK_none    0
81789 +#define ACMHOOK_sharing 1
81790 +
81791 +/* -------security policy relevant type definitions-------- */
81792 +
81793 +/* type identifier; compares to "equal" or "not equal" */
81794 +typedef uint16_t domaintype_t;
81795 +
81796 +/* CHINESE WALL POLICY DATA STRUCTURES
81797 + *
81798 + * current accumulated conflict type set:
81799 + * When a domain is started and has a type that is in
81800 + * a conflict set, the conflicting types are incremented in
81801 + * the aggregate set. When a domain is destroyed, the 
81802 + * conflicting types to its type are decremented.
81803 + * If a domain has multiple types, this procedure works over
81804 + * all those types.
81805 + *
81806 + * conflict_aggregate_set[i] holds the number of
81807 + *   running domains that have a conflict with type i.
81808 + *
81809 + * running_types[i] holds the number of running domains
81810 + *        that include type i in their ssidref-referenced type set
81811 + *
81812 + * conflict_sets[i][j] is "0" if type j has no conflict
81813 + *    with type i and is "1" otherwise.
81814 + */
81815 +/* high-16 = version, low-16 = check magic */
81816 +#define ACM_MAGIC  0x0001debc
81817 +
81818 +/* each offset in bytes from start of the struct they
81819 + * are part of */
81820 +
81821 +/* each buffer consists of all policy information for
81822 + * the respective policy given in the policy code
81823 + *
81824 + * acm_policy_buffer, acm_chwall_policy_buffer,
81825 + * and acm_ste_policy_buffer need to stay 32-bit aligned
81826 + * because we create binary policies also with external
81827 + * tools that assume packed representations (e.g. the java tool)
81828 + */
81829 +struct acm_policy_buffer {
81830 +    uint32_t policy_version; /* ACM_POLICY_VERSION */
81831 +    uint32_t magic;
81832 +    uint32_t len;
81833 +    uint32_t policy_reference_offset;
81834 +    uint32_t primary_policy_code;
81835 +    uint32_t primary_buffer_offset;
81836 +    uint32_t secondary_policy_code;
81837 +    uint32_t secondary_buffer_offset;
81838 +};
81839 +
81840 +struct acm_policy_reference_buffer {
81841 +    uint32_t len;
81842 +};
81843 +
81844 +struct acm_chwall_policy_buffer {
81845 +    uint32_t policy_version; /* ACM_CHWALL_VERSION */
81846 +    uint32_t policy_code;
81847 +    uint32_t chwall_max_types;
81848 +    uint32_t chwall_max_ssidrefs;
81849 +    uint32_t chwall_max_conflictsets;
81850 +    uint32_t chwall_ssid_offset;
81851 +    uint32_t chwall_conflict_sets_offset;
81852 +    uint32_t chwall_running_types_offset;
81853 +    uint32_t chwall_conflict_aggregate_offset;
81854 +};
81855 +
81856 +struct acm_ste_policy_buffer {
81857 +    uint32_t policy_version; /* ACM_STE_VERSION */
81858 +    uint32_t policy_code;
81859 +    uint32_t ste_max_types;
81860 +    uint32_t ste_max_ssidrefs;
81861 +    uint32_t ste_ssid_offset;
81862 +};
81863 +
81864 +struct acm_stats_buffer {
81865 +    uint32_t magic;
81866 +    uint32_t len;
81867 +    uint32_t primary_policy_code;
81868 +    uint32_t primary_stats_offset;
81869 +    uint32_t secondary_policy_code;
81870 +    uint32_t secondary_stats_offset;
81871 +};
81872 +
81873 +struct acm_ste_stats_buffer {
81874 +    uint32_t ec_eval_count;
81875 +    uint32_t gt_eval_count;
81876 +    uint32_t ec_denied_count;
81877 +    uint32_t gt_denied_count;
81878 +    uint32_t ec_cachehit_count;
81879 +    uint32_t gt_cachehit_count;
81880 +};
81881 +
81882 +struct acm_ssid_buffer {
81883 +    uint32_t len;
81884 +    ssidref_t ssidref;
81885 +    uint32_t policy_reference_offset;
81886 +    uint32_t primary_policy_code;
81887 +    uint32_t primary_max_types;
81888 +    uint32_t primary_types_offset;
81889 +    uint32_t secondary_policy_code;
81890 +    uint32_t secondary_max_types;
81891 +    uint32_t secondary_types_offset;
81892 +};
81893 +
81894 +#endif
81895 +
81896 +/*
81897 + * Local variables:
81898 + * mode: C
81899 + * c-set-style: "BSD"
81900 + * c-basic-offset: 4
81901 + * tab-width: 4
81902 + * indent-tabs-mode: nil
81903 + * End:
81904 + */
81905 diff -urNp linux-2.6/include/xen/interface/acm_ops.h new/include/xen/interface/acm_ops.h
81906 --- linux-2.6/include/xen/interface/acm_ops.h   1970-01-01 01:00:00.000000000 +0100
81907 +++ new/include/xen/interface/acm_ops.h 2006-06-28 14:32:14.000000000 +0200
81908 @@ -0,0 +1,103 @@
81909 +/*
81910 + * acm_ops.h: Xen access control module hypervisor commands
81911 + *
81912 + * Reiner Sailer <sailer@watson.ibm.com>
81913 + * Copyright (c) 2005,2006 International Business Machines Corporation.
81914 + */
81915 +
81916 +#ifndef __XEN_PUBLIC_ACM_OPS_H__
81917 +#define __XEN_PUBLIC_ACM_OPS_H__
81918 +
81919 +#include "xen.h"
81920 +#include "sched_ctl.h"
81921 +#include "acm.h"
81922 +
81923 +/*
81924 + * Make sure you increment the interface version whenever you modify this file!
81925 + * This makes sure that old versions of acm tools will stop working in a
81926 + * well-defined way (rather than crashing the machine, for instance).
81927 + */
81928 +#define ACM_INTERFACE_VERSION   0xAAAA0008
81929 +
81930 +/************************************************************************/
81931 +
81932 +/*
81933 + * Prototype for this hypercall is:
81934 + *  int acm_op(int cmd, void *args)
81935 + * @cmd  == ACMOP_??? (access control module operation).
81936 + * @args == Operation-specific extra arguments (NULL if none).
81937 + */
81938 +
81939 +
81940 +#define ACMOP_setpolicy         1
81941 +struct acm_setpolicy {
81942 +    /* IN */
81943 +    uint32_t interface_version;
81944 +    XEN_GUEST_HANDLE(void) pushcache;
81945 +    uint32_t pushcache_size;
81946 +};
81947 +
81948 +
81949 +#define ACMOP_getpolicy         2
81950 +struct acm_getpolicy {
81951 +    /* IN */
81952 +    uint32_t interface_version;
81953 +    XEN_GUEST_HANDLE(void) pullcache;
81954 +    uint32_t pullcache_size;
81955 +};
81956 +
81957 +
81958 +#define ACMOP_dumpstats         3
81959 +struct acm_dumpstats {
81960 +    /* IN */
81961 +    uint32_t interface_version;
81962 +    XEN_GUEST_HANDLE(void) pullcache;
81963 +    uint32_t pullcache_size;
81964 +};
81965 +
81966 +
81967 +#define ACMOP_getssid           4
81968 +#define ACM_GETBY_ssidref  1
81969 +#define ACM_GETBY_domainid 2
81970 +struct acm_getssid {
81971 +    /* IN */
81972 +    uint32_t interface_version;
81973 +    uint32_t get_ssid_by; /* ACM_GETBY_* */
81974 +    union {
81975 +        domaintype_t domainid;
81976 +        ssidref_t    ssidref;
81977 +    } id;
81978 +    XEN_GUEST_HANDLE(void) ssidbuf;
81979 +    uint32_t ssidbuf_size;
81980 +};
81981 +
81982 +#define ACMOP_getdecision      5
81983 +struct acm_getdecision {
81984 +    /* IN */
81985 +    uint32_t interface_version;
81986 +    uint32_t get_decision_by1; /* ACM_GETBY_* */
81987 +    uint32_t get_decision_by2; /* ACM_GETBY_* */
81988 +    union {
81989 +        domaintype_t domainid;
81990 +        ssidref_t    ssidref;
81991 +    } id1;
81992 +    union {
81993 +        domaintype_t domainid;
81994 +        ssidref_t    ssidref;
81995 +    } id2;
81996 +    uint32_t hook;
81997 +    /* OUT */
81998 +    uint32_t acm_decision;
81999 +};
82000 +
82001 +#endif /* __XEN_PUBLIC_ACM_OPS_H__ */
82002 +
82003 +/*
82004 + * Local variables:
82005 + * mode: C
82006 + * c-set-style: "BSD"
82007 + * c-basic-offset: 4
82008 + * tab-width: 4
82009 + * indent-tabs-mode: nil
82010 + * End:
82011 + */
82012 diff -urNp linux-2.6/include/xen/interface/arch-ia64.h new/include/xen/interface/arch-ia64.h
82013 --- linux-2.6/include/xen/interface/arch-ia64.h 1970-01-01 01:00:00.000000000 +0100
82014 +++ new/include/xen/interface/arch-ia64.h       2006-07-07 15:10:03.000000000 +0200
82015 @@ -0,0 +1,420 @@
82016 +/******************************************************************************
82017 + * arch-ia64/hypervisor-if.h
82018 + * 
82019 + * Guest OS interface to IA64 Xen.
82020 + */
82021 +
82022 +#ifndef __HYPERVISOR_IF_IA64_H__
82023 +#define __HYPERVISOR_IF_IA64_H__
82024 +
82025 +#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
82026 +    typedef struct { type *p; } __guest_handle_ ## name
82027 +
82028 +#define DEFINE_XEN_GUEST_HANDLE(name)   __DEFINE_XEN_GUEST_HANDLE(name, name)
82029 +#define XEN_GUEST_HANDLE(name)          __guest_handle_ ## name
82030 +#define set_xen_guest_handle(hnd, val)  do { (hnd).p = val; } while (0)
82031 +#ifdef __XEN_TOOLS__
82032 +#define get_xen_guest_handle(val, hnd)  do { val = (hnd).p; } while (0)
82033 +#endif
82034 +
82035 +#ifndef __ASSEMBLY__
82036 +/* Guest handles for primitive C types. */
82037 +__DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char);
82038 +__DEFINE_XEN_GUEST_HANDLE(uint,  unsigned int);
82039 +__DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long);
82040 +DEFINE_XEN_GUEST_HANDLE(char);
82041 +DEFINE_XEN_GUEST_HANDLE(int);
82042 +DEFINE_XEN_GUEST_HANDLE(long);
82043 +DEFINE_XEN_GUEST_HANDLE(void);
82044 +
82045 +typedef unsigned long xen_pfn_t;
82046 +DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
82047 +#endif
82048 +
82049 +/* Arch specific VIRQs definition */
82050 +#define VIRQ_ITC        VIRQ_ARCH_0 /* V. Virtual itc timer */
82051 +
82052 +/* Maximum number of virtual CPUs in multi-processor guests. */
82053 +/* WARNING: before changing this, check that shared_info fits on a page */
82054 +#define MAX_VIRT_CPUS 64
82055 +
82056 +#ifndef __ASSEMBLY__
82057 +
82058 +typedef unsigned long xen_ulong_t;
82059 +
82060 +#define MAX_NR_SECTION  32  /* at most 32 memory holes */
82061 +struct mm_section {
82062 +    unsigned long start;  /* start of memory hole */
82063 +    unsigned long end;    /* end of memory hole */
82064 +};
82065 +typedef struct mm_section mm_section_t;
82066 +
82067 +struct pmt_entry {
82068 +    unsigned long mfn : 56;
82069 +    unsigned long type: 8;
82070 +};
82071 +typedef struct pmt_entry pmt_entry_t;
82072 +
82073 +#define GPFN_MEM          (0UL << 56) /* Guest pfn is normal mem */
82074 +#define GPFN_FRAME_BUFFER (1UL << 56) /* VGA framebuffer */
82075 +#define GPFN_LOW_MMIO     (2UL << 56) /* Low MMIO range */
82076 +#define GPFN_PIB          (3UL << 56) /* PIB base */
82077 +#define GPFN_IOSAPIC      (4UL << 56) /* IOSAPIC base */
82078 +#define GPFN_LEGACY_IO    (5UL << 56) /* Legacy I/O base */
82079 +#define GPFN_GFW          (6UL << 56) /* Guest Firmware */
82080 +#define GPFN_HIGH_MMIO    (7UL << 56) /* High MMIO range */
82081 +
82082 +#define GPFN_IO_MASK     (7UL << 56)  /* Guest pfn is I/O type */
82083 +#define GPFN_INV_MASK    (31UL << 59) /* Guest pfn is invalid */
82084 +
82085 +#define INVALID_MFN       (~0UL)
82086 +
82087 +#define MEM_G   (1UL << 30)
82088 +#define MEM_M   (1UL << 20)
82089 +
82090 +#define MMIO_START       (3 * MEM_G)
82091 +#define MMIO_SIZE        (512 * MEM_M)
82092 +
82093 +#define VGA_IO_START     0xA0000UL
82094 +#define VGA_IO_SIZE      0x20000
82095 +
82096 +#define LEGACY_IO_START  (MMIO_START + MMIO_SIZE)
82097 +#define LEGACY_IO_SIZE   (64*MEM_M)
82098 +
82099 +#define IO_PAGE_START (LEGACY_IO_START + LEGACY_IO_SIZE)
82100 +#define IO_PAGE_SIZE  PAGE_SIZE
82101 +
82102 +#define STORE_PAGE_START (IO_PAGE_START + IO_PAGE_SIZE)
82103 +#define STORE_PAGE_SIZE         PAGE_SIZE
82104 +
82105 +#define IO_SAPIC_START   0xfec00000UL
82106 +#define IO_SAPIC_SIZE    0x100000
82107 +
82108 +#define PIB_START 0xfee00000UL
82109 +#define PIB_SIZE 0x200000
82110 +
82111 +#define GFW_START        (4*MEM_G -16*MEM_M)
82112 +#define GFW_SIZE         (16*MEM_M)
82113 +
82114 +/*
82115 + * NB. This may become a 64-bit count with no shift. If this happens then the 
82116 + * structure size will still be 8 bytes, so no other alignments will change.
82117 + */
82118 +struct tsc_timestamp {
82119 +    unsigned int  tsc_bits;      /* 0: 32 bits read from the CPU's TSC. */
82120 +    unsigned int  tsc_bitshift;  /* 4: 'tsc_bits' uses N:N+31 of TSC.   */
82121 +}; /* 8 bytes */
82122 +typedef struct tsc_timestamp tsc_timestamp_t;
82123 +
82124 +struct pt_fpreg {
82125 +    union {
82126 +        unsigned long bits[2];
82127 +        long double __dummy;    /* force 16-byte alignment */
82128 +    } u;
82129 +};
82130 +
82131 +struct cpu_user_regs {
82132 +    /* The following registers are saved by SAVE_MIN: */
82133 +    unsigned long b6;  /* scratch */
82134 +    unsigned long b7;  /* scratch */
82135 +
82136 +    unsigned long ar_csd; /* used by cmp8xchg16 (scratch) */
82137 +    unsigned long ar_ssd; /* reserved for future use (scratch) */
82138 +
82139 +    unsigned long r8;  /* scratch (return value register 0) */
82140 +    unsigned long r9;  /* scratch (return value register 1) */
82141 +    unsigned long r10; /* scratch (return value register 2) */
82142 +    unsigned long r11; /* scratch (return value register 3) */
82143 +
82144 +    unsigned long cr_ipsr; /* interrupted task's psr */
82145 +    unsigned long cr_iip;  /* interrupted task's instruction pointer */
82146 +    unsigned long cr_ifs;  /* interrupted task's function state */
82147 +
82148 +    unsigned long ar_unat; /* interrupted task's NaT register (preserved) */
82149 +    unsigned long ar_pfs;  /* prev function state  */
82150 +    unsigned long ar_rsc;  /* RSE configuration */
82151 +    /* The following two are valid only if cr_ipsr.cpl > 0: */
82152 +    unsigned long ar_rnat;  /* RSE NaT */
82153 +    unsigned long ar_bspstore; /* RSE bspstore */
82154 +
82155 +    unsigned long pr;  /* 64 predicate registers (1 bit each) */
82156 +    unsigned long b0;  /* return pointer (bp) */
82157 +    unsigned long loadrs;  /* size of dirty partition << 16 */
82158 +
82159 +    unsigned long r1;  /* the gp pointer */
82160 +    unsigned long r12; /* interrupted task's memory stack pointer */
82161 +    unsigned long r13; /* thread pointer */
82162 +
82163 +    unsigned long ar_fpsr;  /* floating point status (preserved) */
82164 +    unsigned long r15;  /* scratch */
82165 +
82166 + /* The remaining registers are NOT saved for system calls.  */
82167 +
82168 +    unsigned long r14;  /* scratch */
82169 +    unsigned long r2;  /* scratch */
82170 +    unsigned long r3;  /* scratch */
82171 +    unsigned long r16;  /* scratch */
82172 +    unsigned long r17;  /* scratch */
82173 +    unsigned long r18;  /* scratch */
82174 +    unsigned long r19;  /* scratch */
82175 +    unsigned long r20;  /* scratch */
82176 +    unsigned long r21;  /* scratch */
82177 +    unsigned long r22;  /* scratch */
82178 +    unsigned long r23;  /* scratch */
82179 +    unsigned long r24;  /* scratch */
82180 +    unsigned long r25;  /* scratch */
82181 +    unsigned long r26;  /* scratch */
82182 +    unsigned long r27;  /* scratch */
82183 +    unsigned long r28;  /* scratch */
82184 +    unsigned long r29;  /* scratch */
82185 +    unsigned long r30;  /* scratch */
82186 +    unsigned long r31;  /* scratch */
82187 +    unsigned long ar_ccv;  /* compare/exchange value (scratch) */
82188 +
82189 +    /*
82190 +     * Floating point registers that the kernel considers scratch:
82191 +     */
82192 +    struct pt_fpreg f6;  /* scratch */
82193 +    struct pt_fpreg f7;  /* scratch */
82194 +    struct pt_fpreg f8;  /* scratch */
82195 +    struct pt_fpreg f9;  /* scratch */
82196 +    struct pt_fpreg f10;  /* scratch */
82197 +    struct pt_fpreg f11;  /* scratch */
82198 +    unsigned long r4;  /* preserved */
82199 +    unsigned long r5;  /* preserved */
82200 +    unsigned long r6;  /* preserved */
82201 +    unsigned long r7;  /* preserved */
82202 +    unsigned long eml_unat;    /* used for emulating instruction */
82203 +    unsigned long rfi_pfs;     /* used for elulating rfi */
82204 +
82205 +};
82206 +typedef struct cpu_user_regs cpu_user_regs_t;
82207 +
82208 +union vac {
82209 +    unsigned long value;
82210 +    struct {
82211 +        int a_int:1;
82212 +        int a_from_int_cr:1;
82213 +        int a_to_int_cr:1;
82214 +        int a_from_psr:1;
82215 +        int a_from_cpuid:1;
82216 +        int a_cover:1;
82217 +        int a_bsw:1;
82218 +        long reserved:57;
82219 +    };
82220 +};
82221 +typedef union vac vac_t;
82222 +
82223 +union vdc {
82224 +    unsigned long value;
82225 +    struct {
82226 +        int d_vmsw:1;
82227 +        int d_extint:1;
82228 +        int d_ibr_dbr:1;
82229 +        int d_pmc:1;
82230 +        int d_to_pmd:1;
82231 +        int d_itm:1;
82232 +        long reserved:58;
82233 +    };
82234 +};
82235 +typedef union vdc vdc_t;
82236 +
82237 +struct mapped_regs {
82238 +    union vac   vac;
82239 +    union vdc   vdc;
82240 +    unsigned long  virt_env_vaddr;
82241 +    unsigned long  reserved1[29];
82242 +    unsigned long  vhpi;
82243 +    unsigned long  reserved2[95];
82244 +    union {
82245 +        unsigned long  vgr[16];
82246 +        unsigned long bank1_regs[16]; // bank1 regs (r16-r31) when bank0 active
82247 +    };
82248 +    union {
82249 +        unsigned long  vbgr[16];
82250 +        unsigned long bank0_regs[16]; // bank0 regs (r16-r31) when bank1 active
82251 +    };
82252 +    unsigned long  vnat;
82253 +    unsigned long  vbnat;
82254 +    unsigned long  vcpuid[5];
82255 +    unsigned long  reserved3[11];
82256 +    unsigned long  vpsr;
82257 +    unsigned long  vpr;
82258 +    unsigned long  reserved4[76];
82259 +    union {
82260 +        unsigned long  vcr[128];
82261 +        struct {
82262 +            unsigned long dcr;  // CR0
82263 +            unsigned long itm;
82264 +            unsigned long iva;
82265 +            unsigned long rsv1[5];
82266 +            unsigned long pta;  // CR8
82267 +            unsigned long rsv2[7];
82268 +            unsigned long ipsr;  // CR16
82269 +            unsigned long isr;
82270 +            unsigned long rsv3;
82271 +            unsigned long iip;
82272 +            unsigned long ifa;
82273 +            unsigned long itir;
82274 +            unsigned long iipa;
82275 +            unsigned long ifs;
82276 +            unsigned long iim;  // CR24
82277 +            unsigned long iha;
82278 +            unsigned long rsv4[38];
82279 +            unsigned long lid;  // CR64
82280 +            unsigned long ivr;
82281 +            unsigned long tpr;
82282 +            unsigned long eoi;
82283 +            unsigned long irr[4];
82284 +            unsigned long itv;  // CR72
82285 +            unsigned long pmv;
82286 +            unsigned long cmcv;
82287 +            unsigned long rsv5[5];
82288 +            unsigned long lrr0;  // CR80
82289 +            unsigned long lrr1;
82290 +            unsigned long rsv6[46];
82291 +        };
82292 +    };
82293 +    union {
82294 +        unsigned long  reserved5[128];
82295 +        struct {
82296 +            unsigned long precover_ifs;
82297 +            unsigned long unat;  // not sure if this is needed until NaT arch is done
82298 +            int interrupt_collection_enabled; // virtual psr.ic
82299 +            /* virtual interrupt deliverable flag is evtchn_upcall_mask in
82300 +             * shared info area now. interrupt_mask_addr is the address
82301 +             * of evtchn_upcall_mask for current vcpu
82302 +             */
82303 +            unsigned long interrupt_mask_addr;
82304 +            int pending_interruption;
82305 +            int incomplete_regframe; // see SDM vol2 6.8
82306 +            unsigned long reserved5_1[4];
82307 +            int metaphysical_mode; // 1 = use metaphys mapping, 0 = use virtual
82308 +            int banknum; // 0 or 1, which virtual register bank is active
82309 +            unsigned long rrs[8]; // region registers
82310 +            unsigned long krs[8]; // kernel registers
82311 +            unsigned long pkrs[8]; // protection key registers
82312 +            unsigned long tmp[8]; // temp registers (e.g. for hyperprivops)
82313 +            // FIXME: tmp[8] temp'ly being used for virtual psr.pp
82314 +        };
82315 +    };
82316 +    unsigned long  reserved6[3456];
82317 +    unsigned long  vmm_avail[128];
82318 +    unsigned long  reserved7[4096];
82319 +};
82320 +typedef struct mapped_regs mapped_regs_t;
82321 +
82322 +struct arch_vcpu_info {
82323 +};
82324 +typedef struct arch_vcpu_info arch_vcpu_info_t;
82325 +
82326 +typedef mapped_regs_t vpd_t;
82327 +
82328 +struct arch_shared_info {
82329 +    unsigned int flags;
82330 +    unsigned long start_info_pfn;
82331 +
82332 +    /* Interrupt vector for event channel.  */
82333 +    int evtchn_vector;
82334 +};
82335 +typedef struct arch_shared_info arch_shared_info_t;
82336 +
82337 +struct arch_initrd_info {
82338 +    unsigned long start;
82339 +    unsigned long size;
82340 +};
82341 +typedef struct arch_initrd_info arch_initrd_info_t;
82342 +
82343 +typedef unsigned long xen_callback_t;
82344 +
82345 +#define IA64_COMMAND_LINE_SIZE 512
82346 +struct vcpu_guest_context {
82347 +#define VGCF_FPU_VALID (1<<0)
82348 +#define VGCF_VMX_GUEST (1<<1)
82349 +#define VGCF_IN_KERNEL (1<<2)
82350 +    unsigned long flags;       /* VGCF_* flags */
82351 +    unsigned long pt_base;     /* PMT table base */
82352 +    unsigned long share_io_pg; /* Shared page for I/O emulation */
82353 +    unsigned long sys_pgnr;    /* System pages out of domain memory */
82354 +    unsigned long vm_assist;   /* VMASST_TYPE_* bitmap, now none on IPF */
82355 +
82356 +    struct cpu_user_regs regs;
82357 +    struct mapped_regs *privregs;
82358 +    struct arch_shared_info shared;
82359 +    struct arch_initrd_info initrd;
82360 +    char cmdline[IA64_COMMAND_LINE_SIZE];
82361 +};
82362 +typedef struct vcpu_guest_context vcpu_guest_context_t;
82363 +DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
82364 +
82365 +// dom0 vp op
82366 +#define __HYPERVISOR_ia64_dom0vp_op     __HYPERVISOR_arch_0
82367 +#define IA64_DOM0VP_ioremap             0       // map io space in machine
82368 +                                                // address to dom0 physical
82369 +                                                // address space.
82370 +                                                // currently physical
82371 +                                                // assignedg address equals to
82372 +                                                // machine address
82373 +#define IA64_DOM0VP_phystomach          1       // convert a pseudo physical
82374 +                                                // page frame number
82375 +                                                // to the corresponding
82376 +                                                // machine page frame number.
82377 +                                                // if no page is assigned,
82378 +                                                // INVALID_MFN or GPFN_INV_MASK
82379 +                                                // is returned depending on
82380 +                                                // domain's non-vti/vti mode.
82381 +#define IA64_DOM0VP_machtophys          3       // convert a machine page
82382 +                                                // frame number
82383 +                                                // to the corresponding
82384 +                                                // pseudo physical page frame
82385 +                                                // number of the caller domain
82386 +#define IA64_DOM0VP_zap_physmap         17      // unmap and free pages
82387 +                                                // contained in the specified
82388 +                                                // pseudo physical region
82389 +#define IA64_DOM0VP_add_physmap         18      // assigne machine page frane
82390 +                                                // to dom0's pseudo physical
82391 +                                                // address space.
82392 +// flags for page assignement to pseudo physical address space
82393 +#define _ASSIGN_readonly                0
82394 +#define ASSIGN_readonly                 (1UL << _ASSIGN_readonly)
82395 +#define ASSIGN_writable                 (0UL << _ASSIGN_readonly) // dummy flag
82396 +
82397 +#endif /* !__ASSEMBLY__ */
82398 +
82399 +/* Hyperprivops.  */
82400 +#define HYPERPRIVOP_RFI                        0x1
82401 +#define HYPERPRIVOP_RSM_DT             0x2
82402 +#define HYPERPRIVOP_SSM_DT             0x3
82403 +#define HYPERPRIVOP_COVER              0x4
82404 +#define HYPERPRIVOP_ITC_D              0x5
82405 +#define HYPERPRIVOP_ITC_I              0x6
82406 +#define HYPERPRIVOP_SSM_I              0x7
82407 +#define HYPERPRIVOP_GET_IVR            0x8
82408 +#define HYPERPRIVOP_GET_TPR            0x9
82409 +#define HYPERPRIVOP_SET_TPR            0xa
82410 +#define HYPERPRIVOP_EOI                        0xb
82411 +#define HYPERPRIVOP_SET_ITM            0xc
82412 +#define HYPERPRIVOP_THASH              0xd
82413 +#define HYPERPRIVOP_PTC_GA             0xe
82414 +#define HYPERPRIVOP_ITR_D              0xf
82415 +#define HYPERPRIVOP_GET_RR             0x10
82416 +#define HYPERPRIVOP_SET_RR             0x11
82417 +#define HYPERPRIVOP_SET_KR             0x12
82418 +#define HYPERPRIVOP_FC                 0x13
82419 +#define HYPERPRIVOP_GET_CPUID          0x14
82420 +#define HYPERPRIVOP_GET_PMD            0x15
82421 +#define HYPERPRIVOP_GET_EFLAG          0x16
82422 +#define HYPERPRIVOP_SET_EFLAG          0x17
82423 +#define HYPERPRIVOP_MAX                        0x17
82424 +
82425 +#endif /* __HYPERVISOR_IF_IA64_H__ */
82426 +
82427 +/*
82428 + * Local variables:
82429 + * mode: C
82430 + * c-set-style: "BSD"
82431 + * c-basic-offset: 4
82432 + * tab-width: 4
82433 + * indent-tabs-mode: nil
82434 + * End:
82435 + */
82436 diff -urNp linux-2.6/include/xen/interface/arch-x86_32.h new/include/xen/interface/arch-x86_32.h
82437 --- linux-2.6/include/xen/interface/arch-x86_32.h       1970-01-01 01:00:00.000000000 +0100
82438 +++ new/include/xen/interface/arch-x86_32.h     2006-07-07 15:10:03.000000000 +0200
82439 @@ -0,0 +1,233 @@
82440 +/******************************************************************************
82441 + * arch-x86_32.h
82442 + * 
82443 + * Guest OS interface to x86 32-bit Xen.
82444 + * 
82445 + * Copyright (c) 2004, K A Fraser
82446 + */
82447 +
82448 +#ifndef __XEN_PUBLIC_ARCH_X86_32_H__
82449 +#define __XEN_PUBLIC_ARCH_X86_32_H__
82450 +
82451 +#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
82452 +    typedef struct { type *p; } __guest_handle_ ## name
82453 +
82454 +#define DEFINE_XEN_GUEST_HANDLE(name)   __DEFINE_XEN_GUEST_HANDLE(name, name)
82455 +#define XEN_GUEST_HANDLE(name)          __guest_handle_ ## name
82456 +#define set_xen_guest_handle(hnd, val)  do { (hnd).p = val; } while (0)
82457 +#ifdef __XEN_TOOLS__
82458 +#define get_xen_guest_handle(val, hnd)  do { val = (hnd).p; } while (0)
82459 +#endif
82460 +
82461 +#ifndef __ASSEMBLY__
82462 +/* Guest handles for primitive C types. */
82463 +__DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char);
82464 +__DEFINE_XEN_GUEST_HANDLE(uint,  unsigned int);
82465 +__DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long);
82466 +DEFINE_XEN_GUEST_HANDLE(char);
82467 +DEFINE_XEN_GUEST_HANDLE(int);
82468 +DEFINE_XEN_GUEST_HANDLE(long);
82469 +DEFINE_XEN_GUEST_HANDLE(void);
82470 +
82471 +typedef unsigned long xen_pfn_t;
82472 +DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
82473 +#endif
82474 +
82475 +/*
82476 + * SEGMENT DESCRIPTOR TABLES
82477 + */
82478 +/*
82479 + * A number of GDT entries are reserved by Xen. These are not situated at the
82480 + * start of the GDT because some stupid OSes export hard-coded selector values
82481 + * in their ABI. These hard-coded values are always near the start of the GDT,
82482 + * so Xen places itself out of the way, at the far end of the GDT.
82483 + */
82484 +#define FIRST_RESERVED_GDT_PAGE  14
82485 +#define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
82486 +#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
82487 +
82488 +/*
82489 + * These flat segments are in the Xen-private section of every GDT. Since these
82490 + * are also present in the initial GDT, many OSes will be able to avoid
82491 + * installing their own GDT.
82492 + */
82493 +#define FLAT_RING1_CS 0xe019    /* GDT index 259 */
82494 +#define FLAT_RING1_DS 0xe021    /* GDT index 260 */
82495 +#define FLAT_RING1_SS 0xe021    /* GDT index 260 */
82496 +#define FLAT_RING3_CS 0xe02b    /* GDT index 261 */
82497 +#define FLAT_RING3_DS 0xe033    /* GDT index 262 */
82498 +#define FLAT_RING3_SS 0xe033    /* GDT index 262 */
82499 +
82500 +#define FLAT_KERNEL_CS FLAT_RING1_CS
82501 +#define FLAT_KERNEL_DS FLAT_RING1_DS
82502 +#define FLAT_KERNEL_SS FLAT_RING1_SS
82503 +#define FLAT_USER_CS    FLAT_RING3_CS
82504 +#define FLAT_USER_DS    FLAT_RING3_DS
82505 +#define FLAT_USER_SS    FLAT_RING3_SS
82506 +
82507 +/* And the trap vector is... */
82508 +#define TRAP_INSTR "int $0x82"
82509 +
82510 +/*
82511 + * Virtual addresses beyond this are not modifiable by guest OSes. The 
82512 + * machine->physical mapping table starts at this address, read-only.
82513 + */
82514 +#ifdef CONFIG_X86_PAE
82515 +#define __HYPERVISOR_VIRT_START 0xF5800000
82516 +#define __MACH2PHYS_VIRT_START  0xF5800000
82517 +#define __MACH2PHYS_VIRT_END    0xF6800000
82518 +#else
82519 +#define __HYPERVISOR_VIRT_START 0xFC000000
82520 +#define __MACH2PHYS_VIRT_START  0xFC000000
82521 +#define __MACH2PHYS_VIRT_END    0xFC400000
82522 +#endif
82523 +
82524 +#ifndef HYPERVISOR_VIRT_START
82525 +#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
82526 +#endif
82527 +
82528 +#define MACH2PHYS_VIRT_START  mk_unsigned_long(__MACH2PHYS_VIRT_START)
82529 +#define MACH2PHYS_VIRT_END    mk_unsigned_long(__MACH2PHYS_VIRT_END)
82530 +#define MACH2PHYS_NR_ENTRIES  ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>2)
82531 +#ifndef machine_to_phys_mapping
82532 +#define machine_to_phys_mapping ((unsigned long *)MACH2PHYS_VIRT_START)
82533 +#endif
82534 +
82535 +/* Maximum number of virtual CPUs in multi-processor guests. */
82536 +#define MAX_VIRT_CPUS 32
82537 +
82538 +#ifndef __ASSEMBLY__
82539 +
82540 +typedef unsigned long xen_ulong_t;
82541 +
82542 +/*
82543 + * Send an array of these to HYPERVISOR_set_trap_table()
82544 + */
82545 +#define TI_GET_DPL(_ti)      ((_ti)->flags & 3)
82546 +#define TI_GET_IF(_ti)       ((_ti)->flags & 4)
82547 +#define TI_SET_DPL(_ti,_dpl) ((_ti)->flags |= (_dpl))
82548 +#define TI_SET_IF(_ti,_if)   ((_ti)->flags |= ((!!(_if))<<2))
82549 +struct trap_info {
82550 +    uint8_t       vector;  /* exception vector                              */
82551 +    uint8_t       flags;   /* 0-3: privilege level; 4: clear event enable?  */
82552 +    uint16_t      cs;      /* code selector                                 */
82553 +    unsigned long address; /* code offset                                   */
82554 +};
82555 +typedef struct trap_info trap_info_t;
82556 +DEFINE_XEN_GUEST_HANDLE(trap_info_t);
82557 +
82558 +struct cpu_user_regs {
82559 +    uint32_t ebx;
82560 +    uint32_t ecx;
82561 +    uint32_t edx;
82562 +    uint32_t esi;
82563 +    uint32_t edi;
82564 +    uint32_t ebp;
82565 +    uint32_t eax;
82566 +    uint16_t error_code;    /* private */
82567 +    uint16_t entry_vector;  /* private */
82568 +    uint32_t eip;
82569 +    uint16_t cs;
82570 +    uint8_t  saved_upcall_mask;
82571 +    uint8_t  _pad0;
82572 +    uint32_t eflags;        /* eflags.IF == !saved_upcall_mask */
82573 +    uint32_t esp;
82574 +    uint16_t ss, _pad1;
82575 +    uint16_t es, _pad2;
82576 +    uint16_t ds, _pad3;
82577 +    uint16_t fs, _pad4;
82578 +    uint16_t gs, _pad5;
82579 +};
82580 +typedef struct cpu_user_regs cpu_user_regs_t;
82581 +DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);
82582 +
82583 +typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
82584 +
82585 +/*
82586 + * The following is all CPU context. Note that the fpu_ctxt block is filled 
82587 + * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
82588 + */
82589 +struct vcpu_guest_context {
82590 +    /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
82591 +    struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
82592 +#define VGCF_I387_VALID                (1<<0)
82593 +#define VGCF_HVM_GUEST                 (1<<1)
82594 +#define VGCF_IN_KERNEL                 (1<<2)
82595 +#define _VGCF_i387_valid               0
82596 +#define VGCF_i387_valid                (1<<_VGCF_i387_valid)
82597 +#define _VGCF_hvm_guest                1
82598 +#define VGCF_hvm_guest                 (1<<_VGCF_hvm_guest)
82599 +#define _VGCF_in_kernel                2
82600 +#define VGCF_in_kernel                 (1<<_VGCF_in_kernel)
82601 +#define _VGCF_failsafe_disables_events 3
82602 +#define VGCF_failsafe_disables_events  (1<<_VGCF_failsafe_disables_events)
82603 +    unsigned long flags;                    /* VGCF_* flags                 */
82604 +    struct cpu_user_regs user_regs;         /* User-level CPU registers     */
82605 +    struct trap_info trap_ctxt[256];        /* Virtual IDT                  */
82606 +    unsigned long ldt_base, ldt_ents;       /* LDT (linear address, # ents) */
82607 +    unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
82608 +    unsigned long kernel_ss, kernel_sp;     /* Virtual TSS (only SS1/SP1)   */
82609 +    unsigned long ctrlreg[8];               /* CR0-CR7 (control registers)  */
82610 +    unsigned long debugreg[8];              /* DB0-DB7 (debug registers)    */
82611 +    unsigned long event_callback_cs;        /* CS:EIP of event callback     */
82612 +    unsigned long event_callback_eip;
82613 +    unsigned long failsafe_callback_cs;     /* CS:EIP of failsafe callback  */
82614 +    unsigned long failsafe_callback_eip;
82615 +    unsigned long vm_assist;                /* VMASST_TYPE_* bitmap */
82616 +};
82617 +typedef struct vcpu_guest_context vcpu_guest_context_t;
82618 +DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
82619 +
82620 +/*
82621 + * Page-directory addresses above 4GB do not fit into architectural %cr3.
82622 + * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
82623 + * must use the following accessor macros to pack/unpack valid MFNs.
82624 + */
82625 +#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
82626 +#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
82627 +
82628 +struct arch_shared_info {
82629 +    unsigned long max_pfn;                  /* max pfn that appears in table */
82630 +    /* Frame containing list of mfns containing list of mfns containing p2m. */
82631 +    xen_pfn_t     pfn_to_mfn_frame_list_list;
82632 +    unsigned long nmi_reason;
82633 +};
82634 +typedef struct arch_shared_info arch_shared_info_t;
82635 +
82636 +struct arch_vcpu_info {
82637 +    unsigned long cr2;
82638 +    unsigned long pad[5]; /* sizeof(vcpu_info_t) == 64 */
82639 +};
82640 +typedef struct arch_vcpu_info arch_vcpu_info_t;
82641 +
82642 +struct xen_callback {
82643 +    unsigned long cs;
82644 +    unsigned long eip;
82645 +};
82646 +typedef struct xen_callback xen_callback_t;
82647 +
82648 +#endif /* !__ASSEMBLY__ */
82649 +
82650 +/*
82651 + * Prefix forces emulation of some non-trapping instructions.
82652 + * Currently only CPUID.
82653 + */
82654 +#ifdef __ASSEMBLY__
82655 +#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
82656 +#define XEN_CPUID          XEN_EMULATE_PREFIX cpuid
82657 +#else
82658 +#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
82659 +#define XEN_CPUID          XEN_EMULATE_PREFIX "cpuid"
82660 +#endif
82661 +
82662 +#endif
82663 +
82664 +/*
82665 + * Local variables:
82666 + * mode: C
82667 + * c-set-style: "BSD"
82668 + * c-basic-offset: 4
82669 + * tab-width: 4
82670 + * indent-tabs-mode: nil
82671 + * End:
82672 + */
82673 diff -urNp linux-2.6/include/xen/interface/arch-x86_64.h new/include/xen/interface/arch-x86_64.h
82674 --- linux-2.6/include/xen/interface/arch-x86_64.h       1970-01-01 01:00:00.000000000 +0100
82675 +++ new/include/xen/interface/arch-x86_64.h     2006-07-07 15:10:03.000000000 +0200
82676 @@ -0,0 +1,299 @@
82677 +/******************************************************************************
82678 + * arch-x86_64.h
82679 + * 
82680 + * Guest OS interface to x86 64-bit Xen.
82681 + * 
82682 + * Copyright (c) 2004, K A Fraser
82683 + */
82684 +
82685 +#ifndef __XEN_PUBLIC_ARCH_X86_64_H__
82686 +#define __XEN_PUBLIC_ARCH_X86_64_H__
82687 +
82688 +#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
82689 +    typedef struct { type *p; } __guest_handle_ ## name
82690 +
82691 +#define DEFINE_XEN_GUEST_HANDLE(name)   __DEFINE_XEN_GUEST_HANDLE(name, name)
82692 +#define XEN_GUEST_HANDLE(name)          __guest_handle_ ## name
82693 +#define set_xen_guest_handle(hnd, val)  do { (hnd).p = val; } while (0)
82694 +#ifdef __XEN_TOOLS__
82695 +#define get_xen_guest_handle(val, hnd)  do { val = (hnd).p; } while (0)
82696 +#endif
82697 +
82698 +#ifndef __ASSEMBLY__
82699 +/* Guest handles for primitive C types. */
82700 +__DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char);
82701 +__DEFINE_XEN_GUEST_HANDLE(uint,  unsigned int);
82702 +__DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long);
82703 +DEFINE_XEN_GUEST_HANDLE(char);
82704 +DEFINE_XEN_GUEST_HANDLE(int);
82705 +DEFINE_XEN_GUEST_HANDLE(long);
82706 +DEFINE_XEN_GUEST_HANDLE(void);
82707 +
82708 +typedef unsigned long xen_pfn_t;
82709 +DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
82710 +#endif
82711 +
82712 +/*
82713 + * SEGMENT DESCRIPTOR TABLES
82714 + */
82715 +/*
82716 + * A number of GDT entries are reserved by Xen. These are not situated at the
82717 + * start of the GDT because some stupid OSes export hard-coded selector values
82718 + * in their ABI. These hard-coded values are always near the start of the GDT,
82719 + * so Xen places itself out of the way, at the far end of the GDT.
82720 + */
82721 +#define FIRST_RESERVED_GDT_PAGE  14
82722 +#define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
82723 +#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
82724 +
82725 +/*
82726 + * 64-bit segment selectors
82727 + * These flat segments are in the Xen-private section of every GDT. Since these
82728 + * are also present in the initial GDT, many OSes will be able to avoid
82729 + * installing their own GDT.
82730 + */
82731 +
82732 +#define FLAT_RING3_CS32 0xe023  /* GDT index 260 */
82733 +#define FLAT_RING3_CS64 0xe033  /* GDT index 261 */
82734 +#define FLAT_RING3_DS32 0xe02b  /* GDT index 262 */
82735 +#define FLAT_RING3_DS64 0x0000  /* NULL selector */
82736 +#define FLAT_RING3_SS32 0xe02b  /* GDT index 262 */
82737 +#define FLAT_RING3_SS64 0xe02b  /* GDT index 262 */
82738 +
82739 +#define FLAT_KERNEL_DS64 FLAT_RING3_DS64
82740 +#define FLAT_KERNEL_DS32 FLAT_RING3_DS32
82741 +#define FLAT_KERNEL_DS   FLAT_KERNEL_DS64
82742 +#define FLAT_KERNEL_CS64 FLAT_RING3_CS64
82743 +#define FLAT_KERNEL_CS32 FLAT_RING3_CS32
82744 +#define FLAT_KERNEL_CS   FLAT_KERNEL_CS64
82745 +#define FLAT_KERNEL_SS64 FLAT_RING3_SS64
82746 +#define FLAT_KERNEL_SS32 FLAT_RING3_SS32
82747 +#define FLAT_KERNEL_SS   FLAT_KERNEL_SS64
82748 +
82749 +#define FLAT_USER_DS64 FLAT_RING3_DS64
82750 +#define FLAT_USER_DS32 FLAT_RING3_DS32
82751 +#define FLAT_USER_DS   FLAT_USER_DS64
82752 +#define FLAT_USER_CS64 FLAT_RING3_CS64
82753 +#define FLAT_USER_CS32 FLAT_RING3_CS32
82754 +#define FLAT_USER_CS   FLAT_USER_CS64
82755 +#define FLAT_USER_SS64 FLAT_RING3_SS64
82756 +#define FLAT_USER_SS32 FLAT_RING3_SS32
82757 +#define FLAT_USER_SS   FLAT_USER_SS64
82758 +
82759 +/* And the trap vector is... */
82760 +#define TRAP_INSTR "syscall"
82761 +
82762 +#define __HYPERVISOR_VIRT_START 0xFFFF800000000000
82763 +#define __HYPERVISOR_VIRT_END   0xFFFF880000000000
82764 +#define __MACH2PHYS_VIRT_START  0xFFFF800000000000
82765 +#define __MACH2PHYS_VIRT_END    0xFFFF804000000000
82766 +
82767 +#ifndef HYPERVISOR_VIRT_START
82768 +#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
82769 +#define HYPERVISOR_VIRT_END   mk_unsigned_long(__HYPERVISOR_VIRT_END)
82770 +#endif
82771 +
82772 +#define MACH2PHYS_VIRT_START  mk_unsigned_long(__MACH2PHYS_VIRT_START)
82773 +#define MACH2PHYS_VIRT_END    mk_unsigned_long(__MACH2PHYS_VIRT_END)
82774 +#define MACH2PHYS_NR_ENTRIES  ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
82775 +#ifndef machine_to_phys_mapping
82776 +#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
82777 +#endif
82778 +
82779 +/* Maximum number of virtual CPUs in multi-processor guests. */
82780 +#define MAX_VIRT_CPUS 32
82781 +
82782 +#ifndef __ASSEMBLY__
82783 +
82784 +typedef unsigned long xen_ulong_t;
82785 +
82786 +/*
82787 + * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
82788 + *  @which == SEGBASE_*  ;  @base == 64-bit base address
82789 + * Returns 0 on success.
82790 + */
82791 +#define SEGBASE_FS          0
82792 +#define SEGBASE_GS_USER     1
82793 +#define SEGBASE_GS_KERNEL   2
82794 +#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */
82795 +
82796 +/*
82797 + * int HYPERVISOR_iret(void)
82798 + * All arguments are on the kernel stack, in the following format.
82799 + * Never returns if successful. Current kernel context is lost.
82800 + * The saved CS is mapped as follows:
82801 + *   RING0 -> RING3 kernel mode.
82802 + *   RING1 -> RING3 kernel mode.
82803 + *   RING2 -> RING3 kernel mode.
82804 + *   RING3 -> RING3 user mode.
82805 + * However RING0 indicates that the guest kernel should return to iteself
82806 + * directly with
82807 + *      orb   $3,1*8(%rsp)
82808 + *      iretq
82809 + * If flags contains VGCF_IN_SYSCALL:
82810 + *   Restore RAX, RIP, RFLAGS, RSP.
82811 + *   Discard R11, RCX, CS, SS.
82812 + * Otherwise:
82813 + *   Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP.
82814 + * All other registers are saved on hypercall entry and restored to user.
82815 + */
82816 +/* Guest exited in SYSCALL context? Return to guest with SYSRET? */
82817 +#define VGCF_IN_SYSCALL (1<<8)
82818 +struct iret_context {
82819 +    /* Top of stack (%rsp at point of hypercall). */
82820 +    uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
82821 +    /* Bottom of iret stack frame. */
82822 +};
82823 +
82824 +/*
82825 + * Send an array of these to HYPERVISOR_set_trap_table().
82826 + * N.B. As in x86/32 mode, the privilege level specifies which modes may enter
82827 + * a trap via a software interrupt. Since rings 1 and 2 are unavailable, we
82828 + * allocate privilege levels as follows:
82829 + *  Level == 0: Noone may enter
82830 + *  Level == 1: Kernel may enter
82831 + *  Level == 2: Kernel may enter
82832 + *  Level == 3: Everyone may enter
82833 + */
82834 +#define TI_GET_DPL(_ti)      ((_ti)->flags & 3)
82835 +#define TI_GET_IF(_ti)       ((_ti)->flags & 4)
82836 +#define TI_SET_DPL(_ti,_dpl) ((_ti)->flags |= (_dpl))
82837 +#define TI_SET_IF(_ti,_if)   ((_ti)->flags |= ((!!(_if))<<2))
82838 +struct trap_info {
82839 +    uint8_t       vector;  /* exception vector                              */
82840 +    uint8_t       flags;   /* 0-3: privilege level; 4: clear event enable?  */
82841 +    uint16_t      cs;      /* code selector                                 */
82842 +    unsigned long address; /* code offset                                   */
82843 +};
82844 +typedef struct trap_info trap_info_t;
82845 +DEFINE_XEN_GUEST_HANDLE(trap_info_t);
82846 +
82847 +#ifdef __GNUC__
82848 +/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */
82849 +#define __DECL_REG(name) union { uint64_t r ## name, e ## name; }
82850 +#else
82851 +/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */
82852 +#define __DECL_REG(name) uint64_t r ## name
82853 +#endif
82854 +
82855 +struct cpu_user_regs {
82856 +    uint64_t r15;
82857 +    uint64_t r14;
82858 +    uint64_t r13;
82859 +    uint64_t r12;
82860 +    __DECL_REG(bp);
82861 +    __DECL_REG(bx);
82862 +    uint64_t r11;
82863 +    uint64_t r10;
82864 +    uint64_t r9;
82865 +    uint64_t r8;
82866 +    __DECL_REG(ax);
82867 +    __DECL_REG(cx);
82868 +    __DECL_REG(dx);
82869 +    __DECL_REG(si);
82870 +    __DECL_REG(di);
82871 +    uint32_t error_code;    /* private */
82872 +    uint32_t entry_vector;  /* private */
82873 +    __DECL_REG(ip);
82874 +    uint16_t cs, _pad0[1];
82875 +    uint8_t  saved_upcall_mask;
82876 +    uint8_t  _pad1[3];
82877 +    __DECL_REG(flags);      /* rflags.IF == !saved_upcall_mask */
82878 +    __DECL_REG(sp);
82879 +    uint16_t ss, _pad2[3];
82880 +    uint16_t es, _pad3[3];
82881 +    uint16_t ds, _pad4[3];
82882 +    uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base.     */
82883 +    uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
82884 +};
82885 +typedef struct cpu_user_regs cpu_user_regs_t;
82886 +DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);
82887 +
82888 +#undef __DECL_REG
82889 +
82890 +typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
82891 +
82892 +/*
82893 + * The following is all CPU context. Note that the fpu_ctxt block is filled 
82894 + * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
82895 + */
82896 +struct vcpu_guest_context {
82897 +    /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
82898 +    struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
82899 +#define VGCF_I387_VALID                (1<<0)
82900 +#define VGCF_HVM_GUEST                 (1<<1)
82901 +#define VGCF_IN_KERNEL                 (1<<2)
82902 +#define _VGCF_i387_valid               0
82903 +#define VGCF_i387_valid                (1<<_VGCF_i387_valid)
82904 +#define _VGCF_hvm_guest                1
82905 +#define VGCF_hvm_guest                 (1<<_VGCF_hvm_guest)
82906 +#define _VGCF_in_kernel                2
82907 +#define VGCF_in_kernel                 (1<<_VGCF_in_kernel)
82908 +#define _VGCF_failsafe_disables_events 3
82909 +#define VGCF_failsafe_disables_events  (1<<_VGCF_failsafe_disables_events)
82910 +#define _VGCF_syscall_disables_events  4
82911 +#define VGCF_syscall_disables_events   (1<<_VGCF_syscall_disables_events)
82912 +    unsigned long flags;                    /* VGCF_* flags                 */
82913 +    struct cpu_user_regs user_regs;         /* User-level CPU registers     */
82914 +    struct trap_info trap_ctxt[256];        /* Virtual IDT                  */
82915 +    unsigned long ldt_base, ldt_ents;       /* LDT (linear address, # ents) */
82916 +    unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
82917 +    unsigned long kernel_ss, kernel_sp;     /* Virtual TSS (only SS1/SP1)   */
82918 +    unsigned long ctrlreg[8];               /* CR0-CR7 (control registers)  */
82919 +    unsigned long debugreg[8];              /* DB0-DB7 (debug registers)    */
82920 +    unsigned long event_callback_eip;
82921 +    unsigned long failsafe_callback_eip;
82922 +    unsigned long syscall_callback_eip;
82923 +    unsigned long vm_assist;                /* VMASST_TYPE_* bitmap */
82924 +    /* Segment base addresses. */
82925 +    uint64_t      fs_base;
82926 +    uint64_t      gs_base_kernel;
82927 +    uint64_t      gs_base_user;
82928 +};
82929 +typedef struct vcpu_guest_context vcpu_guest_context_t;
82930 +DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
82931 +
82932 +#define xen_pfn_to_cr3(pfn) ((unsigned long)(pfn) << 12)
82933 +#define xen_cr3_to_pfn(cr3) ((unsigned long)(cr3) >> 12)
82934 +
82935 +struct arch_shared_info {
82936 +    unsigned long max_pfn;                  /* max pfn that appears in table */
82937 +    /* Frame containing list of mfns containing list of mfns containing p2m. */
82938 +    xen_pfn_t     pfn_to_mfn_frame_list_list;
82939 +    unsigned long nmi_reason;
82940 +};
82941 +typedef struct arch_shared_info arch_shared_info_t;
82942 +
82943 +struct arch_vcpu_info {
82944 +    unsigned long cr2;
82945 +    unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
82946 +};
82947 +typedef struct arch_vcpu_info  arch_vcpu_info_t;
82948 +
82949 +typedef unsigned long xen_callback_t;
82950 +
82951 +#endif /* !__ASSEMBLY__ */
82952 +
82953 +/*
82954 + * Prefix forces emulation of some non-trapping instructions.
82955 + * Currently only CPUID.
82956 + */
82957 +#ifdef __ASSEMBLY__
82958 +#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
82959 +#define XEN_CPUID          XEN_EMULATE_PREFIX cpuid
82960 +#else
82961 +#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
82962 +#define XEN_CPUID          XEN_EMULATE_PREFIX "cpuid"
82963 +#endif
82964 +
82965 +#endif
82966 +
82967 +/*
82968 + * Local variables:
82969 + * mode: C
82970 + * c-set-style: "BSD"
82971 + * c-basic-offset: 4
82972 + * tab-width: 4
82973 + * indent-tabs-mode: nil
82974 + * End:
82975 + */
82976 diff -urNp linux-2.6/include/xen/interface/callback.h new/include/xen/interface/callback.h
82977 --- linux-2.6/include/xen/interface/callback.h  1970-01-01 01:00:00.000000000 +0100
82978 +++ new/include/xen/interface/callback.h        2006-06-07 13:29:36.000000000 +0200
82979 @@ -0,0 +1,74 @@
82980 +/******************************************************************************
82981 + * callback.h
82982 + *
82983 + * Register guest OS callbacks with Xen.
82984 + *
82985 + * Copyright (c) 2006, Ian Campbell
82986 + */
82987 +
82988 +#ifndef __XEN_PUBLIC_CALLBACK_H__
82989 +#define __XEN_PUBLIC_CALLBACK_H__
82990 +
82991 +#include "xen.h"
82992 +
82993 +/*
82994 + * Prototype for this hypercall is:
82995 + *   long callback_op(int cmd, void *extra_args)
82996 + * @cmd        == CALLBACKOP_??? (callback operation).
82997 + * @extra_args == Operation-specific extra arguments (NULL if none).
82998 + */
82999 +
83000 +#define CALLBACKTYPE_event                 0
83001 +#define CALLBACKTYPE_failsafe              1
83002 +#define CALLBACKTYPE_syscall               2 /* x86_64 only */
83003 +/*
83004 + * sysenter is only available on x86_32 with the
83005 + * supervisor_mode_kernel option enabled.
83006 + */
83007 +#define CALLBACKTYPE_sysenter              3
83008 +#define CALLBACKTYPE_nmi                   4
83009 +
83010 +/*
83011 + * Disable event deliver during callback? This flag is ignored for event and
83012 + * NMI callbacks: event delivery is unconditionally disabled.
83013 + */
83014 +#define _CALLBACKF_mask_events             0
83015 +#define CALLBACKF_mask_events              (1U << _CALLBACKF_mask_events)
83016 +
83017 +/*
83018 + * Register a callback.
83019 + */
83020 +#define CALLBACKOP_register                0
83021 +struct callback_register {
83022 +    uint16_t type;
83023 +    uint16_t flags;
83024 +    xen_callback_t address;
83025 +};
83026 +typedef struct callback_register callback_register_t;
83027 +DEFINE_XEN_GUEST_HANDLE(callback_register_t);
83028 +
83029 +/*
83030 + * Unregister a callback.
83031 + *
83032 + * Not all callbacks can be unregistered. -EINVAL will be returned if
83033 + * you attempt to unregister such a callback.
83034 + */
83035 +#define CALLBACKOP_unregister              1
83036 +struct callback_unregister {
83037 +    uint16_t type;
83038 +    uint16_t _unused;
83039 +};
83040 +typedef struct callback_unregister callback_unregister_t;
83041 +DEFINE_XEN_GUEST_HANDLE(callback_unregister_t);
83042 +
83043 +#endif /* __XEN_PUBLIC_CALLBACK_H__ */
83044 +
83045 +/*
83046 + * Local variables:
83047 + * mode: C
83048 + * c-set-style: "BSD"
83049 + * c-basic-offset: 4
83050 + * tab-width: 4
83051 + * indent-tabs-mode: nil
83052 + * End:
83053 + */
83054 diff -urNp linux-2.6/include/xen/interface/dom0_ops.h new/include/xen/interface/dom0_ops.h
83055 --- linux-2.6/include/xen/interface/dom0_ops.h  1970-01-01 01:00:00.000000000 +0100
83056 +++ new/include/xen/interface/dom0_ops.h        2006-07-07 15:10:03.000000000 +0200
83057 @@ -0,0 +1,598 @@
83058 +/******************************************************************************
83059 + * dom0_ops.h
83060 + * 
83061 + * Process command requests from domain-0 guest OS.
83062 + * 
83063 + * Copyright (c) 2002-2003, B Dragovic
83064 + * Copyright (c) 2002-2004, K Fraser
83065 + */
83066 +
83067 +
83068 +#ifndef __XEN_PUBLIC_DOM0_OPS_H__
83069 +#define __XEN_PUBLIC_DOM0_OPS_H__
83070 +
83071 +#include "xen.h"
83072 +#include "sched_ctl.h"
83073 +
83074 +/*
83075 + * Make sure you increment the interface version whenever you modify this file!
83076 + * This makes sure that old versions of dom0 tools will stop working in a
83077 + * well-defined way (rather than crashing the machine, for instance).
83078 + */
83079 +#define DOM0_INTERFACE_VERSION   0x03000001
83080 +
83081 +/************************************************************************/
83082 +
83083 +#define DOM0_GETMEMLIST        2
83084 +struct dom0_getmemlist {
83085 +    /* IN variables. */
83086 +    domid_t       domain;
83087 +    uint64_t max_pfns;
83088 +    XEN_GUEST_HANDLE(xen_pfn_t) buffer;
83089 +    /* OUT variables. */
83090 +    uint64_t num_pfns;
83091 +};
83092 +typedef struct dom0_getmemlist dom0_getmemlist_t;
83093 +DEFINE_XEN_GUEST_HANDLE(dom0_getmemlist_t);
83094 +
83095 +#define DOM0_SCHEDCTL          6
83096 + /* struct sched_ctl_cmd is from sched-ctl.h   */
83097 +typedef struct sched_ctl_cmd dom0_schedctl_t;
83098 +DEFINE_XEN_GUEST_HANDLE(dom0_schedctl_t);
83099 +
83100 +#define DOM0_ADJUSTDOM         7
83101 +/* struct sched_adjdom_cmd is from sched-ctl.h */
83102 +typedef struct sched_adjdom_cmd dom0_adjustdom_t;
83103 +DEFINE_XEN_GUEST_HANDLE(dom0_adjustdom_t);
83104 +
83105 +#define DOM0_CREATEDOMAIN      8
83106 +struct dom0_createdomain {
83107 +    /* IN parameters */
83108 +    uint32_t ssidref;
83109 +    xen_domain_handle_t handle;
83110 +    /* IN/OUT parameters. */
83111 +    /* Identifier for new domain (auto-allocate if zero is specified). */
83112 +    domid_t domain;
83113 +};
83114 +typedef struct dom0_createdomain dom0_createdomain_t;
83115 +DEFINE_XEN_GUEST_HANDLE(dom0_createdomain_t);
83116 +
83117 +#define DOM0_DESTROYDOMAIN     9
83118 +struct dom0_destroydomain {
83119 +    /* IN variables. */
83120 +    domid_t domain;
83121 +};
83122 +typedef struct dom0_destroydomain dom0_destroydomain_t;
83123 +DEFINE_XEN_GUEST_HANDLE(dom0_destroydomain_t);
83124 +
83125 +#define DOM0_PAUSEDOMAIN      10
83126 +struct dom0_pausedomain {
83127 +    /* IN parameters. */
83128 +    domid_t domain;
83129 +};
83130 +typedef struct dom0_pausedomain dom0_pausedomain_t;
83131 +DEFINE_XEN_GUEST_HANDLE(dom0_pausedomain_t);
83132 +
83133 +#define DOM0_UNPAUSEDOMAIN    11
83134 +struct dom0_unpausedomain {
83135 +    /* IN parameters. */
83136 +    domid_t domain;
83137 +};
83138 +typedef struct dom0_unpausedomain dom0_unpausedomain_t;
83139 +DEFINE_XEN_GUEST_HANDLE(dom0_unpausedomain_t);
83140 +
83141 +#define DOM0_GETDOMAININFO    12
83142 +struct dom0_getdomaininfo {
83143 +    /* IN variables. */
83144 +    domid_t  domain;                  /* NB. IN/OUT variable. */
83145 +    /* OUT variables. */
83146 +#define DOMFLAGS_DYING     (1<<0) /* Domain is scheduled to die.             */
83147 +#define DOMFLAGS_SHUTDOWN  (1<<2) /* The guest OS has shut down.             */
83148 +#define DOMFLAGS_PAUSED    (1<<3) /* Currently paused by control software.   */
83149 +#define DOMFLAGS_BLOCKED   (1<<4) /* Currently blocked pending an event.     */
83150 +#define DOMFLAGS_RUNNING   (1<<5) /* Domain is currently running.            */
83151 +#define DOMFLAGS_CPUMASK      255 /* CPU to which this domain is bound.      */
83152 +#define DOMFLAGS_CPUSHIFT       8
83153 +#define DOMFLAGS_SHUTDOWNMASK 255 /* DOMFLAGS_SHUTDOWN guest-supplied code.  */
83154 +#define DOMFLAGS_SHUTDOWNSHIFT 16
83155 +    uint32_t flags;
83156 +    uint64_t tot_pages;
83157 +    uint64_t max_pages;
83158 +    xen_pfn_t shared_info_frame;  /* MFN of shared_info struct */
83159 +    uint64_t cpu_time;
83160 +    uint32_t nr_online_vcpus;     /* Number of VCPUs currently online. */
83161 +    uint32_t max_vcpu_id;         /* Maximum VCPUID in use by this domain. */
83162 +    uint32_t ssidref;
83163 +    xen_domain_handle_t handle;
83164 +};
83165 +typedef struct dom0_getdomaininfo dom0_getdomaininfo_t;
83166 +DEFINE_XEN_GUEST_HANDLE(dom0_getdomaininfo_t);
83167 +
83168 +#define DOM0_SETVCPUCONTEXT   13
83169 +struct dom0_setvcpucontext {
83170 +    /* IN variables. */
83171 +    domid_t               domain;
83172 +    uint32_t              vcpu;
83173 +    /* IN/OUT parameters */
83174 +    XEN_GUEST_HANDLE(vcpu_guest_context_t) ctxt;
83175 +};
83176 +typedef struct dom0_setvcpucontext dom0_setvcpucontext_t;
83177 +DEFINE_XEN_GUEST_HANDLE(dom0_setvcpucontext_t);
83178 +
83179 +#define DOM0_MSR              15
83180 +struct dom0_msr {
83181 +    /* IN variables. */
83182 +    uint32_t write;
83183 +    cpumap_t cpu_mask;
83184 +    uint32_t msr;
83185 +    uint32_t in1;
83186 +    uint32_t in2;
83187 +    /* OUT variables. */
83188 +    uint32_t out1;
83189 +    uint32_t out2;
83190 +};
83191 +typedef struct dom0_msr dom0_msr_t;
83192 +DEFINE_XEN_GUEST_HANDLE(dom0_msr_t);
83193 +
83194 +/*
83195 + * Set clock such that it would read <secs,nsecs> after 00:00:00 UTC,
83196 + * 1 January, 1970 if the current system time was <system_time>.
83197 + */
83198 +#define DOM0_SETTIME          17
83199 +struct dom0_settime {
83200 +    /* IN variables. */
83201 +    uint32_t secs;
83202 +    uint32_t nsecs;
83203 +    uint64_t system_time;
83204 +};
83205 +typedef struct dom0_settime dom0_settime_t;
83206 +DEFINE_XEN_GUEST_HANDLE(dom0_settime_t);
83207 +
83208 +#define DOM0_GETPAGEFRAMEINFO 18
83209 +#define LTAB_SHIFT 28
83210 +#define NOTAB 0         /* normal page */
83211 +#define L1TAB (1<<LTAB_SHIFT)
83212 +#define L2TAB (2<<LTAB_SHIFT)
83213 +#define L3TAB (3<<LTAB_SHIFT)
83214 +#define L4TAB (4<<LTAB_SHIFT)
83215 +#define LPINTAB  (1<<31)
83216 +#define XTAB  (0xf<<LTAB_SHIFT) /* invalid page */
83217 +#define LTAB_MASK XTAB
83218 +#define LTABTYPE_MASK (0x7<<LTAB_SHIFT)
83219 +
83220 +struct dom0_getpageframeinfo {
83221 +    /* IN variables. */
83222 +    xen_pfn_t mfn;         /* Machine page frame number to query.       */
83223 +    domid_t domain;        /* To which domain does the frame belong?    */
83224 +    /* OUT variables. */
83225 +    /* Is the page PINNED to a type? */
83226 +    uint32_t type;         /* see above type defs */
83227 +};
83228 +typedef struct dom0_getpageframeinfo dom0_getpageframeinfo_t;
83229 +DEFINE_XEN_GUEST_HANDLE(dom0_getpageframeinfo_t);
83230 +
83231 +/*
83232 + * Read console content from Xen buffer ring.
83233 + */
83234 +#define DOM0_READCONSOLE      19
83235 +struct dom0_readconsole {
83236 +    /* IN variables. */
83237 +    uint32_t clear;            /* Non-zero -> clear after reading. */
83238 +    /* IN/OUT variables. */
83239 +    XEN_GUEST_HANDLE(char) buffer; /* In: Buffer start; Out: Used buffer start */
83240 +    uint32_t count;            /* In: Buffer size;  Out: Used buffer size  */
83241 +};
83242 +typedef struct dom0_readconsole dom0_readconsole_t;
83243 +DEFINE_XEN_GUEST_HANDLE(dom0_readconsole_t);
83244 +
83245 +/*
83246 + * Set which physical cpus a vcpu can execute on.
83247 + */
83248 +#define DOM0_SETVCPUAFFINITY  20
83249 +struct dom0_setvcpuaffinity {
83250 +    /* IN variables. */
83251 +    domid_t   domain;
83252 +    uint32_t  vcpu;
83253 +    cpumap_t  cpumap;
83254 +};
83255 +typedef struct dom0_setvcpuaffinity dom0_setvcpuaffinity_t;
83256 +DEFINE_XEN_GUEST_HANDLE(dom0_setvcpuaffinity_t);
83257 +
83258 +/* Get trace buffers machine base address */
83259 +#define DOM0_TBUFCONTROL       21
83260 +struct dom0_tbufcontrol {
83261 +    /* IN variables */
83262 +#define DOM0_TBUF_GET_INFO     0
83263 +#define DOM0_TBUF_SET_CPU_MASK 1
83264 +#define DOM0_TBUF_SET_EVT_MASK 2
83265 +#define DOM0_TBUF_SET_SIZE     3
83266 +#define DOM0_TBUF_ENABLE       4
83267 +#define DOM0_TBUF_DISABLE      5
83268 +    uint32_t      op;
83269 +    /* IN/OUT variables */
83270 +    cpumap_t      cpu_mask;
83271 +    uint32_t      evt_mask;
83272 +    /* OUT variables */
83273 +    xen_pfn_t buffer_mfn;
83274 +    uint32_t size;
83275 +};
83276 +typedef struct dom0_tbufcontrol dom0_tbufcontrol_t;
83277 +DEFINE_XEN_GUEST_HANDLE(dom0_tbufcontrol_t);
83278 +
83279 +/*
83280 + * Get physical information about the host machine
83281 + */
83282 +#define DOM0_PHYSINFO         22
83283 +struct dom0_physinfo {
83284 +    uint32_t threads_per_core;
83285 +    uint32_t cores_per_socket;
83286 +    uint32_t sockets_per_node;
83287 +    uint32_t nr_nodes;
83288 +    uint32_t cpu_khz;
83289 +    uint64_t total_pages;
83290 +    uint64_t free_pages;
83291 +    uint64_t scrub_pages;
83292 +    uint32_t hw_cap[8];
83293 +};
83294 +typedef struct dom0_physinfo dom0_physinfo_t;
83295 +DEFINE_XEN_GUEST_HANDLE(dom0_physinfo_t);
83296 +
83297 +/*
83298 + * Get the ID of the current scheduler.
83299 + */
83300 +#define DOM0_SCHED_ID        24
83301 +struct dom0_sched_id {
83302 +    /* OUT variable */
83303 +    uint32_t sched_id;
83304 +};
83305 +typedef struct dom0_physinfo dom0_sched_id_t;
83306 +DEFINE_XEN_GUEST_HANDLE(dom0_sched_id_t);
83307 +
83308 +/*
83309 + * Control shadow pagetables operation
83310 + */
83311 +#define DOM0_SHADOW_CONTROL  25
83312 +
83313 +#define DOM0_SHADOW_CONTROL_OP_OFF         0
83314 +#define DOM0_SHADOW_CONTROL_OP_ENABLE_TEST 1
83315 +#define DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY 2
83316 +#define DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE 3
83317 +
83318 +#define DOM0_SHADOW_CONTROL_OP_FLUSH       10     /* table ops */
83319 +#define DOM0_SHADOW_CONTROL_OP_CLEAN       11
83320 +#define DOM0_SHADOW_CONTROL_OP_PEEK        12
83321 +
83322 +struct dom0_shadow_control_stats {
83323 +    uint32_t fault_count;
83324 +    uint32_t dirty_count;
83325 +    uint32_t dirty_net_count;
83326 +    uint32_t dirty_block_count;
83327 +};
83328 +typedef struct dom0_shadow_control_stats dom0_shadow_control_stats_t;
83329 +DEFINE_XEN_GUEST_HANDLE(dom0_shadow_control_stats_t);
83330 +
83331 +struct dom0_shadow_control {
83332 +    /* IN variables. */
83333 +    domid_t        domain;
83334 +    uint32_t       op;
83335 +    XEN_GUEST_HANDLE(ulong) dirty_bitmap;
83336 +    /* IN/OUT variables. */
83337 +    uint64_t       pages;        /* size of buffer, updated with actual size */
83338 +    /* OUT variables. */
83339 +    struct dom0_shadow_control_stats stats;
83340 +};
83341 +typedef struct dom0_shadow_control dom0_shadow_control_t;
83342 +DEFINE_XEN_GUEST_HANDLE(dom0_shadow_control_t);
83343 +
83344 +#define DOM0_SETDOMAINMAXMEM   28
83345 +struct dom0_setdomainmaxmem {
83346 +    /* IN variables. */
83347 +    domid_t  domain;
83348 +    uint64_t max_memkb;
83349 +};
83350 +typedef struct dom0_setdomainmaxmem dom0_setdomainmaxmem_t;
83351 +DEFINE_XEN_GUEST_HANDLE(dom0_setdomainmaxmem_t);
83352 +
83353 +#define DOM0_GETPAGEFRAMEINFO2 29   /* batched interface */
83354 +struct dom0_getpageframeinfo2 {
83355 +    /* IN variables. */
83356 +    domid_t  domain;
83357 +    uint64_t num;
83358 +    /* IN/OUT variables. */
83359 +    XEN_GUEST_HANDLE(ulong) array;
83360 +};
83361 +typedef struct dom0_getpageframeinfo2 dom0_getpageframeinfo2_t;
83362 +DEFINE_XEN_GUEST_HANDLE(dom0_getpageframeinfo2_t);
83363 +
83364 +/*
83365 + * Request memory range (@mfn, @mfn+@nr_mfns-1) to have type @type.
83366 + * On x86, @type is an architecture-defined MTRR memory type.
83367 + * On success, returns the MTRR that was used (@reg) and a handle that can
83368 + * be passed to DOM0_DEL_MEMTYPE to accurately tear down the new setting.
83369 + * (x86-specific).
83370 + */
83371 +#define DOM0_ADD_MEMTYPE         31
83372 +struct dom0_add_memtype {
83373 +    /* IN variables. */
83374 +    xen_pfn_t mfn;
83375 +    uint64_t nr_mfns;
83376 +    uint32_t type;
83377 +    /* OUT variables. */
83378 +    uint32_t handle;
83379 +    uint32_t reg;
83380 +};
83381 +typedef struct dom0_add_memtype dom0_add_memtype_t;
83382 +DEFINE_XEN_GUEST_HANDLE(dom0_add_memtype_t);
83383 +
83384 +/*
83385 + * Tear down an existing memory-range type. If @handle is remembered then it
83386 + * should be passed in to accurately tear down the correct setting (in case
83387 + * of overlapping memory regions with differing types). If it is not known
83388 + * then @handle should be set to zero. In all cases @reg must be set.
83389 + * (x86-specific).
83390 + */
83391 +#define DOM0_DEL_MEMTYPE         32
83392 +struct dom0_del_memtype {
83393 +    /* IN variables. */
83394 +    uint32_t handle;
83395 +    uint32_t reg;
83396 +};
83397 +typedef struct dom0_del_memtype dom0_del_memtype_t;
83398 +DEFINE_XEN_GUEST_HANDLE(dom0_del_memtype_t);
83399 +
83400 +/* Read current type of an MTRR (x86-specific). */
83401 +#define DOM0_READ_MEMTYPE        33
83402 +struct dom0_read_memtype {
83403 +    /* IN variables. */
83404 +    uint32_t reg;
83405 +    /* OUT variables. */
83406 +    xen_pfn_t mfn;
83407 +    uint64_t nr_mfns;
83408 +    uint32_t type;
83409 +};
83410 +typedef struct dom0_read_memtype dom0_read_memtype_t;
83411 +DEFINE_XEN_GUEST_HANDLE(dom0_read_memtype_t);
83412 +
83413 +/* Interface for controlling Xen software performance counters. */
83414 +#define DOM0_PERFCCONTROL        34
83415 +/* Sub-operations: */
83416 +#define DOM0_PERFCCONTROL_OP_RESET 1   /* Reset all counters to zero. */
83417 +#define DOM0_PERFCCONTROL_OP_QUERY 2   /* Get perfctr information. */
83418 +struct dom0_perfc_desc {
83419 +    char         name[80];             /* name of perf counter */
83420 +    uint32_t     nr_vals;              /* number of values for this counter */
83421 +    uint32_t     vals[64];             /* array of values */
83422 +};
83423 +typedef struct dom0_perfc_desc dom0_perfc_desc_t;
83424 +DEFINE_XEN_GUEST_HANDLE(dom0_perfc_desc_t);
83425 +
83426 +struct dom0_perfccontrol {
83427 +    /* IN variables. */
83428 +    uint32_t       op;                /*  DOM0_PERFCCONTROL_OP_??? */
83429 +    /* OUT variables. */
83430 +    uint32_t       nr_counters;       /*  number of counters */
83431 +    XEN_GUEST_HANDLE(dom0_perfc_desc_t) desc; /*  counter information (or NULL) */
83432 +};
83433 +typedef struct dom0_perfccontrol dom0_perfccontrol_t;
83434 +DEFINE_XEN_GUEST_HANDLE(dom0_perfccontrol_t);
83435 +
83436 +#define DOM0_MICROCODE           35
83437 +struct dom0_microcode {
83438 +    /* IN variables. */
83439 +    XEN_GUEST_HANDLE(void) data;          /* Pointer to microcode data */
83440 +    uint32_t length;                  /* Length of microcode data. */
83441 +};
83442 +typedef struct dom0_microcode dom0_microcode_t;
83443 +DEFINE_XEN_GUEST_HANDLE(dom0_microcode_t);
83444 +
83445 +#define DOM0_IOPORT_PERMISSION   36
83446 +struct dom0_ioport_permission {
83447 +    domid_t  domain;                  /* domain to be affected */
83448 +    uint32_t first_port;              /* first port int range */
83449 +    uint32_t nr_ports;                /* size of port range */
83450 +    uint8_t  allow_access;            /* allow or deny access to range? */
83451 +};
83452 +typedef struct dom0_ioport_permission dom0_ioport_permission_t;
83453 +DEFINE_XEN_GUEST_HANDLE(dom0_ioport_permission_t);
83454 +
83455 +#define DOM0_GETVCPUCONTEXT      37
83456 +struct dom0_getvcpucontext {
83457 +    /* IN variables. */
83458 +    domid_t  domain;                  /* domain to be affected */
83459 +    uint32_t vcpu;                    /* vcpu # */
83460 +    /* OUT variables. */
83461 +    XEN_GUEST_HANDLE(vcpu_guest_context_t) ctxt;
83462 +};
83463 +typedef struct dom0_getvcpucontext dom0_getvcpucontext_t;
83464 +DEFINE_XEN_GUEST_HANDLE(dom0_getvcpucontext_t);
83465 +
83466 +#define DOM0_GETVCPUINFO         43
83467 +struct dom0_getvcpuinfo {
83468 +    /* IN variables. */
83469 +    domid_t  domain;                  /* domain to be affected */
83470 +    uint32_t vcpu;                    /* vcpu # */
83471 +    /* OUT variables. */
83472 +    uint8_t  online;                  /* currently online (not hotplugged)? */
83473 +    uint8_t  blocked;                 /* blocked waiting for an event? */
83474 +    uint8_t  running;                 /* currently scheduled on its CPU? */
83475 +    uint64_t cpu_time;                /* total cpu time consumed (ns) */
83476 +    uint32_t cpu;                     /* current mapping   */
83477 +    cpumap_t cpumap;                  /* allowable mapping */
83478 +};
83479 +typedef struct dom0_getvcpuinfo dom0_getvcpuinfo_t;
83480 +DEFINE_XEN_GUEST_HANDLE(dom0_getvcpuinfo_t);
83481 +
83482 +#define DOM0_GETDOMAININFOLIST   38
83483 +struct dom0_getdomaininfolist {
83484 +    /* IN variables. */
83485 +    domid_t               first_domain;
83486 +    uint32_t              max_domains;
83487 +    XEN_GUEST_HANDLE(dom0_getdomaininfo_t) buffer;
83488 +    /* OUT variables. */
83489 +    uint32_t              num_domains;
83490 +};
83491 +typedef struct dom0_getdomaininfolist dom0_getdomaininfolist_t;
83492 +DEFINE_XEN_GUEST_HANDLE(dom0_getdomaininfolist_t);
83493 +
83494 +#define DOM0_PLATFORM_QUIRK      39
83495 +#define QUIRK_NOIRQBALANCING      1 /* Do not restrict IO-APIC RTE targets */
83496 +#define QUIRK_IOAPIC_BAD_REGSEL   2 /* IO-APIC REGSEL forgets its value    */
83497 +#define QUIRK_IOAPIC_GOOD_REGSEL  3 /* IO-APIC REGSEL behaves properly     */
83498 +struct dom0_platform_quirk {
83499 +    /* IN variables. */
83500 +    uint32_t quirk_id;
83501 +};
83502 +typedef struct dom0_platform_quirk dom0_platform_quirk_t;
83503 +DEFINE_XEN_GUEST_HANDLE(dom0_platform_quirk_t);
83504 +
83505 +#define DOM0_PHYSICAL_MEMORY_MAP 40   /* Unimplemented from 3.0.3 onwards */
83506 +struct dom0_memory_map_entry {
83507 +    uint64_t start, end;
83508 +    uint32_t flags; /* reserved */
83509 +    uint8_t  is_ram;
83510 +};
83511 +typedef struct dom0_memory_map_entry dom0_memory_map_entry_t;
83512 +DEFINE_XEN_GUEST_HANDLE(dom0_memory_map_entry_t);
83513 +
83514 +struct dom0_physical_memory_map {
83515 +    /* IN variables. */
83516 +    uint32_t max_map_entries;
83517 +    /* OUT variables. */
83518 +    uint32_t nr_map_entries;
83519 +    XEN_GUEST_HANDLE(dom0_memory_map_entry_t) memory_map;
83520 +};
83521 +typedef struct dom0_physical_memory_map dom0_physical_memory_map_t;
83522 +DEFINE_XEN_GUEST_HANDLE(dom0_physical_memory_map_t);
83523 +
83524 +#define DOM0_MAX_VCPUS 41
83525 +struct dom0_max_vcpus {
83526 +    domid_t  domain;        /* domain to be affected */
83527 +    uint32_t max;           /* maximum number of vcpus */
83528 +};
83529 +typedef struct dom0_max_vcpus dom0_max_vcpus_t;
83530 +DEFINE_XEN_GUEST_HANDLE(dom0_max_vcpus_t);
83531 +
83532 +#define DOM0_SETDOMAINHANDLE 44
83533 +struct dom0_setdomainhandle {
83534 +    domid_t domain;
83535 +    xen_domain_handle_t handle;
83536 +};
83537 +typedef struct dom0_setdomainhandle dom0_setdomainhandle_t;
83538 +DEFINE_XEN_GUEST_HANDLE(dom0_setdomainhandle_t);
83539 +
83540 +#define DOM0_SETDEBUGGING 45
83541 +struct dom0_setdebugging {
83542 +    domid_t domain;
83543 +    uint8_t enable;
83544 +};
83545 +typedef struct dom0_setdebugging dom0_setdebugging_t;
83546 +DEFINE_XEN_GUEST_HANDLE(dom0_setdebugging_t);
83547 +
83548 +#define DOM0_IRQ_PERMISSION 46
83549 +struct dom0_irq_permission {
83550 +    domid_t domain;          /* domain to be affected */
83551 +    uint8_t pirq;
83552 +    uint8_t allow_access;    /* flag to specify enable/disable of IRQ access */
83553 +};
83554 +typedef struct dom0_irq_permission dom0_irq_permission_t;
83555 +DEFINE_XEN_GUEST_HANDLE(dom0_irq_permission_t);
83556 +
83557 +#define DOM0_IOMEM_PERMISSION 47
83558 +struct dom0_iomem_permission {
83559 +    domid_t  domain;          /* domain to be affected */
83560 +    xen_pfn_t first_mfn;      /* first page (physical page number) in range */
83561 +    uint64_t nr_mfns;         /* number of pages in range (>0) */
83562 +    uint8_t allow_access;     /* allow (!0) or deny (0) access to range? */
83563 +};
83564 +typedef struct dom0_iomem_permission dom0_iomem_permission_t;
83565 +DEFINE_XEN_GUEST_HANDLE(dom0_iomem_permission_t);
83566 +
83567 +#define DOM0_HYPERCALL_INIT   48
83568 +struct dom0_hypercall_init {
83569 +    domid_t  domain;          /* domain to be affected */
83570 +    xen_pfn_t mfn;            /* machine frame to be initialised */
83571 +};
83572 +typedef struct dom0_hypercall_init dom0_hypercall_init_t;
83573 +DEFINE_XEN_GUEST_HANDLE(dom0_hypercall_init_t);
83574 +
83575 +#define DOM0_DOMAIN_SETUP     49
83576 +#define _XEN_DOMAINSETUP_hvm_guest 0
83577 +#define XEN_DOMAINSETUP_hvm_guest  (1UL<<_XEN_DOMAINSETUP_hvm_guest)
83578 +typedef struct dom0_domain_setup {
83579 +    domid_t  domain;          /* domain to be affected */
83580 +    unsigned long flags;      /* XEN_DOMAINSETUP_* */
83581 +#ifdef __ia64__
83582 +    unsigned long bp;         /* mpaddr of boot param area */
83583 +    unsigned long maxmem;        /* Highest memory address for MDT.  */
83584 +#endif
83585 +} dom0_domain_setup_t;
83586 +DEFINE_XEN_GUEST_HANDLE(dom0_domain_setup_t);
83587 +
83588 +#define DOM0_SETTIMEOFFSET    50
83589 +struct dom0_settimeoffset {
83590 +    domid_t  domain;
83591 +    int32_t  time_offset_seconds; /* applied to domain wallclock time */
83592 +};
83593 +typedef struct dom0_settimeoffset dom0_settimeoffset_t;
83594 +DEFINE_XEN_GUEST_HANDLE(dom0_settimeoffset_t);
83595 +
83596 +struct dom0_op {
83597 +    uint32_t cmd;
83598 +    uint32_t interface_version; /* DOM0_INTERFACE_VERSION */
83599 +    union {
83600 +        struct dom0_createdomain      createdomain;
83601 +        struct dom0_pausedomain       pausedomain;
83602 +        struct dom0_unpausedomain     unpausedomain;
83603 +        struct dom0_destroydomain     destroydomain;
83604 +        struct dom0_getmemlist        getmemlist;
83605 +        struct sched_ctl_cmd          schedctl;
83606 +        struct sched_adjdom_cmd       adjustdom;
83607 +        struct dom0_setvcpucontext    setvcpucontext;
83608 +        struct dom0_getdomaininfo     getdomaininfo;
83609 +        struct dom0_getpageframeinfo  getpageframeinfo;
83610 +        struct dom0_msr               msr;
83611 +        struct dom0_settime           settime;
83612 +        struct dom0_readconsole       readconsole;
83613 +        struct dom0_setvcpuaffinity   setvcpuaffinity;
83614 +        struct dom0_tbufcontrol       tbufcontrol;
83615 +        struct dom0_physinfo          physinfo;
83616 +        struct dom0_sched_id          sched_id;
83617 +        struct dom0_shadow_control    shadow_control;
83618 +        struct dom0_setdomainmaxmem   setdomainmaxmem;
83619 +        struct dom0_getpageframeinfo2 getpageframeinfo2;
83620 +        struct dom0_add_memtype       add_memtype;
83621 +        struct dom0_del_memtype       del_memtype;
83622 +        struct dom0_read_memtype      read_memtype;
83623 +        struct dom0_perfccontrol      perfccontrol;
83624 +        struct dom0_microcode         microcode;
83625 +        struct dom0_ioport_permission ioport_permission;
83626 +        struct dom0_getvcpucontext    getvcpucontext;
83627 +        struct dom0_getvcpuinfo       getvcpuinfo;
83628 +        struct dom0_getdomaininfolist getdomaininfolist;
83629 +        struct dom0_platform_quirk    platform_quirk;
83630 +        struct dom0_physical_memory_map physical_memory_map;
83631 +        struct dom0_max_vcpus         max_vcpus;
83632 +        struct dom0_setdomainhandle   setdomainhandle;
83633 +        struct dom0_setdebugging      setdebugging;
83634 +        struct dom0_irq_permission    irq_permission;
83635 +        struct dom0_iomem_permission  iomem_permission;
83636 +        struct dom0_hypercall_init    hypercall_init;
83637 +        struct dom0_domain_setup      domain_setup;
83638 +        struct dom0_settimeoffset     settimeoffset;
83639 +        uint8_t                       pad[128];
83640 +    } u;
83641 +};
83642 +typedef struct dom0_op dom0_op_t;
83643 +DEFINE_XEN_GUEST_HANDLE(dom0_op_t);
83644 +
83645 +#endif /* __XEN_PUBLIC_DOM0_OPS_H__ */
83646 +
83647 +/*
83648 + * Local variables:
83649 + * mode: C
83650 + * c-set-style: "BSD"
83651 + * c-basic-offset: 4
83652 + * tab-width: 4
83653 + * indent-tabs-mode: nil
83654 + * End:
83655 + */
83656 diff -urNp linux-2.6/include/xen/interface/event_channel.h new/include/xen/interface/event_channel.h
83657 --- linux-2.6/include/xen/interface/event_channel.h     1970-01-01 01:00:00.000000000 +0100
83658 +++ new/include/xen/interface/event_channel.h   2006-05-23 18:42:17.000000000 +0200
83659 @@ -0,0 +1,233 @@
83660 +/******************************************************************************
83661 + * event_channel.h
83662 + * 
83663 + * Event channels between domains.
83664 + * 
83665 + * Copyright (c) 2003-2004, K A Fraser.
83666 + */
83667 +
83668 +#ifndef __XEN_PUBLIC_EVENT_CHANNEL_H__
83669 +#define __XEN_PUBLIC_EVENT_CHANNEL_H__
83670 +
83671 +/*
83672 + * Prototype for this hypercall is:
83673 + *  int event_channel_op(int cmd, void *args)
83674 + * @cmd  == EVTCHNOP_??? (event-channel operation).
83675 + * @args == Operation-specific extra arguments (NULL if none).
83676 + */
83677 +
83678 +typedef uint32_t evtchn_port_t;
83679 +DEFINE_XEN_GUEST_HANDLE(evtchn_port_t);
83680 +
83681 +/*
83682 + * EVTCHNOP_alloc_unbound: Allocate a port in domain <dom> and mark as
83683 + * accepting interdomain bindings from domain <remote_dom>. A fresh port
83684 + * is allocated in <dom> and returned as <port>.
83685 + * NOTES:
83686 + *  1. If the caller is unprivileged then <dom> must be DOMID_SELF.
83687 + *  2. <rdom> may be DOMID_SELF, allowing loopback connections.
83688 + */
83689 +#define EVTCHNOP_alloc_unbound    6
83690 +struct evtchn_alloc_unbound {
83691 +    /* IN parameters */
83692 +    domid_t dom, remote_dom;
83693 +    /* OUT parameters */
83694 +    evtchn_port_t port;
83695 +};
83696 +typedef struct evtchn_alloc_unbound evtchn_alloc_unbound_t;
83697 +
83698 +/*
83699 + * EVTCHNOP_bind_interdomain: Construct an interdomain event channel between
83700 + * the calling domain and <remote_dom>. <remote_dom,remote_port> must identify
83701 + * a port that is unbound and marked as accepting bindings from the calling
83702 + * domain. A fresh port is allocated in the calling domain and returned as
83703 + * <local_port>.
83704 + * NOTES:
83705 + *  2. <remote_dom> may be DOMID_SELF, allowing loopback connections.
83706 + */
83707 +#define EVTCHNOP_bind_interdomain 0
83708 +struct evtchn_bind_interdomain {
83709 +    /* IN parameters. */
83710 +    domid_t remote_dom;
83711 +    evtchn_port_t remote_port;
83712 +    /* OUT parameters. */
83713 +    evtchn_port_t local_port;
83714 +};
83715 +typedef struct evtchn_bind_interdomain evtchn_bind_interdomain_t;
83716 +
83717 +/*
83718 + * EVTCHNOP_bind_virq: Bind a local event channel to VIRQ <irq> on specified
83719 + * vcpu.
83720 + * NOTES:
83721 + *  1. Virtual IRQs are classified as per-vcpu or global. See the VIRQ list
83722 + *     in xen.h for the classification of each VIRQ.
83723 + *  2. Global VIRQs must be allocated on VCPU0 but can subsequently be
83724 + *     re-bound via EVTCHNOP_bind_vcpu.
83725 + *  3. Per-vcpu VIRQs may be bound to at most one event channel per vcpu.
83726 + *     The allocated event channel is bound to the specified vcpu and the
83727 + *     binding cannot be changed.
83728 + */
83729 +#define EVTCHNOP_bind_virq        1
83730 +struct evtchn_bind_virq {
83731 +    /* IN parameters. */
83732 +    uint32_t virq;
83733 +    uint32_t vcpu;
83734 +    /* OUT parameters. */
83735 +    evtchn_port_t port;
83736 +};
83737 +typedef struct evtchn_bind_virq evtchn_bind_virq_t;
83738 +
83739 +/*
83740 + * EVTCHNOP_bind_pirq: Bind a local event channel to PIRQ <irq>.
83741 + * NOTES:
83742 + *  1. A physical IRQ may be bound to at most one event channel per domain.
83743 + *  2. Only a sufficiently-privileged domain may bind to a physical IRQ.
83744 + */
83745 +#define EVTCHNOP_bind_pirq        2
83746 +struct evtchn_bind_pirq {
83747 +    /* IN parameters. */
83748 +    uint32_t pirq;
83749 +#define BIND_PIRQ__WILL_SHARE 1
83750 +    uint32_t flags; /* BIND_PIRQ__* */
83751 +    /* OUT parameters. */
83752 +    evtchn_port_t port;
83753 +};
83754 +typedef struct evtchn_bind_pirq evtchn_bind_pirq_t;
83755 +
83756 +/*
83757 + * EVTCHNOP_bind_ipi: Bind a local event channel to receive events.
83758 + * NOTES:
83759 + *  1. The allocated event channel is bound to the specified vcpu. The binding
83760 + *     may not be changed.
83761 + */
83762 +#define EVTCHNOP_bind_ipi         7
83763 +struct evtchn_bind_ipi {
83764 +    uint32_t vcpu;
83765 +    /* OUT parameters. */
83766 +    evtchn_port_t port;
83767 +};
83768 +typedef struct evtchn_bind_ipi evtchn_bind_ipi_t;
83769 +
83770 +/*
83771 + * EVTCHNOP_close: Close a local event channel <port>. If the channel is
83772 + * interdomain then the remote end is placed in the unbound state
83773 + * (EVTCHNSTAT_unbound), awaiting a new connection.
83774 + */
83775 +#define EVTCHNOP_close            3
83776 +struct evtchn_close {
83777 +    /* IN parameters. */
83778 +    evtchn_port_t port;
83779 +};
83780 +typedef struct evtchn_close evtchn_close_t;
83781 +
83782 +/*
83783 + * EVTCHNOP_send: Send an event to the remote end of the channel whose local
83784 + * endpoint is <port>.
83785 + */
83786 +#define EVTCHNOP_send             4
83787 +struct evtchn_send {
83788 +    /* IN parameters. */
83789 +    evtchn_port_t port;
83790 +};
83791 +typedef struct evtchn_send evtchn_send_t;
83792 +
83793 +/*
83794 + * EVTCHNOP_status: Get the current status of the communication channel which
83795 + * has an endpoint at <dom, port>.
83796 + * NOTES:
83797 + *  1. <dom> may be specified as DOMID_SELF.
83798 + *  2. Only a sufficiently-privileged domain may obtain the status of an event
83799 + *     channel for which <dom> is not DOMID_SELF.
83800 + */
83801 +#define EVTCHNOP_status           5
83802 +struct evtchn_status {
83803 +    /* IN parameters */
83804 +    domid_t  dom;
83805 +    evtchn_port_t port;
83806 +    /* OUT parameters */
83807 +#define EVTCHNSTAT_closed       0  /* Channel is not in use.                 */
83808 +#define EVTCHNSTAT_unbound      1  /* Channel is waiting interdom connection.*/
83809 +#define EVTCHNSTAT_interdomain  2  /* Channel is connected to remote domain. */
83810 +#define EVTCHNSTAT_pirq         3  /* Channel is bound to a phys IRQ line.   */
83811 +#define EVTCHNSTAT_virq         4  /* Channel is bound to a virtual IRQ line */
83812 +#define EVTCHNSTAT_ipi          5  /* Channel is bound to a virtual IPI line */
83813 +    uint32_t status;
83814 +    uint32_t vcpu;                 /* VCPU to which this channel is bound.   */
83815 +    union {
83816 +        struct {
83817 +            domid_t dom;
83818 +        } unbound; /* EVTCHNSTAT_unbound */
83819 +        struct {
83820 +            domid_t dom;
83821 +            evtchn_port_t port;
83822 +        } interdomain; /* EVTCHNSTAT_interdomain */
83823 +        uint32_t pirq;      /* EVTCHNSTAT_pirq        */
83824 +        uint32_t virq;      /* EVTCHNSTAT_virq        */
83825 +    } u;
83826 +};
83827 +typedef struct evtchn_status evtchn_status_t;
83828 +
83829 +/*
83830 + * EVTCHNOP_bind_vcpu: Specify which vcpu a channel should notify when an
83831 + * event is pending.
83832 + * NOTES:
83833 + *  1. IPI-bound channels always notify the vcpu specified at bind time.
83834 + *     This binding cannot be changed.
83835 + *  2. Per-VCPU VIRQ channels always notify the vcpu specified at bind time.
83836 + *     This binding cannot be changed.
83837 + *  3. All other channels notify vcpu0 by default. This default is set when
83838 + *     the channel is allocated (a port that is freed and subsequently reused
83839 + *     has its binding reset to vcpu0).
83840 + */
83841 +#define EVTCHNOP_bind_vcpu        8
83842 +struct evtchn_bind_vcpu {
83843 +    /* IN parameters. */
83844 +    evtchn_port_t port;
83845 +    uint32_t vcpu;
83846 +};
83847 +typedef struct evtchn_bind_vcpu evtchn_bind_vcpu_t;
83848 +
83849 +/*
83850 + * EVTCHNOP_unmask: Unmask the specified local event-channel port and deliver
83851 + * a notification to the appropriate VCPU if an event is pending.
83852 + */
83853 +#define EVTCHNOP_unmask           9
83854 +struct evtchn_unmask {
83855 +    /* IN parameters. */
83856 +    evtchn_port_t port;
83857 +};
83858 +typedef struct evtchn_unmask evtchn_unmask_t;
83859 +
83860 +/*
83861 + * Argument to event_channel_op_compat() hypercall. Superceded by new
83862 + * event_channel_op() hypercall since 0x00030202.
83863 + */
83864 +struct evtchn_op {
83865 +    uint32_t cmd; /* EVTCHNOP_* */
83866 +    union {
83867 +        struct evtchn_alloc_unbound    alloc_unbound;
83868 +        struct evtchn_bind_interdomain bind_interdomain;
83869 +        struct evtchn_bind_virq        bind_virq;
83870 +        struct evtchn_bind_pirq        bind_pirq;
83871 +        struct evtchn_bind_ipi         bind_ipi;
83872 +        struct evtchn_close            close;
83873 +        struct evtchn_send             send;
83874 +        struct evtchn_status           status;
83875 +        struct evtchn_bind_vcpu        bind_vcpu;
83876 +        struct evtchn_unmask           unmask;
83877 +    } u;
83878 +};
83879 +typedef struct evtchn_op evtchn_op_t;
83880 +DEFINE_XEN_GUEST_HANDLE(evtchn_op_t);
83881 +
83882 +#endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */
83883 +
83884 +/*
83885 + * Local variables:
83886 + * mode: C
83887 + * c-set-style: "BSD"
83888 + * c-basic-offset: 4
83889 + * tab-width: 4
83890 + * indent-tabs-mode: nil
83891 + * End:
83892 + */
83893 diff -urNp linux-2.6/include/xen/interface/features.h new/include/xen/interface/features.h
83894 --- linux-2.6/include/xen/interface/features.h  1970-01-01 01:00:00.000000000 +0100
83895 +++ new/include/xen/interface/features.h        2006-05-09 12:35:56.000000000 +0200
83896 @@ -0,0 +1,53 @@
83897 +/******************************************************************************
83898 + * features.h
83899 + * 
83900 + * Feature flags, reported by XENVER_get_features.
83901 + * 
83902 + * Copyright (c) 2006, Keir Fraser <keir@xensource.com>
83903 + */
83904 +
83905 +#ifndef __XEN_PUBLIC_FEATURES_H__
83906 +#define __XEN_PUBLIC_FEATURES_H__
83907 +
83908 +/*
83909 + * If set, the guest does not need to write-protect its pagetables, and can
83910 + * update them via direct writes.
83911 + */
83912 +#define XENFEAT_writable_page_tables       0
83913 +
83914 +/*
83915 + * If set, the guest does not need to write-protect its segment descriptor
83916 + * tables, and can update them via direct writes.
83917 + */
83918 +#define XENFEAT_writable_descriptor_tables 1
83919 +
83920 +/*
83921 + * If set, translation between the guest's 'pseudo-physical' address space
83922 + * and the host's machine address space are handled by the hypervisor. In this
83923 + * mode the guest does not need to perform phys-to/from-machine translations
83924 + * when performing page table operations.
83925 + */
83926 +#define XENFEAT_auto_translated_physmap    2
83927 +
83928 +/* If set, the guest is running in supervisor mode (e.g., x86 ring 0). */
83929 +#define XENFEAT_supervisor_mode_kernel     3
83930 +
83931 +/*
83932 + * If set, the guest does not need to allocate x86 PAE page directories
83933 + * below 4GB. This flag is usually implied by auto_translated_physmap.
83934 + */
83935 +#define XENFEAT_pae_pgdir_above_4gb        4
83936 +
83937 +#define XENFEAT_NR_SUBMAPS 1
83938 +
83939 +#endif /* __XEN_PUBLIC_FEATURES_H__ */
83940 +
83941 +/*
83942 + * Local variables:
83943 + * mode: C
83944 + * c-set-style: "BSD"
83945 + * c-basic-offset: 4
83946 + * tab-width: 4
83947 + * indent-tabs-mode: nil
83948 + * End:
83949 + */
83950 diff -urNp linux-2.6/include/xen/interface/grant_table.h new/include/xen/interface/grant_table.h
83951 --- linux-2.6/include/xen/interface/grant_table.h       1970-01-01 01:00:00.000000000 +0100
83952 +++ new/include/xen/interface/grant_table.h     2006-06-07 13:29:36.000000000 +0200
83953 @@ -0,0 +1,317 @@
83954 +/******************************************************************************
83955 + * grant_table.h
83956 + * 
83957 + * Interface for granting foreign access to page frames, and receiving
83958 + * page-ownership transfers.
83959 + * 
83960 + * Copyright (c) 2004, K A Fraser
83961 + */
83962 +
83963 +#ifndef __XEN_PUBLIC_GRANT_TABLE_H__
83964 +#define __XEN_PUBLIC_GRANT_TABLE_H__
83965 +
83966 +
83967 +/***********************************
83968 + * GRANT TABLE REPRESENTATION
83969 + */
83970 +
83971 +/* Some rough guidelines on accessing and updating grant-table entries
83972 + * in a concurrency-safe manner. For more information, Linux contains a
83973 + * reference implementation for guest OSes (arch/xen/kernel/grant_table.c).
83974 + * 
83975 + * NB. WMB is a no-op on current-generation x86 processors. However, a
83976 + *     compiler barrier will still be required.
83977 + * 
83978 + * Introducing a valid entry into the grant table:
83979 + *  1. Write ent->domid.
83980 + *  2. Write ent->frame:
83981 + *      GTF_permit_access:   Frame to which access is permitted.
83982 + *      GTF_accept_transfer: Pseudo-phys frame slot being filled by new
83983 + *                           frame, or zero if none.
83984 + *  3. Write memory barrier (WMB).
83985 + *  4. Write ent->flags, inc. valid type.
83986 + * 
83987 + * Invalidating an unused GTF_permit_access entry:
83988 + *  1. flags = ent->flags.
83989 + *  2. Observe that !(flags & (GTF_reading|GTF_writing)).
83990 + *  3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
83991 + *  NB. No need for WMB as reuse of entry is control-dependent on success of
83992 + *      step 3, and all architectures guarantee ordering of ctrl-dep writes.
83993 + *
83994 + * Invalidating an in-use GTF_permit_access entry:
83995 + *  This cannot be done directly. Request assistance from the domain controller
83996 + *  which can set a timeout on the use of a grant entry and take necessary
83997 + *  action. (NB. This is not yet implemented!).
83998 + * 
83999 + * Invalidating an unused GTF_accept_transfer entry:
84000 + *  1. flags = ent->flags.
84001 + *  2. Observe that !(flags & GTF_transfer_committed). [*]
84002 + *  3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
84003 + *  NB. No need for WMB as reuse of entry is control-dependent on success of
84004 + *      step 3, and all architectures guarantee ordering of ctrl-dep writes.
84005 + *  [*] If GTF_transfer_committed is set then the grant entry is 'committed'.
84006 + *      The guest must /not/ modify the grant entry until the address of the
84007 + *      transferred frame is written. It is safe for the guest to spin waiting
84008 + *      for this to occur (detect by observing GTF_transfer_completed in
84009 + *      ent->flags).
84010 + *
84011 + * Invalidating a committed GTF_accept_transfer entry:
84012 + *  1. Wait for (ent->flags & GTF_transfer_completed).
84013 + *
84014 + * Changing a GTF_permit_access from writable to read-only:
84015 + *  Use SMP-safe CMPXCHG to set GTF_readonly, while checking !GTF_writing.
84016 + * 
84017 + * Changing a GTF_permit_access from read-only to writable:
84018 + *  Use SMP-safe bit-setting instruction.
84019 + */
84020 +
84021 +/*
84022 + * A grant table comprises a packed array of grant entries in one or more
84023 + * page frames shared between Xen and a guest.
84024 + * [XEN]: This field is written by Xen and read by the sharing guest.
84025 + * [GST]: This field is written by the guest and read by Xen.
84026 + */
84027 +struct grant_entry {
84028 +    /* GTF_xxx: various type and flag information.  [XEN,GST] */
84029 +    uint16_t flags;
84030 +    /* The domain being granted foreign privileges. [GST] */
84031 +    domid_t  domid;
84032 +    /*
84033 +     * GTF_permit_access: Frame that @domid is allowed to map and access. [GST]
84034 +     * GTF_accept_transfer: Frame whose ownership transferred by @domid. [XEN]
84035 +     */
84036 +    uint32_t frame;
84037 +};
84038 +typedef struct grant_entry grant_entry_t;
84039 +
84040 +/*
84041 + * Type of grant entry.
84042 + *  GTF_invalid: This grant entry grants no privileges.
84043 + *  GTF_permit_access: Allow @domid to map/access @frame.
84044 + *  GTF_accept_transfer: Allow @domid to transfer ownership of one page frame
84045 + *                       to this guest. Xen writes the page number to @frame.
84046 + */
84047 +#define GTF_invalid         (0U<<0)
84048 +#define GTF_permit_access   (1U<<0)
84049 +#define GTF_accept_transfer (2U<<0)
84050 +#define GTF_type_mask       (3U<<0)
84051 +
84052 +/*
84053 + * Subflags for GTF_permit_access.
84054 + *  GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST]
84055 + *  GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN]
84056 + *  GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN]
84057 + */
84058 +#define _GTF_readonly       (2)
84059 +#define GTF_readonly        (1U<<_GTF_readonly)
84060 +#define _GTF_reading        (3)
84061 +#define GTF_reading         (1U<<_GTF_reading)
84062 +#define _GTF_writing        (4)
84063 +#define GTF_writing         (1U<<_GTF_writing)
84064 +
84065 +/*
84066 + * Subflags for GTF_accept_transfer:
84067 + *  GTF_transfer_committed: Xen sets this flag to indicate that it is committed
84068 + *      to transferring ownership of a page frame. When a guest sees this flag
84069 + *      it must /not/ modify the grant entry until GTF_transfer_completed is
84070 + *      set by Xen.
84071 + *  GTF_transfer_completed: It is safe for the guest to spin-wait on this flag
84072 + *      after reading GTF_transfer_committed. Xen will always write the frame
84073 + *      address, followed by ORing this flag, in a timely manner.
84074 + */
84075 +#define _GTF_transfer_committed (2)
84076 +#define GTF_transfer_committed  (1U<<_GTF_transfer_committed)
84077 +#define _GTF_transfer_completed (3)
84078 +#define GTF_transfer_completed  (1U<<_GTF_transfer_completed)
84079 +
84080 +
84081 +/***********************************
84082 + * GRANT TABLE QUERIES AND USES
84083 + */
84084 +
84085 +/*
84086 + * Reference to a grant entry in a specified domain's grant table.
84087 + */
84088 +typedef uint32_t grant_ref_t;
84089 +
84090 +/*
84091 + * Handle to track a mapping created via a grant reference.
84092 + */
84093 +typedef uint32_t grant_handle_t;
84094 +
84095 +/*
84096 + * GNTTABOP_map_grant_ref: Map the grant entry (<dom>,<ref>) for access
84097 + * by devices and/or host CPUs. If successful, <handle> is a tracking number
84098 + * that must be presented later to destroy the mapping(s). On error, <handle>
84099 + * is a negative status code.
84100 + * NOTES:
84101 + *  1. If GNTPIN_map_for_dev is specified then <dev_bus_addr> is the address
84102 + *     via which I/O devices may access the granted frame.
84103 + *  2. If GNTPIN_map_for_host is specified then a mapping will be added at
84104 + *     either a host virtual address in the current address space, or at
84105 + *     a PTE at the specified machine address.  The type of mapping to
84106 + *     perform is selected through the GNTMAP_contains_pte flag, and the 
84107 + *     address is specified in <host_addr>.
84108 + *  3. Mappings should only be destroyed via GNTTABOP_unmap_grant_ref. If a
84109 + *     host mapping is destroyed by other means then it is *NOT* guaranteed
84110 + *     to be accounted to the correct grant reference!
84111 + */
84112 +#define GNTTABOP_map_grant_ref        0
84113 +struct gnttab_map_grant_ref {
84114 +    /* IN parameters. */
84115 +    uint64_t host_addr;
84116 +    uint32_t flags;               /* GNTMAP_* */
84117 +    grant_ref_t ref;
84118 +    domid_t  dom;
84119 +    /* OUT parameters. */
84120 +    int16_t  status;              /* GNTST_* */
84121 +    grant_handle_t handle;
84122 +    uint64_t dev_bus_addr;
84123 +};
84124 +typedef struct gnttab_map_grant_ref gnttab_map_grant_ref_t;
84125 +DEFINE_XEN_GUEST_HANDLE(gnttab_map_grant_ref_t);
84126 +
84127 +/*
84128 + * GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference mappings
84129 + * tracked by <handle>. If <host_addr> or <dev_bus_addr> is zero, that
84130 + * field is ignored. If non-zero, they must refer to a device/host mapping
84131 + * that is tracked by <handle>
84132 + * NOTES:
84133 + *  1. The call may fail in an undefined manner if either mapping is not
84134 + *     tracked by <handle>.
84135 + *  3. After executing a batch of unmaps, it is guaranteed that no stale
84136 + *     mappings will remain in the device or host TLBs.
84137 + */
84138 +#define GNTTABOP_unmap_grant_ref      1
84139 +struct gnttab_unmap_grant_ref {
84140 +    /* IN parameters. */
84141 +    uint64_t host_addr;
84142 +    uint64_t dev_bus_addr;
84143 +    grant_handle_t handle;
84144 +    /* OUT parameters. */
84145 +    int16_t  status;              /* GNTST_* */
84146 +};
84147 +typedef struct gnttab_unmap_grant_ref gnttab_unmap_grant_ref_t;
84148 +DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_grant_ref_t);
84149 +
84150 +/*
84151 + * GNTTABOP_setup_table: Set up a grant table for <dom> comprising at least
84152 + * <nr_frames> pages. The frame addresses are written to the <frame_list>.
84153 + * Only <nr_frames> addresses are written, even if the table is larger.
84154 + * NOTES:
84155 + *  1. <dom> may be specified as DOMID_SELF.
84156 + *  2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF.
84157 + *  3. Xen may not support more than a single grant-table page per domain.
84158 + */
84159 +#define GNTTABOP_setup_table          2
84160 +struct gnttab_setup_table {
84161 +    /* IN parameters. */
84162 +    domid_t  dom;
84163 +    uint32_t nr_frames;
84164 +    /* OUT parameters. */
84165 +    int16_t  status;              /* GNTST_* */
84166 +    XEN_GUEST_HANDLE(ulong) frame_list;
84167 +};
84168 +typedef struct gnttab_setup_table gnttab_setup_table_t;
84169 +DEFINE_XEN_GUEST_HANDLE(gnttab_setup_table_t);
84170 +
84171 +/*
84172 + * GNTTABOP_dump_table: Dump the contents of the grant table to the
84173 + * xen console. Debugging use only.
84174 + */
84175 +#define GNTTABOP_dump_table           3
84176 +struct gnttab_dump_table {
84177 +    /* IN parameters. */
84178 +    domid_t dom;
84179 +    /* OUT parameters. */
84180 +    int16_t status;               /* GNTST_* */
84181 +};
84182 +typedef struct gnttab_dump_table gnttab_dump_table_t;
84183 +DEFINE_XEN_GUEST_HANDLE(gnttab_dump_table_t);
84184 +
84185 +/*
84186 + * GNTTABOP_transfer_grant_ref: Transfer <frame> to a foreign domain. The
84187 + * foreign domain has previously registered its interest in the transfer via
84188 + * <domid, ref>.
84189 + * 
84190 + * Note that, even if the transfer fails, the specified page no longer belongs
84191 + * to the calling domain *unless* the error is GNTST_bad_page.
84192 + */
84193 +#define GNTTABOP_transfer                4
84194 +struct gnttab_transfer {
84195 +    /* IN parameters. */
84196 +    xen_pfn_t     mfn;
84197 +    domid_t       domid;
84198 +    grant_ref_t   ref;
84199 +    /* OUT parameters. */
84200 +    int16_t       status;
84201 +};
84202 +typedef struct gnttab_transfer gnttab_transfer_t;
84203 +DEFINE_XEN_GUEST_HANDLE(gnttab_transfer_t);
84204 +
84205 +/*
84206 + * Bitfield values for update_pin_status.flags.
84207 + */
84208 + /* Map the grant entry for access by I/O devices. */
84209 +#define _GNTMAP_device_map      (0)
84210 +#define GNTMAP_device_map       (1<<_GNTMAP_device_map)
84211 + /* Map the grant entry for access by host CPUs. */
84212 +#define _GNTMAP_host_map        (1)
84213 +#define GNTMAP_host_map         (1<<_GNTMAP_host_map)
84214 + /* Accesses to the granted frame will be restricted to read-only access. */
84215 +#define _GNTMAP_readonly        (2)
84216 +#define GNTMAP_readonly         (1<<_GNTMAP_readonly)
84217 + /*
84218 +  * GNTMAP_host_map subflag:
84219 +  *  0 => The host mapping is usable only by the guest OS.
84220 +  *  1 => The host mapping is usable by guest OS + current application.
84221 +  */
84222 +#define _GNTMAP_application_map (3)
84223 +#define GNTMAP_application_map  (1<<_GNTMAP_application_map)
84224 +
84225 + /*
84226 +  * GNTMAP_contains_pte subflag:
84227 +  *  0 => This map request contains a host virtual address.
84228 +  *  1 => This map request contains the machine addess of the PTE to update.
84229 +  */
84230 +#define _GNTMAP_contains_pte    (4)
84231 +#define GNTMAP_contains_pte     (1<<_GNTMAP_contains_pte)
84232 +
84233 +/*
84234 + * Values for error status returns. All errors are -ve.
84235 + */
84236 +#define GNTST_okay             (0)  /* Normal return.                        */
84237 +#define GNTST_general_error    (-1) /* General undefined error.              */
84238 +#define GNTST_bad_domain       (-2) /* Unrecognsed domain id.                */
84239 +#define GNTST_bad_gntref       (-3) /* Unrecognised or inappropriate gntref. */
84240 +#define GNTST_bad_handle       (-4) /* Unrecognised or inappropriate handle. */
84241 +#define GNTST_bad_virt_addr    (-5) /* Inappropriate virtual address to map. */
84242 +#define GNTST_bad_dev_addr     (-6) /* Inappropriate device address to unmap.*/
84243 +#define GNTST_no_device_space  (-7) /* Out of space in I/O MMU.              */
84244 +#define GNTST_permission_denied (-8) /* Not enough privilege for operation.  */
84245 +#define GNTST_bad_page         (-9) /* Specified page was invalid for op.    */
84246 +
84247 +#define GNTTABOP_error_msgs {                   \
84248 +    "okay",                                     \
84249 +    "undefined error",                          \
84250 +    "unrecognised domain id",                   \
84251 +    "invalid grant reference",                  \
84252 +    "invalid mapping handle",                   \
84253 +    "invalid virtual address",                  \
84254 +    "invalid device address",                   \
84255 +    "no spare translation slot in the I/O MMU", \
84256 +    "permission denied",                        \
84257 +    "bad page"                                  \
84258 +}
84259 +
84260 +#endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */
84261 +
84262 +/*
84263 + * Local variables:
84264 + * mode: C
84265 + * c-set-style: "BSD"
84266 + * c-basic-offset: 4
84267 + * tab-width: 4
84268 + * indent-tabs-mode: nil
84269 + * End:
84270 + */
84271 diff -urNp linux-2.6/include/xen/interface/hvm/hvm_info_table.h new/include/xen/interface/hvm/hvm_info_table.h
84272 --- linux-2.6/include/xen/interface/hvm/hvm_info_table.h        1970-01-01 01:00:00.000000000 +0100
84273 +++ new/include/xen/interface/hvm/hvm_info_table.h      2006-05-09 12:35:56.000000000 +0200
84274 @@ -0,0 +1,24 @@
84275 +/******************************************************************************
84276 + * hvm/hvm_info_table.h
84277 + * 
84278 + * HVM parameter and information table, written into guest memory map.
84279 + */
84280 +
84281 +#ifndef __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
84282 +#define __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
84283 +
84284 +#define HVM_INFO_PFN         0x09F
84285 +#define HVM_INFO_OFFSET      0x800
84286 +#define HVM_INFO_PADDR       ((HVM_INFO_PFN << 12) + HVM_INFO_OFFSET)
84287 +
84288 +struct hvm_info_table {
84289 +    char        signature[8]; /* "HVM INFO" */
84290 +    uint32_t    length;
84291 +    uint8_t     checksum;
84292 +    uint8_t     acpi_enabled;
84293 +    uint8_t     apic_enabled;
84294 +    uint8_t     pae_enabled;
84295 +    uint32_t    nr_vcpus;
84296 +};
84297 +
84298 +#endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */
84299 diff -urNp linux-2.6/include/xen/interface/hvm/ioreq.h new/include/xen/interface/hvm/ioreq.h
84300 --- linux-2.6/include/xen/interface/hvm/ioreq.h 1970-01-01 01:00:00.000000000 +0100
84301 +++ new/include/xen/interface/hvm/ioreq.h       2006-07-07 15:10:03.000000000 +0200
84302 @@ -0,0 +1,93 @@
84303 +/*
84304 + * ioreq.h: I/O request definitions for device models
84305 + * Copyright (c) 2004, Intel Corporation.
84306 + *
84307 + * This program is free software; you can redistribute it and/or modify it
84308 + * under the terms and conditions of the GNU General Public License,
84309 + * version 2, as published by the Free Software Foundation.
84310 + *
84311 + * This program is distributed in the hope it will be useful, but WITHOUT
84312 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
84313 + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
84314 + * more details.
84315 + *
84316 + * You should have received a copy of the GNU General Public License along with
84317 + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
84318 + * Place - Suite 330, Boston, MA 02111-1307 USA.
84319 + *
84320 + */
84321 +
84322 +#ifndef _IOREQ_H_
84323 +#define _IOREQ_H_
84324 +
84325 +#define IOREQ_READ      1
84326 +#define IOREQ_WRITE     0
84327 +
84328 +#define STATE_INVALID           0
84329 +#define STATE_IOREQ_READY       1
84330 +#define STATE_IOREQ_INPROCESS   2
84331 +#define STATE_IORESP_READY      3
84332 +#define STATE_IORESP_HOOK       4
84333 +
84334 +#define IOREQ_TYPE_PIO          0 /* pio */
84335 +#define IOREQ_TYPE_COPY         1 /* mmio ops */
84336 +#define IOREQ_TYPE_AND          2
84337 +#define IOREQ_TYPE_OR           3
84338 +#define IOREQ_TYPE_XOR          4
84339 +#define IOREQ_TYPE_XCHG         5
84340 +
84341 +/*
84342 + * VMExit dispatcher should cooperate with instruction decoder to
84343 + * prepare this structure and notify service OS and DM by sending
84344 + * virq
84345 + */
84346 +struct ioreq {
84347 +    uint64_t addr;          /*  physical address            */
84348 +    uint64_t size;          /*  size in bytes               */
84349 +    uint64_t count;         /*  for rep prefixes            */
84350 +    union {
84351 +        uint64_t data;      /*  data                        */
84352 +        void    *pdata;     /*  pointer to data             */
84353 +    } u;
84354 +    uint8_t state:4;
84355 +    uint8_t pdata_valid:1;  /* if 1, use pdata above        */
84356 +    uint8_t dir:1;          /*  1=read, 0=write             */
84357 +    uint8_t df:1;
84358 +    uint8_t type;           /* I/O type                     */
84359 +    uint64_t io_count;      /* How many IO done on a vcpu   */
84360 +};
84361 +typedef struct ioreq ioreq_t;
84362 +
84363 +struct global_iodata {
84364 +    uint16_t    pic_elcr;
84365 +    uint16_t    pic_irr;
84366 +    uint16_t    pic_last_irr;
84367 +    uint16_t    pic_clear_irr;
84368 +};
84369 +typedef struct global_iodata global_iodata_t;
84370 +
84371 +struct vcpu_iodata {
84372 +    struct ioreq         vp_ioreq;
84373 +    /* Event channel port */
84374 +    unsigned int    vp_eport;   /* VMX vcpu uses this to notify DM */
84375 +    unsigned int    dm_eport;   /* DM uses this to notify VMX vcpu */
84376 +};
84377 +typedef struct vcpu_iodata vcpu_iodata_t;
84378 +
84379 +struct shared_iopage {
84380 +    struct global_iodata sp_global;
84381 +    struct vcpu_iodata   vcpu_iodata[1];
84382 +};
84383 +typedef struct shared_iopage shared_iopage_t;
84384 +
84385 +#endif /* _IOREQ_H_ */
84386 +
84387 +/*
84388 + * Local variables:
84389 + * mode: C
84390 + * c-set-style: "BSD"
84391 + * c-basic-offset: 4
84392 + * tab-width: 4
84393 + * indent-tabs-mode: nil
84394 + * End:
84395 + */
84396 diff -urNp linux-2.6/include/xen/interface/hvm/vmx_assist.h new/include/xen/interface/hvm/vmx_assist.h
84397 --- linux-2.6/include/xen/interface/hvm/vmx_assist.h    1970-01-01 01:00:00.000000000 +0100
84398 +++ new/include/xen/interface/hvm/vmx_assist.h  2006-05-23 18:42:17.000000000 +0200
84399 @@ -0,0 +1,98 @@
84400 +/*
84401 + * vmx_assist.h: Context definitions for the VMXASSIST world switch.
84402 + *
84403 + * Leendert van Doorn, leendert@watson.ibm.com
84404 + * Copyright (c) 2005, International Business Machines Corporation.
84405 + */
84406 +
84407 +#ifndef _VMX_ASSIST_H_
84408 +#define _VMX_ASSIST_H_
84409 +
84410 +#define VMXASSIST_BASE         0xD0000
84411 +#define VMXASSIST_MAGIC        0x17101966
84412 +#define VMXASSIST_MAGIC_OFFSET (VMXASSIST_BASE+8)
84413 +
84414 +#define VMXASSIST_NEW_CONTEXT (VMXASSIST_BASE + 12)
84415 +#define VMXASSIST_OLD_CONTEXT (VMXASSIST_NEW_CONTEXT + 4)
84416 +
84417 +#ifndef __ASSEMBLY__
84418 +
84419 +union vmcs_arbytes {
84420 +    struct arbyte_fields {
84421 +        unsigned int seg_type : 4,
84422 +            s         : 1,
84423 +            dpl       : 2,
84424 +            p         : 1,
84425 +            reserved0 : 4,
84426 +            avl       : 1,
84427 +            reserved1 : 1,
84428 +            default_ops_size: 1,
84429 +            g         : 1,
84430 +            null_bit  : 1,
84431 +            reserved2 : 15;
84432 +    } fields;
84433 +    unsigned int bytes;
84434 +};
84435 +
84436 +/*
84437 + * World switch state
84438 + */
84439 +struct vmx_assist_context {
84440 +    uint32_t  eip;        /* execution pointer */
84441 +    uint32_t  esp;        /* stack pointer */
84442 +    uint32_t  eflags;     /* flags register */
84443 +    uint32_t  cr0;
84444 +    uint32_t  cr3;        /* page table directory */
84445 +    uint32_t  cr4;
84446 +    uint32_t  idtr_limit; /* idt */
84447 +    uint32_t  idtr_base;
84448 +    uint32_t  gdtr_limit; /* gdt */
84449 +    uint32_t  gdtr_base;
84450 +    uint32_t  cs_sel;     /* cs selector */
84451 +    uint32_t  cs_limit;
84452 +    uint32_t  cs_base;
84453 +    union vmcs_arbytes cs_arbytes;
84454 +    uint32_t  ds_sel;     /* ds selector */
84455 +    uint32_t  ds_limit;
84456 +    uint32_t  ds_base;
84457 +    union vmcs_arbytes ds_arbytes;
84458 +    uint32_t  es_sel;     /* es selector */
84459 +    uint32_t  es_limit;
84460 +    uint32_t  es_base;
84461 +    union vmcs_arbytes es_arbytes;
84462 +    uint32_t  ss_sel;     /* ss selector */
84463 +    uint32_t  ss_limit;
84464 +    uint32_t  ss_base;
84465 +    union vmcs_arbytes ss_arbytes;
84466 +    uint32_t  fs_sel;     /* fs selector */
84467 +    uint32_t  fs_limit;
84468 +    uint32_t  fs_base;
84469 +    union vmcs_arbytes fs_arbytes;
84470 +    uint32_t  gs_sel;     /* gs selector */
84471 +    uint32_t  gs_limit;
84472 +    uint32_t  gs_base;
84473 +    union vmcs_arbytes gs_arbytes;
84474 +    uint32_t  tr_sel;     /* task selector */
84475 +    uint32_t  tr_limit;
84476 +    uint32_t  tr_base;
84477 +    union vmcs_arbytes tr_arbytes;
84478 +    uint32_t  ldtr_sel;   /* ldtr selector */
84479 +    uint32_t  ldtr_limit;
84480 +    uint32_t  ldtr_base;
84481 +    union vmcs_arbytes ldtr_arbytes;
84482 +};
84483 +typedef struct vmx_assist_context vmx_assist_context_t;
84484 +
84485 +#endif /* __ASSEMBLY__ */
84486 +
84487 +#endif /* _VMX_ASSIST_H_ */
84488 +
84489 +/*
84490 + * Local variables:
84491 + * mode: C
84492 + * c-set-style: "BSD"
84493 + * c-basic-offset: 4
84494 + * tab-width: 4
84495 + * indent-tabs-mode: nil
84496 + * End:
84497 + */
84498 diff -urNp linux-2.6/include/xen/interface/io/blkif.h new/include/xen/interface/io/blkif.h
84499 --- linux-2.6/include/xen/interface/io/blkif.h  1970-01-01 01:00:00.000000000 +0100
84500 +++ new/include/xen/interface/io/blkif.h        2006-05-23 18:42:17.000000000 +0200
84501 @@ -0,0 +1,87 @@
84502 +/******************************************************************************
84503 + * blkif.h
84504 + * 
84505 + * Unified block-device I/O interface for Xen guest OSes.
84506 + * 
84507 + * Copyright (c) 2003-2004, Keir Fraser
84508 + */
84509 +
84510 +#ifndef __XEN_PUBLIC_IO_BLKIF_H__
84511 +#define __XEN_PUBLIC_IO_BLKIF_H__
84512 +
84513 +#include "ring.h"
84514 +#include "../grant_table.h"
84515 +
84516 +/*
84517 + * Front->back notifications: When enqueuing a new request, sending a
84518 + * notification can be made conditional on req_event (i.e., the generic
84519 + * hold-off mechanism provided by the ring macros). Backends must set
84520 + * req_event appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()).
84521 + * 
84522 + * Back->front notifications: When enqueuing a new response, sending a
84523 + * notification can be made conditional on rsp_event (i.e., the generic
84524 + * hold-off mechanism provided by the ring macros). Frontends must set
84525 + * rsp_event appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()).
84526 + */
84527 +
84528 +#ifndef blkif_vdev_t
84529 +#define blkif_vdev_t   uint16_t
84530 +#endif
84531 +#define blkif_sector_t uint64_t
84532 +
84533 +#define BLKIF_OP_READ      0
84534 +#define BLKIF_OP_WRITE     1
84535 +
84536 +/*
84537 + * Maximum scatter/gather segments per request.
84538 + * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE.
84539 + * NB. This could be 12 if the ring indexes weren't stored in the same page.
84540 + */
84541 +#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
84542 +
84543 +struct blkif_request {
84544 +    uint8_t        operation;    /* BLKIF_OP_???                         */
84545 +    uint8_t        nr_segments;  /* number of segments                   */
84546 +    blkif_vdev_t   handle;       /* only for read/write requests         */
84547 +    uint64_t       id;           /* private guest value, echoed in resp  */
84548 +    blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
84549 +    struct blkif_request_segment {
84550 +        grant_ref_t gref;        /* reference to I/O buffer frame        */
84551 +        /* @first_sect: first sector in frame to transfer (inclusive).   */
84552 +        /* @last_sect: last sector in frame to transfer (inclusive).     */
84553 +        uint8_t     first_sect, last_sect;
84554 +    } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
84555 +};
84556 +typedef struct blkif_request blkif_request_t;
84557 +
84558 +struct blkif_response {
84559 +    uint64_t        id;              /* copied from request */
84560 +    uint8_t         operation;       /* copied from request */
84561 +    int16_t         status;          /* BLKIF_RSP_???       */
84562 +};
84563 +typedef struct blkif_response blkif_response_t;
84564 +
84565 +#define BLKIF_RSP_ERROR  -1 /* non-specific 'error' */
84566 +#define BLKIF_RSP_OKAY    0 /* non-specific 'okay'  */
84567 +
84568 +/*
84569 + * Generate blkif ring structures and types.
84570 + */
84571 +
84572 +DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
84573 +
84574 +#define VDISK_CDROM        0x1
84575 +#define VDISK_REMOVABLE    0x2
84576 +#define VDISK_READONLY     0x4
84577 +
84578 +#endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
84579 +
84580 +/*
84581 + * Local variables:
84582 + * mode: C
84583 + * c-set-style: "BSD"
84584 + * c-basic-offset: 4
84585 + * tab-width: 4
84586 + * indent-tabs-mode: nil
84587 + * End:
84588 + */
84589 diff -urNp linux-2.6/include/xen/interface/io/console.h new/include/xen/interface/io/console.h
84590 --- linux-2.6/include/xen/interface/io/console.h        1970-01-01 01:00:00.000000000 +0100
84591 +++ new/include/xen/interface/io/console.h      2006-05-09 12:35:56.000000000 +0200
84592 @@ -0,0 +1,33 @@
84593 +/******************************************************************************
84594 + * console.h
84595 + * 
84596 + * Console I/O interface for Xen guest OSes.
84597 + * 
84598 + * Copyright (c) 2005, Keir Fraser
84599 + */
84600 +
84601 +#ifndef __XEN_PUBLIC_IO_CONSOLE_H__
84602 +#define __XEN_PUBLIC_IO_CONSOLE_H__
84603 +
84604 +typedef uint32_t XENCONS_RING_IDX;
84605 +
84606 +#define MASK_XENCONS_IDX(idx, ring) ((idx) & (sizeof(ring)-1))
84607 +
84608 +struct xencons_interface {
84609 +    char in[1024];
84610 +    char out[2048];
84611 +    XENCONS_RING_IDX in_cons, in_prod;
84612 +    XENCONS_RING_IDX out_cons, out_prod;
84613 +};
84614 +
84615 +#endif /* __XEN_PUBLIC_IO_CONSOLE_H__ */
84616 +
84617 +/*
84618 + * Local variables:
84619 + * mode: C
84620 + * c-set-style: "BSD"
84621 + * c-basic-offset: 4
84622 + * tab-width: 4
84623 + * indent-tabs-mode: nil
84624 + * End:
84625 + */
84626 diff -urNp linux-2.6/include/xen/interface/io/netif.h new/include/xen/interface/io/netif.h
84627 --- linux-2.6/include/xen/interface/io/netif.h  1970-01-01 01:00:00.000000000 +0100
84628 +++ new/include/xen/interface/io/netif.h        2006-07-07 15:10:03.000000000 +0200
84629 @@ -0,0 +1,148 @@
84630 +/******************************************************************************
84631 + * netif.h
84632 + * 
84633 + * Unified network-device I/O interface for Xen guest OSes.
84634 + * 
84635 + * Copyright (c) 2003-2004, Keir Fraser
84636 + */
84637 +
84638 +#ifndef __XEN_PUBLIC_IO_NETIF_H__
84639 +#define __XEN_PUBLIC_IO_NETIF_H__
84640 +
84641 +#include "ring.h"
84642 +#include "../grant_table.h"
84643 +
84644 +/*
84645 + * Note that there is *never* any need to notify the backend when
84646 + * enqueuing receive requests (struct netif_rx_request). Notifications
84647 + * after enqueuing any other type of message should be conditional on
84648 + * the appropriate req_event or rsp_event field in the shared ring.
84649 + */
84650 +
84651 +/*
84652 + * This is the 'wire' format for packets:
84653 + *  Request 1: netif_tx_request -- NETTXF_* (any flags)
84654 + * [Request 2: netif_tx_extra]  (only if request 1 has NETTXF_extra_info)
84655 + * [Request 3: netif_tx_extra]  (only if request 2 has XEN_NETIF_EXTRA_MORE)
84656 + *  Request 4: netif_tx_request -- NETTXF_more_data
84657 + *  Request 5: netif_tx_request -- NETTXF_more_data
84658 + *  ...
84659 + *  Request N: netif_tx_request -- 0
84660 + */
84661 +
84662 +/* Protocol checksum field is blank in the packet (hardware offload)? */
84663 +#define _NETTXF_csum_blank     (0)
84664 +#define  NETTXF_csum_blank     (1U<<_NETTXF_csum_blank)
84665 +
84666 +/* Packet data has been validated against protocol checksum. */
84667 +#define _NETTXF_data_validated (1)
84668 +#define  NETTXF_data_validated (1U<<_NETTXF_data_validated)
84669 +
84670 +/* Packet continues in the next request descriptor. */
84671 +#define _NETTXF_more_data      (2)
84672 +#define  NETTXF_more_data      (1U<<_NETTXF_more_data)
84673 +
84674 +/* Packet to be followed by extra descriptor(s). */
84675 +#define _NETTXF_extra_info     (3)
84676 +#define  NETTXF_extra_info     (1U<<_NETTXF_extra_info)
84677 +
84678 +struct netif_tx_request {
84679 +    grant_ref_t gref;      /* Reference to buffer page */
84680 +    uint16_t offset;       /* Offset within buffer page */
84681 +    uint16_t flags;        /* NETTXF_* */
84682 +    uint16_t id;           /* Echoed in response message. */
84683 +    uint16_t size;         /* Packet size in bytes.       */
84684 +};
84685 +typedef struct netif_tx_request netif_tx_request_t;
84686 +
84687 +/* Types of netif_extra_info descriptors. */
84688 +#define XEN_NETIF_EXTRA_TYPE_NONE  (0)  /* Never used - invalid */
84689 +#define XEN_NETIF_EXTRA_TYPE_GSO   (1)  /* u.gso */
84690 +#define XEN_NETIF_EXTRA_TYPE_MAX   (2)
84691 +
84692 +/* netif_extra_info flags. */
84693 +#define _XEN_NETIF_EXTRA_FLAG_MORE (0)
84694 +#define XEN_NETIF_EXTRA_FLAG_MORE  (1U<<_XEN_NETIF_EXTRA_FLAG_MORE)
84695 +
84696 +/* GSO types - only TCPv4 currently supported. */
84697 +#define XEN_NETIF_GSO_TCPV4        (1)
84698 +
84699 +/*
84700 + * This structure needs to fit within both netif_tx_request and
84701 + * netif_rx_response for compatibility.
84702 + */
84703 +struct netif_extra_info {
84704 +    uint8_t type;  /* XEN_NETIF_EXTRA_TYPE_* */
84705 +    uint8_t flags; /* XEN_NETIF_EXTRA_FLAG_* */
84706 +
84707 +    union {
84708 +        struct {
84709 +            /*
84710 +             * Maximum payload size of each segment. For example, for TCP this
84711 +             * is just the path MSS.
84712 +             */
84713 +            uint16_t size;
84714 +
84715 +            /*
84716 +             * GSO type. This determines the protocol of the packet and any
84717 +             * extra features required to segment the packet properly.
84718 +             */
84719 +            uint16_t type; /* XEN_NETIF_GSO_* */
84720 +        } gso;
84721 +
84722 +        uint16_t pad[3];
84723 +    } u;
84724 +};
84725 +
84726 +struct netif_tx_response {
84727 +    uint16_t id;
84728 +    int16_t  status;       /* NETIF_RSP_* */
84729 +};
84730 +typedef struct netif_tx_response netif_tx_response_t;
84731 +
84732 +struct netif_rx_request {
84733 +    uint16_t    id;        /* Echoed in response message.        */
84734 +    grant_ref_t gref;      /* Reference to incoming granted frame */
84735 +};
84736 +typedef struct netif_rx_request netif_rx_request_t;
84737 +
84738 +/* Packet data has been validated against protocol checksum. */
84739 +#define _NETRXF_data_validated (0)
84740 +#define  NETRXF_data_validated (1U<<_NETRXF_data_validated)
84741 +
84742 +/* Protocol checksum field is blank in the packet (hardware offload)? */
84743 +#define _NETRXF_csum_blank     (1)
84744 +#define  NETRXF_csum_blank     (1U<<_NETRXF_csum_blank)
84745 +
84746 +struct netif_rx_response {
84747 +    uint16_t id;
84748 +    uint16_t offset;       /* Offset in page of start of received packet  */
84749 +    uint16_t flags;        /* NETRXF_* */
84750 +    int16_t  status;       /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */
84751 +};
84752 +typedef struct netif_rx_response netif_rx_response_t;
84753 +
84754 +/*
84755 + * Generate netif ring structures and types.
84756 + */
84757 +
84758 +DEFINE_RING_TYPES(netif_tx, struct netif_tx_request, struct netif_tx_response);
84759 +DEFINE_RING_TYPES(netif_rx, struct netif_rx_request, struct netif_rx_response);
84760 +
84761 +#define NETIF_RSP_DROPPED         -2
84762 +#define NETIF_RSP_ERROR           -1
84763 +#define NETIF_RSP_OKAY             0
84764 +/* No response: used for auxiliary requests (e.g., netif_tx_extra). */
84765 +#define NETIF_RSP_NULL             1
84766 +
84767 +#endif
84768 +
84769 +/*
84770 + * Local variables:
84771 + * mode: C
84772 + * c-set-style: "BSD"
84773 + * c-basic-offset: 4
84774 + * tab-width: 4
84775 + * indent-tabs-mode: nil
84776 + * End:
84777 + */
84778 diff -urNp linux-2.6/include/xen/interface/io/pciif.h new/include/xen/interface/io/pciif.h
84779 --- linux-2.6/include/xen/interface/io/pciif.h  1970-01-01 01:00:00.000000000 +0100
84780 +++ new/include/xen/interface/io/pciif.h        2006-05-09 12:35:56.000000000 +0200
84781 @@ -0,0 +1,55 @@
84782 +/*
84783 + * PCI Backend/Frontend Common Data Structures & Macros
84784 + *
84785 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
84786 + */
84787 +#ifndef __XEN_PCI_COMMON_H__
84788 +#define __XEN_PCI_COMMON_H__
84789 +
84790 +/* Be sure to bump this number if you change this file */
84791 +#define XEN_PCI_MAGIC          "7"
84792 +
84793 +/* xen_pci_sharedinfo flags */
84794 +#define _XEN_PCIF_active     (0)
84795 +#define XEN_PCIF_active      (1<<_XEN_PCI_active)
84796 +
84797 +/* xen_pci_op commands */
84798 +#define XEN_PCI_OP_conf_read    (0)
84799 +#define XEN_PCI_OP_conf_write   (1)
84800 +
84801 +/* xen_pci_op error numbers */
84802 +#define XEN_PCI_ERR_success          (0)
84803 +#define XEN_PCI_ERR_dev_not_found   (-1)
84804 +#define XEN_PCI_ERR_invalid_offset  (-2)
84805 +#define XEN_PCI_ERR_access_denied   (-3)
84806 +#define XEN_PCI_ERR_not_implemented (-4)
84807 +/* XEN_PCI_ERR_op_failed - backend failed to complete the operation */
84808 +#define XEN_PCI_ERR_op_failed       (-5)
84809 +
84810 +struct xen_pci_op {
84811 +       /* IN: what action to perform: XEN_PCI_OP_* */
84812 +       uint32_t cmd;
84813 +
84814 +       /* OUT: will contain an error number (if any) from errno.h */
84815 +       int32_t err;
84816 +
84817 +       /* IN: which device to touch */
84818 +       uint32_t domain; /* PCI Domain/Segment */
84819 +       uint32_t bus;
84820 +       uint32_t devfn;
84821 +
84822 +       /* IN: which configuration registers to touch */
84823 +       int32_t offset;
84824 +       int32_t size;
84825 +
84826 +       /* IN/OUT: Contains the result after a READ or the value to WRITE */
84827 +       uint32_t value;
84828 +};
84829 +
84830 +struct xen_pci_sharedinfo {
84831 +       /* flags - XEN_PCIF_* */
84832 +       uint32_t flags;
84833 +       struct xen_pci_op op;
84834 +};
84835 +
84836 +#endif /* __XEN_PCI_COMMON_H__ */
84837 diff -urNp linux-2.6/include/xen/interface/io/ring.h new/include/xen/interface/io/ring.h
84838 --- linux-2.6/include/xen/interface/io/ring.h   1970-01-01 01:00:00.000000000 +0100
84839 +++ new/include/xen/interface/io/ring.h 2006-06-07 13:29:36.000000000 +0200
84840 @@ -0,0 +1,273 @@
84841 +/******************************************************************************
84842 + * ring.h
84843 + * 
84844 + * Shared producer-consumer ring macros.
84845 + *
84846 + * Tim Deegan and Andrew Warfield November 2004.
84847 + */
84848 +
84849 +#ifndef __XEN_PUBLIC_IO_RING_H__
84850 +#define __XEN_PUBLIC_IO_RING_H__
84851 +
84852 +typedef unsigned int RING_IDX;
84853 +
84854 +/* Round a 32-bit unsigned constant down to the nearest power of two. */
84855 +#define __RD2(_x)  (((_x) & 0x00000002) ? 0x2                  : ((_x) & 0x1))
84856 +#define __RD4(_x)  (((_x) & 0x0000000c) ? __RD2((_x)>>2)<<2    : __RD2(_x))
84857 +#define __RD8(_x)  (((_x) & 0x000000f0) ? __RD4((_x)>>4)<<4    : __RD4(_x))
84858 +#define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8    : __RD8(_x))
84859 +#define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 : __RD16(_x))
84860 +
84861 +/*
84862 + * Calculate size of a shared ring, given the total available space for the
84863 + * ring and indexes (_sz), and the name tag of the request/response structure.
84864 + * A ring contains as many entries as will fit, rounded down to the nearest 
84865 + * power of two (so we can mask with (size-1) to loop around).
84866 + */
84867 +#define __RING_SIZE(_s, _sz) \
84868 +    (__RD32(((_sz) - (long)&(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
84869 +
84870 +/*
84871 + * Macros to make the correct C datatypes for a new kind of ring.
84872 + * 
84873 + * To make a new ring datatype, you need to have two message structures,
84874 + * let's say request_t, and response_t already defined.
84875 + *
84876 + * In a header where you want the ring datatype declared, you then do:
84877 + *
84878 + *     DEFINE_RING_TYPES(mytag, request_t, response_t);
84879 + *
84880 + * These expand out to give you a set of types, as you can see below.
84881 + * The most important of these are:
84882 + * 
84883 + *     mytag_sring_t      - The shared ring.
84884 + *     mytag_front_ring_t - The 'front' half of the ring.
84885 + *     mytag_back_ring_t  - The 'back' half of the ring.
84886 + *
84887 + * To initialize a ring in your code you need to know the location and size
84888 + * of the shared memory area (PAGE_SIZE, for instance). To initialise
84889 + * the front half:
84890 + *
84891 + *     mytag_front_ring_t front_ring;
84892 + *     SHARED_RING_INIT((mytag_sring_t *)shared_page);
84893 + *     FRONT_RING_INIT(&front_ring, (mytag_sring_t *)shared_page, PAGE_SIZE);
84894 + *
84895 + * Initializing the back follows similarly (note that only the front
84896 + * initializes the shared ring):
84897 + *
84898 + *     mytag_back_ring_t back_ring;
84899 + *     BACK_RING_INIT(&back_ring, (mytag_sring_t *)shared_page, PAGE_SIZE);
84900 + */
84901 +
84902 +#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t)                     \
84903 +                                                                        \
84904 +/* Shared ring entry */                                                 \
84905 +union __name##_sring_entry {                                            \
84906 +    __req_t req;                                                        \
84907 +    __rsp_t rsp;                                                        \
84908 +};                                                                      \
84909 +                                                                        \
84910 +/* Shared ring page */                                                  \
84911 +struct __name##_sring {                                                 \
84912 +    RING_IDX req_prod, req_event;                                       \
84913 +    RING_IDX rsp_prod, rsp_event;                                       \
84914 +    uint8_t  pad[48];                                                   \
84915 +    union __name##_sring_entry ring[1]; /* variable-length */           \
84916 +};                                                                      \
84917 +                                                                        \
84918 +/* "Front" end's private variables */                                   \
84919 +struct __name##_front_ring {                                            \
84920 +    RING_IDX req_prod_pvt;                                              \
84921 +    RING_IDX rsp_cons;                                                  \
84922 +    unsigned int nr_ents;                                               \
84923 +    struct __name##_sring *sring;                                       \
84924 +};                                                                      \
84925 +                                                                        \
84926 +/* "Back" end's private variables */                                    \
84927 +struct __name##_back_ring {                                             \
84928 +    RING_IDX rsp_prod_pvt;                                              \
84929 +    RING_IDX req_cons;                                                  \
84930 +    unsigned int nr_ents;                                               \
84931 +    struct __name##_sring *sring;                                       \
84932 +};                                                                      \
84933 +                                                                        \
84934 +/* Syntactic sugar */                                                   \
84935 +typedef struct __name##_sring __name##_sring_t;                         \
84936 +typedef struct __name##_front_ring __name##_front_ring_t;               \
84937 +typedef struct __name##_back_ring __name##_back_ring_t
84938 +
84939 +/*
84940 + * Macros for manipulating rings.
84941 + * 
84942 + * FRONT_RING_whatever works on the "front end" of a ring: here 
84943 + * requests are pushed on to the ring and responses taken off it.
84944 + * 
84945 + * BACK_RING_whatever works on the "back end" of a ring: here 
84946 + * requests are taken off the ring and responses put on.
84947 + * 
84948 + * N.B. these macros do NO INTERLOCKS OR FLOW CONTROL. 
84949 + * This is OK in 1-for-1 request-response situations where the 
84950 + * requestor (front end) never has more than RING_SIZE()-1
84951 + * outstanding requests.
84952 + */
84953 +
84954 +/* Initialising empty rings */
84955 +#define SHARED_RING_INIT(_s) do {                                       \
84956 +    (_s)->req_prod  = (_s)->rsp_prod  = 0;                              \
84957 +    (_s)->req_event = (_s)->rsp_event = 1;                              \
84958 +    memset((_s)->pad, 0, sizeof((_s)->pad));                            \
84959 +} while(0)
84960 +
84961 +#define FRONT_RING_INIT(_r, _s, __size) do {                            \
84962 +    (_r)->req_prod_pvt = 0;                                             \
84963 +    (_r)->rsp_cons = 0;                                                 \
84964 +    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
84965 +    (_r)->sring = (_s);                                                 \
84966 +} while (0)
84967 +
84968 +#define BACK_RING_INIT(_r, _s, __size) do {                             \
84969 +    (_r)->rsp_prod_pvt = 0;                                             \
84970 +    (_r)->req_cons = 0;                                                 \
84971 +    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
84972 +    (_r)->sring = (_s);                                                 \
84973 +} while (0)
84974 +
84975 +/* Initialize to existing shared indexes -- for recovery */
84976 +#define FRONT_RING_ATTACH(_r, _s, __size) do {                          \
84977 +    (_r)->sring = (_s);                                                 \
84978 +    (_r)->req_prod_pvt = (_s)->req_prod;                                \
84979 +    (_r)->rsp_cons = (_s)->rsp_prod;                                    \
84980 +    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
84981 +} while (0)
84982 +
84983 +#define BACK_RING_ATTACH(_r, _s, __size) do {                           \
84984 +    (_r)->sring = (_s);                                                 \
84985 +    (_r)->rsp_prod_pvt = (_s)->rsp_prod;                                \
84986 +    (_r)->req_cons = (_s)->req_prod;                                    \
84987 +    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
84988 +} while (0)
84989 +
84990 +/* How big is this ring? */
84991 +#define RING_SIZE(_r)                                                   \
84992 +    ((_r)->nr_ents)
84993 +
84994 +/* Number of free requests (for use on front side only). */
84995 +#define RING_FREE_REQUESTS(_r)                                         \
84996 +    (RING_SIZE(_r) - ((_r)->req_prod_pvt - (_r)->rsp_cons))
84997 +
84998 +/* Test if there is an empty slot available on the front ring.
84999 + * (This is only meaningful from the front. )
85000 + */
85001 +#define RING_FULL(_r)                                                   \
85002 +    (RING_FREE_REQUESTS(_r) == 0)
85003 +
85004 +/* Test if there are outstanding messages to be processed on a ring. */
85005 +#define RING_HAS_UNCONSUMED_RESPONSES(_r)                               \
85006 +    ((_r)->sring->rsp_prod - (_r)->rsp_cons)
85007 +
85008 +#define RING_HAS_UNCONSUMED_REQUESTS(_r)                                \
85009 +    ({                                                                 \
85010 +       unsigned int req = (_r)->sring->req_prod - (_r)->req_cons;      \
85011 +       unsigned int rsp = RING_SIZE(_r) -                              \
85012 +                          ((_r)->req_cons - (_r)->rsp_prod_pvt);       \
85013 +       req < rsp ? req : rsp;                                          \
85014 +    })
85015 +
85016 +/* Direct access to individual ring elements, by index. */
85017 +#define RING_GET_REQUEST(_r, _idx)                                      \
85018 +    (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req))
85019 +
85020 +#define RING_GET_RESPONSE(_r, _idx)                                     \
85021 +    (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp))
85022 +
85023 +/* Loop termination condition: Would the specified index overflow the ring? */
85024 +#define RING_REQUEST_CONS_OVERFLOW(_r, _cons)                           \
85025 +    (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
85026 +
85027 +#define RING_PUSH_REQUESTS(_r) do {                                     \
85028 +    wmb(); /* back sees requests /before/ updated producer index */     \
85029 +    (_r)->sring->req_prod = (_r)->req_prod_pvt;                         \
85030 +} while (0)
85031 +
85032 +#define RING_PUSH_RESPONSES(_r) do {                                    \
85033 +    wmb(); /* front sees responses /before/ updated producer index */   \
85034 +    (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt;                         \
85035 +} while (0)
85036 +
85037 +/*
85038 + * Notification hold-off (req_event and rsp_event):
85039 + * 
85040 + * When queueing requests or responses on a shared ring, it may not always be
85041 + * necessary to notify the remote end. For example, if requests are in flight
85042 + * in a backend, the front may be able to queue further requests without
85043 + * notifying the back (if the back checks for new requests when it queues
85044 + * responses).
85045 + * 
85046 + * When enqueuing requests or responses:
85047 + * 
85048 + *  Use RING_PUSH_{REQUESTS,RESPONSES}_AND_CHECK_NOTIFY(). The second argument
85049 + *  is a boolean return value. True indicates that the receiver requires an
85050 + *  asynchronous notification.
85051 + * 
85052 + * After dequeuing requests or responses (before sleeping the connection):
85053 + * 
85054 + *  Use RING_FINAL_CHECK_FOR_REQUESTS() or RING_FINAL_CHECK_FOR_RESPONSES().
85055 + *  The second argument is a boolean return value. True indicates that there
85056 + *  are pending messages on the ring (i.e., the connection should not be put
85057 + *  to sleep).
85058 + * 
85059 + *  These macros will set the req_event/rsp_event field to trigger a
85060 + *  notification on the very next message that is enqueued. If you want to
85061 + *  create batches of work (i.e., only receive a notification after several
85062 + *  messages have been enqueued) then you will need to create a customised
85063 + *  version of the FINAL_CHECK macro in your own code, which sets the event
85064 + *  field appropriately.
85065 + */
85066 +
85067 +#define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do {           \
85068 +    RING_IDX __old = (_r)->sring->req_prod;                             \
85069 +    RING_IDX __new = (_r)->req_prod_pvt;                                \
85070 +    wmb(); /* back sees requests /before/ updated producer index */     \
85071 +    (_r)->sring->req_prod = __new;                                      \
85072 +    mb(); /* back sees new requests /before/ we check req_event */      \
85073 +    (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) <           \
85074 +                 (RING_IDX)(__new - __old));                            \
85075 +} while (0)
85076 +
85077 +#define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do {          \
85078 +    RING_IDX __old = (_r)->sring->rsp_prod;                             \
85079 +    RING_IDX __new = (_r)->rsp_prod_pvt;                                \
85080 +    wmb(); /* front sees responses /before/ updated producer index */   \
85081 +    (_r)->sring->rsp_prod = __new;                                      \
85082 +    mb(); /* front sees new responses /before/ we check rsp_event */    \
85083 +    (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) <           \
85084 +                 (RING_IDX)(__new - __old));                            \
85085 +} while (0)
85086 +
85087 +#define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do {             \
85088 +    (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);                   \
85089 +    if (_work_to_do) break;                                             \
85090 +    (_r)->sring->req_event = (_r)->req_cons + 1;                        \
85091 +    mb();                                                               \
85092 +    (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);                   \
85093 +} while (0)
85094 +
85095 +#define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do {            \
85096 +    (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);                  \
85097 +    if (_work_to_do) break;                                             \
85098 +    (_r)->sring->rsp_event = (_r)->rsp_cons + 1;                        \
85099 +    mb();                                                               \
85100 +    (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);                  \
85101 +} while (0)
85102 +
85103 +#endif /* __XEN_PUBLIC_IO_RING_H__ */
85104 +
85105 +/*
85106 + * Local variables:
85107 + * mode: C
85108 + * c-set-style: "BSD"
85109 + * c-basic-offset: 4
85110 + * tab-width: 4
85111 + * indent-tabs-mode: nil
85112 + * End:
85113 + */
85114 diff -urNp linux-2.6/include/xen/interface/io/tpmif.h new/include/xen/interface/io/tpmif.h
85115 --- linux-2.6/include/xen/interface/io/tpmif.h  1970-01-01 01:00:00.000000000 +0100
85116 +++ new/include/xen/interface/io/tpmif.h        2006-05-23 18:42:17.000000000 +0200
85117 @@ -0,0 +1,59 @@
85118 +/******************************************************************************
85119 + * tpmif.h
85120 + *
85121 + * TPM I/O interface for Xen guest OSes.
85122 + *
85123 + * Copyright (c) 2005, IBM Corporation
85124 + *
85125 + * Author: Stefan Berger, stefanb@us.ibm.com
85126 + * Grant table support: Mahadevan Gomathisankaran
85127 + *
85128 + * This code has been derived from tools/libxc/xen/io/netif.h
85129 + *
85130 + * Copyright (c) 2003-2004, Keir Fraser
85131 + */
85132 +
85133 +#ifndef __XEN_PUBLIC_IO_TPMIF_H__
85134 +#define __XEN_PUBLIC_IO_TPMIF_H__
85135 +
85136 +#include "../grant_table.h"
85137 +
85138 +struct tpmif_tx_request {
85139 +    unsigned long addr;   /* Machine address of packet.   */
85140 +    grant_ref_t ref;      /* grant table access reference */
85141 +    uint16_t unused;
85142 +    uint16_t size;        /* Packet size in bytes.        */
85143 +};
85144 +typedef struct tpmif_tx_request tpmif_tx_request_t;
85145 +
85146 +/*
85147 + * The TPMIF_TX_RING_SIZE defines the number of pages the
85148 + * front-end and backend can exchange (= size of array).
85149 + */
85150 +typedef uint32_t TPMIF_RING_IDX;
85151 +
85152 +#define TPMIF_TX_RING_SIZE 10
85153 +
85154 +/* This structure must fit in a memory page. */
85155 +
85156 +struct tpmif_ring {
85157 +    struct tpmif_tx_request req;
85158 +};
85159 +typedef struct tpmif_ring tpmif_ring_t;
85160 +
85161 +struct tpmif_tx_interface {
85162 +    struct tpmif_ring ring[TPMIF_TX_RING_SIZE];
85163 +};
85164 +typedef struct tpmif_tx_interface tpmif_tx_interface_t;
85165 +
85166 +#endif
85167 +
85168 +/*
85169 + * Local variables:
85170 + * mode: C
85171 + * c-set-style: "BSD"
85172 + * c-basic-offset: 4
85173 + * tab-width: 4
85174 + * indent-tabs-mode: nil
85175 + * End:
85176 + */
85177 diff -urNp linux-2.6/include/xen/interface/io/xenbus.h new/include/xen/interface/io/xenbus.h
85178 --- linux-2.6/include/xen/interface/io/xenbus.h 1970-01-01 01:00:00.000000000 +0100
85179 +++ new/include/xen/interface/io/xenbus.h       2006-06-05 15:54:33.000000000 +0200
85180 @@ -0,0 +1,45 @@
85181 +/*****************************************************************************
85182 + * xenbus.h
85183 + *
85184 + * Xenbus protocol details.
85185 + *
85186 + * Copyright (C) 2005 XenSource Ltd.
85187 + */
85188 +
85189 +#ifndef _XEN_PUBLIC_IO_XENBUS_H
85190 +#define _XEN_PUBLIC_IO_XENBUS_H
85191 +
85192 +/*
85193 + * The state of either end of the Xenbus, i.e. the current communication
85194 + * status of initialisation across the bus.  States here imply nothing about
85195 + * the state of the connection between the driver and the kernel's device
85196 + * layers.
85197 + */
85198 +enum xenbus_state {
85199 +    XenbusStateUnknown       = 0,
85200 +
85201 +    XenbusStateInitialising  = 1,
85202 +
85203 +    /*
85204 +     * InitWait: Finished early initialisation but waiting for information
85205 +     * from the peer or hotplug scripts.
85206 +     */
85207 +    XenbusStateInitWait      = 2,
85208 +
85209 +    /*
85210 +     * Initialised: Waiting for a connection from the peer.
85211 +     */
85212 +    XenbusStateInitialised   = 3,
85213 +
85214 +    XenbusStateConnected     = 4,
85215 +
85216 +    /*
85217 +     * Closing: The device is being closed due to an error or an unplug event.
85218 +     */
85219 +    XenbusStateClosing       = 5,
85220 +
85221 +    XenbusStateClosed       = 6
85222 +};
85223 +typedef enum xenbus_state XenbusState;
85224 +
85225 +#endif /* _XEN_PUBLIC_IO_XENBUS_H */
85226 diff -urNp linux-2.6/include/xen/interface/io/xs_wire.h new/include/xen/interface/io/xs_wire.h
85227 --- linux-2.6/include/xen/interface/io/xs_wire.h        1970-01-01 01:00:00.000000000 +0100
85228 +++ new/include/xen/interface/io/xs_wire.h      2006-05-09 12:35:56.000000000 +0200
85229 @@ -0,0 +1,97 @@
85230 +/*
85231 + * Details of the "wire" protocol between Xen Store Daemon and client
85232 + * library or guest kernel.
85233 + * Copyright (C) 2005 Rusty Russell IBM Corporation
85234 + */
85235 +
85236 +#ifndef _XS_WIRE_H
85237 +#define _XS_WIRE_H
85238 +
85239 +enum xsd_sockmsg_type
85240 +{
85241 +    XS_DEBUG,
85242 +    XS_DIRECTORY,
85243 +    XS_READ,
85244 +    XS_GET_PERMS,
85245 +    XS_WATCH,
85246 +    XS_UNWATCH,
85247 +    XS_TRANSACTION_START,
85248 +    XS_TRANSACTION_END,
85249 +    XS_INTRODUCE,
85250 +    XS_RELEASE,
85251 +    XS_GET_DOMAIN_PATH,
85252 +    XS_WRITE,
85253 +    XS_MKDIR,
85254 +    XS_RM,
85255 +    XS_SET_PERMS,
85256 +    XS_WATCH_EVENT,
85257 +    XS_ERROR,
85258 +    XS_IS_DOMAIN_INTRODUCED
85259 +};
85260 +
85261 +#define XS_WRITE_NONE "NONE"
85262 +#define XS_WRITE_CREATE "CREATE"
85263 +#define XS_WRITE_CREATE_EXCL "CREATE|EXCL"
85264 +
85265 +/* We hand errors as strings, for portability. */
85266 +struct xsd_errors
85267 +{
85268 +    int errnum;
85269 +    const char *errstring;
85270 +};
85271 +#define XSD_ERROR(x) { x, #x }
85272 +static struct xsd_errors xsd_errors[] __attribute__((unused)) = {
85273 +    XSD_ERROR(EINVAL),
85274 +    XSD_ERROR(EACCES),
85275 +    XSD_ERROR(EEXIST),
85276 +    XSD_ERROR(EISDIR),
85277 +    XSD_ERROR(ENOENT),
85278 +    XSD_ERROR(ENOMEM),
85279 +    XSD_ERROR(ENOSPC),
85280 +    XSD_ERROR(EIO),
85281 +    XSD_ERROR(ENOTEMPTY),
85282 +    XSD_ERROR(ENOSYS),
85283 +    XSD_ERROR(EROFS),
85284 +    XSD_ERROR(EBUSY),
85285 +    XSD_ERROR(EAGAIN),
85286 +    XSD_ERROR(EISCONN)
85287 +};
85288 +
85289 +struct xsd_sockmsg
85290 +{
85291 +    uint32_t type;  /* XS_??? */
85292 +    uint32_t req_id;/* Request identifier, echoed in daemon's response.  */
85293 +    uint32_t tx_id; /* Transaction id (0 if not related to a transaction). */
85294 +    uint32_t len;   /* Length of data following this. */
85295 +
85296 +    /* Generally followed by nul-terminated string(s). */
85297 +};
85298 +
85299 +enum xs_watch_type
85300 +{
85301 +    XS_WATCH_PATH = 0,
85302 +    XS_WATCH_TOKEN
85303 +};
85304 +
85305 +/* Inter-domain shared memory communications. */
85306 +#define XENSTORE_RING_SIZE 1024
85307 +typedef uint32_t XENSTORE_RING_IDX;
85308 +#define MASK_XENSTORE_IDX(idx) ((idx) & (XENSTORE_RING_SIZE-1))
85309 +struct xenstore_domain_interface {
85310 +    char req[XENSTORE_RING_SIZE]; /* Requests to xenstore daemon. */
85311 +    char rsp[XENSTORE_RING_SIZE]; /* Replies and async watch events. */
85312 +    XENSTORE_RING_IDX req_cons, req_prod;
85313 +    XENSTORE_RING_IDX rsp_cons, rsp_prod;
85314 +};
85315 +
85316 +#endif /* _XS_WIRE_H */
85317 +
85318 +/*
85319 + * Local variables:
85320 + * mode: C
85321 + * c-set-style: "BSD"
85322 + * c-basic-offset: 4
85323 + * tab-width: 4
85324 + * indent-tabs-mode: nil
85325 + * End:
85326 + */
85327 diff -urNp linux-2.6/include/xen/interface/memory.h new/include/xen/interface/memory.h
85328 --- linux-2.6/include/xen/interface/memory.h    1970-01-01 01:00:00.000000000 +0100
85329 +++ new/include/xen/interface/memory.h  2006-07-07 15:10:03.000000000 +0200
85330 @@ -0,0 +1,243 @@
85331 +/******************************************************************************
85332 + * memory.h
85333 + * 
85334 + * Memory reservation and information.
85335 + * 
85336 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
85337 + */
85338 +
85339 +#ifndef __XEN_PUBLIC_MEMORY_H__
85340 +#define __XEN_PUBLIC_MEMORY_H__
85341 +
85342 +/*
85343 + * Increase or decrease the specified domain's memory reservation. Returns the
85344 + * number of extents successfully allocated or freed.
85345 + * arg == addr of struct xen_memory_reservation.
85346 + */
85347 +#define XENMEM_increase_reservation 0
85348 +#define XENMEM_decrease_reservation 1
85349 +#define XENMEM_populate_physmap     6
85350 +struct xen_memory_reservation {
85351 +
85352 +    /*
85353 +     * XENMEM_increase_reservation:
85354 +     *   OUT: MFN (*not* GMFN) bases of extents that were allocated
85355 +     * XENMEM_decrease_reservation:
85356 +     *   IN:  GMFN bases of extents to free
85357 +     * XENMEM_populate_physmap:
85358 +     *   IN:  GPFN bases of extents to populate with memory
85359 +     *   OUT: GMFN bases of extents that were allocated
85360 +     *   (NB. This command also updates the mach_to_phys translation table)
85361 +     */
85362 +    XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
85363 +
85364 +    /* Number of extents, and size/alignment of each (2^extent_order pages). */
85365 +    xen_ulong_t    nr_extents;
85366 +    unsigned int   extent_order;
85367 +
85368 +    /*
85369 +     * Maximum # bits addressable by the user of the allocated region (e.g., 
85370 +     * I/O devices often have a 32-bit limitation even in 64-bit systems). If 
85371 +     * zero then the user has no addressing restriction.
85372 +     * This field is not used by XENMEM_decrease_reservation.
85373 +     */
85374 +    unsigned int   address_bits;
85375 +
85376 +    /*
85377 +     * Domain whose reservation is being changed.
85378 +     * Unprivileged domains can specify only DOMID_SELF.
85379 +     */
85380 +    domid_t        domid;
85381 +};
85382 +typedef struct xen_memory_reservation xen_memory_reservation_t;
85383 +DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t);
85384 +
85385 +/*
85386 + * An atomic exchange of memory pages. If return code is zero then
85387 + * @out.extent_list provides GMFNs of the newly-allocated memory.
85388 + * Returns zero on complete success, otherwise a negative error code.
85389 + * On complete success then always @nr_exchanged == @in.nr_extents.
85390 + * On partial success @nr_exchanged indicates how much work was done.
85391 + */
85392 +#define XENMEM_exchange             11
85393 +struct xen_memory_exchange {
85394 +    /*
85395 +     * [IN] Details of memory extents to be exchanged (GMFN bases).
85396 +     * Note that @in.address_bits is ignored and unused.
85397 +     */
85398 +    struct xen_memory_reservation in;
85399 +
85400 +    /*
85401 +     * [IN/OUT] Details of new memory extents.
85402 +     * We require that:
85403 +     *  1. @in.domid == @out.domid
85404 +     *  2. @in.nr_extents  << @in.extent_order == 
85405 +     *     @out.nr_extents << @out.extent_order
85406 +     *  3. @in.extent_start and @out.extent_start lists must not overlap
85407 +     *  4. @out.extent_start lists GPFN bases to be populated
85408 +     *  5. @out.extent_start is overwritten with allocated GMFN bases
85409 +     */
85410 +    struct xen_memory_reservation out;
85411 +
85412 +    /*
85413 +     * [OUT] Number of input extents that were successfully exchanged:
85414 +     *  1. The first @nr_exchanged input extents were successfully
85415 +     *     deallocated.
85416 +     *  2. The corresponding first entries in the output extent list correctly
85417 +     *     indicate the GMFNs that were successfully exchanged.
85418 +     *  3. All other input and output extents are untouched.
85419 +     *  4. If not all input exents are exchanged then the return code of this
85420 +     *     command will be non-zero.
85421 +     *  5. THIS FIELD MUST BE INITIALISED TO ZERO BY THE CALLER!
85422 +     */
85423 +    xen_ulong_t nr_exchanged;
85424 +};
85425 +typedef struct xen_memory_exchange xen_memory_exchange_t;
85426 +DEFINE_XEN_GUEST_HANDLE(xen_memory_exchange_t);
85427 +
85428 +/*
85429 + * Returns the maximum machine frame number of mapped RAM in this system.
85430 + * This command always succeeds (it never returns an error code).
85431 + * arg == NULL.
85432 + */
85433 +#define XENMEM_maximum_ram_page     2
85434 +
85435 +/*
85436 + * Returns the current or maximum memory reservation, in pages, of the
85437 + * specified domain (may be DOMID_SELF). Returns -ve errcode on failure.
85438 + * arg == addr of domid_t.
85439 + */
85440 +#define XENMEM_current_reservation  3
85441 +#define XENMEM_maximum_reservation  4
85442 +
85443 +/*
85444 + * Returns a list of MFN bases of 2MB extents comprising the machine_to_phys
85445 + * mapping table. Architectures which do not have a m2p table do not implement
85446 + * this command.
85447 + * arg == addr of xen_machphys_mfn_list_t.
85448 + */
85449 +#define XENMEM_machphys_mfn_list    5
85450 +struct xen_machphys_mfn_list {
85451 +    /*
85452 +     * Size of the 'extent_start' array. Fewer entries will be filled if the
85453 +     * machphys table is smaller than max_extents * 2MB.
85454 +     */
85455 +    unsigned int max_extents;
85456 +
85457 +    /*
85458 +     * Pointer to buffer to fill with list of extent starts. If there are
85459 +     * any large discontiguities in the machine address space, 2MB gaps in
85460 +     * the machphys table will be represented by an MFN base of zero.
85461 +     */
85462 +    XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
85463 +
85464 +    /*
85465 +     * Number of extents written to the above array. This will be smaller
85466 +     * than 'max_extents' if the machphys table is smaller than max_e * 2MB.
85467 +     */
85468 +    unsigned int nr_extents;
85469 +};
85470 +typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t;
85471 +DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);
85472 +
85473 +/*
85474 + * Returns the location in virtual address space of the machine_to_phys
85475 + * mapping table. Architectures which do not have a m2p table, or which do not
85476 + * map it by default into guest address space, do not implement this command.
85477 + * arg == addr of xen_machphys_mapping_t.
85478 + */
85479 +#define XENMEM_machphys_mapping     12
85480 +struct xen_machphys_mapping {
85481 +    xen_ulong_t v_start, v_end; /* Start and end virtual addresses.   */
85482 +    xen_ulong_t max_mfn;        /* Maximum MFN that can be looked up. */
85483 +};
85484 +typedef struct xen_machphys_mapping xen_machphys_mapping_t;
85485 +DEFINE_XEN_GUEST_HANDLE(xen_machphys_mapping_t);
85486 +
85487 +/*
85488 + * Sets the GPFN at which a particular page appears in the specified guest's
85489 + * pseudophysical address space.
85490 + * arg == addr of xen_add_to_physmap_t.
85491 + */
85492 +#define XENMEM_add_to_physmap      7
85493 +struct xen_add_to_physmap {
85494 +    /* Which domain to change the mapping for. */
85495 +    domid_t domid;
85496 +
85497 +    /* Source mapping space. */
85498 +#define XENMAPSPACE_shared_info 0 /* shared info page */
85499 +#define XENMAPSPACE_grant_table 1 /* grant table page */
85500 +    unsigned int space;
85501 +
85502 +    /* Index into source mapping space. */
85503 +    xen_ulong_t idx;
85504 +
85505 +    /* GPFN where the source mapping page should appear. */
85506 +    xen_pfn_t     gpfn;
85507 +};
85508 +typedef struct xen_add_to_physmap xen_add_to_physmap_t;
85509 +DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
85510 +
85511 +/*
85512 + * Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error
85513 + * code on failure. This call only works for auto-translated guests.
85514 + */
85515 +#define XENMEM_translate_gpfn_list  8
85516 +struct xen_translate_gpfn_list {
85517 +    /* Which domain to translate for? */
85518 +    domid_t domid;
85519 +
85520 +    /* Length of list. */
85521 +    xen_ulong_t nr_gpfns;
85522 +
85523 +    /* List of GPFNs to translate. */
85524 +    XEN_GUEST_HANDLE(xen_pfn_t) gpfn_list;
85525 +
85526 +    /*
85527 +     * Output list to contain MFN translations. May be the same as the input
85528 +     * list (in which case each input GPFN is overwritten with the output MFN).
85529 +     */
85530 +    XEN_GUEST_HANDLE(xen_pfn_t) mfn_list;
85531 +};
85532 +typedef struct xen_translate_gpfn_list xen_translate_gpfn_list_t;
85533 +DEFINE_XEN_GUEST_HANDLE(xen_translate_gpfn_list_t);
85534 +
85535 +/*
85536 + * Returns the pseudo-physical memory map as it was when the domain
85537 + * was started.
85538 + */
85539 +#define XENMEM_memory_map           9
85540 +struct xen_memory_map {
85541 +    /*
85542 +     * On call the number of entries which can be stored in buffer. On
85543 +     * return the number of entries which have been stored in
85544 +     * buffer.
85545 +     */
85546 +    unsigned int nr_entries;
85547 +
85548 +    /*
85549 +     * Entries in the buffer are in the same format as returned by the
85550 +     * BIOS INT 0x15 EAX=0xE820 call.
85551 +     */
85552 +    XEN_GUEST_HANDLE(void) buffer;
85553 +};
85554 +typedef struct xen_memory_map xen_memory_map_t;
85555 +DEFINE_XEN_GUEST_HANDLE(xen_memory_map_t);
85556 +
85557 +/*
85558 + * Returns the real physical memory map. Passes the same structure as
85559 + * XENMEM_memory_map.
85560 + */
85561 +#define XENMEM_machine_memory_map      10
85562 +
85563 +#endif /* __XEN_PUBLIC_MEMORY_H__ */
85564 +
85565 +/*
85566 + * Local variables:
85567 + * mode: C
85568 + * c-set-style: "BSD"
85569 + * c-basic-offset: 4
85570 + * tab-width: 4
85571 + * indent-tabs-mode: nil
85572 + * End:
85573 + */
85574 diff -urNp linux-2.6/include/xen/interface/nmi.h new/include/xen/interface/nmi.h
85575 --- linux-2.6/include/xen/interface/nmi.h       1970-01-01 01:00:00.000000000 +0100
85576 +++ new/include/xen/interface/nmi.h     2006-05-23 18:42:17.000000000 +0200
85577 @@ -0,0 +1,60 @@
85578 +/******************************************************************************
85579 + * nmi.h
85580 + * 
85581 + * NMI callback registration and reason codes.
85582 + * 
85583 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
85584 + */
85585 +
85586 +#ifndef __XEN_PUBLIC_NMI_H__
85587 +#define __XEN_PUBLIC_NMI_H__
85588 +
85589 +/*
85590 + * NMI reason codes:
85591 + * Currently these are x86-specific, stored in arch_shared_info.nmi_reason.
85592 + */
85593 + /* I/O-check error reported via ISA port 0x61, bit 6. */
85594 +#define _XEN_NMIREASON_io_error     0
85595 +#define XEN_NMIREASON_io_error      (1UL << _XEN_NMIREASON_io_error)
85596 + /* Parity error reported via ISA port 0x61, bit 7. */
85597 +#define _XEN_NMIREASON_parity_error 1
85598 +#define XEN_NMIREASON_parity_error  (1UL << _XEN_NMIREASON_parity_error)
85599 + /* Unknown hardware-generated NMI. */
85600 +#define _XEN_NMIREASON_unknown      2
85601 +#define XEN_NMIREASON_unknown       (1UL << _XEN_NMIREASON_unknown)
85602 +
85603 +/*
85604 + * long nmi_op(unsigned int cmd, void *arg)
85605 + * NB. All ops return zero on success, else a negative error code.
85606 + */
85607 +
85608 +/*
85609 + * Register NMI callback for this (calling) VCPU. Currently this only makes
85610 + * sense for domain 0, vcpu 0. All other callers will be returned EINVAL.
85611 + * arg == pointer to xennmi_callback structure.
85612 + */
85613 +#define XENNMI_register_callback   0
85614 +struct xennmi_callback {
85615 +    unsigned long handler_address;
85616 +    unsigned long pad;
85617 +};
85618 +typedef struct xennmi_callback xennmi_callback_t;
85619 +DEFINE_XEN_GUEST_HANDLE(xennmi_callback_t);
85620 +
85621 +/*
85622 + * Deregister NMI callback for this (calling) VCPU.
85623 + * arg == NULL.
85624 + */
85625 +#define XENNMI_unregister_callback 1
85626 +
85627 +#endif /* __XEN_PUBLIC_NMI_H__ */
85628 +
85629 +/*
85630 + * Local variables:
85631 + * mode: C
85632 + * c-set-style: "BSD"
85633 + * c-basic-offset: 4
85634 + * tab-width: 4
85635 + * indent-tabs-mode: nil
85636 + * End:
85637 + */
85638 diff -urNp linux-2.6/include/xen/interface/physdev.h new/include/xen/interface/physdev.h
85639 --- linux-2.6/include/xen/interface/physdev.h   1970-01-01 01:00:00.000000000 +0100
85640 +++ new/include/xen/interface/physdev.h 2006-06-28 14:32:14.000000000 +0200
85641 @@ -0,0 +1,149 @@
85642 +
85643 +#ifndef __XEN_PUBLIC_PHYSDEV_H__
85644 +#define __XEN_PUBLIC_PHYSDEV_H__
85645 +
85646 +/*
85647 + * Prototype for this hypercall is:
85648 + *  int physdev_op(int cmd, void *args)
85649 + * @cmd  == PHYSDEVOP_??? (physdev operation).
85650 + * @args == Operation-specific extra arguments (NULL if none).
85651 + */
85652 +
85653 +/*
85654 + * Notify end-of-interrupt (EOI) for the specified IRQ.
85655 + * @arg == pointer to physdev_eoi structure.
85656 + */
85657 +#define PHYSDEVOP_eoi                   12
85658 +struct physdev_eoi {
85659 +    /* IN */
85660 +    uint32_t irq;
85661 +};
85662 +typedef struct physdev_eoi physdev_eoi_t;
85663 +DEFINE_XEN_GUEST_HANDLE(physdev_eoi_t);
85664 +
85665 +/*
85666 + * Query the status of an IRQ line.
85667 + * @arg == pointer to physdev_irq_status_query structure.
85668 + */
85669 +#define PHYSDEVOP_irq_status_query       5
85670 +struct physdev_irq_status_query {
85671 +    /* IN */
85672 +    uint32_t irq;
85673 +    /* OUT */
85674 +    uint32_t flags; /* XENIRQSTAT_* */
85675 +};
85676 +typedef struct physdev_irq_status_query physdev_irq_status_query_t;
85677 +DEFINE_XEN_GUEST_HANDLE(physdev_irq_status_query_t);
85678 +
85679 +/* Need to call PHYSDEVOP_eoi when the IRQ has been serviced? */
85680 +#define _XENIRQSTAT_needs_eoi   (0)
85681 +#define  XENIRQSTAT_needs_eoi   (1U<<_XENIRQSTAT_needs_eoi)
85682 +
85683 +/* IRQ shared by multiple guests? */
85684 +#define _XENIRQSTAT_shared      (1)
85685 +#define  XENIRQSTAT_shared      (1U<<_XENIRQSTAT_shared)
85686 +
85687 +/*
85688 + * Set the current VCPU's I/O privilege level.
85689 + * @arg == pointer to physdev_set_iopl structure.
85690 + */
85691 +#define PHYSDEVOP_set_iopl               6
85692 +struct physdev_set_iopl {
85693 +    /* IN */
85694 +    uint32_t iopl;
85695 +};
85696 +typedef struct physdev_set_iopl physdev_set_iopl_t;
85697 +DEFINE_XEN_GUEST_HANDLE(physdev_set_iopl_t);
85698 +
85699 +/*
85700 + * Set the current VCPU's I/O-port permissions bitmap.
85701 + * @arg == pointer to physdev_set_iobitmap structure.
85702 + */
85703 +#define PHYSDEVOP_set_iobitmap           7
85704 +struct physdev_set_iobitmap {
85705 +    /* IN */
85706 +    uint8_t *bitmap;
85707 +    uint32_t nr_ports;
85708 +};
85709 +typedef struct physdev_set_iobitmap physdev_set_iobitmap_t;
85710 +DEFINE_XEN_GUEST_HANDLE(physdev_set_iobitmap_t);
85711 +
85712 +/*
85713 + * Read or write an IO-APIC register.
85714 + * @arg == pointer to physdev_apic structure.
85715 + */
85716 +#define PHYSDEVOP_apic_read              8
85717 +#define PHYSDEVOP_apic_write             9
85718 +struct physdev_apic {
85719 +    /* IN */
85720 +    unsigned long apic_physbase;
85721 +    uint32_t reg;
85722 +    /* IN or OUT */
85723 +    uint32_t value;
85724 +};
85725 +typedef struct physdev_apic physdev_apic_t;
85726 +DEFINE_XEN_GUEST_HANDLE(physdev_apic_t);
85727 +
85728 +/*
85729 + * Allocate or free a physical upcall vector for the specified IRQ line.
85730 + * @arg == pointer to physdev_irq structure.
85731 + */
85732 +#define PHYSDEVOP_alloc_irq_vector      10
85733 +#define PHYSDEVOP_free_irq_vector       11
85734 +struct physdev_irq {
85735 +    /* IN */
85736 +    uint32_t irq;
85737 +    /* IN or OUT */
85738 +    uint32_t vector;
85739 +};
85740 +typedef struct physdev_irq physdev_irq_t;
85741 +DEFINE_XEN_GUEST_HANDLE(physdev_irq_t);
85742 +
85743 +/*
85744 + * Argument to physdev_op_compat() hypercall. Superceded by new physdev_op()
85745 + * hypercall since 0x00030202.
85746 + */
85747 +struct physdev_op {
85748 +    uint32_t cmd;
85749 +    union {
85750 +        struct physdev_irq_status_query      irq_status_query;
85751 +        struct physdev_set_iopl              set_iopl;
85752 +        struct physdev_set_iobitmap          set_iobitmap;
85753 +        struct physdev_apic                  apic_op;
85754 +        struct physdev_irq                   irq_op;
85755 +    } u;
85756 +};
85757 +typedef struct physdev_op physdev_op_t;
85758 +DEFINE_XEN_GUEST_HANDLE(physdev_op_t);
85759 +
85760 +/*
85761 + * Notify that some PIRQ-bound event channels have been unmasked.
85762 + * ** This command is obsolete since interface version 0x00030202 and is **
85763 + * ** unsupported by newer versions of Xen.                              **
85764 + */
85765 +#define PHYSDEVOP_IRQ_UNMASK_NOTIFY      4
85766 +
85767 +/*
85768 + * These all-capitals physdev operation names are superceded by the new names
85769 + * (defined above) since interface version 0x00030202.
85770 + */
85771 +#define PHYSDEVOP_IRQ_STATUS_QUERY       PHYSDEVOP_irq_status_query
85772 +#define PHYSDEVOP_SET_IOPL               PHYSDEVOP_set_iopl
85773 +#define PHYSDEVOP_SET_IOBITMAP           PHYSDEVOP_set_iobitmap
85774 +#define PHYSDEVOP_APIC_READ              PHYSDEVOP_apic_read
85775 +#define PHYSDEVOP_APIC_WRITE             PHYSDEVOP_apic_write
85776 +#define PHYSDEVOP_ASSIGN_VECTOR          PHYSDEVOP_alloc_irq_vector
85777 +#define PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY XENIRQSTAT_needs_eoi
85778 +#define PHYSDEVOP_IRQ_SHARED             XENIRQSTAT_shared
85779 +
85780 +#endif /* __XEN_PUBLIC_PHYSDEV_H__ */
85781 +
85782 +/*
85783 + * Local variables:
85784 + * mode: C
85785 + * c-set-style: "BSD"
85786 + * c-basic-offset: 4
85787 + * tab-width: 4
85788 + * indent-tabs-mode: nil
85789 + * End:
85790 + */
85791 diff -urNp linux-2.6/include/xen/interface/sched_ctl.h new/include/xen/interface/sched_ctl.h
85792 --- linux-2.6/include/xen/interface/sched_ctl.h 1970-01-01 01:00:00.000000000 +0100
85793 +++ new/include/xen/interface/sched_ctl.h       2006-06-05 15:54:33.000000000 +0200
85794 @@ -0,0 +1,69 @@
85795 +/******************************************************************************
85796 + * Generic scheduler control interface.
85797 + *
85798 + * Mark Williamson, (C) 2004 Intel Research Cambridge
85799 + */
85800 +
85801 +#ifndef __XEN_PUBLIC_SCHED_CTL_H__
85802 +#define __XEN_PUBLIC_SCHED_CTL_H__
85803 +
85804 +/* Scheduler types. */
85805 +#define SCHED_BVT      0
85806 +#define SCHED_SEDF     4
85807 +#define SCHED_CREDIT   5
85808 +
85809 +/* Set or get info? */
85810 +#define SCHED_INFO_PUT 0
85811 +#define SCHED_INFO_GET 1
85812 +
85813 +/*
85814 + * Generic scheduler control command - used to adjust system-wide scheduler
85815 + * parameters
85816 + */
85817 +struct sched_ctl_cmd {
85818 +    uint32_t sched_id;
85819 +    uint32_t direction;
85820 +    union {
85821 +        struct bvt_ctl {
85822 +            uint32_t ctx_allow;
85823 +        } bvt;
85824 +    } u;
85825 +};
85826 +
85827 +struct sched_adjdom_cmd {
85828 +    uint32_t sched_id;
85829 +    uint32_t direction;
85830 +    domid_t  domain;
85831 +    union {
85832 +        struct bvt_adjdom {
85833 +            uint32_t mcu_adv;      /* mcu advance: inverse of weight */
85834 +            uint32_t warpback;     /* warp? */
85835 +            int32_t  warpvalue;    /* warp value */
85836 +            int64_t  warpl;        /* warp limit */
85837 +            int64_t  warpu;        /* unwarp time requirement */
85838 +        } bvt;
85839 +        struct sedf_adjdom {
85840 +            uint64_t period;
85841 +            uint64_t slice;
85842 +            uint64_t latency;
85843 +            uint32_t extratime;
85844 +            uint32_t weight;
85845 +        } sedf;
85846 +        struct sched_credit_adjdom {
85847 +            uint16_t weight;
85848 +            uint16_t cap;
85849 +        } credit;
85850 +    } u;
85851 +};
85852 +
85853 +#endif /* __XEN_PUBLIC_SCHED_CTL_H__ */
85854 +
85855 +/*
85856 + * Local variables:
85857 + * mode: C
85858 + * c-set-style: "BSD"
85859 + * c-basic-offset: 4
85860 + * tab-width: 4
85861 + * indent-tabs-mode: nil
85862 + * End:
85863 + */
85864 diff -urNp linux-2.6/include/xen/interface/sched.h new/include/xen/interface/sched.h
85865 --- linux-2.6/include/xen/interface/sched.h     1970-01-01 01:00:00.000000000 +0100
85866 +++ new/include/xen/interface/sched.h   2006-05-23 18:42:17.000000000 +0200
85867 @@ -0,0 +1,103 @@
85868 +/******************************************************************************
85869 + * sched.h
85870 + * 
85871 + * Scheduler state interactions
85872 + * 
85873 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
85874 + */
85875 +
85876 +#ifndef __XEN_PUBLIC_SCHED_H__
85877 +#define __XEN_PUBLIC_SCHED_H__
85878 +
85879 +#include "event_channel.h"
85880 +
85881 +/*
85882 + * The prototype for this hypercall is:
85883 + *  long sched_op(int cmd, void *arg)
85884 + * @cmd == SCHEDOP_??? (scheduler operation).
85885 + * @arg == Operation-specific extra argument(s), as described below.
85886 + * 
85887 + * Versions of Xen prior to 3.0.2 provided only the following legacy version
85888 + * of this hypercall, supporting only the commands yield, block and shutdown:
85889 + *  long sched_op(int cmd, unsigned long arg)
85890 + * @cmd == SCHEDOP_??? (scheduler operation).
85891 + * @arg == 0               (SCHEDOP_yield and SCHEDOP_block)
85892 + *      == SHUTDOWN_* code (SCHEDOP_shutdown)
85893 + * This legacy version is available to new guests as sched_op_compat().
85894 + */
85895 +
85896 +/*
85897 + * Voluntarily yield the CPU.
85898 + * @arg == NULL.
85899 + */
85900 +#define SCHEDOP_yield       0
85901 +
85902 +/*
85903 + * Block execution of this VCPU until an event is received for processing.
85904 + * If called with event upcalls masked, this operation will atomically
85905 + * reenable event delivery and check for pending events before blocking the
85906 + * VCPU. This avoids a "wakeup waiting" race.
85907 + * @arg == NULL.
85908 + */
85909 +#define SCHEDOP_block       1
85910 +
85911 +/*
85912 + * Halt execution of this domain (all VCPUs) and notify the system controller.
85913 + * @arg == pointer to sched_shutdown structure.
85914 + */
85915 +#define SCHEDOP_shutdown    2
85916 +struct sched_shutdown {
85917 +    unsigned int reason; /* SHUTDOWN_* */
85918 +};
85919 +typedef struct sched_shutdown sched_shutdown_t;
85920 +DEFINE_XEN_GUEST_HANDLE(sched_shutdown_t);
85921 +
85922 +/*
85923 + * Poll a set of event-channel ports. Return when one or more are pending. An
85924 + * optional timeout may be specified.
85925 + * @arg == pointer to sched_poll structure.
85926 + */
85927 +#define SCHEDOP_poll        3
85928 +struct sched_poll {
85929 +    XEN_GUEST_HANDLE(evtchn_port_t) ports;
85930 +    unsigned int nr_ports;
85931 +    uint64_t timeout;
85932 +};
85933 +typedef struct sched_poll sched_poll_t;
85934 +DEFINE_XEN_GUEST_HANDLE(sched_poll_t);
85935 +
85936 +/*
85937 + * Declare a shutdown for another domain. The main use of this function is
85938 + * in interpreting shutdown requests and reasons for fully-virtualized
85939 + * domains.  A para-virtualized domain may use SCHEDOP_shutdown directly.
85940 + * @arg == pointer to sched_remote_shutdown structure.
85941 + */
85942 +#define SCHEDOP_remote_shutdown        4
85943 +struct sched_remote_shutdown {
85944 +    domid_t domain_id;         /* Remote domain ID */
85945 +    unsigned int reason;       /* SHUTDOWN_xxx reason */
85946 +};
85947 +typedef struct sched_remote_shutdown sched_remote_shutdown_t;
85948 +DEFINE_XEN_GUEST_HANDLE(sched_remote_shutdown_t);
85949 +
85950 +/*
85951 + * Reason codes for SCHEDOP_shutdown. These may be interpreted by control
85952 + * software to determine the appropriate action. For the most part, Xen does
85953 + * not care about the shutdown code.
85954 + */
85955 +#define SHUTDOWN_poweroff   0  /* Domain exited normally. Clean up and kill. */
85956 +#define SHUTDOWN_reboot     1  /* Clean up, kill, and then restart.          */
85957 +#define SHUTDOWN_suspend    2  /* Clean up, save suspend info, kill.         */
85958 +#define SHUTDOWN_crash      3  /* Tell controller we've crashed.             */
85959 +
85960 +#endif /* __XEN_PUBLIC_SCHED_H__ */
85961 +
85962 +/*
85963 + * Local variables:
85964 + * mode: C
85965 + * c-set-style: "BSD"
85966 + * c-basic-offset: 4
85967 + * tab-width: 4
85968 + * indent-tabs-mode: nil
85969 + * End:
85970 + */
85971 diff -urNp linux-2.6/include/xen/interface/trace.h new/include/xen/interface/trace.h
85972 --- linux-2.6/include/xen/interface/trace.h     1970-01-01 01:00:00.000000000 +0100
85973 +++ new/include/xen/interface/trace.h   2006-05-09 12:35:56.000000000 +0200
85974 @@ -0,0 +1,86 @@
85975 +/******************************************************************************
85976 + * include/public/trace.h
85977 + * 
85978 + * Mark Williamson, (C) 2004 Intel Research Cambridge
85979 + * Copyright (C) 2005 Bin Ren
85980 + */
85981 +
85982 +#ifndef __XEN_PUBLIC_TRACE_H__
85983 +#define __XEN_PUBLIC_TRACE_H__
85984 +
85985 +/* Trace classes */
85986 +#define TRC_CLS_SHIFT 16
85987 +#define TRC_GEN     0x0001f000    /* General trace            */
85988 +#define TRC_SCHED   0x0002f000    /* Xen Scheduler trace      */
85989 +#define TRC_DOM0OP  0x0004f000    /* Xen DOM0 operation trace */
85990 +#define TRC_VMX     0x0008f000    /* Xen VMX trace            */
85991 +#define TRC_MEM     0x000af000    /* Xen memory trace         */
85992 +#define TRC_ALL     0xfffff000
85993 +
85994 +/* Trace subclasses */
85995 +#define TRC_SUBCLS_SHIFT 12
85996 +/* trace subclasses for VMX */
85997 +#define TRC_VMXEXIT  0x00081000   /* VMX exit trace            */
85998 +#define TRC_VMXTIMER 0x00082000   /* VMX timer trace           */
85999 +#define TRC_VMXINT   0x00084000   /* VMX interrupt trace       */
86000 +#define TRC_VMXIO    0x00088000   /* VMX io emulation trace  */
86001 +
86002 +/* Trace events per class */
86003 +
86004 +#define TRC_SCHED_DOM_ADD       (TRC_SCHED +  1)
86005 +#define TRC_SCHED_DOM_REM       (TRC_SCHED +  2)
86006 +#define TRC_SCHED_SLEEP         (TRC_SCHED +  3)
86007 +#define TRC_SCHED_WAKE          (TRC_SCHED +  4)
86008 +#define TRC_SCHED_YIELD         (TRC_SCHED +  5)
86009 +#define TRC_SCHED_BLOCK         (TRC_SCHED +  6)
86010 +#define TRC_SCHED_SHUTDOWN      (TRC_SCHED +  7)
86011 +#define TRC_SCHED_CTL           (TRC_SCHED +  8)
86012 +#define TRC_SCHED_ADJDOM        (TRC_SCHED +  9)
86013 +#define TRC_SCHED_SWITCH        (TRC_SCHED + 10)
86014 +#define TRC_SCHED_S_TIMER_FN    (TRC_SCHED + 11)
86015 +#define TRC_SCHED_T_TIMER_FN    (TRC_SCHED + 12)
86016 +#define TRC_SCHED_DOM_TIMER_FN  (TRC_SCHED + 13)
86017 +#define TRC_SCHED_SWITCH_INFPREV (TRC_SCHED + 14)
86018 +#define TRC_SCHED_SWITCH_INFNEXT (TRC_SCHED + 15)
86019 +
86020 +#define TRC_MEM_PAGE_GRANT_MAP      (TRC_MEM + 1)
86021 +#define TRC_MEM_PAGE_GRANT_UNMAP    (TRC_MEM + 2)
86022 +#define TRC_MEM_PAGE_GRANT_TRANSFER (TRC_MEM + 3)
86023 +
86024 +/* trace events per subclass */
86025 +#define TRC_VMX_VMEXIT          (TRC_VMXEXIT + 1)
86026 +#define TRC_VMX_VMENTRY         (TRC_VMXEXIT + 2)
86027 +
86028 +#define TRC_VMX_TIMER_INTR      (TRC_VMXTIMER + 1)
86029 +
86030 +#define TRC_VMX_INT             (TRC_VMXINT + 1)
86031 +
86032 +
86033 +/* This structure represents a single trace buffer record. */
86034 +struct t_rec {
86035 +    uint64_t cycles;          /* cycle counter timestamp */
86036 +    uint32_t event;           /* event ID                */
86037 +    unsigned long data[5];    /* event data items        */
86038 +};
86039 +
86040 +/*
86041 + * This structure contains the metadata for a single trace buffer.  The head
86042 + * field, indexes into an array of struct t_rec's.
86043 + */
86044 +struct t_buf {
86045 +    uint32_t cons;      /* Next item to be consumed by control tools. */
86046 +    uint32_t prod;      /* Next item to be produced by Xen.           */
86047 +    /* 'nr_recs' records follow immediately after the meta-data header.    */
86048 +};
86049 +
86050 +#endif /* __XEN_PUBLIC_TRACE_H__ */
86051 +
86052 +/*
86053 + * Local variables:
86054 + * mode: C
86055 + * c-set-style: "BSD"
86056 + * c-basic-offset: 4
86057 + * tab-width: 4
86058 + * indent-tabs-mode: nil
86059 + * End:
86060 + */
86061 diff -urNp linux-2.6/include/xen/interface/vcpu.h new/include/xen/interface/vcpu.h
86062 --- linux-2.6/include/xen/interface/vcpu.h      1970-01-01 01:00:00.000000000 +0100
86063 +++ new/include/xen/interface/vcpu.h    2006-05-23 18:42:17.000000000 +0200
86064 @@ -0,0 +1,121 @@
86065 +/******************************************************************************
86066 + * vcpu.h
86067 + * 
86068 + * VCPU initialisation, query, and hotplug.
86069 + * 
86070 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
86071 + */
86072 +
86073 +#ifndef __XEN_PUBLIC_VCPU_H__
86074 +#define __XEN_PUBLIC_VCPU_H__
86075 +
86076 +/*
86077 + * Prototype for this hypercall is:
86078 + *  int vcpu_op(int cmd, int vcpuid, void *extra_args)
86079 + * @cmd        == VCPUOP_??? (VCPU operation).
86080 + * @vcpuid     == VCPU to operate on.
86081 + * @extra_args == Operation-specific extra arguments (NULL if none).
86082 + */
86083 +
86084 +/*
86085 + * Initialise a VCPU. Each VCPU can be initialised only once. A 
86086 + * newly-initialised VCPU will not run until it is brought up by VCPUOP_up.
86087 + * 
86088 + * @extra_arg == pointer to vcpu_guest_context structure containing initial
86089 + *               state for the VCPU.
86090 + */
86091 +#define VCPUOP_initialise           0
86092 +
86093 +/*
86094 + * Bring up a VCPU. This makes the VCPU runnable. This operation will fail
86095 + * if the VCPU has not been initialised (VCPUOP_initialise).
86096 + */
86097 +#define VCPUOP_up                   1
86098 +
86099 +/*
86100 + * Bring down a VCPU (i.e., make it non-runnable).
86101 + * There are a few caveats that callers should observe:
86102 + *  1. This operation may return, and VCPU_is_up may return false, before the
86103 + *     VCPU stops running (i.e., the command is asynchronous). It is a good
86104 + *     idea to ensure that the VCPU has entered a non-critical loop before
86105 + *     bringing it down. Alternatively, this operation is guaranteed
86106 + *     synchronous if invoked by the VCPU itself.
86107 + *  2. After a VCPU is initialised, there is currently no way to drop all its
86108 + *     references to domain memory. Even a VCPU that is down still holds
86109 + *     memory references via its pagetable base pointer and GDT. It is good
86110 + *     practise to move a VCPU onto an 'idle' or default page table, LDT and
86111 + *     GDT before bringing it down.
86112 + */
86113 +#define VCPUOP_down                 2
86114 +
86115 +/* Returns 1 if the given VCPU is up. */
86116 +#define VCPUOP_is_up                3
86117 +
86118 +/*
86119 + * Return information about the state and running time of a VCPU.
86120 + * @extra_arg == pointer to vcpu_runstate_info structure.
86121 + */
86122 +#define VCPUOP_get_runstate_info    4
86123 +struct vcpu_runstate_info {
86124 +    /* VCPU's current state (RUNSTATE_*). */
86125 +    int      state;
86126 +    /* When was current state entered (system time, ns)? */
86127 +    uint64_t state_entry_time;
86128 +    /*
86129 +     * Time spent in each RUNSTATE_* (ns). The sum of these times is
86130 +     * guaranteed not to drift from system time.
86131 +     */
86132 +    uint64_t time[4];
86133 +};
86134 +typedef struct vcpu_runstate_info vcpu_runstate_info_t;
86135 +
86136 +/* VCPU is currently running on a physical CPU. */
86137 +#define RUNSTATE_running  0
86138 +
86139 +/* VCPU is runnable, but not currently scheduled on any physical CPU. */
86140 +#define RUNSTATE_runnable 1
86141 +
86142 +/* VCPU is blocked (a.k.a. idle). It is therefore not runnable. */
86143 +#define RUNSTATE_blocked  2
86144 +
86145 +/*
86146 + * VCPU is not runnable, but it is not blocked.
86147 + * This is a 'catch all' state for things like hotplug and pauses by the
86148 + * system administrator (or for critical sections in the hypervisor).
86149 + * RUNSTATE_blocked dominates this state (it is the preferred state).
86150 + */
86151 +#define RUNSTATE_offline  3
86152 +
86153 +/*
86154 + * Register a shared memory area from which the guest may obtain its own
86155 + * runstate information without needing to execute a hypercall.
86156 + * Notes:
86157 + *  1. The registered address may be virtual or physical, depending on the
86158 + *     platform. The virtual address should be registered on x86 systems.
86159 + *  2. Only one shared area may be registered per VCPU. The shared area is
86160 + *     updated by the hypervisor each time the VCPU is scheduled. Thus
86161 + *     runstate.state will always be RUNSTATE_running and
86162 + *     runstate.state_entry_time will indicate the system time at which the
86163 + *     VCPU was last scheduled to run.
86164 + * @extra_arg == pointer to vcpu_register_runstate_memory_area structure.
86165 + */
86166 +#define VCPUOP_register_runstate_memory_area 5
86167 +struct vcpu_register_runstate_memory_area {
86168 +    union {
86169 +        struct vcpu_runstate_info *v;
86170 +        uint64_t p;
86171 +    } addr;
86172 +};
86173 +typedef struct vcpu_register_runstate_memory_area vcpu_register_runstate_memory_area_t;
86174 +
86175 +#endif /* __XEN_PUBLIC_VCPU_H__ */
86176 +
86177 +/*
86178 + * Local variables:
86179 + * mode: C
86180 + * c-set-style: "BSD"
86181 + * c-basic-offset: 4
86182 + * tab-width: 4
86183 + * indent-tabs-mode: nil
86184 + * End:
86185 + */
86186 diff -urNp linux-2.6/include/xen/interface/version.h new/include/xen/interface/version.h
86187 --- linux-2.6/include/xen/interface/version.h   1970-01-01 01:00:00.000000000 +0100
86188 +++ new/include/xen/interface/version.h 2006-07-07 15:10:03.000000000 +0200
86189 @@ -0,0 +1,70 @@
86190 +/******************************************************************************
86191 + * version.h
86192 + * 
86193 + * Xen version, type, and compile information.
86194 + * 
86195 + * Copyright (c) 2005, Nguyen Anh Quynh <aquynh@gmail.com>
86196 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
86197 + */
86198 +
86199 +#ifndef __XEN_PUBLIC_VERSION_H__
86200 +#define __XEN_PUBLIC_VERSION_H__
86201 +
86202 +/* NB. All ops return zero on success, except XENVER_{version,pagesize} */
86203 +
86204 +/* arg == NULL; returns major:minor (16:16). */
86205 +#define XENVER_version      0
86206 +
86207 +/* arg == xen_extraversion_t. */
86208 +#define XENVER_extraversion 1
86209 +typedef char xen_extraversion_t[16];
86210 +#define XEN_EXTRAVERSION_LEN (sizeof(xen_extraversion_t))
86211 +
86212 +/* arg == xen_compile_info_t. */
86213 +#define XENVER_compile_info 2
86214 +struct xen_compile_info {
86215 +    char compiler[64];
86216 +    char compile_by[16];
86217 +    char compile_domain[32];
86218 +    char compile_date[32];
86219 +};
86220 +typedef struct xen_compile_info xen_compile_info_t;
86221 +
86222 +#define XENVER_capabilities 3
86223 +typedef char xen_capabilities_info_t[1024];
86224 +#define XEN_CAPABILITIES_INFO_LEN (sizeof(xen_capabilities_info_t))
86225 +
86226 +#define XENVER_changeset 4
86227 +typedef char xen_changeset_info_t[64];
86228 +#define XEN_CHANGESET_INFO_LEN (sizeof(xen_changeset_info_t))
86229 +
86230 +#define XENVER_platform_parameters 5
86231 +struct xen_platform_parameters {
86232 +    unsigned long virt_start;
86233 +};
86234 +typedef struct xen_platform_parameters xen_platform_parameters_t;
86235 +
86236 +#define XENVER_get_features 6
86237 +struct xen_feature_info {
86238 +    unsigned int submap_idx;    /* IN: which 32-bit submap to return */
86239 +    uint32_t     submap;        /* OUT: 32-bit submap */
86240 +};
86241 +typedef struct xen_feature_info xen_feature_info_t;
86242 +
86243 +/* Declares the features reported by XENVER_get_features. */
86244 +#include "features.h"
86245 +
86246 +/* arg == NULL; returns host memory page size. */
86247 +#define XENVER_pagesize 7
86248 +
86249 +#endif /* __XEN_PUBLIC_VERSION_H__ */
86250 +
86251 +/*
86252 + * Local variables:
86253 + * mode: C
86254 + * c-set-style: "BSD"
86255 + * c-basic-offset: 4
86256 + * tab-width: 4
86257 + * indent-tabs-mode: nil
86258 + * End:
86259 + */
86260 diff -urNp linux-2.6/include/xen/interface/xen-compat.h new/include/xen/interface/xen-compat.h
86261 --- linux-2.6/include/xen/interface/xen-compat.h        1970-01-01 01:00:00.000000000 +0100
86262 +++ new/include/xen/interface/xen-compat.h      2006-05-09 12:35:56.000000000 +0200
86263 @@ -0,0 +1,47 @@
86264 +/******************************************************************************
86265 + * xen-compat.h
86266 + * 
86267 + * Guest OS interface to Xen.  Compatibility layer.
86268 + * 
86269 + * Copyright (c) 2006, Christian Limpach
86270 + */
86271 +
86272 +#ifndef __XEN_PUBLIC_XEN_COMPAT_H__
86273 +#define __XEN_PUBLIC_XEN_COMPAT_H__
86274 +
86275 +#define __XEN_LATEST_INTERFACE_VERSION__ 0x00030202
86276 +
86277 +#if defined(__XEN__) || defined(__XEN_TOOLS__)
86278 +/* Xen is built with matching headers and implements the latest interface. */
86279 +#define __XEN_INTERFACE_VERSION__ __XEN_LATEST_INTERFACE_VERSION__
86280 +#elif !defined(__XEN_INTERFACE_VERSION__)
86281 +/* Guests which do not specify a version get the legacy interface. */
86282 +#define __XEN_INTERFACE_VERSION__ 0x00000000
86283 +#endif
86284 +
86285 +#if __XEN_INTERFACE_VERSION__ > __XEN_LATEST_INTERFACE_VERSION__
86286 +#error "These header files do not support the requested interface version."
86287 +#endif
86288 +
86289 +/* New sched_op hypercall introduced in 0x00030101. */
86290 +#if __XEN_INTERFACE_VERSION__ < 0x00030101
86291 +#undef __HYPERVISOR_sched_op
86292 +#define __HYPERVISOR_sched_op __HYPERVISOR_sched_op_compat
86293 +#endif
86294 +
86295 +/* Structural guest handles introduced in 0x00030201. */
86296 +#if __XEN_INTERFACE_VERSION__ < 0x00030201
86297 +#undef __DEFINE_XEN_GUEST_HANDLE
86298 +#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
86299 +    typedef type * __guest_handle_ ## name
86300 +#endif
86301 +
86302 +/* New event-channel and physdev hypercalls introduced in 0x00030202. */
86303 +#if __XEN_INTERFACE_VERSION__ < 0x00030202
86304 +#undef __HYPERVISOR_event_channel_op
86305 +#define __HYPERVISOR_event_channel_op __HYPERVISOR_event_channel_op_compat
86306 +#undef __HYPERVISOR_physdev_op
86307 +#define __HYPERVISOR_physdev_op __HYPERVISOR_physdev_op_compat
86308 +#endif
86309 +
86310 +#endif /* __XEN_PUBLIC_XEN_COMPAT_H__ */
86311 diff -urNp linux-2.6/include/xen/interface/xen.h new/include/xen/interface/xen.h
86312 --- linux-2.6/include/xen/interface/xen.h       1970-01-01 01:00:00.000000000 +0100
86313 +++ new/include/xen/interface/xen.h     2006-06-07 13:29:36.000000000 +0200
86314 @@ -0,0 +1,505 @@
86315 +/******************************************************************************
86316 + * xen.h
86317 + * 
86318 + * Guest OS interface to Xen.
86319 + * 
86320 + * Copyright (c) 2004, K A Fraser
86321 + */
86322 +
86323 +#ifndef __XEN_PUBLIC_XEN_H__
86324 +#define __XEN_PUBLIC_XEN_H__
86325 +
86326 +#if defined(__i386__)
86327 +#include "arch-x86_32.h"
86328 +#elif defined(__x86_64__)
86329 +#include "arch-x86_64.h"
86330 +#elif defined(__ia64__)
86331 +#include "arch-ia64.h"
86332 +#else
86333 +#error "Unsupported architecture"
86334 +#endif
86335 +
86336 +/*
86337 + * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS).
86338 + */
86339 +
86340 +/*
86341 + * x86_32: EAX = vector; EBX, ECX, EDX, ESI, EDI = args 1, 2, 3, 4, 5.
86342 + *         EAX = return value
86343 + *         (argument registers may be clobbered on return)
86344 + * x86_64: RAX = vector; RDI, RSI, RDX, R10, R8, R9 = args 1, 2, 3, 4, 5, 6. 
86345 + *         RAX = return value
86346 + *         (argument registers not clobbered on return; RCX, R11 are)
86347 + */
86348 +#define __HYPERVISOR_set_trap_table        0
86349 +#define __HYPERVISOR_mmu_update            1
86350 +#define __HYPERVISOR_set_gdt               2
86351 +#define __HYPERVISOR_stack_switch          3
86352 +#define __HYPERVISOR_set_callbacks         4
86353 +#define __HYPERVISOR_fpu_taskswitch        5
86354 +#define __HYPERVISOR_sched_op_compat       6 /* compat since 0x00030101 */
86355 +#define __HYPERVISOR_dom0_op               7
86356 +#define __HYPERVISOR_set_debugreg          8
86357 +#define __HYPERVISOR_get_debugreg          9
86358 +#define __HYPERVISOR_update_descriptor    10
86359 +#define __HYPERVISOR_memory_op            12
86360 +#define __HYPERVISOR_multicall            13
86361 +#define __HYPERVISOR_update_va_mapping    14
86362 +#define __HYPERVISOR_set_timer_op         15
86363 +#define __HYPERVISOR_event_channel_op_compat 16 /* compat since 0x00030202 */
86364 +#define __HYPERVISOR_xen_version          17
86365 +#define __HYPERVISOR_console_io           18
86366 +#define __HYPERVISOR_physdev_op_compat    19 /* compat since 0x00030202 */
86367 +#define __HYPERVISOR_grant_table_op       20
86368 +#define __HYPERVISOR_vm_assist            21
86369 +#define __HYPERVISOR_update_va_mapping_otherdomain 22
86370 +#define __HYPERVISOR_iret                 23 /* x86 only */
86371 +#define __HYPERVISOR_vcpu_op              24
86372 +#define __HYPERVISOR_set_segment_base     25 /* x86/64 only */
86373 +#define __HYPERVISOR_mmuext_op            26
86374 +#define __HYPERVISOR_acm_op               27
86375 +#define __HYPERVISOR_nmi_op               28
86376 +#define __HYPERVISOR_sched_op             29
86377 +#define __HYPERVISOR_callback_op          30
86378 +#define __HYPERVISOR_xenoprof_op          31
86379 +#define __HYPERVISOR_event_channel_op     32
86380 +#define __HYPERVISOR_physdev_op           33
86381 +
86382 +/* Architecture-specific hypercall definitions. */
86383 +#define __HYPERVISOR_arch_0               48
86384 +#define __HYPERVISOR_arch_1               49
86385 +#define __HYPERVISOR_arch_2               50
86386 +#define __HYPERVISOR_arch_3               51
86387 +#define __HYPERVISOR_arch_4               52
86388 +#define __HYPERVISOR_arch_5               53
86389 +#define __HYPERVISOR_arch_6               54
86390 +#define __HYPERVISOR_arch_7               55
86391 +
86392 +/* 
86393 + * VIRTUAL INTERRUPTS
86394 + * 
86395 + * Virtual interrupts that a guest OS may receive from Xen.
86396 + * 
86397 + * In the side comments, 'V.' denotes a per-VCPU VIRQ while 'G.' denotes a
86398 + * global VIRQ. The former can be bound once per VCPU and cannot be re-bound.
86399 + * The latter can be allocated only once per guest: they must initially be
86400 + * allocated to VCPU0 but can subsequently be re-bound.
86401 + */
86402 +#define VIRQ_TIMER      0  /* V. Timebase update, and/or requested timeout.  */
86403 +#define VIRQ_DEBUG      1  /* V. Request guest to dump debug info.           */
86404 +#define VIRQ_CONSOLE    2  /* G. (DOM0) Bytes received on emergency console. */
86405 +#define VIRQ_DOM_EXC    3  /* G. (DOM0) Exceptional event for some domain.   */
86406 +#define VIRQ_TBUF       4  /* G. (DOM0) Trace buffer has records available.  */
86407 +#define VIRQ_DEBUGGER   6  /* G. (DOM0) A domain has paused for debugging.   */
86408 +#define VIRQ_XENOPROF   7  /* V. XenOprofile interrupt: new sample available */
86409 +
86410 +/* Architecture-specific VIRQ definitions. */
86411 +#define VIRQ_ARCH_0    16
86412 +#define VIRQ_ARCH_1    17
86413 +#define VIRQ_ARCH_2    18
86414 +#define VIRQ_ARCH_3    19
86415 +#define VIRQ_ARCH_4    20
86416 +#define VIRQ_ARCH_5    21
86417 +#define VIRQ_ARCH_6    22
86418 +#define VIRQ_ARCH_7    23
86419 +
86420 +#define NR_VIRQS       24
86421 +
86422 +/*
86423 + * MMU-UPDATE REQUESTS
86424 + * 
86425 + * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs.
86426 + * A foreigndom (FD) can be specified (or DOMID_SELF for none).
86427 + * Where the FD has some effect, it is described below.
86428 + * ptr[1:0] specifies the appropriate MMU_* command.
86429 + * 
86430 + * ptr[1:0] == MMU_NORMAL_PT_UPDATE:
86431 + * Updates an entry in a page table. If updating an L1 table, and the new
86432 + * table entry is valid/present, the mapped frame must belong to the FD, if
86433 + * an FD has been specified. If attempting to map an I/O page then the
86434 + * caller assumes the privilege of the FD.
86435 + * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller.
86436 + * FD == DOMID_XEN: Map restricted areas of Xen's heap space.
86437 + * ptr[:2]  -- Machine address of the page-table entry to modify.
86438 + * val      -- Value to write.
86439 + * 
86440 + * ptr[1:0] == MMU_MACHPHYS_UPDATE:
86441 + * Updates an entry in the machine->pseudo-physical mapping table.
86442 + * ptr[:2]  -- Machine address within the frame whose mapping to modify.
86443 + *             The frame must belong to the FD, if one is specified.
86444 + * val      -- Value to write into the mapping entry.
86445 + */
86446 +#define MMU_NORMAL_PT_UPDATE     0 /* checked '*ptr = val'. ptr is MA.       */
86447 +#define MMU_MACHPHYS_UPDATE      1 /* ptr = MA of frame to modify entry for  */
86448 +
86449 +/*
86450 + * MMU EXTENDED OPERATIONS
86451 + * 
86452 + * HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures.
86453 + * A foreigndom (FD) can be specified (or DOMID_SELF for none).
86454 + * Where the FD has some effect, it is described below.
86455 + * 
86456 + * cmd: MMUEXT_(UN)PIN_*_TABLE
86457 + * mfn: Machine frame number to be (un)pinned as a p.t. page.
86458 + *      The frame must belong to the FD, if one is specified.
86459 + * 
86460 + * cmd: MMUEXT_NEW_BASEPTR
86461 + * mfn: Machine frame number of new page-table base to install in MMU.
86462 + * 
86463 + * cmd: MMUEXT_NEW_USER_BASEPTR [x86/64 only]
86464 + * mfn: Machine frame number of new page-table base to install in MMU
86465 + *      when in user space.
86466 + * 
86467 + * cmd: MMUEXT_TLB_FLUSH_LOCAL
86468 + * No additional arguments. Flushes local TLB.
86469 + * 
86470 + * cmd: MMUEXT_INVLPG_LOCAL
86471 + * linear_addr: Linear address to be flushed from the local TLB.
86472 + * 
86473 + * cmd: MMUEXT_TLB_FLUSH_MULTI
86474 + * vcpumask: Pointer to bitmap of VCPUs to be flushed.
86475 + * 
86476 + * cmd: MMUEXT_INVLPG_MULTI
86477 + * linear_addr: Linear address to be flushed.
86478 + * vcpumask: Pointer to bitmap of VCPUs to be flushed.
86479 + * 
86480 + * cmd: MMUEXT_TLB_FLUSH_ALL
86481 + * No additional arguments. Flushes all VCPUs' TLBs.
86482 + * 
86483 + * cmd: MMUEXT_INVLPG_ALL
86484 + * linear_addr: Linear address to be flushed from all VCPUs' TLBs.
86485 + * 
86486 + * cmd: MMUEXT_FLUSH_CACHE
86487 + * No additional arguments. Writes back and flushes cache contents.
86488 + * 
86489 + * cmd: MMUEXT_SET_LDT
86490 + * linear_addr: Linear address of LDT base (NB. must be page-aligned).
86491 + * nr_ents: Number of entries in LDT.
86492 + */
86493 +#define MMUEXT_PIN_L1_TABLE      0
86494 +#define MMUEXT_PIN_L2_TABLE      1
86495 +#define MMUEXT_PIN_L3_TABLE      2
86496 +#define MMUEXT_PIN_L4_TABLE      3
86497 +#define MMUEXT_UNPIN_TABLE       4
86498 +#define MMUEXT_NEW_BASEPTR       5
86499 +#define MMUEXT_TLB_FLUSH_LOCAL   6
86500 +#define MMUEXT_INVLPG_LOCAL      7
86501 +#define MMUEXT_TLB_FLUSH_MULTI   8
86502 +#define MMUEXT_INVLPG_MULTI      9
86503 +#define MMUEXT_TLB_FLUSH_ALL    10
86504 +#define MMUEXT_INVLPG_ALL       11
86505 +#define MMUEXT_FLUSH_CACHE      12
86506 +#define MMUEXT_SET_LDT          13
86507 +#define MMUEXT_NEW_USER_BASEPTR 15
86508 +
86509 +#ifndef __ASSEMBLY__
86510 +struct mmuext_op {
86511 +    unsigned int cmd;
86512 +    union {
86513 +        /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
86514 +        xen_pfn_t     mfn;
86515 +        /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
86516 +        unsigned long linear_addr;
86517 +    } arg1;
86518 +    union {
86519 +        /* SET_LDT */
86520 +        unsigned int nr_ents;
86521 +        /* TLB_FLUSH_MULTI, INVLPG_MULTI */
86522 +        void *vcpumask;
86523 +    } arg2;
86524 +};
86525 +typedef struct mmuext_op mmuext_op_t;
86526 +DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
86527 +#endif
86528 +
86529 +/* These are passed as 'flags' to update_va_mapping. They can be ORed. */
86530 +/* When specifying UVMF_MULTI, also OR in a pointer to a CPU bitmap.   */
86531 +/* UVMF_LOCAL is merely UVMF_MULTI with a NULL bitmap pointer.         */
86532 +#define UVMF_NONE               (0UL<<0) /* No flushing at all.   */
86533 +#define UVMF_TLB_FLUSH          (1UL<<0) /* Flush entire TLB(s).  */
86534 +#define UVMF_INVLPG             (2UL<<0) /* Flush only one entry. */
86535 +#define UVMF_FLUSHTYPE_MASK     (3UL<<0)
86536 +#define UVMF_MULTI              (0UL<<2) /* Flush subset of TLBs. */
86537 +#define UVMF_LOCAL              (0UL<<2) /* Flush local TLB.      */
86538 +#define UVMF_ALL                (1UL<<2) /* Flush all TLBs.       */
86539 +
86540 +/*
86541 + * Commands to HYPERVISOR_console_io().
86542 + */
86543 +#define CONSOLEIO_write         0
86544 +#define CONSOLEIO_read          1
86545 +
86546 +/*
86547 + * Commands to HYPERVISOR_vm_assist().
86548 + */
86549 +#define VMASST_CMD_enable                0
86550 +#define VMASST_CMD_disable               1
86551 +
86552 +/* x86/32 guests: simulate full 4GB segment limits. */
86553 +#define VMASST_TYPE_4gb_segments         0
86554 +
86555 +/* x86/32 guests: trap (vector 15) whenever above vmassist is used. */
86556 +#define VMASST_TYPE_4gb_segments_notify  1
86557 +
86558 +/*
86559 + * x86 guests: support writes to bottom-level PTEs.
86560 + * NB1. Page-directory entries cannot be written.
86561 + * NB2. Guest must continue to remove all writable mappings of PTEs.
86562 + */
86563 +#define VMASST_TYPE_writable_pagetables  2
86564 +
86565 +/* x86/PAE guests: support PDPTs above 4GB. */
86566 +#define VMASST_TYPE_pae_extended_cr3     3
86567 +
86568 +#define MAX_VMASST_TYPE                  3
86569 +
86570 +#ifndef __ASSEMBLY__
86571 +
86572 +typedef uint16_t domid_t;
86573 +
86574 +/* Domain ids >= DOMID_FIRST_RESERVED cannot be used for ordinary domains. */
86575 +#define DOMID_FIRST_RESERVED (0x7FF0U)
86576 +
86577 +/* DOMID_SELF is used in certain contexts to refer to oneself. */
86578 +#define DOMID_SELF (0x7FF0U)
86579 +
86580 +/*
86581 + * DOMID_IO is used to restrict page-table updates to mapping I/O memory.
86582 + * Although no Foreign Domain need be specified to map I/O pages, DOMID_IO
86583 + * is useful to ensure that no mappings to the OS's own heap are accidentally
86584 + * installed. (e.g., in Linux this could cause havoc as reference counts
86585 + * aren't adjusted on the I/O-mapping code path).
86586 + * This only makes sense in MMUEXT_SET_FOREIGNDOM, but in that context can
86587 + * be specified by any calling domain.
86588 + */
86589 +#define DOMID_IO   (0x7FF1U)
86590 +
86591 +/*
86592 + * DOMID_XEN is used to allow privileged domains to map restricted parts of
86593 + * Xen's heap space (e.g., the machine_to_phys table).
86594 + * This only makes sense in MMUEXT_SET_FOREIGNDOM, and is only permitted if
86595 + * the caller is privileged.
86596 + */
86597 +#define DOMID_XEN  (0x7FF2U)
86598 +
86599 +/*
86600 + * Send an array of these to HYPERVISOR_mmu_update().
86601 + * NB. The fields are natural pointer/address size for this architecture.
86602 + */
86603 +struct mmu_update {
86604 +    uint64_t ptr;       /* Machine address of PTE. */
86605 +    uint64_t val;       /* New contents of PTE.    */
86606 +};
86607 +typedef struct mmu_update mmu_update_t;
86608 +DEFINE_XEN_GUEST_HANDLE(mmu_update_t);
86609 +
86610 +/*
86611 + * Send an array of these to HYPERVISOR_multicall().
86612 + * NB. The fields are natural register size for this architecture.
86613 + */
86614 +struct multicall_entry {
86615 +    unsigned long op, result;
86616 +    unsigned long args[6];
86617 +};
86618 +typedef struct multicall_entry multicall_entry_t;
86619 +DEFINE_XEN_GUEST_HANDLE(multicall_entry_t);
86620 +
86621 +/*
86622 + * Event channel endpoints per domain:
86623 + *  1024 if a long is 32 bits; 4096 if a long is 64 bits.
86624 + */
86625 +#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64)
86626 +
86627 +struct vcpu_time_info {
86628 +    /*
86629 +     * Updates to the following values are preceded and followed by an
86630 +     * increment of 'version'. The guest can therefore detect updates by
86631 +     * looking for changes to 'version'. If the least-significant bit of
86632 +     * the version number is set then an update is in progress and the guest
86633 +     * must wait to read a consistent set of values.
86634 +     * The correct way to interact with the version number is similar to
86635 +     * Linux's seqlock: see the implementations of read_seqbegin/read_seqretry.
86636 +     */
86637 +    uint32_t version;
86638 +    uint32_t pad0;
86639 +    uint64_t tsc_timestamp;   /* TSC at last update of time vals.  */
86640 +    uint64_t system_time;     /* Time, in nanosecs, since boot.    */
86641 +    /*
86642 +     * Current system time:
86643 +     *   system_time +
86644 +     *   ((((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul) >> 32)
86645 +     * CPU frequency (Hz):
86646 +     *   ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift
86647 +     */
86648 +    uint32_t tsc_to_system_mul;
86649 +    int8_t   tsc_shift;
86650 +    int8_t   pad1[3];
86651 +}; /* 32 bytes */
86652 +typedef struct vcpu_time_info vcpu_time_info_t;
86653 +
86654 +struct vcpu_info {
86655 +    /*
86656 +     * 'evtchn_upcall_pending' is written non-zero by Xen to indicate
86657 +     * a pending notification for a particular VCPU. It is then cleared 
86658 +     * by the guest OS /before/ checking for pending work, thus avoiding
86659 +     * a set-and-check race. Note that the mask is only accessed by Xen
86660 +     * on the CPU that is currently hosting the VCPU. This means that the
86661 +     * pending and mask flags can be updated by the guest without special
86662 +     * synchronisation (i.e., no need for the x86 LOCK prefix).
86663 +     * This may seem suboptimal because if the pending flag is set by
86664 +     * a different CPU then an IPI may be scheduled even when the mask
86665 +     * is set. However, note:
86666 +     *  1. The task of 'interrupt holdoff' is covered by the per-event-
86667 +     *     channel mask bits. A 'noisy' event that is continually being
86668 +     *     triggered can be masked at source at this very precise
86669 +     *     granularity.
86670 +     *  2. The main purpose of the per-VCPU mask is therefore to restrict
86671 +     *     reentrant execution: whether for concurrency control, or to
86672 +     *     prevent unbounded stack usage. Whatever the purpose, we expect
86673 +     *     that the mask will be asserted only for short periods at a time,
86674 +     *     and so the likelihood of a 'spurious' IPI is suitably small.
86675 +     * The mask is read before making an event upcall to the guest: a
86676 +     * non-zero mask therefore guarantees that the VCPU will not receive
86677 +     * an upcall activation. The mask is cleared when the VCPU requests
86678 +     * to block: this avoids wakeup-waiting races.
86679 +     */
86680 +    uint8_t evtchn_upcall_pending;
86681 +    uint8_t evtchn_upcall_mask;
86682 +    unsigned long evtchn_pending_sel;
86683 +    struct arch_vcpu_info arch;
86684 +    struct vcpu_time_info time;
86685 +}; /* 64 bytes (x86) */
86686 +typedef struct vcpu_info vcpu_info_t;
86687 +
86688 +/*
86689 + * Xen/kernel shared data -- pointer provided in start_info.
86690 + * NB. We expect that this struct is smaller than a page.
86691 + */
86692 +struct shared_info {
86693 +    struct vcpu_info vcpu_info[MAX_VIRT_CPUS];
86694 +
86695 +    /*
86696 +     * A domain can create "event channels" on which it can send and receive
86697 +     * asynchronous event notifications. There are three classes of event that
86698 +     * are delivered by this mechanism:
86699 +     *  1. Bi-directional inter- and intra-domain connections. Domains must
86700 +     *     arrange out-of-band to set up a connection (usually by allocating
86701 +     *     an unbound 'listener' port and avertising that via a storage service
86702 +     *     such as xenstore).
86703 +     *  2. Physical interrupts. A domain with suitable hardware-access
86704 +     *     privileges can bind an event-channel port to a physical interrupt
86705 +     *     source.
86706 +     *  3. Virtual interrupts ('events'). A domain can bind an event-channel
86707 +     *     port to a virtual interrupt source, such as the virtual-timer
86708 +     *     device or the emergency console.
86709 +     * 
86710 +     * Event channels are addressed by a "port index". Each channel is
86711 +     * associated with two bits of information:
86712 +     *  1. PENDING -- notifies the domain that there is a pending notification
86713 +     *     to be processed. This bit is cleared by the guest.
86714 +     *  2. MASK -- if this bit is clear then a 0->1 transition of PENDING
86715 +     *     will cause an asynchronous upcall to be scheduled. This bit is only
86716 +     *     updated by the guest. It is read-only within Xen. If a channel
86717 +     *     becomes pending while the channel is masked then the 'edge' is lost
86718 +     *     (i.e., when the channel is unmasked, the guest must manually handle
86719 +     *     pending notifications as no upcall will be scheduled by Xen).
86720 +     * 
86721 +     * To expedite scanning of pending notifications, any 0->1 pending
86722 +     * transition on an unmasked channel causes a corresponding bit in a
86723 +     * per-vcpu selector word to be set. Each bit in the selector covers a
86724 +     * 'C long' in the PENDING bitfield array.
86725 +     */
86726 +    unsigned long evtchn_pending[sizeof(unsigned long) * 8];
86727 +    unsigned long evtchn_mask[sizeof(unsigned long) * 8];
86728 +
86729 +    /*
86730 +     * Wallclock time: updated only by control software. Guests should base
86731 +     * their gettimeofday() syscall on this wallclock-base value.
86732 +     */
86733 +    uint32_t wc_version;      /* Version counter: see vcpu_time_info_t. */
86734 +    uint32_t wc_sec;          /* Secs  00:00:00 UTC, Jan 1, 1970.  */
86735 +    uint32_t wc_nsec;         /* Nsecs 00:00:00 UTC, Jan 1, 1970.  */
86736 +
86737 +    struct arch_shared_info arch;
86738 +
86739 +};
86740 +typedef struct shared_info shared_info_t;
86741 +
86742 +/*
86743 + * Start-of-day memory layout for the initial domain (DOM0):
86744 + *  1. The domain is started within contiguous virtual-memory region.
86745 + *  2. The contiguous region begins and ends on an aligned 4MB boundary.
86746 + *  3. The region start corresponds to the load address of the OS image.
86747 + *     If the load address is not 4MB aligned then the address is rounded down.
86748 + *  4. This the order of bootstrap elements in the initial virtual region:
86749 + *      a. relocated kernel image
86750 + *      b. initial ram disk              [mod_start, mod_len]
86751 + *      c. list of allocated page frames [mfn_list, nr_pages]
86752 + *      d. start_info_t structure        [register ESI (x86)]
86753 + *      e. bootstrap page tables         [pt_base, CR3 (x86)]
86754 + *      f. bootstrap stack               [register ESP (x86)]
86755 + *  5. Bootstrap elements are packed together, but each is 4kB-aligned.
86756 + *  6. The initial ram disk may be omitted.
86757 + *  7. The list of page frames forms a contiguous 'pseudo-physical' memory
86758 + *     layout for the domain. In particular, the bootstrap virtual-memory
86759 + *     region is a 1:1 mapping to the first section of the pseudo-physical map.
86760 + *  8. All bootstrap elements are mapped read-writable for the guest OS. The
86761 + *     only exception is the bootstrap page table, which is mapped read-only.
86762 + *  9. There is guaranteed to be at least 512kB padding after the final
86763 + *     bootstrap element. If necessary, the bootstrap virtual region is
86764 + *     extended by an extra 4MB to ensure this.
86765 + */
86766 +
86767 +#define MAX_GUEST_CMDLINE 1024
86768 +struct start_info {
86769 +    /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME.    */
86770 +    char magic[32];             /* "xen-<version>-<platform>".            */
86771 +    unsigned long nr_pages;     /* Total pages allocated to this domain.  */
86772 +    unsigned long shared_info;  /* MACHINE address of shared info struct. */
86773 +    uint32_t flags;             /* SIF_xxx flags.                         */
86774 +    xen_pfn_t store_mfn;        /* MACHINE page number of shared page.    */
86775 +    uint32_t store_evtchn;      /* Event channel for store communication. */
86776 +    xen_pfn_t console_mfn;      /* MACHINE page number of console page.   */
86777 +    uint32_t console_evtchn;    /* Event channel for console messages.    */
86778 +    /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME).     */
86779 +    unsigned long pt_base;      /* VIRTUAL address of page directory.     */
86780 +    unsigned long nr_pt_frames; /* Number of bootstrap p.t. frames.       */
86781 +    unsigned long mfn_list;     /* VIRTUAL address of page-frame list.    */
86782 +    unsigned long mod_start;    /* VIRTUAL address of pre-loaded module.  */
86783 +    unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     */
86784 +    int8_t cmd_line[MAX_GUEST_CMDLINE];
86785 +};
86786 +typedef struct start_info start_info_t;
86787 +
86788 +/* These flags are passed in the 'flags' field of start_info_t. */
86789 +#define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
86790 +#define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
86791 +
86792 +typedef uint64_t cpumap_t;
86793 +
86794 +typedef uint8_t xen_domain_handle_t[16];
86795 +
86796 +/* Turn a plain number into a C unsigned long constant. */
86797 +#define __mk_unsigned_long(x) x ## UL
86798 +#define mk_unsigned_long(x) __mk_unsigned_long(x)
86799 +
86800 +#else /* __ASSEMBLY__ */
86801 +
86802 +/* In assembly code we cannot use C numeric constant suffixes. */
86803 +#define mk_unsigned_long(x) x
86804 +
86805 +#endif /* !__ASSEMBLY__ */
86806 +
86807 +#include "xen-compat.h"
86808 +
86809 +#endif /* __XEN_PUBLIC_XEN_H__ */
86810 +
86811 +/*
86812 + * Local variables:
86813 + * mode: C
86814 + * c-set-style: "BSD"
86815 + * c-basic-offset: 4
86816 + * tab-width: 4
86817 + * indent-tabs-mode: nil
86818 + * End:
86819 + */
86820 diff -urNp linux-2.6/include/xen/interface/xenoprof.h new/include/xen/interface/xenoprof.h
86821 --- linux-2.6/include/xen/interface/xenoprof.h  1970-01-01 01:00:00.000000000 +0100
86822 +++ new/include/xen/interface/xenoprof.h        2006-07-07 15:10:03.000000000 +0200
86823 @@ -0,0 +1,103 @@
86824 +/******************************************************************************
86825 + * xenoprof.h
86826 + * 
86827 + * Interface for enabling system wide profiling based on hardware performance
86828 + * counters
86829 + * 
86830 + * Copyright (C) 2005 Hewlett-Packard Co.
86831 + * Written by Aravind Menon & Jose Renato Santos
86832 + */
86833 +
86834 +#ifndef __XEN_PUBLIC_XENOPROF_H__
86835 +#define __XEN_PUBLIC_XENOPROF_H__
86836 +
86837 +/*
86838 + * Commands to HYPERVISOR_xenoprof_op().
86839 + */
86840 +#define XENOPROF_init                0
86841 +#define XENOPROF_reset_active_list   1
86842 +#define XENOPROF_reset_passive_list  2
86843 +#define XENOPROF_set_active          3
86844 +#define XENOPROF_set_passive         4
86845 +#define XENOPROF_reserve_counters    5
86846 +#define XENOPROF_counter             6
86847 +#define XENOPROF_setup_events        7
86848 +#define XENOPROF_enable_virq         8
86849 +#define XENOPROF_start               9
86850 +#define XENOPROF_stop               10
86851 +#define XENOPROF_disable_virq       11
86852 +#define XENOPROF_release_counters   12
86853 +#define XENOPROF_shutdown           13
86854 +
86855 +#define MAX_OPROF_EVENTS    32
86856 +#define MAX_OPROF_DOMAINS   25 
86857 +#define XENOPROF_CPU_TYPE_SIZE 64
86858 +
86859 +/* Xenoprof performance events (not Xen events) */
86860 +struct event_log {
86861 +    uint64_t eip;
86862 +    uint8_t mode;
86863 +    uint8_t event;
86864 +};
86865 +
86866 +/* Xenoprof buffer shared between Xen and domain - 1 per VCPU */
86867 +struct xenoprof_buf {
86868 +    uint32_t event_head;
86869 +    uint32_t event_tail;
86870 +    uint32_t event_size;
86871 +    uint32_t vcpu_id;
86872 +    uint64_t xen_samples;
86873 +    uint64_t kernel_samples;
86874 +    uint64_t user_samples;
86875 +    uint64_t lost_samples;
86876 +    struct event_log event_log[1];
86877 +};
86878 +typedef struct xenoprof_buf xenoprof_buf_t;
86879 +DEFINE_XEN_GUEST_HANDLE(xenoprof_buf_t);
86880 +
86881 +struct xenoprof_init {
86882 +    int32_t  max_samples;
86883 +    int32_t  num_events;
86884 +    int32_t  is_primary;
86885 +    int32_t  nbuf;
86886 +    int32_t  bufsize;
86887 +    uint64_t buf_maddr;
86888 +    char cpu_type[XENOPROF_CPU_TYPE_SIZE];
86889 +};
86890 +typedef struct xenoprof_init xenoprof_init_t;
86891 +DEFINE_XEN_GUEST_HANDLE(xenoprof_init_t);
86892 +
86893 +struct xenoprof_counter {
86894 +    uint32_t ind;
86895 +    uint64_t count;
86896 +    uint32_t enabled;
86897 +    uint32_t event;
86898 +    uint32_t hypervisor;
86899 +    uint32_t kernel;
86900 +    uint32_t user;
86901 +    uint64_t unit_mask;
86902 +};
86903 +typedef struct xenoprof_counter xenoprof_counter_t;
86904 +DEFINE_XEN_GUEST_HANDLE(xenoprof_counter_t);
86905 +
86906 +typedef struct xenoprof_passive {
86907 +    uint16_t domain_id;
86908 +    int32_t  max_samples;
86909 +    int32_t  nbuf;
86910 +    int32_t  bufsize;
86911 +    uint64_t buf_maddr;
86912 +} xenoprof_passive_t;
86913 +DEFINE_XEN_GUEST_HANDLE(xenoprof_passive_t);
86914 +
86915 +
86916 +#endif /* __XEN_PUBLIC_XENOPROF_H__ */
86917 +
86918 +/*
86919 + * Local variables:
86920 + * mode: C
86921 + * c-set-style: "BSD"
86922 + * c-basic-offset: 4
86923 + * tab-width: 4
86924 + * indent-tabs-mode: nil
86925 + * End:
86926 + */
86927 diff -urNp linux-2.6/include/xen/pcifront.h new/include/xen/pcifront.h
86928 --- linux-2.6/include/xen/pcifront.h    1970-01-01 01:00:00.000000000 +0100
86929 +++ new/include/xen/pcifront.h  2006-05-09 12:35:56.000000000 +0200
86930 @@ -0,0 +1,39 @@
86931 +/*
86932 + * PCI Frontend - arch-dependendent declarations
86933 + *
86934 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
86935 + */
86936 +#ifndef __XEN_ASM_PCIFRONT_H__
86937 +#define __XEN_ASM_PCIFRONT_H__
86938 +
86939 +#include <linux/config.h>
86940 +#include <linux/spinlock.h>
86941 +
86942 +#ifdef __KERNEL__
86943 +
86944 +struct pcifront_device;
86945 +
86946 +struct pcifront_sd {
86947 +       int domain;
86948 +       struct pcifront_device *pdev;
86949 +};
86950 +
86951 +struct pci_bus;
86952 +
86953 +#ifdef CONFIG_PCI_DOMAINS
86954 +static inline int pci_domain_nr(struct pci_bus *bus)
86955 +{
86956 +       struct pcifront_sd *sd = bus->sysdata;
86957 +       return sd->domain;
86958 +}
86959 +static inline int pci_proc_domain(struct pci_bus *bus)
86960 +{
86961 +       return pci_domain_nr(bus);
86962 +}
86963 +#endif /* CONFIG_PCI_DOMAINS */
86964 +
86965 +extern spinlock_t pci_bus_lock;
86966 +
86967 +#endif /* __KERNEL__ */
86968 +
86969 +#endif /* __XEN_ASM_PCIFRONT_H__ */
86970 diff -urNp linux-2.6/include/xen/public/evtchn.h new/include/xen/public/evtchn.h
86971 --- linux-2.6/include/xen/public/evtchn.h       1970-01-01 01:00:00.000000000 +0100
86972 +++ new/include/xen/public/evtchn.h     2006-05-23 18:42:17.000000000 +0200
86973 @@ -0,0 +1,91 @@
86974 +/******************************************************************************
86975 + * evtchn.h
86976 + * 
86977 + * Interface to /dev/xen/evtchn.
86978 + * 
86979 + * Copyright (c) 2003-2005, K A Fraser
86980 + * 
86981 + * This program is free software; you can redistribute it and/or
86982 + * modify it under the terms of the GNU General Public License version 2
86983 + * as published by the Free Software Foundation; or, when distributed
86984 + * separately from the Linux kernel or incorporated into other
86985 + * software packages, subject to the following license:
86986 + * 
86987 + * Permission is hereby granted, free of charge, to any person obtaining a copy
86988 + * of this source file (the "Software"), to deal in the Software without
86989 + * restriction, including without limitation the rights to use, copy, modify,
86990 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
86991 + * and to permit persons to whom the Software is furnished to do so, subject to
86992 + * the following conditions:
86993 + * 
86994 + * The above copyright notice and this permission notice shall be included in
86995 + * all copies or substantial portions of the Software.
86996 + * 
86997 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
86998 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
86999 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
87000 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
87001 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
87002 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
87003 + * IN THE SOFTWARE.
87004 + */
87005 +
87006 +#ifndef __LINUX_PUBLIC_EVTCHN_H__
87007 +#define __LINUX_PUBLIC_EVTCHN_H__
87008 +
87009 +/* /dev/xen/evtchn resides at device number major=10, minor=201 */
87010 +#define EVTCHN_MINOR 201
87011 +
87012 +/*
87013 + * Bind a fresh port to VIRQ @virq.
87014 + * Return allocated port.
87015 + */
87016 +#define IOCTL_EVTCHN_BIND_VIRQ                         \
87017 +       _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
87018 +struct ioctl_evtchn_bind_virq {
87019 +       unsigned int virq;
87020 +};
87021 +
87022 +/*
87023 + * Bind a fresh port to remote <@remote_domain, @remote_port>.
87024 + * Return allocated port.
87025 + */
87026 +#define IOCTL_EVTCHN_BIND_INTERDOMAIN                  \
87027 +       _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
87028 +struct ioctl_evtchn_bind_interdomain {
87029 +       unsigned int remote_domain, remote_port;
87030 +};
87031 +
87032 +/*
87033 + * Allocate a fresh port for binding to @remote_domain.
87034 + * Return allocated port.
87035 + */
87036 +#define IOCTL_EVTCHN_BIND_UNBOUND_PORT                 \
87037 +       _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
87038 +struct ioctl_evtchn_bind_unbound_port {
87039 +       unsigned int remote_domain;
87040 +};
87041 +
87042 +/*
87043 + * Unbind previously allocated @port.
87044 + */
87045 +#define IOCTL_EVTCHN_UNBIND                            \
87046 +       _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
87047 +struct ioctl_evtchn_unbind {
87048 +       unsigned int port;
87049 +};
87050 +
87051 +/*
87052 + * Unbind previously allocated @port.
87053 + */
87054 +#define IOCTL_EVTCHN_NOTIFY                            \
87055 +       _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
87056 +struct ioctl_evtchn_notify {
87057 +       unsigned int port;
87058 +};
87059 +
87060 +/* Clear and reinitialise the event buffer. Clear error condition. */
87061 +#define IOCTL_EVTCHN_RESET                             \
87062 +       _IOC(_IOC_NONE, 'E', 5, 0)
87063 +
87064 +#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
87065 diff -urNp linux-2.6/include/xen/public/privcmd.h new/include/xen/public/privcmd.h
87066 --- linux-2.6/include/xen/public/privcmd.h      1970-01-01 01:00:00.000000000 +0100
87067 +++ new/include/xen/public/privcmd.h    2006-06-07 13:29:36.000000000 +0200
87068 @@ -0,0 +1,79 @@
87069 +/******************************************************************************
87070 + * privcmd.h
87071 + * 
87072 + * Interface to /proc/xen/privcmd.
87073 + * 
87074 + * Copyright (c) 2003-2005, K A Fraser
87075 + * 
87076 + * This program is free software; you can redistribute it and/or
87077 + * modify it under the terms of the GNU General Public License version 2
87078 + * as published by the Free Software Foundation; or, when distributed
87079 + * separately from the Linux kernel or incorporated into other
87080 + * software packages, subject to the following license:
87081 + * 
87082 + * Permission is hereby granted, free of charge, to any person obtaining a copy
87083 + * of this source file (the "Software"), to deal in the Software without
87084 + * restriction, including without limitation the rights to use, copy, modify,
87085 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
87086 + * and to permit persons to whom the Software is furnished to do so, subject to
87087 + * the following conditions:
87088 + * 
87089 + * The above copyright notice and this permission notice shall be included in
87090 + * all copies or substantial portions of the Software.
87091 + * 
87092 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
87093 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
87094 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
87095 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
87096 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
87097 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
87098 + * IN THE SOFTWARE.
87099 + */
87100 +
87101 +#ifndef __LINUX_PUBLIC_PRIVCMD_H__
87102 +#define __LINUX_PUBLIC_PRIVCMD_H__
87103 +
87104 +#include <linux/types.h>
87105 +
87106 +#ifndef __user
87107 +#define __user
87108 +#endif
87109 +
87110 +typedef struct privcmd_hypercall
87111 +{
87112 +       __u64 op;
87113 +       __u64 arg[5];
87114 +} privcmd_hypercall_t;
87115 +
87116 +typedef struct privcmd_mmap_entry {
87117 +       __u64 va;
87118 +       __u64 mfn;
87119 +       __u64 npages;
87120 +} privcmd_mmap_entry_t; 
87121 +
87122 +typedef struct privcmd_mmap {
87123 +       int num;
87124 +       domid_t dom; /* target domain */
87125 +       privcmd_mmap_entry_t __user *entry;
87126 +} privcmd_mmap_t; 
87127 +
87128 +typedef struct privcmd_mmapbatch {
87129 +       int num;     /* number of pages to populate */
87130 +       domid_t dom; /* target domain */
87131 +       __u64 addr;  /* virtual address */
87132 +       xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */
87133 +} privcmd_mmapbatch_t; 
87134 +
87135 +/*
87136 + * @cmd: IOCTL_PRIVCMD_HYPERCALL
87137 + * @arg: &privcmd_hypercall_t
87138 + * Return: Value returned from execution of the specified hypercall.
87139 + */
87140 +#define IOCTL_PRIVCMD_HYPERCALL                                        \
87141 +       _IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t))
87142 +#define IOCTL_PRIVCMD_MMAP                                     \
87143 +       _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t))
87144 +#define IOCTL_PRIVCMD_MMAPBATCH                                        \
87145 +       _IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t))
87146 +
87147 +#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
87148 diff -urNp linux-2.6/include/xen/xenbus.h new/include/xen/xenbus.h
87149 --- linux-2.6/include/xen/xenbus.h      1970-01-01 01:00:00.000000000 +0100
87150 +++ new/include/xen/xenbus.h    2006-06-28 14:32:14.000000000 +0200
87151 @@ -0,0 +1,299 @@
87152 +/******************************************************************************
87153 + * xenbus.h
87154 + *
87155 + * Talks to Xen Store to figure out what devices we have.
87156 + *
87157 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
87158 + * Copyright (C) 2005 XenSource Ltd.
87159 + * 
87160 + * This program is free software; you can redistribute it and/or
87161 + * modify it under the terms of the GNU General Public License version 2
87162 + * as published by the Free Software Foundation; or, when distributed
87163 + * separately from the Linux kernel or incorporated into other
87164 + * software packages, subject to the following license:
87165 + * 
87166 + * Permission is hereby granted, free of charge, to any person obtaining a copy
87167 + * of this source file (the "Software"), to deal in the Software without
87168 + * restriction, including without limitation the rights to use, copy, modify,
87169 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
87170 + * and to permit persons to whom the Software is furnished to do so, subject to
87171 + * the following conditions:
87172 + * 
87173 + * The above copyright notice and this permission notice shall be included in
87174 + * all copies or substantial portions of the Software.
87175 + * 
87176 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
87177 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
87178 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
87179 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
87180 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
87181 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
87182 + * IN THE SOFTWARE.
87183 + */
87184 +
87185 +#ifndef _XEN_XENBUS_H
87186 +#define _XEN_XENBUS_H
87187 +
87188 +#include <linux/device.h>
87189 +#include <linux/notifier.h>
87190 +#include <linux/mutex.h>
87191 +#include <xen/interface/xen.h>
87192 +#include <xen/interface/grant_table.h>
87193 +#include <xen/interface/io/xenbus.h>
87194 +#include <xen/interface/io/xs_wire.h>
87195 +
87196 +/* Register callback to watch this node. */
87197 +struct xenbus_watch
87198 +{
87199 +       struct list_head list;
87200 +
87201 +       /* Path being watched. */
87202 +       const char *node;
87203 +
87204 +       /* Callback (executed in a process context with no locks held). */
87205 +       void (*callback)(struct xenbus_watch *,
87206 +                        const char **vec, unsigned int len);
87207 +
87208 +       /* See XBWF_ definitions below. */
87209 +       unsigned long flags;
87210 +};
87211 +
87212 +/*
87213 + * Execute callback in its own kthread. Useful if the callback is long
87214 + * running or heavily serialised, to avoid taking out the main xenwatch thread
87215 + * for a long period of time (or even unwittingly causing a deadlock).
87216 + */
87217 +#define XBWF_new_thread        1
87218 +
87219 +/* A xenbus device. */
87220 +struct xenbus_device {
87221 +       const char *devicetype;
87222 +       const char *nodename;
87223 +       const char *otherend;
87224 +       int otherend_id;
87225 +       struct xenbus_watch otherend_watch;
87226 +       struct device dev;
87227 +       enum xenbus_state state;
87228 +};
87229 +
87230 +static inline struct xenbus_device *to_xenbus_device(struct device *dev)
87231 +{
87232 +       return container_of(dev, struct xenbus_device, dev);
87233 +}
87234 +
87235 +struct xenbus_device_id
87236 +{
87237 +       /* .../device/<device_type>/<identifier> */
87238 +       char devicetype[32];    /* General class of device. */
87239 +};
87240 +
87241 +/* A xenbus driver. */
87242 +struct xenbus_driver {
87243 +       char *name;
87244 +       struct module *owner;
87245 +       const struct xenbus_device_id *ids;
87246 +       int (*probe)(struct xenbus_device *dev,
87247 +                    const struct xenbus_device_id *id);
87248 +       void (*otherend_changed)(struct xenbus_device *dev,
87249 +                                enum xenbus_state backend_state);
87250 +       int (*remove)(struct xenbus_device *dev);
87251 +       int (*suspend)(struct xenbus_device *dev);
87252 +       int (*resume)(struct xenbus_device *dev);
87253 +       int (*uevent)(struct xenbus_device *, char **, int, char *, int);
87254 +       struct device_driver driver;
87255 +       int (*read_otherend_details)(struct xenbus_device *dev);
87256 +};
87257 +
87258 +static inline struct xenbus_driver *to_xenbus_driver(struct device_driver *drv)
87259 +{
87260 +       return container_of(drv, struct xenbus_driver, driver);
87261 +}
87262 +
87263 +int xenbus_register_frontend(struct xenbus_driver *drv);
87264 +int xenbus_register_backend(struct xenbus_driver *drv);
87265 +void xenbus_unregister_driver(struct xenbus_driver *drv);
87266 +
87267 +struct xenbus_transaction
87268 +{
87269 +       u32 id;
87270 +};
87271 +
87272 +/* Nil transaction ID. */
87273 +#define XBT_NIL ((struct xenbus_transaction) { 0 })
87274 +
87275 +char **xenbus_directory(struct xenbus_transaction t,
87276 +                       const char *dir, const char *node, unsigned int *num);
87277 +void *xenbus_read(struct xenbus_transaction t,
87278 +                 const char *dir, const char *node, unsigned int *len);
87279 +int xenbus_write(struct xenbus_transaction t,
87280 +                const char *dir, const char *node, const char *string);
87281 +int xenbus_mkdir(struct xenbus_transaction t,
87282 +                const char *dir, const char *node);
87283 +int xenbus_exists(struct xenbus_transaction t,
87284 +                 const char *dir, const char *node);
87285 +int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node);
87286 +int xenbus_transaction_start(struct xenbus_transaction *t);
87287 +int xenbus_transaction_end(struct xenbus_transaction t, int abort);
87288 +
87289 +/* Single read and scanf: returns -errno or num scanned if > 0. */
87290 +int xenbus_scanf(struct xenbus_transaction t,
87291 +                const char *dir, const char *node, const char *fmt, ...)
87292 +       __attribute__((format(scanf, 4, 5)));
87293 +
87294 +/* Single printf and write: returns -errno or 0. */
87295 +int xenbus_printf(struct xenbus_transaction t,
87296 +                 const char *dir, const char *node, const char *fmt, ...)
87297 +       __attribute__((format(printf, 4, 5)));
87298 +
87299 +/* Generic read function: NULL-terminated triples of name,
87300 + * sprintf-style type string, and pointer. Returns 0 or errno.*/
87301 +int xenbus_gather(struct xenbus_transaction t, const char *dir, ...);
87302 +
87303 +/* notifer routines for when the xenstore comes up */
87304 +int register_xenstore_notifier(struct notifier_block *nb);
87305 +void unregister_xenstore_notifier(struct notifier_block *nb);
87306 +
87307 +int register_xenbus_watch(struct xenbus_watch *watch);
87308 +void unregister_xenbus_watch(struct xenbus_watch *watch);
87309 +void xs_suspend(void);
87310 +void xs_resume(void);
87311 +
87312 +/* Used by xenbus_dev to borrow kernel's store connection. */
87313 +void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg);
87314 +
87315 +/* Called from xen core code. */
87316 +void xenbus_suspend(void);
87317 +void xenbus_resume(void);
87318 +
87319 +#define XENBUS_IS_ERR_READ(str) ({                     \
87320 +       if (!IS_ERR(str) && strlen(str) == 0) {         \
87321 +               kfree(str);                             \
87322 +               str = ERR_PTR(-ERANGE);                 \
87323 +       }                                               \
87324 +       IS_ERR(str);                                    \
87325 +})
87326 +
87327 +#define XENBUS_EXIST_ERR(err) ((err) == -ENOENT || (err) == -ERANGE)
87328 +
87329 +
87330 +/**
87331 + * Register a watch on the given path, using the given xenbus_watch structure
87332 + * for storage, and the given callback function as the callback.  Return 0 on
87333 + * success, or -errno on error.  On success, the given path will be saved as
87334 + * watch->node, and remains the caller's to free.  On error, watch->node will
87335 + * be NULL, the device will switch to XenbusStateClosing, and the error will
87336 + * be saved in the store.
87337 + */
87338 +int xenbus_watch_path(struct xenbus_device *dev, const char *path,
87339 +                     struct xenbus_watch *watch,
87340 +                     void (*callback)(struct xenbus_watch *,
87341 +                                      const char **, unsigned int));
87342 +
87343 +
87344 +/**
87345 + * Register a watch on the given path/path2, using the given xenbus_watch
87346 + * structure for storage, and the given callback function as the callback.
87347 + * Return 0 on success, or -errno on error.  On success, the watched path
87348 + * (path/path2) will be saved as watch->node, and becomes the caller's to
87349 + * kfree().  On error, watch->node will be NULL, so the caller has nothing to
87350 + * free, the device will switch to XenbusStateClosing, and the error will be
87351 + * saved in the store.
87352 + */
87353 +int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
87354 +                      const char *path2, struct xenbus_watch *watch,
87355 +                      void (*callback)(struct xenbus_watch *,
87356 +                                       const char **, unsigned int));
87357 +
87358 +
87359 +/**
87360 + * Advertise in the store a change of the given driver to the given new_state.
87361 + * Return 0 on success, or -errno on error.  On error, the device will switch
87362 + * to XenbusStateClosing, and the error will be saved in the store.
87363 + */
87364 +int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state);
87365 +
87366 +
87367 +/**
87368 + * Grant access to the given ring_mfn to the peer of the given device.  Return
87369 + * 0 on success, or -errno on error.  On error, the device will switch to
87370 + * XenbusStateClosing, and the error will be saved in the store.
87371 + */
87372 +int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn);
87373 +
87374 +
87375 +/**
87376 + * Map a page of memory into this domain from another domain's grant table.
87377 + * xenbus_map_ring_valloc allocates a page of virtual address space, maps the
87378 + * page to that address, and sets *vaddr to that address.
87379 + * xenbus_map_ring does not allocate the virtual address space (you must do
87380 + * this yourself!). It only maps in the page to the specified address.
87381 + * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
87382 + * or -ENOMEM on error. If an error is returned, device will switch to
87383 + * XenbusStateClosing and the error message will be saved in XenStore.
87384 + */
87385 +struct vm_struct *xenbus_map_ring_valloc(struct xenbus_device *dev,
87386 +                                        int gnt_ref);
87387 +int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
87388 +                          grant_handle_t *handle, void *vaddr);
87389 +
87390 +
87391 +/**
87392 + * Unmap a page of memory in this domain that was imported from another domain.
87393 + * Use xenbus_unmap_ring_vfree if you mapped in your memory with
87394 + * xenbus_map_ring_valloc (it will free the virtual address space).
87395 + * Returns 0 on success and returns GNTST_* on error
87396 + * (see xen/include/interface/grant_table.h).
87397 + */
87398 +int xenbus_unmap_ring_vfree(struct xenbus_device *dev, struct vm_struct *);
87399 +int xenbus_unmap_ring(struct xenbus_device *dev,
87400 +                     grant_handle_t handle, void *vaddr);
87401 +
87402 +
87403 +/**
87404 + * Allocate an event channel for the given xenbus_device, assigning the newly
87405 + * created local port to *port.  Return 0 on success, or -errno on error.  On
87406 + * error, the device will switch to XenbusStateClosing, and the error will be
87407 + * saved in the store.
87408 + */
87409 +int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port);
87410 +
87411 +
87412 +/**
87413 + * Bind to an existing interdomain event channel in another domain. Returns 0
87414 + * on success and stores the local port in *port. On error, returns -errno,
87415 + * switches the device to XenbusStateClosing, and saves the error in XenStore.
87416 + */
87417 +int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port);
87418 +
87419 +
87420 +/**
87421 + * Free an existing event channel. Returns 0 on success or -errno on error.
87422 + */
87423 +int xenbus_free_evtchn(struct xenbus_device *dev, int port);
87424 +
87425 +
87426 +/**
87427 + * Return the state of the driver rooted at the given store path, or
87428 + * XenbusStateClosed if no state can be read.
87429 + */
87430 +enum xenbus_state xenbus_read_driver_state(const char *path);
87431 +
87432 +
87433 +/***
87434 + * Report the given negative errno into the store, along with the given
87435 + * formatted message.
87436 + */
87437 +void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt,
87438 +                     ...);
87439 +
87440 +
87441 +/***
87442 + * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by
87443 + * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly
87444 + * closedown of this driver and its peer.
87445 + */
87446 +void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt,
87447 +                     ...);
87448 +
87449 +
87450 +#endif /* _XEN_XENBUS_H */
87451 diff -urNp linux-2.6/include/xen/xencons.h new/include/xen/xencons.h
87452 --- linux-2.6/include/xen/xencons.h     1970-01-01 01:00:00.000000000 +0100
87453 +++ new/include/xen/xencons.h   2006-05-09 12:35:56.000000000 +0200
87454 @@ -0,0 +1,14 @@
87455 +#ifndef __ASM_XENCONS_H__
87456 +#define __ASM_XENCONS_H__
87457 +
87458 +void xencons_force_flush(void);
87459 +void xencons_resume(void);
87460 +
87461 +/* Interrupt work hooks. Receive data, or kick data out. */
87462 +void xencons_rx(char *buf, unsigned len, struct pt_regs *regs);
87463 +void xencons_tx(void);
87464 +
87465 +int xencons_ring_init(void);
87466 +int xencons_ring_send(const char *data, unsigned len);
87467 +
87468 +#endif /* __ASM_XENCONS_H__ */
87469 diff -urNp linux-2.6/include/xen/xen_proc.h new/include/xen/xen_proc.h
87470 --- linux-2.6/include/xen/xen_proc.h    1970-01-01 01:00:00.000000000 +0100
87471 +++ new/include/xen/xen_proc.h  2006-05-23 18:42:17.000000000 +0200
87472 @@ -0,0 +1,13 @@
87473 +
87474 +#ifndef __ASM_XEN_PROC_H__
87475 +#define __ASM_XEN_PROC_H__
87476 +
87477 +#include <linux/config.h>
87478 +#include <linux/proc_fs.h>
87479 +
87480 +extern struct proc_dir_entry *create_xen_proc_entry(
87481 +       const char *name, mode_t mode);
87482 +extern void remove_xen_proc_entry(
87483 +       const char *name);
87484 +
87485 +#endif /* __ASM_XEN_PROC_H__ */
87486 diff -urNp linux-2.6/kernel/fork.c new/kernel/fork.c
87487 --- linux-2.6/kernel/fork.c     2006-07-03 14:15:20.000000000 +0200
87488 +++ new/kernel/fork.c   2006-06-28 14:32:14.000000000 +0200
87489 @@ -272,6 +272,9 @@ static inline int dup_mmap(struct mm_str
87490                 if (retval)
87491                         goto out;
87492         }
87493 +#ifdef arch_dup_mmap
87494 +       arch_dup_mmap(mm, oldmm);
87495 +#endif
87496         retval = 0;
87497  out:
87498         up_write(&mm->mmap_sem);
87499 diff -urNp linux-2.6/kernel/irq/manage.c new/kernel/irq/manage.c
87500 --- linux-2.6/kernel/irq/manage.c       2006-07-03 14:15:20.000000000 +0200
87501 +++ new/kernel/irq/manage.c     2006-05-09 12:35:57.000000000 +0200
87502 @@ -204,14 +204,10 @@ int setup_irq(unsigned int irq, struct i
87503         p = &desc->action;
87504         if ((old = *p) != NULL) {
87505                 /* Can't share interrupts unless both agree to */
87506 -               if (!(old->flags & new->flags & SA_SHIRQ))
87507 -                       goto mismatch;
87508 -
87509 -#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ)
87510 -               /* All handlers must agree on per-cpuness */
87511 -               if ((old->flags & IRQ_PER_CPU) != (new->flags & IRQ_PER_CPU))
87512 -                       goto mismatch;
87513 -#endif
87514 +               if (!(old->flags & new->flags & SA_SHIRQ)) {
87515 +                       spin_unlock_irqrestore(&desc->lock,flags);
87516 +                       return -EBUSY;
87517 +               }
87518  
87519                 /* add new interrupt at end of irq queue */
87520                 do {
87521 @@ -222,10 +218,7 @@ int setup_irq(unsigned int irq, struct i
87522         }
87523  
87524         *p = new;
87525 -#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ)
87526 -       if (new->flags & SA_PERCPU_IRQ)
87527 -               desc->status |= IRQ_PER_CPU;
87528 -#endif
87529 +
87530         if (!shared) {
87531                 desc->depth = 0;
87532                 desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT |
87533 @@ -243,14 +236,6 @@ int setup_irq(unsigned int irq, struct i
87534         register_handler_proc(irq, new);
87535  
87536         return 0;
87537 -
87538 -mismatch:
87539 -       spin_unlock_irqrestore(&desc->lock, flags);
87540 -       if (!(new->flags & SA_PROBEIRQ)) {
87541 -               printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__);
87542 -               dump_stack();
87543 -       }
87544 -       return -EBUSY;
87545  }
87546  
87547  /**
87548 @@ -273,7 +258,6 @@ void free_irq(unsigned int irq, void *de
87549         struct irqaction **p;
87550         unsigned long flags;
87551  
87552 -       WARN_ON(in_interrupt());
87553         if (irq >= NR_IRQS)
87554                 return;
87555  
87556 diff -urNp linux-2.6/kernel/irq/spurious.c new/kernel/irq/spurious.c
87557 --- linux-2.6/kernel/irq/spurious.c     2006-07-03 14:15:20.000000000 +0200
87558 +++ new/kernel/irq/spurious.c   2006-06-28 14:32:14.000000000 +0200
87559 @@ -137,7 +137,8 @@ void note_interrupt(unsigned int irq, ir
87560                         struct pt_regs *regs)
87561  {
87562         if (action_ret != IRQ_HANDLED) {
87563 -               desc->irqs_unhandled++;
87564 +               if (!irq_ignore_unhandled(irq))
87565 +                       desc->irqs_unhandled++;
87566                 if (action_ret != IRQ_NONE)
87567                         report_bad_irq(irq, desc, action_ret);
87568         }
87569 diff -urNp linux-2.6/kernel/Kconfig.preempt new/kernel/Kconfig.preempt
87570 --- linux-2.6/kernel/Kconfig.preempt    2006-07-03 14:15:20.000000000 +0200
87571 +++ new/kernel/Kconfig.preempt  2006-05-09 12:35:57.000000000 +0200
87572 @@ -35,6 +35,7 @@ config PREEMPT_VOLUNTARY
87573  
87574  config PREEMPT
87575         bool "Preemptible Kernel (Low-Latency Desktop)"
87576 +       depends on !XEN
87577         help
87578           This option reduces the latency of the kernel by making
87579           all kernel code (that is not executing in a critical section)
87580 diff -urNp linux-2.6/kernel/timer.c new/kernel/timer.c
87581 --- linux-2.6/kernel/timer.c    2006-07-03 14:15:20.000000000 +0200
87582 +++ new/kernel/timer.c  2006-07-07 16:12:20.000000000 +0200
87583 @@ -557,6 +557,22 @@ found:
87584         if (time_before(expires, jiffies))
87585                 return jiffies;
87586  
87587 +       /*
87588 +        * It can happen that other CPUs service timer IRQs and increment
87589 +        * jiffies, but we have not yet got a local timer tick to process
87590 +        * the timer wheels.  In that case, the expiry time can be before
87591 +        * jiffies, but since the high-resolution timer here is relative to
87592 +        * jiffies, the default expression when high-resolution timers are
87593 +        * not active,
87594 +        *
87595 +        *   time_before(MAX_JIFFY_OFFSET + jiffies, expires)
87596 +        *
87597 +        * would falsely evaluate to true.  If that is the case, just
87598 +        * return jiffies so that we can immediately fire the local timer
87599 +        */
87600 +       if (time_before(expires, jiffies))
87601 +               return jiffies;
87602 +
87603         if (time_before(hr_expires, expires))
87604                 return hr_expires;
87605  
87606 diff -urNp linux-2.6/lib/Makefile new/lib/Makefile
87607 --- linux-2.6/lib/Makefile      2006-07-03 14:15:20.000000000 +0200
87608 +++ new/lib/Makefile    2006-05-09 12:35:58.000000000 +0200
87609 @@ -48,6 +48,9 @@ obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o
87610  obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
87611  
87612  obj-$(CONFIG_SWIOTLB) += swiotlb.o
87613 +ifneq ($(CONFIG_XEN_IA64_DOM0_NON_VP),y)
87614 +swiotlb-$(CONFIG_XEN) := ../arch/i386/kernel/swiotlb.o
87615 +endif
87616  
87617  hostprogs-y    := gen_crc32table
87618  clean-files    := crc32table.h
87619 diff -urNp linux-2.6/mm/highmem.c new/mm/highmem.c
87620 --- linux-2.6/mm/highmem.c      2006-07-03 14:15:20.000000000 +0200
87621 +++ new/mm/highmem.c    2006-05-09 12:35:59.000000000 +0200
87622 @@ -142,6 +142,17 @@ start:
87623         return vaddr;
87624  }
87625  
87626 +#ifdef CONFIG_XEN
87627 +void kmap_flush_unused(void)
87628 +{
87629 +       spin_lock(&kmap_lock);
87630 +       flush_all_zero_pkmaps();
87631 +       spin_unlock(&kmap_lock);
87632 +}
87633 +
87634 +EXPORT_SYMBOL(kmap_flush_unused);
87635 +#endif
87636 +
87637  void fastcall *kmap_high(struct page *page)
87638  {
87639         unsigned long vaddr;
87640 diff -urNp linux-2.6/mm/Kconfig new/mm/Kconfig
87641 --- linux-2.6/mm/Kconfig        2006-07-03 14:15:20.000000000 +0200
87642 +++ new/mm/Kconfig      2006-05-09 12:40:16.000000000 +0200
87643 @@ -126,11 +126,14 @@ comment "Memory hotplug is currently inc
87644  # Default to 4 for wider testing, though 8 might be more appropriate.
87645  # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
87646  # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
87647 +# XEN on x86 architecture uses the mapping field on pagetable pages to store a
87648 +# pointer to the destructor. This conflicts with pte_lock_deinit().
87649  #
87650  config SPLIT_PTLOCK_CPUS
87651         int
87652         default "4096" if ARM && !CPU_CACHE_VIPT
87653         default "4096" if PARISC && !PA20
87654 +       default "4096" if X86_XEN || X86_64_XEN
87655         default "4"
87656  
87657  #
87658 diff -urNp linux-2.6/mm/memory.c new/mm/memory.c
87659 --- linux-2.6/mm/memory.c       2006-07-03 14:15:20.000000000 +0200
87660 +++ new/mm/memory.c     2006-06-28 14:32:14.000000000 +0200
87661 @@ -402,7 +402,8 @@ struct page *vm_normal_page(struct vm_ar
87662          * and that the resulting page looks ok.
87663          */
87664         if (unlikely(!pfn_valid(pfn))) {
87665 -               print_bad_pte(vma, pte, addr);
87666 +               if (!(vma->vm_flags & VM_RESERVED))
87667 +                       print_bad_pte(vma, pte, addr);
87668                 return NULL;
87669         }
87670  
87671 @@ -1017,6 +1018,26 @@ int get_user_pages(struct task_struct *t
87672                         continue;
87673                 }
87674  
87675 +#ifdef CONFIG_XEN
87676 +               if (vma && (vma->vm_flags & VM_FOREIGN)) {
87677 +                       struct page **map = vma->vm_private_data;
87678 +                       int offset = (start - vma->vm_start) >> PAGE_SHIFT;
87679 +                       if (map[offset] != NULL) {
87680 +                               if (pages) {
87681 +                                       struct page *page = map[offset];
87682 +                                       
87683 +                                       pages[i] = page;
87684 +                                       get_page(page);
87685 +                               }
87686 +                               if (vmas)
87687 +                                       vmas[i] = vma;
87688 +                               i++;
87689 +                               start += PAGE_SIZE;
87690 +                               len--;
87691 +                               continue;
87692 +                       }
87693 +               }
87694 +#endif
87695                 if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
87696                                 || !(vm_flags & vma->vm_flags))
87697                         return i ? : -EFAULT;
87698 @@ -1356,6 +1377,102 @@ int remap_pfn_range(struct vm_area_struc
87699  }
87700  EXPORT_SYMBOL(remap_pfn_range);
87701  
87702 +#ifdef CONFIG_XEN
87703 +static inline int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
87704 +                                    unsigned long addr, unsigned long end,
87705 +                                    pte_fn_t fn, void *data)
87706 +{
87707 +       pte_t *pte;
87708 +       int err;
87709 +       struct page *pmd_page;
87710 +       spinlock_t *ptl;
87711 +
87712 +       pte = (mm == &init_mm) ?
87713 +               pte_alloc_kernel(pmd, addr) :
87714 +               pte_alloc_map_lock(mm, pmd, addr, &ptl);
87715 +       if (!pte)
87716 +               return -ENOMEM;
87717 +
87718 +       BUG_ON(pmd_huge(*pmd));
87719 +
87720 +       pmd_page = pmd_page(*pmd);
87721 +
87722 +       do {
87723 +               err = fn(pte, pmd_page, addr, data);
87724 +               if (err)
87725 +                       break;
87726 +       } while (pte++, addr += PAGE_SIZE, addr != end);
87727 +
87728 +       if (mm != &init_mm)
87729 +               pte_unmap_unlock(pte-1, ptl);
87730 +       return err;
87731 +}
87732 +
87733 +static inline int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
87734 +                                    unsigned long addr, unsigned long end,
87735 +                                    pte_fn_t fn, void *data)
87736 +{
87737 +       pmd_t *pmd;
87738 +       unsigned long next;
87739 +       int err;
87740 +
87741 +       pmd = pmd_alloc(mm, pud, addr);
87742 +       if (!pmd)
87743 +               return -ENOMEM;
87744 +       do {
87745 +               next = pmd_addr_end(addr, end);
87746 +               err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
87747 +               if (err)
87748 +                       break;
87749 +       } while (pmd++, addr = next, addr != end);
87750 +       return err;
87751 +}
87752 +
87753 +static inline int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
87754 +                                    unsigned long addr, unsigned long end,
87755 +                                    pte_fn_t fn, void *data)
87756 +{
87757 +       pud_t *pud;
87758 +       unsigned long next;
87759 +       int err;
87760 +
87761 +       pud = pud_alloc(mm, pgd, addr);
87762 +       if (!pud)
87763 +               return -ENOMEM;
87764 +       do {
87765 +               next = pud_addr_end(addr, end);
87766 +               err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
87767 +               if (err)
87768 +                       break;
87769 +       } while (pud++, addr = next, addr != end);
87770 +       return err;
87771 +}
87772 +
87773 +/*
87774 + * Scan a region of virtual memory, filling in page tables as necessary
87775 + * and calling a provided function on each leaf page table.
87776 + */
87777 +int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
87778 +                       unsigned long size, pte_fn_t fn, void *data)
87779 +{
87780 +       pgd_t *pgd;
87781 +       unsigned long next;
87782 +       unsigned long end = addr + size;
87783 +       int err;
87784 +
87785 +       BUG_ON(addr >= end);
87786 +       pgd = pgd_offset(mm, addr);
87787 +       do {
87788 +               next = pgd_addr_end(addr, end);
87789 +               err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
87790 +               if (err)
87791 +                       break;
87792 +       } while (pgd++, addr = next, addr != end);
87793 +       return err;
87794 +}
87795 +EXPORT_SYMBOL_GPL(apply_to_page_range);
87796 +#endif
87797 +
87798  /*
87799   * handle_pte_fault chooses page fault handler according to an entry
87800   * which was read non-atomically.  Before making any commitment, on
87801 diff -urNp linux-2.6/mm/mmap.c new/mm/mmap.c
87802 --- linux-2.6/mm/mmap.c 2006-07-03 14:15:20.000000000 +0200
87803 +++ new/mm/mmap.c       2006-05-09 12:35:59.000000000 +0200
87804 @@ -1942,6 +1942,10 @@ void exit_mmap(struct mm_struct *mm)
87805         unsigned long nr_accounted = 0;
87806         unsigned long end;
87807  
87808 +#ifdef arch_exit_mmap
87809 +       arch_exit_mmap(mm);
87810 +#endif
87811 +
87812         lru_add_drain();
87813         flush_cache_mm(mm);
87814         tlb = tlb_gather_mmu(mm, 1);
87815 diff -urNp linux-2.6/mm/page_alloc.c new/mm/page_alloc.c
87816 --- linux-2.6/mm/page_alloc.c   2006-07-03 14:15:20.000000000 +0200
87817 +++ new/mm/page_alloc.c 2006-05-23 18:37:12.000000000 +0200
87818 @@ -438,7 +438,8 @@ static void __free_pages_ok(struct page 
87819         int i;
87820         int reserved = 0;
87821  
87822 -       arch_free_page(page, order);
87823 +       if (arch_free_page(page, order))
87824 +               return;
87825         if (!PageHighMem(page))
87826                 mutex_debug_check_no_locks_freed(page_address(page),
87827                                                  PAGE_SIZE<<order);
87828 @@ -733,7 +734,8 @@ static void fastcall free_hot_cold_page(
87829         struct per_cpu_pages *pcp;
87830         unsigned long flags;
87831  
87832 -       arch_free_page(page, 0);
87833 +       if (arch_free_page(page, 0))
87834 +               return;
87835  
87836         if (PageAnon(page))
87837                 page->mapping = NULL;
87838 diff -urNp linux-2.6/net/atm/clip.c new/net/atm/clip.c
87839 --- linux-2.6/net/atm/clip.c    2006-07-03 14:15:20.000000000 +0200
87840 +++ new/net/atm/clip.c  2006-07-07 16:13:30.000000000 +0200
87841 @@ -98,7 +98,7 @@ static void unlink_clip_vcc(struct clip_
87842                 printk(KERN_CRIT "!clip_vcc->entry (clip_vcc %p)\n", clip_vcc);
87843                 return;
87844         }
87845 -       spin_lock_bh(&entry->neigh->dev->xmit_lock);    /* block clip_start_xmit() */
87846 +       netif_tx_lock_bh(entry->neigh->dev);    /* block clip_start_xmit() */
87847         entry->neigh->used = jiffies;
87848         for (walk = &entry->vccs; *walk; walk = &(*walk)->next)
87849                 if (*walk == clip_vcc) {
87850 @@ -120,9 +120,9 @@ static void unlink_clip_vcc(struct clip_
87851                         goto out;
87852                 }
87853         printk(KERN_CRIT "ATMARP: unlink_clip_vcc failed (entry %p, vcc "
87854 -              "0x%p)\n", entry, clip_vcc);
87855 -      out:
87856 -       spin_unlock_bh(&entry->neigh->dev->xmit_lock);
87857 +         "0x%p)\n",entry,clip_vcc);
87858 +out:
87859 +       netif_tx_unlock_bh(entry->neigh->dev);
87860  }
87861  
87862  /* The neighbour entry n->lock is held. */
87863 diff -urNp linux-2.6/net/bridge/br_device.c new/net/bridge/br_device.c
87864 --- linux-2.6/net/bridge/br_device.c    2006-07-03 14:15:21.000000000 +0200
87865 +++ new/net/bridge/br_device.c  2006-07-07 16:13:36.000000000 +0200
87866 @@ -145,9 +145,9 @@ static int br_set_tx_csum(struct net_dev
87867         struct net_bridge *br = netdev_priv(dev);
87868  
87869         if (data)
87870 -               br->feature_mask |= NETIF_F_IP_CSUM;
87871 +               br->feature_mask |= NETIF_F_NO_CSUM;
87872         else
87873 -               br->feature_mask &= ~NETIF_F_IP_CSUM;
87874 +               br->feature_mask &= ~NETIF_F_ALL_CSUM;
87875  
87876         br_features_recompute(br);
87877         return 0;
87878 @@ -184,6 +184,6 @@ void br_dev_setup(struct net_device *dev
87879         dev->set_mac_address = br_set_mac_address;
87880         dev->priv_flags = IFF_EBRIDGE;
87881  
87882 -       dev->features = NETIF_F_SG | NETIF_F_FRAGLIST
87883 -               | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_IP_CSUM;
87884 +       dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
87885 +                       NETIF_F_TSO | NETIF_F_NO_CSUM | NETIF_F_GSO_ROBUST;
87886  }
87887 diff -urNp linux-2.6/net/bridge/br_forward.c new/net/bridge/br_forward.c
87888 --- linux-2.6/net/bridge/br_forward.c   2006-07-03 14:15:21.000000000 +0200
87889 +++ new/net/bridge/br_forward.c 2006-07-07 16:14:07.000000000 +0200
87890 @@ -38,7 +38,7 @@ static inline unsigned packet_length(con
87891  int br_dev_queue_push_xmit(struct sk_buff *skb)
87892  {
87893         /* drop mtu oversized packets except tso */
87894 -       if (packet_length(skb) > skb->dev->mtu && !skb_shinfo(skb)->tso_size)
87895 +       if (skb->len > skb->dev->mtu && !skb_shinfo(skb)->gso_size)
87896                 kfree_skb(skb);
87897         else {
87898  #ifdef CONFIG_BRIDGE_NETFILTER
87899 diff -urNp linux-2.6/net/bridge/br_if.c new/net/bridge/br_if.c
87900 --- linux-2.6/net/bridge/br_if.c        2006-07-03 14:15:21.000000000 +0200
87901 +++ new/net/bridge/br_if.c      2006-07-07 16:14:17.000000000 +0200
87902 @@ -372,17 +372,28 @@ void br_features_recompute(struct net_br
87903         struct net_bridge_port *p;
87904         unsigned long features, checksum;
87905  
87906 -       features = br->feature_mask &~ NETIF_F_IP_CSUM;
87907 -       checksum = br->feature_mask & NETIF_F_IP_CSUM;
87908 +       checksum = br->feature_mask & NETIF_F_ALL_CSUM ? NETIF_F_NO_CSUM : 0;
87909 +       features = br->feature_mask & ~NETIF_F_ALL_CSUM;
87910  
87911         list_for_each_entry(p, &br->port_list, list) {
87912 -               if (!(p->dev->features 
87913 -                     & (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)))
87914 +               unsigned long feature = p->dev->features;
87915 +
87916 +               if (checksum & NETIF_F_NO_CSUM && !(feature & NETIF_F_NO_CSUM))
87917 +                       checksum ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
87918 +               if (checksum & NETIF_F_HW_CSUM && !(feature & NETIF_F_HW_CSUM))
87919 +                       checksum ^= NETIF_F_HW_CSUM | NETIF_F_IP_CSUM;
87920 +               if (!(feature & NETIF_F_IP_CSUM))
87921                         checksum = 0;
87922 -               features &= p->dev->features;
87923 +
87924 +               if (feature & NETIF_F_GSO)
87925 +                       feature |= NETIF_F_TSO;
87926 +               feature |= NETIF_F_GSO;
87927 +
87928 +               features &= feature;
87929         }
87930  
87931 -       br->dev->features = features | checksum | NETIF_F_LLTX;
87932 +       br->dev->features = features | checksum | NETIF_F_LLTX |
87933 +                           NETIF_F_GSO_ROBUST;
87934  }
87935  
87936  /* called with RTNL */
87937 diff -urNp linux-2.6/net/bridge/br_netfilter.c new/net/bridge/br_netfilter.c
87938 --- linux-2.6/net/bridge/br_netfilter.c 2006-07-03 14:15:21.000000000 +0200
87939 +++ new/net/bridge/br_netfilter.c       2006-07-07 16:17:11.000000000 +0200
87940 @@ -769,7 +769,7 @@ static int br_nf_dev_queue_xmit(struct s
87941  {
87942         if (skb->protocol == htons(ETH_P_IP) &&
87943             skb->len > skb->dev->mtu &&
87944 -           !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
87945 +           !skb_shinfo(skb)->gso_size)
87946                 return ip_fragment(skb, br_dev_queue_push_xmit);
87947         else
87948                 return br_dev_queue_push_xmit(skb);
87949 @@ -877,8 +877,9 @@ static unsigned int ip_sabotage_out(unsi
87950         struct sk_buff *skb = *pskb;
87951  
87952         if ((out->hard_start_xmit == br_dev_xmit &&
87953 -            okfn != br_nf_forward_finish &&
87954 -            okfn != br_nf_local_out_finish && okfn != br_nf_dev_queue_xmit)
87955 +           okfn != br_nf_forward_finish &&
87956 +           okfn != br_nf_local_out_finish &&
87957 +           okfn != br_nf_dev_queue_xmit)
87958  #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
87959             || ((out->priv_flags & IFF_802_1Q_VLAN) &&
87960                 VLAN_DEV_INFO(out)->real_dev->hard_start_xmit == br_dev_xmit)
87961 diff -urNp linux-2.6/net/core/dev.c new/net/core/dev.c
87962 --- linux-2.6/net/core/dev.c    2006-07-03 14:15:21.000000000 +0200
87963 +++ new/net/core/dev.c  2006-07-07 16:17:58.000000000 +0200
87964 @@ -115,6 +115,13 @@
87965  #include <asm/current.h>
87966  #include <linux/audit.h>
87967  #include <linux/vs_network.h>
87968 +#include <linux/err.h>
87969 +
87970 +#ifdef CONFIG_XEN
87971 +#include <net/ip.h>
87972 +#include <linux/tcp.h>
87973 +#include <linux/udp.h>
87974 +#endif
87975  
87976  /*
87977   *     The list of packet types we will receive (as opposed to discard)
87978 @@ -1041,7 +1048,7 @@ static inline void net_timestamp(struct 
87979   *     taps currently in use.
87980   */
87981  
87982 -void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
87983 +static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
87984  {
87985         struct packet_type *ptype;
87986  
87987 @@ -1179,6 +1186,45 @@ out:     
87988         return ret;
87989  }
87990  
87991 +/**
87992 + *     skb_gso_segment - Perform segmentation on skb.
87993 + *     @skb: buffer to segment
87994 + *     @features: features for the output path (see dev->features)
87995 + *
87996 + *     This function segments the given skb and returns a list of segments.
87997 + *
87998 + *     It may return NULL if the skb requires no segmentation.  This is
87999 + *     only possible when GSO is used for verifying header integrity.
88000 + */
88001 +struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
88002 +{
88003 +       struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
88004 +       struct packet_type *ptype;
88005 +       int type = skb->protocol;
88006 +
88007 +       BUG_ON(skb_shinfo(skb)->frag_list);
88008 +       BUG_ON(skb->ip_summed != CHECKSUM_HW);
88009 +
88010 +       skb->mac.raw = skb->data;
88011 +       skb->mac_len = skb->nh.raw - skb->data;
88012 +       __skb_pull(skb, skb->mac_len);
88013 +
88014 +       rcu_read_lock();
88015 +       list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) {
88016 +               if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
88017 +                       segs = ptype->gso_segment(skb, features);
88018 +                       break;
88019 +               }
88020 +       }
88021 +       rcu_read_unlock();
88022 +
88023 +       __skb_push(skb, skb->data - skb->mac.raw);
88024 +
88025 +       return segs;
88026 +}
88027 +
88028 +EXPORT_SYMBOL(skb_gso_segment);
88029 +
88030  /* Take action when hardware reception checksum errors are detected. */
88031  #ifdef CONFIG_BUG
88032  void netdev_rx_csum_fault(struct net_device *dev)
88033 @@ -1215,79 +1261,148 @@ static inline int illegal_highdma(struct
88034  #define illegal_highdma(dev, skb)      (0)
88035  #endif
88036  
88037 -/* Keep head the same: replace data */
88038 -int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask)
88039 +struct dev_gso_cb {
88040 +       void (*destructor)(struct sk_buff *skb);
88041 +};
88042 +
88043 +#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
88044 +
88045 +static void dev_gso_skb_destructor(struct sk_buff *skb)
88046  {
88047 -       unsigned int size;
88048 -       u8 *data;
88049 -       long offset;
88050 -       struct skb_shared_info *ninfo;
88051 -       int headerlen = skb->data - skb->head;
88052 -       int expand = (skb->tail + skb->data_len) - skb->end;
88053 -
88054 -       if (skb_shared(skb))
88055 -               BUG();
88056 -
88057 -       if (expand <= 0)
88058 -               expand = 0;
88059 -
88060 -       size = skb->end - skb->head + expand;
88061 -       size = SKB_DATA_ALIGN(size);
88062 -       data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
88063 -       if (!data)
88064 -               return -ENOMEM;
88065 -
88066 -       /* Copy entire thing */
88067 -       if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len))
88068 -               BUG();
88069 -
88070 -       /* Set up shinfo */
88071 -       ninfo = (struct skb_shared_info*)(data + size);
88072 -       atomic_set(&ninfo->dataref, 1);
88073 -       ninfo->tso_size = skb_shinfo(skb)->tso_size;
88074 -       ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
88075 -       ninfo->ufo_size = skb_shinfo(skb)->ufo_size;
88076 -       ninfo->nr_frags = 0;
88077 -       ninfo->frag_list = NULL;
88078 -
88079 -       /* Offset between the two in bytes */
88080 -       offset = data - skb->head;
88081 -
88082 -       /* Free old data. */
88083 -       skb_release_data(skb);
88084 -
88085 -       skb->head = data;
88086 -       skb->end  = data + size;
88087 -
88088 -       /* Set up new pointers */
88089 -       skb->h.raw   += offset;
88090 -       skb->nh.raw  += offset;
88091 -       skb->mac.raw += offset;
88092 -       skb->tail    += offset;
88093 -       skb->data    += offset;
88094 +       struct dev_gso_cb *cb;
88095 +
88096 +       do {
88097 +               struct sk_buff *nskb = skb->next;
88098  
88099 -       /* We are no longer a clone, even if we were. */
88100 -       skb->cloned    = 0;
88101 +               skb->next = nskb->next;
88102 +               nskb->next = NULL;
88103 +               kfree_skb(nskb);
88104 +       } while (skb->next);
88105 +
88106 +       cb = DEV_GSO_CB(skb);
88107 +       if (cb->destructor)
88108 +               cb->destructor(skb);
88109 +}
88110 +
88111 +/**
88112 + *     dev_gso_segment - Perform emulated hardware segmentation on skb.
88113 + *     @skb: buffer to segment
88114 + *
88115 + *     This function segments the given skb and stores the list of segments
88116 + *     in skb->next.
88117 + */
88118 +static int dev_gso_segment(struct sk_buff *skb)
88119 +{
88120 +       struct net_device *dev = skb->dev;
88121 +       struct sk_buff *segs;
88122 +       int features = dev->features & ~(illegal_highdma(dev, skb) ?
88123 +                                        NETIF_F_SG : 0);
88124 +
88125 +       segs = skb_gso_segment(skb, features);
88126 +
88127 +       /* Verifying header integrity only. */
88128 +       if (!segs)
88129 +               return 0;
88130 +
88131 +       if (unlikely(IS_ERR(segs)))
88132 +               return PTR_ERR(segs);
88133 +
88134 +       skb->next = segs;
88135 +       DEV_GSO_CB(skb)->destructor = skb->destructor;
88136 +       skb->destructor = dev_gso_skb_destructor;
88137  
88138 -       skb->tail     += skb->data_len;
88139 -       skb->data_len  = 0;
88140 +       return 0;
88141 +}
88142 +
88143 +int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
88144 +{
88145 +       if (likely(!skb->next)) {
88146 +               if (netdev_nit)
88147 +                       dev_queue_xmit_nit(skb, dev);
88148 +
88149 +               if (netif_needs_gso(dev, skb)) {
88150 +                       if (unlikely(dev_gso_segment(skb)))
88151 +                               goto out_kfree_skb;
88152 +                       if (skb->next)
88153 +                               goto gso;
88154 +               }
88155 +
88156 +               return dev->hard_start_xmit(skb, dev);
88157 +       }
88158 +
88159 +gso:
88160 +       do {
88161 +               struct sk_buff *nskb = skb->next;
88162 +               int rc;
88163 +
88164 +               skb->next = nskb->next;
88165 +               nskb->next = NULL;
88166 +               rc = dev->hard_start_xmit(nskb, dev);
88167 +               if (unlikely(rc)) {
88168 +                       nskb->next = skb->next;
88169 +                       skb->next = nskb;
88170 +                       return rc;
88171 +               }
88172 +               if (unlikely(netif_queue_stopped(dev) && skb->next))
88173 +                       return NETDEV_TX_BUSY;
88174 +       } while (skb->next);
88175 +       
88176 +       skb->destructor = DEV_GSO_CB(skb)->destructor;
88177 +
88178 +out_kfree_skb:
88179 +       kfree_skb(skb);
88180         return 0;
88181  }
88182  
88183  #define HARD_TX_LOCK(dev, cpu) {                       \
88184         if ((dev->features & NETIF_F_LLTX) == 0) {      \
88185 -               spin_lock(&dev->xmit_lock);             \
88186 -               dev->xmit_lock_owner = cpu;             \
88187 +               netif_tx_lock(dev);                     \
88188         }                                               \
88189  }
88190  
88191  #define HARD_TX_UNLOCK(dev) {                          \
88192         if ((dev->features & NETIF_F_LLTX) == 0) {      \
88193 -               dev->xmit_lock_owner = -1;              \
88194 -               spin_unlock(&dev->xmit_lock);           \
88195 +               netif_tx_unlock(dev);                   \
88196         }                                               \
88197  }
88198  
88199 +#ifdef CONFIG_XEN
88200 +inline int skb_checksum_setup(struct sk_buff *skb)
88201 +{
88202 +       if (skb->proto_csum_blank) {
88203 +               if (skb->protocol != htons(ETH_P_IP))
88204 +                       goto out;
88205 +               skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl;
88206 +               if (skb->h.raw >= skb->tail)
88207 +                       goto out;
88208 +               switch (skb->nh.iph->protocol) {
88209 +               case IPPROTO_TCP:
88210 +                       skb->csum = offsetof(struct tcphdr, check);
88211 +                       break;
88212 +               case IPPROTO_UDP:
88213 +                       skb->csum = offsetof(struct udphdr, check);
88214 +                       break;
88215 +               default:
88216 +                       if (net_ratelimit())
88217 +                               printk(KERN_ERR "Attempting to checksum a non-"
88218 +                                      "TCP/UDP packet, dropping a protocol"
88219 +                                      " %d packet", skb->nh.iph->protocol);
88220 +                       goto out;
88221 +               }
88222 +               if ((skb->h.raw + skb->csum + 2) > skb->tail)
88223 +                       goto out;
88224 +               skb->ip_summed = CHECKSUM_HW;
88225 +               skb->proto_csum_blank = 0;
88226 +       }
88227 +       return 0;
88228 +out:
88229 +       return -EPROTO;
88230 +}
88231 +#else
88232 +inline int skb_checksum_setup(struct sk_buff *skb) { return 0; }
88233 +#endif
88234 +
88235 +
88236  /**
88237   *     dev_queue_xmit - transmit a buffer
88238   *     @skb: buffer to transmit
88239 @@ -1319,9 +1435,19 @@ int dev_queue_xmit(struct sk_buff *skb)
88240         struct Qdisc *q;
88241         int rc = -ENOMEM;
88242  
88243 +       /* If a checksum-deferred packet is forwarded to a device that needs a
88244 +        * checksum, correct the pointers and force checksumming.
88245 +        */
88246 +       if (skb_checksum_setup(skb))
88247 +               goto out_kfree_skb;
88248 +
88249 +       /* GSO will handle the following emulations directly. */
88250 +       if (netif_needs_gso(dev, skb))
88251 +               goto gso;
88252 +
88253         if (skb_shinfo(skb)->frag_list &&
88254             !(dev->features & NETIF_F_FRAGLIST) &&
88255 -           __skb_linearize(skb, GFP_ATOMIC))
88256 +           __skb_linearize(skb))
88257                 goto out_kfree_skb;
88258  
88259         /* Fragmented skb is linearized if device does not support SG,
88260 @@ -1330,25 +1456,26 @@ int dev_queue_xmit(struct sk_buff *skb)
88261          */
88262         if (skb_shinfo(skb)->nr_frags &&
88263             (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
88264 -           __skb_linearize(skb, GFP_ATOMIC))
88265 +           __skb_linearize(skb))
88266                 goto out_kfree_skb;
88267  
88268         /* If packet is not checksummed and device does not support
88269          * checksumming for this protocol, complete checksumming here.
88270          */
88271         if (skb->ip_summed == CHECKSUM_HW &&
88272 -           (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) &&
88273 +           (!(dev->features & NETIF_F_GEN_CSUM) &&
88274              (!(dev->features & NETIF_F_IP_CSUM) ||
88275               skb->protocol != htons(ETH_P_IP))))
88276                 if (skb_checksum_help(skb, 0))
88277                         goto out_kfree_skb;
88278  
88279 +gso:
88280         spin_lock_prefetch(&dev->queue_lock);
88281  
88282         /* Disable soft irqs for various locks below. Also 
88283          * stops preemption for RCU. 
88284          */
88285 -       local_bh_disable(); 
88286 +       rcu_read_lock_bh(); 
88287  
88288         /* Updates of qdisc are serialized by queue_lock. 
88289          * The struct Qdisc which is pointed to by qdisc is now a 
88290 @@ -1382,8 +1509,8 @@ int dev_queue_xmit(struct sk_buff *skb)
88291         /* The device has no queue. Common case for software devices:
88292            loopback, all the sorts of tunnels...
88293  
88294 -          Really, it is unlikely that xmit_lock protection is necessary here.
88295 -          (f.e. loopback and IP tunnels are clean ignoring statistics
88296 +          Really, it is unlikely that netif_tx_lock protection is necessary
88297 +          here.  (f.e. loopback and IP tunnels are clean ignoring statistics
88298            counters.)
88299            However, it is possible, that they rely on protection
88300            made by us here.
88301 @@ -1399,11 +1526,8 @@ int dev_queue_xmit(struct sk_buff *skb)
88302                         HARD_TX_LOCK(dev, cpu);
88303  
88304                         if (!netif_queue_stopped(dev)) {
88305 -                               if (netdev_nit)
88306 -                                       dev_queue_xmit_nit(skb, dev);
88307 -
88308                                 rc = 0;
88309 -                               if (!dev->hard_start_xmit(skb, dev)) {
88310 +                               if (!dev_hard_start_xmit(skb, dev)) {
88311                                         HARD_TX_UNLOCK(dev);
88312                                         goto out;
88313                                 }
88314 @@ -1422,13 +1546,13 @@ int dev_queue_xmit(struct sk_buff *skb)
88315         }
88316  
88317         rc = -ENETDOWN;
88318 -       local_bh_enable();
88319 +       rcu_read_unlock_bh();
88320  
88321  out_kfree_skb:
88322         kfree_skb(skb);
88323         return rc;
88324  out:
88325 -       local_bh_enable();
88326 +       rcu_read_unlock_bh();
88327         return rc;
88328  }
88329  
88330 @@ -1706,6 +1830,19 @@ int netif_receive_skb(struct sk_buff *sk
88331         }
88332  #endif
88333  
88334 +#ifdef CONFIG_XEN
88335 +       switch (skb->ip_summed) {
88336 +       case CHECKSUM_UNNECESSARY:
88337 +               skb->proto_data_valid = 1;
88338 +               break;
88339 +       case CHECKSUM_HW:
88340 +               /* XXX Implement me. */
88341 +       default:
88342 +               skb->proto_data_valid = 0;
88343 +               break;
88344 +       }
88345 +#endif
88346 +
88347         list_for_each_entry_rcu(ptype, &ptype_all, list) {
88348                 if (!ptype->dev || ptype->dev == skb->dev) {
88349                         if (pt_prev) 
88350 @@ -2785,7 +2922,7 @@ int register_netdevice(struct net_device
88351         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
88352  
88353         spin_lock_init(&dev->queue_lock);
88354 -       spin_lock_init(&dev->xmit_lock);
88355 +       spin_lock_init(&dev->_xmit_lock);
88356         dev->xmit_lock_owner = -1;
88357  #ifdef CONFIG_NET_CLS_ACT
88358         spin_lock_init(&dev->ingress_lock);
88359 @@ -2829,9 +2966,7 @@ int register_netdevice(struct net_device
88360  
88361         /* Fix illegal SG+CSUM combinations. */
88362         if ((dev->features & NETIF_F_SG) &&
88363 -           !(dev->features & (NETIF_F_IP_CSUM |
88364 -                              NETIF_F_NO_CSUM |
88365 -                              NETIF_F_HW_CSUM))) {
88366 +           !(dev->features & NETIF_F_ALL_CSUM)) {
88367                 printk("%s: Dropping NETIF_F_SG since no checksum feature.\n",
88368                        dev->name);
88369                 dev->features &= ~NETIF_F_SG;
88370 @@ -3371,7 +3506,6 @@ subsys_initcall(net_dev_init);
88371  EXPORT_SYMBOL(__dev_get_by_index);
88372  EXPORT_SYMBOL(__dev_get_by_name);
88373  EXPORT_SYMBOL(__dev_remove_pack);
88374 -EXPORT_SYMBOL(__skb_linearize);
88375  EXPORT_SYMBOL(dev_valid_name);
88376  EXPORT_SYMBOL(dev_add_pack);
88377  EXPORT_SYMBOL(dev_alloc_name);
88378 @@ -3403,6 +3537,7 @@ EXPORT_SYMBOL(unregister_netdevice_notif
88379  EXPORT_SYMBOL(net_enable_timestamp);
88380  EXPORT_SYMBOL(net_disable_timestamp);
88381  EXPORT_SYMBOL(dev_get_flags);
88382 +EXPORT_SYMBOL(skb_checksum_setup);
88383  
88384  #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
88385  EXPORT_SYMBOL(br_handle_frame_hook);
88386 diff -urNp linux-2.6/net/core/dev_mcast.c new/net/core/dev_mcast.c
88387 --- linux-2.6/net/core/dev_mcast.c      2006-07-03 14:15:21.000000000 +0200
88388 +++ new/net/core/dev_mcast.c    2006-07-07 15:10:03.000000000 +0200
88389 @@ -62,7 +62,7 @@
88390   *     Device mc lists are changed by bh at least if IPv6 is enabled,
88391   *     so that it must be bh protected.
88392   *
88393 - *     We block accesses to device mc filters with dev->xmit_lock.
88394 + *     We block accesses to device mc filters with netif_tx_lock.
88395   */
88396  
88397  /*
88398 @@ -93,9 +93,9 @@ static void __dev_mc_upload(struct net_d
88399  
88400  void dev_mc_upload(struct net_device *dev)
88401  {
88402 -       spin_lock_bh(&dev->xmit_lock);
88403 +       netif_tx_lock_bh(dev);
88404         __dev_mc_upload(dev);
88405 -       spin_unlock_bh(&dev->xmit_lock);
88406 +       netif_tx_unlock_bh(dev);
88407  }
88408  
88409  /*
88410 @@ -107,7 +107,7 @@ int dev_mc_delete(struct net_device *dev
88411         int err = 0;
88412         struct dev_mc_list *dmi, **dmip;
88413  
88414 -       spin_lock_bh(&dev->xmit_lock);
88415 +       netif_tx_lock_bh(dev);
88416  
88417         for (dmip = &dev->mc_list; (dmi = *dmip) != NULL; dmip = &dmi->next) {
88418                 /*
88419 @@ -139,13 +139,13 @@ int dev_mc_delete(struct net_device *dev
88420                          */
88421                         __dev_mc_upload(dev);
88422                         
88423 -                       spin_unlock_bh(&dev->xmit_lock);
88424 +                       netif_tx_unlock_bh(dev);
88425                         return 0;
88426                 }
88427         }
88428         err = -ENOENT;
88429  done:
88430 -       spin_unlock_bh(&dev->xmit_lock);
88431 +       netif_tx_unlock_bh(dev);
88432         return err;
88433  }
88434  
88435 @@ -160,7 +160,7 @@ int dev_mc_add(struct net_device *dev, v
88436  
88437         dmi1 = kmalloc(sizeof(*dmi), GFP_ATOMIC);
88438  
88439 -       spin_lock_bh(&dev->xmit_lock);
88440 +       netif_tx_lock_bh(dev);
88441         for (dmi = dev->mc_list; dmi != NULL; dmi = dmi->next) {
88442                 if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 &&
88443                     dmi->dmi_addrlen == alen) {
88444 @@ -176,7 +176,7 @@ int dev_mc_add(struct net_device *dev, v
88445         }
88446  
88447         if ((dmi = dmi1) == NULL) {
88448 -               spin_unlock_bh(&dev->xmit_lock);
88449 +               netif_tx_unlock_bh(dev);
88450                 return -ENOMEM;
88451         }
88452         memcpy(dmi->dmi_addr, addr, alen);
88453 @@ -189,11 +189,11 @@ int dev_mc_add(struct net_device *dev, v
88454  
88455         __dev_mc_upload(dev);
88456         
88457 -       spin_unlock_bh(&dev->xmit_lock);
88458 +       netif_tx_unlock_bh(dev);
88459         return 0;
88460  
88461  done:
88462 -       spin_unlock_bh(&dev->xmit_lock);
88463 +       netif_tx_unlock_bh(dev);
88464         kfree(dmi1);
88465         return err;
88466  }
88467 @@ -204,7 +204,7 @@ done:
88468  
88469  void dev_mc_discard(struct net_device *dev)
88470  {
88471 -       spin_lock_bh(&dev->xmit_lock);
88472 +       netif_tx_lock_bh(dev);
88473         
88474         while (dev->mc_list != NULL) {
88475                 struct dev_mc_list *tmp = dev->mc_list;
88476 @@ -215,7 +215,7 @@ void dev_mc_discard(struct net_device *d
88477         }
88478         dev->mc_count = 0;
88479  
88480 -       spin_unlock_bh(&dev->xmit_lock);
88481 +       netif_tx_unlock_bh(dev);
88482  }
88483  
88484  #ifdef CONFIG_PROC_FS
88485 @@ -250,7 +250,7 @@ static int dev_mc_seq_show(struct seq_fi
88486         struct dev_mc_list *m;
88487         struct net_device *dev = v;
88488  
88489 -       spin_lock_bh(&dev->xmit_lock);
88490 +       netif_tx_lock_bh(dev);
88491         for (m = dev->mc_list; m; m = m->next) {
88492                 int i;
88493  
88494 @@ -262,7 +262,7 @@ static int dev_mc_seq_show(struct seq_fi
88495  
88496                 seq_putc(seq, '\n');
88497         }
88498 -       spin_unlock_bh(&dev->xmit_lock);
88499 +       netif_tx_unlock_bh(dev);
88500         return 0;
88501  }
88502  
88503 diff -urNp linux-2.6/net/core/ethtool.c new/net/core/ethtool.c
88504 --- linux-2.6/net/core/ethtool.c        2006-07-03 14:15:21.000000000 +0200
88505 +++ new/net/core/ethtool.c      2006-07-07 15:10:03.000000000 +0200
88506 @@ -30,7 +30,7 @@ u32 ethtool_op_get_link(struct net_devic
88507  
88508  u32 ethtool_op_get_tx_csum(struct net_device *dev)
88509  {
88510 -       return (dev->features & (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM)) != 0;
88511 +       return (dev->features & NETIF_F_ALL_CSUM) != 0;
88512  }
88513  
88514  int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
88515 @@ -551,9 +551,7 @@ static int ethtool_set_sg(struct net_dev
88516                 return -EFAULT;
88517  
88518         if (edata.data && 
88519 -           !(dev->features & (NETIF_F_IP_CSUM |
88520 -                              NETIF_F_NO_CSUM |
88521 -                              NETIF_F_HW_CSUM)))
88522 +           !(dev->features & NETIF_F_ALL_CSUM))
88523                 return -EINVAL;
88524  
88525         return __ethtool_set_sg(dev, edata.data);
88526 @@ -591,7 +589,7 @@ static int ethtool_set_tso(struct net_de
88527  
88528  static int ethtool_get_ufo(struct net_device *dev, char __user *useraddr)
88529  {
88530 -       struct ethtool_value edata = { ETHTOOL_GTSO };
88531 +       struct ethtool_value edata = { ETHTOOL_GUFO };
88532  
88533         if (!dev->ethtool_ops->get_ufo)
88534                 return -EOPNOTSUPP;
88535 @@ -615,6 +614,29 @@ static int ethtool_set_ufo(struct net_de
88536         return dev->ethtool_ops->set_ufo(dev, edata.data);
88537  }
88538  
88539 +static int ethtool_get_gso(struct net_device *dev, char __user *useraddr)
88540 +{
88541 +       struct ethtool_value edata = { ETHTOOL_GGSO };
88542 +
88543 +       edata.data = dev->features & NETIF_F_GSO;
88544 +       if (copy_to_user(useraddr, &edata, sizeof(edata)))
88545 +                return -EFAULT;
88546 +       return 0;
88547 +}
88548 +
88549 +static int ethtool_set_gso(struct net_device *dev, char __user *useraddr)
88550 +{
88551 +       struct ethtool_value edata;
88552 +
88553 +       if (copy_from_user(&edata, useraddr, sizeof(edata)))
88554 +               return -EFAULT;
88555 +       if (edata.data)
88556 +               dev->features |= NETIF_F_GSO;
88557 +       else
88558 +               dev->features &= ~NETIF_F_GSO;
88559 +       return 0;
88560 +}
88561 +
88562  static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
88563  {
88564         struct ethtool_test test;
88565 @@ -906,6 +928,12 @@ int dev_ethtool(struct ifreq *ifr)
88566         case ETHTOOL_SUFO:
88567                 rc = ethtool_set_ufo(dev, useraddr);
88568                 break;
88569 +       case ETHTOOL_GGSO:
88570 +               rc = ethtool_get_gso(dev, useraddr);
88571 +               break;
88572 +       case ETHTOOL_SGSO:
88573 +               rc = ethtool_set_gso(dev, useraddr);
88574 +               break;
88575         default:
88576                 rc =  -EOPNOTSUPP;
88577         }
88578 diff -urNp linux-2.6/net/core/netpoll.c new/net/core/netpoll.c
88579 --- linux-2.6/net/core/netpoll.c        2006-07-03 14:15:21.000000000 +0200
88580 +++ new/net/core/netpoll.c      2006-07-07 16:18:04.000000000 +0200
88581 @@ -273,24 +273,21 @@ static void netpoll_send_skb(struct netp
88582  
88583         do {
88584                 npinfo->tries--;
88585 -               spin_lock(&np->dev->xmit_lock);
88586 -               np->dev->xmit_lock_owner = smp_processor_id();
88587 +               netif_tx_lock(np->dev);
88588  
88589                 /*
88590                  * network drivers do not expect to be called if the queue is
88591                  * stopped.
88592                  */
88593                 if (netif_queue_stopped(np->dev)) {
88594 -                       np->dev->xmit_lock_owner = -1;
88595 -                       spin_unlock(&np->dev->xmit_lock);
88596 +                       netif_tx_unlock(np->dev);
88597                         netpoll_poll(np);
88598                         udelay(50);
88599                         continue;
88600                 }
88601  
88602                 status = np->dev->hard_start_xmit(skb, np->dev);
88603 -               np->dev->xmit_lock_owner = -1;
88604 -               spin_unlock(&np->dev->xmit_lock);
88605 +               netif_tx_unlock(np->dev);
88606  
88607                 /* success */
88608                 if(!status) {
88609 diff -urNp linux-2.6/net/core/pktgen.c new/net/core/pktgen.c
88610 --- linux-2.6/net/core/pktgen.c 2006-07-03 14:15:21.000000000 +0200
88611 +++ new/net/core/pktgen.c       2006-07-07 16:18:55.000000000 +0200
88612 @@ -2896,8 +2896,8 @@ static __inline__ void pktgen_xmit(struc
88613                         pkt_dev->clone_count = 0;       /* reset counter */
88614                 }
88615         }
88616 -
88617 -       spin_lock_bh(&odev->xmit_lock);
88618 +       
88619 +       netif_tx_lock_bh(odev);
88620         if (!netif_queue_stopped(odev)) {
88621  
88622                 atomic_inc(&(pkt_dev->skb->users));
88623 @@ -2942,8 +2942,8 @@ static __inline__ void pktgen_xmit(struc
88624                 pkt_dev->next_tx_ns = 0;
88625         }
88626  
88627 -       spin_unlock_bh(&odev->xmit_lock);
88628 -
88629 +       netif_tx_unlock_bh(odev);
88630 +       
88631         /* If pkt_dev->count is zero, then run forever */
88632         if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
88633                 if (atomic_read(&(pkt_dev->skb->users)) != 1) {
88634 diff -urNp linux-2.6/net/core/skbuff.c new/net/core/skbuff.c
88635 --- linux-2.6/net/core/skbuff.c 2006-07-03 14:15:21.000000000 +0200
88636 +++ new/net/core/skbuff.c       2006-07-07 16:20:30.000000000 +0200
88637 @@ -140,6 +140,7 @@ EXPORT_SYMBOL(skb_truesize_bug);
88638   *     Buffers may only be allocated from interrupts using a @gfp_mask of
88639   *     %GFP_ATOMIC.
88640   */
88641 +#ifndef CONFIG_HAVE_ARCH_ALLOC_SKB
88642  struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
88643                             int fclone)
88644  {
88645 @@ -172,9 +173,9 @@ struct sk_buff *__alloc_skb(unsigned int
88646         shinfo = skb_shinfo(skb);
88647         atomic_set(&shinfo->dataref, 1);
88648         shinfo->nr_frags  = 0;
88649 -       shinfo->tso_size = 0;
88650 -       shinfo->tso_segs = 0;
88651 -       shinfo->ufo_size = 0;
88652 +       shinfo->gso_size = 0;
88653 +       shinfo->gso_segs = 0;
88654 +       shinfo->gso_type = 0;
88655         shinfo->ip6_frag_id = 0;
88656         shinfo->frag_list = NULL;
88657  
88658 @@ -194,6 +195,7 @@ nodata:
88659         skb = NULL;
88660         goto out;
88661  }
88662 +#endif /* !CONFIG_HAVE_ARCH_ALLOC_SKB */
88663  
88664  /**
88665   *     alloc_skb_from_cache    -       allocate a network buffer
88666 @@ -211,14 +213,18 @@ nodata:
88667   */
88668  struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
88669                                      unsigned int size,
88670 -                                    gfp_t gfp_mask)
88671 +                                    gfp_t gfp_mask,
88672 +                                    int fclone)
88673  {
88674 +       kmem_cache_t *cache;
88675 +       struct skb_shared_info *shinfo;
88676         struct sk_buff *skb;
88677         u8 *data;
88678  
88679 +       cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
88680 +
88681         /* Get the HEAD */
88682 -       skb = kmem_cache_alloc(skbuff_head_cache,
88683 -                              gfp_mask & ~__GFP_DMA);
88684 +       skb = kmem_cache_alloc(cache, gfp_mask & ~__GFP_DMA);
88685         if (!skb)
88686                 goto out;
88687  
88688 @@ -235,17 +241,29 @@ struct sk_buff *alloc_skb_from_cache(kme
88689         skb->data = data;
88690         skb->tail = data;
88691         skb->end  = data + size;
88692 +       /* make sure we initialize shinfo sequentially */
88693 +       shinfo = skb_shinfo(skb);
88694 +       atomic_set(&shinfo->dataref, 1);
88695 +       shinfo->nr_frags  = 0;
88696 +       shinfo->gso_size = 0;
88697 +       shinfo->gso_segs = 0;
88698 +       shinfo->gso_type = 0;
88699 +       shinfo->ip6_frag_id = 0;
88700 +       shinfo->frag_list = NULL;
88701  
88702 -       atomic_set(&(skb_shinfo(skb)->dataref), 1);
88703 -       skb_shinfo(skb)->nr_frags  = 0;
88704 -       skb_shinfo(skb)->tso_size = 0;
88705 -       skb_shinfo(skb)->tso_segs = 0;
88706 -       skb_shinfo(skb)->ufo_size = 0;
88707 -       skb_shinfo(skb)->frag_list = NULL;
88708 +       if (fclone) {
88709 +               struct sk_buff *child = skb + 1;
88710 +               atomic_t *fclone_ref = (atomic_t *) (child + 1);
88711 +
88712 +               skb->fclone = SKB_FCLONE_ORIG;
88713 +               atomic_set(fclone_ref, 1);
88714 +
88715 +               child->fclone = SKB_FCLONE_UNAVAILABLE;
88716 +       }
88717  out:
88718         return skb;
88719  nodata:
88720 -       kmem_cache_free(skbuff_head_cache, skb);
88721 +       kmem_cache_free(cache, skb);
88722         skb = NULL;
88723         goto out;
88724  }
88725 @@ -434,6 +453,10 @@ struct sk_buff *skb_clone(struct sk_buff
88726         C(local_df);
88727         n->cloned = 1;
88728         n->nohdr = 0;
88729 +#ifdef CONFIG_XEN
88730 +       C(proto_data_valid);
88731 +       C(proto_csum_blank);
88732 +#endif
88733         C(pkt_type);
88734         C(ip_summed);
88735         C(priority);
88736 @@ -527,9 +550,9 @@ static void copy_skb_header(struct sk_bu
88737         new->tc_index   = old->tc_index;
88738  #endif
88739         atomic_set(&new->users, 1);
88740 -       skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size;
88741 -       skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs;
88742 -       skb_shinfo(new)->ufo_size = skb_shinfo(old)->ufo_size;
88743 +       skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
88744 +       skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
88745 +       skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
88746  }
88747  
88748  /**
88749 @@ -1826,6 +1850,133 @@ unsigned char *skb_pull_rcsum(struct sk_
88750  
88751  EXPORT_SYMBOL_GPL(skb_pull_rcsum);
88752  
88753 +/**
88754 + *     skb_segment - Perform protocol segmentation on skb.
88755 + *     @skb: buffer to segment
88756 + *     @features: features for the output path (see dev->features)
88757 + *
88758 + *     This function performs segmentation on the given skb.  It returns
88759 + *     the segment at the given position.  It returns NULL if there are
88760 + *     no more segments to generate, or when an error is encountered.
88761 + */
88762 +struct sk_buff *skb_segment(struct sk_buff *skb, int features)
88763 +{
88764 +       struct sk_buff *segs = NULL;
88765 +       struct sk_buff *tail = NULL;
88766 +       unsigned int mss = skb_shinfo(skb)->gso_size;
88767 +       unsigned int doffset = skb->data - skb->mac.raw;
88768 +       unsigned int offset = doffset;
88769 +       unsigned int headroom;
88770 +       unsigned int len;
88771 +       int sg = features & NETIF_F_SG;
88772 +       int nfrags = skb_shinfo(skb)->nr_frags;
88773 +       int err = -ENOMEM;
88774 +       int i = 0;
88775 +       int pos;
88776 +
88777 +       __skb_push(skb, doffset);
88778 +       headroom = skb_headroom(skb);
88779 +       pos = skb_headlen(skb);
88780 +
88781 +       do {
88782 +               struct sk_buff *nskb;
88783 +               skb_frag_t *frag;
88784 +               int hsize, nsize;
88785 +               int k;
88786 +               int size;
88787 +
88788 +               len = skb->len - offset;
88789 +               if (len > mss)
88790 +                       len = mss;
88791 +
88792 +               hsize = skb_headlen(skb) - offset;
88793 +               if (hsize < 0)
88794 +                       hsize = 0;
88795 +               nsize = hsize + doffset;
88796 +               if (nsize > len + doffset || !sg)
88797 +                       nsize = len + doffset;
88798 +
88799 +               nskb = alloc_skb(nsize + headroom, GFP_ATOMIC);
88800 +               if (unlikely(!nskb))
88801 +                       goto err;
88802 +
88803 +               if (segs)
88804 +                       tail->next = nskb;
88805 +               else
88806 +                       segs = nskb;
88807 +               tail = nskb;
88808 +
88809 +               nskb->dev = skb->dev;
88810 +               nskb->priority = skb->priority;
88811 +               nskb->protocol = skb->protocol;
88812 +               nskb->dst = dst_clone(skb->dst);
88813 +               memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
88814 +               nskb->pkt_type = skb->pkt_type;
88815 +               nskb->mac_len = skb->mac_len;
88816 +
88817 +               skb_reserve(nskb, headroom);
88818 +               nskb->mac.raw = nskb->data;
88819 +               nskb->nh.raw = nskb->data + skb->mac_len;
88820 +               nskb->h.raw = nskb->nh.raw + (skb->h.raw - skb->nh.raw);
88821 +               memcpy(skb_put(nskb, doffset), skb->data, doffset);
88822 +
88823 +               if (!sg) {
88824 +                       nskb->csum = skb_copy_and_csum_bits(skb, offset,
88825 +                                                           skb_put(nskb, len),
88826 +                                                           len, 0);
88827 +                       continue;
88828 +               }
88829 +
88830 +               frag = skb_shinfo(nskb)->frags;
88831 +               k = 0;
88832 +
88833 +               nskb->ip_summed = CHECKSUM_HW;
88834 +               nskb->csum = skb->csum;
88835 +               memcpy(skb_put(nskb, hsize), skb->data + offset, hsize);
88836 +
88837 +               while (pos < offset + len) {
88838 +                       BUG_ON(i >= nfrags);
88839 +
88840 +                       *frag = skb_shinfo(skb)->frags[i];
88841 +                       get_page(frag->page);
88842 +                       size = frag->size;
88843 +
88844 +                       if (pos < offset) {
88845 +                               frag->page_offset += offset - pos;
88846 +                               frag->size -= offset - pos;
88847 +                       }
88848 +
88849 +                       k++;
88850 +
88851 +                       if (pos + size <= offset + len) {
88852 +                               i++;
88853 +                               pos += size;
88854 +                       } else {
88855 +                               frag->size -= pos + size - (offset + len);
88856 +                               break;
88857 +                       }
88858 +
88859 +                       frag++;
88860 +               }
88861 +
88862 +               skb_shinfo(nskb)->nr_frags = k;
88863 +               nskb->data_len = len - hsize;
88864 +               nskb->len += nskb->data_len;
88865 +               nskb->truesize += nskb->data_len;
88866 +       } while ((offset += len) < skb->len);
88867 +
88868 +       return segs;
88869 +
88870 +err:
88871 +       while ((skb = segs)) {
88872 +               segs = skb->next;
88873 +               kfree(skb);
88874 +       }
88875 +       return ERR_PTR(err);
88876 +}
88877 +
88878 +EXPORT_SYMBOL_GPL(skb_segment);
88879 +
88880  void __init skb_init(void)
88881  {
88882         skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
88883 diff -urNp linux-2.6/net/decnet/dn_nsp_in.c new/net/decnet/dn_nsp_in.c
88884 --- linux-2.6/net/decnet/dn_nsp_in.c    2006-07-03 14:15:21.000000000 +0200
88885 +++ new/net/decnet/dn_nsp_in.c  2006-07-07 16:20:38.000000000 +0200
88886 @@ -801,8 +801,7 @@ got_it:
88887                  * We linearize everything except data segments here.
88888                  */
88889                 if (cb->nsp_flags & ~0x60) {
88890 -                       if (unlikely(skb_is_nonlinear(skb)) &&
88891 -                           skb_linearize(skb, GFP_ATOMIC) != 0)
88892 +                       if (unlikely(skb_linearize(skb)))
88893                                 goto free_out;
88894                 }
88895  
88896 diff -urNp linux-2.6/net/decnet/dn_route.c new/net/decnet/dn_route.c
88897 --- linux-2.6/net/decnet/dn_route.c     2006-07-03 14:15:21.000000000 +0200
88898 +++ new/net/decnet/dn_route.c   2006-07-07 16:20:38.000000000 +0200
88899 @@ -629,8 +629,7 @@ int dn_route_rcv(struct sk_buff *skb, st
88900                         padlen);
88901  
88902          if (flags & DN_RT_PKT_CNTL) {
88903 -               if (unlikely(skb_is_nonlinear(skb)) &&
88904 -                   skb_linearize(skb, GFP_ATOMIC) != 0)
88905 +               if (unlikely(skb_linearize(skb)))
88906                         goto dump_it;
88907  
88908                  switch(flags & DN_RT_CNTL_MSK) {
88909 diff -urNp linux-2.6/net/ipv4/af_inet.c new/net/ipv4/af_inet.c
88910 --- linux-2.6/net/ipv4/af_inet.c        2006-07-03 14:15:21.000000000 +0200
88911 +++ new/net/ipv4/af_inet.c      2006-07-07 16:20:38.000000000 +0200
88912 @@ -68,6 +68,7 @@
88913   */
88914  
88915  #include <linux/config.h>
88916 +#include <linux/err.h>
88917  #include <linux/errno.h>
88918  #include <linux/types.h>
88919  #include <linux/socket.h>
88920 @@ -1096,6 +1097,54 @@ int inet_sk_rebuild_header(struct sock *
88921  
88922  EXPORT_SYMBOL(inet_sk_rebuild_header);
88923  
88924 +static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
88925 +{
88926 +       struct sk_buff *segs = ERR_PTR(-EINVAL);
88927 +       struct iphdr *iph;
88928 +       struct net_protocol *ops;
88929 +       int proto;
88930 +       int ihl;
88931 +       int id;
88932 +
88933 +       if (!pskb_may_pull(skb, sizeof(*iph)))
88934 +               goto out;
88935 +
88936 +       iph = skb->nh.iph;
88937 +       ihl = iph->ihl * 4;
88938 +       if (ihl < sizeof(*iph))
88939 +               goto out;
88940 +
88941 +       if (!pskb_may_pull(skb, ihl))
88942 +               goto out;
88943 +
88944 +       skb->h.raw = __skb_pull(skb, ihl);
88945 +       iph = skb->nh.iph;
88946 +       id = ntohs(iph->id);
88947 +       proto = iph->protocol & (MAX_INET_PROTOS - 1);
88948 +       segs = ERR_PTR(-EPROTONOSUPPORT);
88949 +
88950 +       rcu_read_lock();
88951 +       ops = rcu_dereference(inet_protos[proto]);
88952 +       if (ops && ops->gso_segment)
88953 +               segs = ops->gso_segment(skb, features);
88954 +       rcu_read_unlock();
88955 +
88956 +       if (!segs || unlikely(IS_ERR(segs)))
88957 +               goto out;
88958 +
88959 +       skb = segs;
88960 +       do {
88961 +               iph = skb->nh.iph;
88962 +               iph->id = htons(id++);
88963 +               iph->tot_len = htons(skb->len - skb->mac_len);
88964 +               iph->check = 0;
88965 +               iph->check = ip_fast_csum(skb->nh.raw, iph->ihl);
88966 +       } while ((skb = skb->next));
88967 +
88968 +out:
88969 +       return segs;
88970 +}
88971 +
88972  #ifdef CONFIG_IP_MULTICAST
88973  static struct net_protocol igmp_protocol = {
88974         .handler =      igmp_rcv,
88975 @@ -1105,6 +1154,7 @@ static struct net_protocol igmp_protocol
88976  static struct net_protocol tcp_protocol = {
88977         .handler =      tcp_v4_rcv,
88978         .err_handler =  tcp_v4_err,
88979 +       .gso_segment =  tcp_tso_segment,
88980         .no_policy =    1,
88981  };
88982  
88983 @@ -1150,6 +1200,7 @@ static int ipv4_proc_init(void);
88984  static struct packet_type ip_packet_type = {
88985         .type = __constant_htons(ETH_P_IP),
88986         .func = ip_rcv,
88987 +       .gso_segment = inet_gso_segment,
88988  };
88989  
88990  static int __init inet_init(void)
88991 diff -urNp linux-2.6/net/ipv4/ipcomp.c new/net/ipv4/ipcomp.c
88992 --- linux-2.6/net/ipv4/ipcomp.c 2006-07-03 14:15:21.000000000 +0200
88993 +++ new/net/ipv4/ipcomp.c       2006-07-07 16:20:39.000000000 +0200
88994 @@ -84,7 +84,7 @@ out:  
88995  static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb)
88996  {
88997         u8 nexthdr;
88998 -       int err = 0;
88999 +       int err = -ENOMEM;
89000         struct iphdr *iph;
89001         union {
89002                 struct iphdr    iph;
89003 @@ -92,11 +92,8 @@ static int ipcomp_input(struct xfrm_stat
89004         } tmp_iph;
89005  
89006  
89007 -       if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
89008 -           skb_linearize(skb, GFP_ATOMIC) != 0) {
89009 -               err = -ENOMEM;
89010 +       if (skb_linearize_cow(skb))
89011                 goto out;
89012 -       }
89013  
89014         skb->ip_summed = CHECKSUM_NONE;
89015  
89016 @@ -171,10 +168,8 @@ static int ipcomp_output(struct xfrm_sta
89017                 goto out_ok;
89018         }
89019  
89020 -       if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
89021 -           skb_linearize(skb, GFP_ATOMIC) != 0) {
89022 +       if (skb_linearize_cow(skb))
89023                 goto out_ok;
89024 -       }
89025         
89026         err = ipcomp_compress(x, skb);
89027         iph = skb->nh.iph;
89028 diff -urNp linux-2.6/net/ipv4/ip_output.c new/net/ipv4/ip_output.c
89029 --- linux-2.6/net/ipv4/ip_output.c      2006-07-03 14:15:21.000000000 +0200
89030 +++ new/net/ipv4/ip_output.c    2006-07-07 16:20:39.000000000 +0200
89031 @@ -210,8 +210,7 @@ static inline int ip_finish_output(struc
89032                 return dst_output(skb);
89033         }
89034  #endif
89035 -       if (skb->len > dst_mtu(skb->dst) &&
89036 -           !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
89037 +       if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->gso_size)
89038                 return ip_fragment(skb, ip_finish_output2);
89039         else
89040                 return ip_finish_output2(skb);
89041 @@ -362,7 +361,7 @@ packet_routed:
89042         }
89043  
89044         ip_select_ident_more(iph, &rt->u.dst, sk,
89045 -                            (skb_shinfo(skb)->tso_segs ?: 1) - 1);
89046 +                            (skb_shinfo(skb)->gso_segs ?: 1) - 1);
89047  
89048         /* Add an IP checksum. */
89049         ip_send_check(iph);
89050 @@ -743,7 +742,8 @@ static inline int ip_ufo_append_data(str
89051                                (length - transhdrlen));
89052         if (!err) {
89053                 /* specify the length of each IP datagram fragment*/
89054 -               skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
89055 +               skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
89056 +               skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4;
89057                 __skb_queue_tail(&sk->sk_write_queue, skb);
89058  
89059                 return 0;
89060 @@ -839,7 +839,7 @@ int ip_append_data(struct sock *sk,
89061          */
89062         if (transhdrlen &&
89063             length + fragheaderlen <= mtu &&
89064 -           rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
89065 +           rt->u.dst.dev->features & NETIF_F_ALL_CSUM &&
89066             !exthdrlen)
89067                 csummode = CHECKSUM_HW;
89068  
89069 @@ -1086,14 +1086,16 @@ ssize_t ip_append_page(struct sock *sk, 
89070  
89071         inet->cork.length += size;
89072         if ((sk->sk_protocol == IPPROTO_UDP) &&
89073 -           (rt->u.dst.dev->features & NETIF_F_UFO))
89074 -               skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
89075 +           (rt->u.dst.dev->features & NETIF_F_UFO)) {
89076 +               skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
89077 +               skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4;
89078 +       }
89079  
89080  
89081         while (size > 0) {
89082                 int i;
89083  
89084 -               if (skb_shinfo(skb)->ufo_size)
89085 +               if (skb_shinfo(skb)->gso_size)
89086                         len = size;
89087                 else {
89088  
89089 diff -urNp linux-2.6/net/ipv4/netfilter/ip_nat_proto_tcp.c new/net/ipv4/netfilter/ip_nat_proto_tcp.c
89090 --- linux-2.6/net/ipv4/netfilter/ip_nat_proto_tcp.c     2006-07-03 14:15:22.000000000 +0200
89091 +++ new/net/ipv4/netfilter/ip_nat_proto_tcp.c   2006-05-23 18:37:13.000000000 +0200
89092 @@ -129,10 +129,17 @@ tcp_manip_pkt(struct sk_buff **pskb,
89093         if (hdrsize < sizeof(*hdr))
89094                 return 1;
89095  
89096 -       hdr->check = ip_nat_cheat_check(~oldip, newip,
89097 +#ifdef CONFIG_XEN
89098 +       if ((*pskb)->proto_csum_blank) {
89099 +               hdr->check = ip_nat_cheat_check(oldip, ~newip, hdr->check);
89100 +       } else
89101 +#endif
89102 +       {
89103 +               hdr->check = ip_nat_cheat_check(~oldip, newip,
89104                                         ip_nat_cheat_check(oldport ^ 0xFFFF,
89105                                                            newport,
89106                                                            hdr->check));
89107 +       }
89108         return 1;
89109  }
89110  
89111 diff -urNp linux-2.6/net/ipv4/netfilter/ip_nat_proto_udp.c new/net/ipv4/netfilter/ip_nat_proto_udp.c
89112 --- linux-2.6/net/ipv4/netfilter/ip_nat_proto_udp.c     2006-07-03 14:15:22.000000000 +0200
89113 +++ new/net/ipv4/netfilter/ip_nat_proto_udp.c   2006-05-23 18:37:13.000000000 +0200
89114 @@ -113,11 +113,19 @@ udp_manip_pkt(struct sk_buff **pskb,
89115                 newport = tuple->dst.u.udp.port;
89116                 portptr = &hdr->dest;
89117         }
89118 -       if (hdr->check) /* 0 is a special case meaning no checksum */
89119 -               hdr->check = ip_nat_cheat_check(~oldip, newip,
89120 +       if (hdr->check) { /* 0 is a special case meaning no checksum */
89121 +#ifdef CONFIG_XEN
89122 +               if ((*pskb)->proto_csum_blank) {
89123 +                       hdr->check = ip_nat_cheat_check(oldip, ~newip, hdr->check);
89124 +               } else
89125 +#endif
89126 +               {
89127 +                       hdr->check = ip_nat_cheat_check(~oldip, newip,
89128                                         ip_nat_cheat_check(*portptr ^ 0xFFFF,
89129                                                            newport,
89130                                                            hdr->check));
89131 +               }
89132 +       }
89133         *portptr = newport;
89134         return 1;
89135  }
89136 diff -urNp linux-2.6/net/ipv4/tcp.c new/net/ipv4/tcp.c
89137 --- linux-2.6/net/ipv4/tcp.c    2006-07-03 14:15:22.000000000 +0200
89138 +++ new/net/ipv4/tcp.c  2006-07-07 16:20:59.000000000 +0200
89139 @@ -258,6 +258,7 @@
89140  #include <linux/bootmem.h>
89141  #include <linux/cache.h>
89142  #include <linux/in.h>
89143 +#include <linux/err.h>
89144  
89145  #include <net/icmp.h>
89146  #include <net/tcp.h>
89147 @@ -571,7 +572,7 @@ new_segment:
89148                 skb->ip_summed = CHECKSUM_HW;
89149                 tp->write_seq += copy;
89150                 TCP_SKB_CB(skb)->end_seq += copy;
89151 -               skb_shinfo(skb)->tso_segs = 0;
89152 +               skb_shinfo(skb)->gso_segs = 0;
89153  
89154                 if (!copied)
89155                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
89156 @@ -622,14 +623,10 @@ ssize_t tcp_sendpage(struct socket *sock
89157         ssize_t res;
89158         struct sock *sk = sock->sk;
89159  
89160 -#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
89161 -
89162         if (!(sk->sk_route_caps & NETIF_F_SG) ||
89163 -           !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
89164 +           !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
89165                 return sock_no_sendpage(sock, page, offset, size, flags);
89166  
89167 -#undef TCP_ZC_CSUM_FLAGS
89168 -
89169         lock_sock(sk);
89170         TCP_CHECK_TIMER(sk);
89171         res = do_tcp_sendpages(sk, &page, offset, size, flags);
89172 @@ -726,9 +723,7 @@ new_segment:
89173                                 /*
89174                                  * Check whether we can use HW checksum.
89175                                  */
89176 -                               if (sk->sk_route_caps &
89177 -                                   (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
89178 -                                    NETIF_F_HW_CSUM))
89179 +                               if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
89180                                         skb->ip_summed = CHECKSUM_HW;
89181  
89182                                 skb_entail(sk, tp, skb);
89183 @@ -824,7 +819,7 @@ new_segment:
89184  
89185                         tp->write_seq += copy;
89186                         TCP_SKB_CB(skb)->end_seq += copy;
89187 -                       skb_shinfo(skb)->tso_segs = 0;
89188 +                       skb_shinfo(skb)->gso_segs = 0;
89189  
89190                         from += copy;
89191                         copied += copy;
89192 @@ -2071,6 +2066,77 @@ int compat_tcp_getsockopt(struct sock *s
89193  EXPORT_SYMBOL(compat_tcp_getsockopt);
89194  #endif
89195  
89196 +struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
89197 +{
89198 +       struct sk_buff *segs = ERR_PTR(-EINVAL);
89199 +       struct tcphdr *th;
89200 +       unsigned thlen;
89201 +       unsigned int seq;
89202 +       unsigned int delta;
89203 +       unsigned int oldlen;
89204 +       unsigned int len;
89205 +
89206 +       if (!pskb_may_pull(skb, sizeof(*th)))
89207 +               goto out;
89208 +
89209 +       th = skb->h.th;
89210 +       thlen = th->doff * 4;
89211 +       if (thlen < sizeof(*th))
89212 +               goto out;
89213 +
89214 +       if (!pskb_may_pull(skb, thlen))
89215 +               goto out;
89216 +
89217 +       oldlen = (u16)~skb->len;
89218 +       __skb_pull(skb, thlen);
89219 +
89220 +       if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
89221 +               /* Packet is from an untrusted source, reset gso_segs. */
89222 +               int mss = skb_shinfo(skb)->gso_size;
89223 +
89224 +               skb_shinfo(skb)->gso_segs = (skb->len + mss - 1) / mss;
89225 +
89226 +               segs = NULL;
89227 +               goto out;
89228 +       }
89229 +
89230 +       segs = skb_segment(skb, features);
89231 +       if (IS_ERR(segs))
89232 +               goto out;
89233 +
89234 +       len = skb_shinfo(skb)->gso_size;
89235 +       delta = htonl(oldlen + (thlen + len));
89236 +
89237 +       skb = segs;
89238 +       th = skb->h.th;
89239 +       seq = ntohl(th->seq);
89240 +
89241 +       do {
89242 +               th->fin = th->psh = 0;
89243 +
89244 +               th->check = ~csum_fold(th->check + delta);
89245 +               if (skb->ip_summed != CHECKSUM_HW)
89246 +                       th->check = csum_fold(csum_partial(skb->h.raw, thlen,
89247 +                                                          skb->csum));
89248 +
89249 +               seq += len;
89250 +               skb = skb->next;
89251 +               th = skb->h.th;
89252 +
89253 +               th->seq = htonl(seq);
89254 +               th->cwr = 0;
89255 +       } while (skb->next);
89256 +
89257 +       delta = htonl(oldlen + (skb->tail - skb->h.raw) + skb->data_len);
89258 +       th->check = ~csum_fold(th->check + delta);
89259 +       if (skb->ip_summed != CHECKSUM_HW)
89260 +               th->check = csum_fold(csum_partial(skb->h.raw, thlen,
89261 +                                                  skb->csum));
89262 +
89263 +out:
89264 +       return segs;
89265 +}
89266 +
89267  extern void __skb_cb_too_small_for_tcp(int, int);
89268  extern struct tcp_congestion_ops tcp_reno;
89269  
89270 diff -urNp linux-2.6/net/ipv4/tcp_input.c new/net/ipv4/tcp_input.c
89271 --- linux-2.6/net/ipv4/tcp_input.c      2006-07-03 14:15:22.000000000 +0200
89272 +++ new/net/ipv4/tcp_input.c    2006-07-07 16:21:07.000000000 +0200
89273 @@ -1072,7 +1072,7 @@ tcp_sacktag_write_queue(struct sock *sk,
89274                                 else
89275                                         pkt_len = (end_seq -
89276                                                    TCP_SKB_CB(skb)->seq);
89277 -                               if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->tso_size))
89278 +                               if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size))
89279                                         break;
89280                                 pcount = tcp_skb_pcount(skb);
89281                         }
89282 diff -urNp linux-2.6/net/ipv4/tcp_output.c new/net/ipv4/tcp_output.c
89283 --- linux-2.6/net/ipv4/tcp_output.c     2006-07-03 14:15:22.000000000 +0200
89284 +++ new/net/ipv4/tcp_output.c   2006-07-07 16:21:08.000000000 +0200
89285 @@ -511,15 +511,17 @@ static void tcp_set_skb_tso_segs(struct 
89286                 /* Avoid the costly divide in the normal
89287                  * non-TSO case.
89288                  */
89289 -               skb_shinfo(skb)->tso_segs = 1;
89290 -               skb_shinfo(skb)->tso_size = 0;
89291 +               skb_shinfo(skb)->gso_segs = 1;
89292 +               skb_shinfo(skb)->gso_size = 0;
89293 +               skb_shinfo(skb)->gso_type = 0;
89294         } else {
89295                 unsigned int factor;
89296  
89297                 factor = skb->len + (mss_now - 1);
89298                 factor /= mss_now;
89299 -               skb_shinfo(skb)->tso_segs = factor;
89300 -               skb_shinfo(skb)->tso_size = mss_now;
89301 +               skb_shinfo(skb)->gso_segs = factor;
89302 +               skb_shinfo(skb)->gso_size = mss_now;
89303 +               skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
89304         }
89305  }
89306  
89307 @@ -910,7 +912,7 @@ static int tcp_init_tso_segs(struct sock
89308  
89309         if (!tso_segs ||
89310             (tso_segs > 1 &&
89311 -            skb_shinfo(skb)->tso_size != mss_now)) {
89312 +            tcp_skb_mss(skb) != mss_now)) {
89313                 tcp_set_skb_tso_segs(sk, skb, mss_now);
89314                 tso_segs = tcp_skb_pcount(skb);
89315         }
89316 @@ -1720,8 +1722,9 @@ int tcp_retransmit_skb(struct sock *sk, 
89317            tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
89318                 if (!pskb_trim(skb, 0)) {
89319                         TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
89320 -                       skb_shinfo(skb)->tso_segs = 1;
89321 -                       skb_shinfo(skb)->tso_size = 0;
89322 +                       skb_shinfo(skb)->gso_segs = 1;
89323 +                       skb_shinfo(skb)->gso_size = 0;
89324 +                       skb_shinfo(skb)->gso_type = 0;
89325                         skb->ip_summed = CHECKSUM_NONE;
89326                         skb->csum = 0;
89327                 }
89328 @@ -1926,8 +1929,9 @@ void tcp_send_fin(struct sock *sk)
89329                 skb->csum = 0;
89330                 TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
89331                 TCP_SKB_CB(skb)->sacked = 0;
89332 -               skb_shinfo(skb)->tso_segs = 1;
89333 -               skb_shinfo(skb)->tso_size = 0;
89334 +               skb_shinfo(skb)->gso_segs = 1;
89335 +               skb_shinfo(skb)->gso_size = 0;
89336 +               skb_shinfo(skb)->gso_type = 0;
89337  
89338                 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
89339                 TCP_SKB_CB(skb)->seq = tp->write_seq;
89340 @@ -1959,8 +1963,9 @@ void tcp_send_active_reset(struct sock *
89341         skb->csum = 0;
89342         TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
89343         TCP_SKB_CB(skb)->sacked = 0;
89344 -       skb_shinfo(skb)->tso_segs = 1;
89345 -       skb_shinfo(skb)->tso_size = 0;
89346 +       skb_shinfo(skb)->gso_segs = 1;
89347 +       skb_shinfo(skb)->gso_size = 0;
89348 +       skb_shinfo(skb)->gso_type = 0;
89349  
89350         /* Send it off. */
89351         TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
89352 @@ -2043,8 +2048,9 @@ struct sk_buff * tcp_make_synack(struct 
89353         TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn;
89354         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
89355         TCP_SKB_CB(skb)->sacked = 0;
89356 -       skb_shinfo(skb)->tso_segs = 1;
89357 -       skb_shinfo(skb)->tso_size = 0;
89358 +       skb_shinfo(skb)->gso_segs = 1;
89359 +       skb_shinfo(skb)->gso_size = 0;
89360 +       skb_shinfo(skb)->gso_type = 0;
89361         th->seq = htonl(TCP_SKB_CB(skb)->seq);
89362         th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
89363         if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
89364 @@ -2148,8 +2154,9 @@ int tcp_connect(struct sock *sk)
89365         TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
89366         TCP_ECN_send_syn(sk, tp, buff);
89367         TCP_SKB_CB(buff)->sacked = 0;
89368 -       skb_shinfo(buff)->tso_segs = 1;
89369 -       skb_shinfo(buff)->tso_size = 0;
89370 +       skb_shinfo(buff)->gso_segs = 1;
89371 +       skb_shinfo(buff)->gso_size = 0;
89372 +       skb_shinfo(buff)->gso_type = 0;
89373         buff->csum = 0;
89374         TCP_SKB_CB(buff)->seq = tp->write_seq++;
89375         TCP_SKB_CB(buff)->end_seq = tp->write_seq;
89376 @@ -2253,8 +2260,9 @@ void tcp_send_ack(struct sock *sk)
89377                 buff->csum = 0;
89378                 TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
89379                 TCP_SKB_CB(buff)->sacked = 0;
89380 -               skb_shinfo(buff)->tso_segs = 1;
89381 -               skb_shinfo(buff)->tso_size = 0;
89382 +               skb_shinfo(buff)->gso_segs = 1;
89383 +               skb_shinfo(buff)->gso_size = 0;
89384 +               skb_shinfo(buff)->gso_type = 0;
89385  
89386                 /* Send it off, this clears delayed acks for us. */
89387                 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
89388 @@ -2289,8 +2297,9 @@ static int tcp_xmit_probe_skb(struct soc
89389         skb->csum = 0;
89390         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
89391         TCP_SKB_CB(skb)->sacked = urgent;
89392 -       skb_shinfo(skb)->tso_segs = 1;
89393 -       skb_shinfo(skb)->tso_size = 0;
89394 +       skb_shinfo(skb)->gso_segs = 1;
89395 +       skb_shinfo(skb)->gso_size = 0;
89396 +       skb_shinfo(skb)->gso_type = 0;
89397  
89398         /* Use a previous sequence.  This should cause the other
89399          * end to send an ack.  Don't queue or clone SKB, just
89400 diff -urNp linux-2.6/net/ipv4/xfrm4_output.c new/net/ipv4/xfrm4_output.c
89401 --- linux-2.6/net/ipv4/xfrm4_output.c   2006-07-03 14:15:22.000000000 +0200
89402 +++ new/net/ipv4/xfrm4_output.c 2006-07-07 16:21:08.000000000 +0200
89403 @@ -9,6 +9,8 @@
89404   */
89405  
89406  #include <linux/compiler.h>
89407 +#include <linux/if_ether.h>
89408 +#include <linux/kernel.h>
89409  #include <linux/skbuff.h>
89410  #include <linux/spinlock.h>
89411  #include <linux/netfilter_ipv4.h>
89412 @@ -17,6 +19,8 @@
89413  #include <net/xfrm.h>
89414  #include <net/icmp.h>
89415  
89416 +extern int skb_checksum_setup(struct sk_buff *skb);
89417 +
89418  /* Add encapsulation header.
89419   *
89420   * In transport mode, the IP header will be moved forward to make space
89421 @@ -103,6 +107,10 @@ static int xfrm4_output_one(struct sk_bu
89422         struct xfrm_state *x = dst->xfrm;
89423         int err;
89424         
89425 +       err = skb_checksum_setup(skb);
89426 +       if (err)
89427 +               goto error_nolock;
89428 +
89429         if (skb->ip_summed == CHECKSUM_HW) {
89430                 err = skb_checksum_help(skb, 0);
89431                 if (err)
89432 @@ -152,16 +160,10 @@ error_nolock:
89433         goto out_exit;
89434  }
89435  
89436 -static int xfrm4_output_finish(struct sk_buff *skb)
89437 +static int xfrm4_output_finish2(struct sk_buff *skb)
89438  {
89439         int err;
89440  
89441 -#ifdef CONFIG_NETFILTER
89442 -       if (!skb->dst->xfrm) {
89443 -               IPCB(skb)->flags |= IPSKB_REROUTED;
89444 -               return dst_output(skb);
89445 -       }
89446 -#endif
89447         while (likely((err = xfrm4_output_one(skb)) == 0)) {
89448                 nf_reset(skb);
89449  
89450 @@ -174,7 +176,7 @@ static int xfrm4_output_finish(struct sk
89451                         return dst_output(skb);
89452  
89453                 err = nf_hook(PF_INET, NF_IP_POST_ROUTING, &skb, NULL,
89454 -                             skb->dst->dev, xfrm4_output_finish);
89455 +                             skb->dst->dev, xfrm4_output_finish2);
89456                 if (unlikely(err != 1))
89457                         break;
89458         }
89459 @@ -182,6 +184,48 @@ static int xfrm4_output_finish(struct sk
89460         return err;
89461  }
89462  
89463 +static int xfrm4_output_finish(struct sk_buff *skb)
89464 +{
89465 +       struct sk_buff *segs;
89466 +
89467 +#ifdef CONFIG_NETFILTER
89468 +       if (!skb->dst->xfrm) {
89469 +               IPCB(skb)->flags |= IPSKB_REROUTED;
89470 +               return dst_output(skb);
89471 +       }
89472 +#endif
89473 +
89474 +       if (!skb_shinfo(skb)->gso_size)
89475 +               return xfrm4_output_finish2(skb);
89476 +
89477 +       skb->protocol = htons(ETH_P_IP);
89478 +       segs = skb_gso_segment(skb, 0);
89479 +       kfree_skb(skb);
89480 +       if (unlikely(IS_ERR(segs)))
89481 +               return PTR_ERR(segs);
89482 +
89483 +       do {
89484 +               struct sk_buff *nskb = segs->next;
89485 +               int err;
89486 +
89487 +               segs->next = NULL;
89488 +               err = xfrm4_output_finish2(segs);
89489 +
89490 +               if (unlikely(err)) {
89491 +                       while ((segs = nskb)) {
89492 +                               nskb = segs->next;
89493 +                               segs->next = NULL;
89494 +                               kfree_skb(segs);
89495 +                       }
89496 +                       return err;
89497 +               }
89498 +
89499 +               segs = nskb;
89500 +       } while (segs);
89501 +
89502 +       return 0;
89503 +}
89504 +
89505  int xfrm4_output(struct sk_buff *skb)
89506  {
89507         return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dst->dev,
89508 diff -urNp linux-2.6/net/ipv6/addrconf.c new/net/ipv6/addrconf.c
89509 --- linux-2.6/net/ipv6/addrconf.c       2006-07-03 14:15:22.000000000 +0200
89510 +++ new/net/ipv6/addrconf.c     2006-06-28 14:32:14.000000000 +0200
89511 @@ -2461,6 +2461,7 @@ static void addrconf_dad_start(struct in
89512         spin_lock_bh(&ifp->lock);
89513  
89514         if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
89515 +           !(dev->flags&IFF_MULTICAST) ||
89516             !(ifp->flags&IFA_F_TENTATIVE)) {
89517                 ifp->flags &= ~IFA_F_TENTATIVE;
89518                 spin_unlock_bh(&ifp->lock);
89519 @@ -2545,6 +2546,7 @@ static void addrconf_dad_completed(struc
89520         if (ifp->idev->cnf.forwarding == 0 &&
89521             ifp->idev->cnf.rtr_solicits > 0 &&
89522             (dev->flags&IFF_LOOPBACK) == 0 &&
89523 +           (dev->flags & IFF_MULTICAST) &&
89524             (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) {
89525                 struct in6_addr all_routers;
89526  
89527 diff -urNp linux-2.6/net/ipv6/ip6_output.c new/net/ipv6/ip6_output.c
89528 --- linux-2.6/net/ipv6/ip6_output.c     2006-07-03 14:15:22.000000000 +0200
89529 +++ new/net/ipv6/ip6_output.c   2006-07-07 16:21:08.000000000 +0200
89530 @@ -147,7 +147,7 @@ static int ip6_output2(struct sk_buff *s
89531  
89532  int ip6_output(struct sk_buff *skb)
89533  {
89534 -       if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
89535 +       if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->gso_size) ||
89536                                 dst_allfrag(skb->dst))
89537                 return ip6_fragment(skb, ip6_output2);
89538         else
89539 @@ -830,8 +830,9 @@ static inline int ip6_ufo_append_data(st
89540                 struct frag_hdr fhdr;
89541  
89542                 /* specify the length of each IP datagram fragment*/
89543 -               skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) - 
89544 -                                               sizeof(struct frag_hdr);
89545 +               skb_shinfo(skb)->gso_size = mtu - fragheaderlen - 
89546 +                                           sizeof(struct frag_hdr);
89547 +               skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4;
89548                 ipv6_select_ident(skb, &fhdr);
89549                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
89550                 __skb_queue_tail(&sk->sk_write_queue, skb);
89551 diff -urNp linux-2.6/net/ipv6/ipcomp6.c new/net/ipv6/ipcomp6.c
89552 --- linux-2.6/net/ipv6/ipcomp6.c        2006-07-03 14:15:22.000000000 +0200
89553 +++ new/net/ipv6/ipcomp6.c      2006-07-07 16:21:08.000000000 +0200
89554 @@ -65,7 +65,7 @@ static LIST_HEAD(ipcomp6_tfms_list);
89555  
89556  static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb)
89557  {
89558 -       int err = 0;
89559 +       int err = -ENOMEM;
89560         u8 nexthdr = 0;
89561         int hdr_len = skb->h.raw - skb->nh.raw;
89562         unsigned char *tmp_hdr = NULL;
89563 @@ -76,11 +76,8 @@ static int ipcomp6_input(struct xfrm_sta
89564         struct crypto_tfm *tfm;
89565         int cpu;
89566  
89567 -       if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
89568 -               skb_linearize(skb, GFP_ATOMIC) != 0) {
89569 -               err = -ENOMEM;
89570 +       if (skb_linearize_cow(skb))
89571                 goto out;
89572 -       }
89573  
89574         skb->ip_summed = CHECKSUM_NONE;
89575  
89576 @@ -159,10 +156,8 @@ static int ipcomp6_output(struct xfrm_st
89577                 goto out_ok;
89578         }
89579  
89580 -       if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
89581 -               skb_linearize(skb, GFP_ATOMIC) != 0) {
89582 +       if (skb_linearize_cow(skb))
89583                 goto out_ok;
89584 -       }
89585  
89586         /* compression */
89587         plen = skb->len - hdr_len;
89588 diff -urNp linux-2.6/net/ipv6/xfrm6_output.c new/net/ipv6/xfrm6_output.c
89589 --- linux-2.6/net/ipv6/xfrm6_output.c   2006-07-03 14:15:22.000000000 +0200
89590 +++ new/net/ipv6/xfrm6_output.c 2006-07-07 15:10:03.000000000 +0200
89591 @@ -151,7 +151,7 @@ error_nolock:
89592         goto out_exit;
89593  }
89594  
89595 -static int xfrm6_output_finish(struct sk_buff *skb)
89596 +static int xfrm6_output_finish2(struct sk_buff *skb)
89597  {
89598         int err;
89599  
89600 @@ -167,7 +167,7 @@ static int xfrm6_output_finish(struct sk
89601                         return dst_output(skb);
89602  
89603                 err = nf_hook(PF_INET6, NF_IP6_POST_ROUTING, &skb, NULL,
89604 -                             skb->dst->dev, xfrm6_output_finish);
89605 +                             skb->dst->dev, xfrm6_output_finish2);
89606                 if (unlikely(err != 1))
89607                         break;
89608         }
89609 @@ -175,6 +175,41 @@ static int xfrm6_output_finish(struct sk
89610         return err;
89611  }
89612  
89613 +static int xfrm6_output_finish(struct sk_buff *skb)
89614 +{
89615 +       struct sk_buff *segs;
89616 +
89617 +       if (!skb_shinfo(skb)->gso_size)
89618 +               return xfrm6_output_finish2(skb);
89619 +
89620 +       skb->protocol = htons(ETH_P_IP);
89621 +       segs = skb_gso_segment(skb, 0);
89622 +       kfree_skb(skb);
89623 +       if (unlikely(IS_ERR(segs)))
89624 +               return PTR_ERR(segs);
89625 +
89626 +       do {
89627 +               struct sk_buff *nskb = segs->next;
89628 +               int err;
89629 +
89630 +               segs->next = NULL;
89631 +               err = xfrm6_output_finish2(segs);
89632 +
89633 +               if (unlikely(err)) {
89634 +                       while ((segs = nskb)) {
89635 +                               nskb = segs->next;
89636 +                               segs->next = NULL;
89637 +                               kfree_skb(segs);
89638 +                       }
89639 +                       return err;
89640 +               }
89641 +
89642 +               segs = nskb;
89643 +       } while (segs);
89644 +
89645 +       return 0;
89646 +}
89647 +
89648  int xfrm6_output(struct sk_buff *skb)
89649  {
89650         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL, skb->dst->dev,
89651 diff -urNp linux-2.6/net/sched/sch_teql.c new/net/sched/sch_teql.c
89652 --- linux-2.6/net/sched/sch_teql.c      2006-07-03 14:15:23.000000000 +0200
89653 +++ new/net/sched/sch_teql.c    2006-07-07 15:10:03.000000000 +0200
89654 @@ -302,20 +302,17 @@ restart:
89655  
89656                 switch (teql_resolve(skb, skb_res, slave)) {
89657                 case 0:
89658 -                       if (spin_trylock(&slave->xmit_lock)) {
89659 -                               slave->xmit_lock_owner = smp_processor_id();
89660 +                       if (netif_tx_trylock(slave)) {
89661                                 if (!netif_queue_stopped(slave) &&
89662                                     slave->hard_start_xmit(skb, slave) == 0) {
89663 -                                       slave->xmit_lock_owner = -1;
89664 -                                       spin_unlock(&slave->xmit_lock);
89665 +                                       netif_tx_unlock(slave);
89666                                         master->slaves = NEXT_SLAVE(q);
89667                                         netif_wake_queue(dev);
89668                                         master->stats.tx_packets++;
89669                                         master->stats.tx_bytes += len;
89670                                         return 0;
89671                                 }
89672 -                               slave->xmit_lock_owner = -1;
89673 -                               spin_unlock(&slave->xmit_lock);
89674 +                               netif_tx_unlock(slave);
89675                         }
89676                         if (netif_queue_stopped(dev))
89677                                 busy = 1;
89678 diff -urNp linux-2.6/scripts/Makefile.xen new/scripts/Makefile.xen
89679 --- linux-2.6/scripts/Makefile.xen      1970-01-01 01:00:00.000000000 +0100
89680 +++ new/scripts/Makefile.xen    2006-05-09 12:40:15.000000000 +0200
89681 @@ -0,0 +1,14 @@
89682 +
89683 +# cherrypickxen($1 = allobj)
89684 +cherrypickxen = $(foreach var, $(1), \
89685 +               $(shell o=$(var); \
89686 +                       c=$${o%.o}-xen.c; \
89687 +                       s=$${o%.o}-xen.S; \
89688 +                       oxen=$${o%.o}-xen.o; \
89689 +                       [ -f $(srctree)/$(src)/$${c} ] || \
89690 +                          [ -f $(srctree)/$(src)/$${s} ] \
89691 +                               && echo $$oxen \
89692 +                               || echo $(var) ) \
89693 +         )
89694 +# filterxen($1 = allobj, $2 = noobjs)
89695 +filterxen = $(filter-out $(2), $(1))
89696 --- linux-2.6.17/drivers/block/aoe/aoenet.c~    2006-06-18 03:49:35.000000000 +0200
89697 +++ linux-2.6.17/drivers/block/aoe/aoenet.c     2006-09-17 03:44:30.000000000 +0200
89698 @@ -117,7 +117,7 @@
89699         if (skb == NULL)
89700                 return 0;
89701         if (skb_is_nonlinear(skb))
89702 -       if (skb_linearize(skb, GFP_ATOMIC) < 0)
89703 +       if (skb_linearize(skb) < 0)
89704                 goto exit;
89705         if (!is_aoe_netif(ifp))
89706                 goto exit;
89707 --- linux-2.6.17/drivers/net/sk98lin/sky2.c~    2006-09-17 04:09:56.000000000 +0200
89708 +++ linux-2.6.17/drivers/net/sk98lin/sky2.c     2006-09-17 04:42:45.000000000 +0200
89709 @@ -990,7 +990,7 @@
89710                         SK_DBG_MSG(pAC, SK_DBGMOD_DRV, SK_DBGCAT_DRV_TX_PROGRESS,
89711                                 ("\tGet LE\n"));
89712  #ifdef NETIF_F_TSO
89713 -                       Mss = skb_shinfo(pSkPacket->pMBuf)->tso_size;
89714 +                       Mss = skb_shinfo(pSkPacket->pMBuf)->gso_size;
89715                         if (Mss) {
89716                                 TcpOptLen = ((pSkPacket->pMBuf->h.th->doff - 5) * 4);
89717                                 IpTcpLen  = ((pSkPacket->pMBuf->nh.iph->ihl * 4) + 
89718 --- linux-2.6.17/drivers/net/imq.c~     2006-09-17 04:46:00.000000000 +0200
89719 +++ linux-2.6.17/drivers/net/imq.c      2006-09-17 12:34:36.000000000 +0200
89720 @@ -201,7 +201,7 @@
89721                         ret = 0;
89722                 }
89723         }
89724 -       if (spin_is_locked(&dev->xmit_lock))
89725 +       if (spin_is_locked(&dev->_xmit_lock))
89726                 netif_schedule(dev);
89727         else
89728  
89729 --- linux-2.6.17/net/sched/sch_generic.c.orig   2006-09-17 19:56:31.000000000 +0200
89730 +++ linux-2.6.17/net/sched/sch_generic.c        2006-09-17 20:03:04.000000000 +0200
89731 @@ -75,9 +75,9 @@
89732     dev->queue_lock serializes queue accesses for this device
89733     AND dev->qdisc pointer itself.
89734  
89735 -   dev->xmit_lock serializes accesses to device driver.
89736 +   netif_tx_lock serializes accesses to device driver.
89737  
89738 -   dev->queue_lock and dev->xmit_lock are mutually exclusive,
89739 +   dev->queue_lock and netif_tx_lock are mutually exclusive,
89740     if one is grabbed, another must be free.
89741   */
89742  
89743 @@ -93,14 +93,17 @@
89744     NOTE: Called under dev->queue_lock with locally disabled BH.
89745  */
89746  
89747 -int qdisc_restart(struct net_device *dev)
89748 +static inline int qdisc_restart(struct net_device *dev)
89749  {
89750         struct Qdisc *q = dev->qdisc;
89751         struct sk_buff *skb;
89752  
89753         /* Dequeue packet */
89754 -       if ((skb = q->dequeue(q)) != NULL) {
89755 +       if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
89756                 unsigned nolock = (dev->features & NETIF_F_LLTX);
89757 +
89758 +               dev->gso_skb = NULL;
89759 +
89760                 /*
89761                  * When the driver has LLTX set it does its own locking
89762                  * in start_xmit. No need to add additional overhead by
89763 @@ -111,7 +114,7 @@
89764                  * will be requeued.
89765                  */
89766                 if (!nolock) {
89767 -                       if (!spin_trylock(&dev->xmit_lock)) {
89768 +                       if (!netif_tx_trylock(dev)) {
89769                         collision:
89770                                 /* So, someone grabbed the driver. */
89771                                 
89772 @@ -129,8 +132,6 @@
89773                                 __get_cpu_var(netdev_rx_stat).cpu_collision++;
89774                                 goto requeue;
89775                         }
89776 -                       /* Remember that the driver is grabbed by us. */
89777 -                       dev->xmit_lock_owner = smp_processor_id();
89778                 }
89779                 
89780                 {
89781 @@ -139,20 +140,10 @@
89782  
89783                         if (!netif_queue_stopped(dev)) {
89784                                 int ret;
89785 -
89786 -                                if (netdev_nit
89787 -#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
89788 -                                  && !(skb->imq_flags & IMQ_F_ENQUEUE)
89789 -#endif
89790 -                                  )
89791 -
89792 -                                       dev_queue_xmit_nit(skb, dev);
89793 -
89794 -                               ret = dev->hard_start_xmit(skb, dev);
89795 +                               ret = dev_hard_start_xmit(skb, dev);
89796                                 if (ret == NETDEV_TX_OK) { 
89797                                         if (!nolock) {
89798 -                                               dev->xmit_lock_owner = -1;
89799 -                                               spin_unlock(&dev->xmit_lock);
89800 +                                               netif_tx_unlock(dev);
89801                                         }
89802                                         spin_lock(&dev->queue_lock);
89803                                         return -1;
89804 @@ -166,8 +157,7 @@
89805                         /* NETDEV_TX_BUSY - we need to requeue */
89806                         /* Release the driver */
89807                         if (!nolock) { 
89808 -                               dev->xmit_lock_owner = -1;
89809 -                               spin_unlock(&dev->xmit_lock);
89810 +                               netif_tx_unlock(dev);
89811                         } 
89812                         spin_lock(&dev->queue_lock);
89813                         q = dev->qdisc;
89814 @@ -184,7 +174,10 @@
89815                  */
89816  
89817  requeue:
89818 -               q->ops->requeue(skb, q);
89819 +               if (skb->next)
89820 +                       dev->gso_skb = skb;
89821 +               else
89822 +                       q->ops->requeue(skb, q);
89823                 netif_schedule(dev);
89824                 return 1;
89825         }
89826 @@ -192,11 +185,23 @@
89827         return q->q.qlen;
89828  }
89829  
89830 +void __qdisc_run(struct net_device *dev)
89831 +{
89832 +       if (unlikely(dev->qdisc == &noop_qdisc))
89833 +               goto out;
89834 +
89835 +       while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
89836 +               /* NOTHING */;
89837 +
89838 +out:
89839 +       clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
89840 +}
89841 +
89842  static void dev_watchdog(unsigned long arg)
89843  {
89844         struct net_device *dev = (struct net_device *)arg;
89845  
89846 -       spin_lock(&dev->xmit_lock);
89847 +       netif_tx_lock(dev);
89848         if (dev->qdisc != &noop_qdisc) {
89849                 if (netif_device_present(dev) &&
89850                     netif_running(dev) &&
89851 @@ -212,7 +217,7 @@
89852                                 dev_hold(dev);
89853                 }
89854         }
89855 -       spin_unlock(&dev->xmit_lock);
89856 +       netif_tx_unlock(dev);
89857  
89858         dev_put(dev);
89859  }
89860 @@ -236,17 +241,17 @@
89861  
89862  static void dev_watchdog_up(struct net_device *dev)
89863  {
89864 -       spin_lock_bh(&dev->xmit_lock);
89865 +       netif_tx_lock_bh(dev);
89866         __netdev_watchdog_up(dev);
89867 -       spin_unlock_bh(&dev->xmit_lock);
89868 +       netif_tx_unlock_bh(dev);
89869  }
89870  
89871  static void dev_watchdog_down(struct net_device *dev)
89872  {
89873 -       spin_lock_bh(&dev->xmit_lock);
89874 +       netif_tx_lock_bh(dev);
89875         if (del_timer(&dev->watchdog_timer))
89876                 dev_put(dev);
89877 -       spin_unlock_bh(&dev->xmit_lock);
89878 +       netif_tx_unlock_bh(dev);
89879  }
89880  
89881  void netif_carrier_on(struct net_device *dev)
89882 @@ -588,10 +593,17 @@
89883  
89884         dev_watchdog_down(dev);
89885  
89886 -       while (test_bit(__LINK_STATE_SCHED, &dev->state))
89887 +       /* Wait for outstanding dev_queue_xmit calls. */
89888 +       synchronize_rcu();
89889 +
89890 +       /* Wait for outstanding qdisc_run calls. */
89891 +       while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
89892                 yield();
89893  
89894 -       spin_unlock_wait(&dev->xmit_lock);
89895 +       if (dev->gso_skb) {
89896 +               kfree_skb(dev->gso_skb);
89897 +               dev->gso_skb = NULL;
89898 +       }
89899  }
89900  
89901  void dev_init_scheduler(struct net_device *dev)
89902 @@ -633,6 +645,5 @@
89903  EXPORT_SYMBOL(qdisc_alloc);
89904  EXPORT_SYMBOL(qdisc_destroy);
89905  EXPORT_SYMBOL(qdisc_reset);
89906 -EXPORT_SYMBOL(qdisc_restart);
89907  EXPORT_SYMBOL(qdisc_lock_tree);
89908  EXPORT_SYMBOL(qdisc_unlock_tree);
This page took 7.069997 seconds and 3 git commands to generate.