]> git.pld-linux.org Git - packages/kernel.git/blob - xen-3.0-2.6.16.patch
- xen0/xenU todo entry (compile failure)
[packages/kernel.git] / xen-3.0-2.6.16.patch
1 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/Kconfig linux-2.6.16/arch/i386/Kconfig
2 --- linux-2.6.16.orig/arch/i386/Kconfig 2006-06-26 09:49:46.000000000 +0200
3 +++ linux-2.6.16/arch/i386/Kconfig      2006-06-26 09:51:32.000000000 +0200
4 @@ -58,6 +58,15 @@
5         help
6           Choose this option if your computer is a standard PC or compatible.
7  
8 +config X86_XEN
9 +       bool "Xen-compatible"
10 +       select X86_UP_APIC if !SMP && XEN_PRIVILEGED_GUEST
11 +       select X86_UP_IOAPIC if !SMP && XEN_PRIVILEGED_GUEST
12 +       select SWIOTLB
13 +       help
14 +         Choose this option if you plan to run this kernel on top of the
15 +         Xen Hypervisor.
16 +
17  config X86_ELAN
18         bool "AMD Elan"
19         help
20 @@ -159,6 +168,7 @@
21  
22  config HPET_TIMER
23         bool "HPET Timer Support"
24 +       depends on !X86_XEN
25         help
26           This enables the use of the HPET for the kernel's internal timer.
27           HPET is the next generation timer replacing legacy 8254s.
28 @@ -202,6 +212,19 @@
29  
30           If you don't know what to do here, say N.
31  
32 +config SMP_ALTERNATIVES
33 +       bool "SMP alternatives support (EXPERIMENTAL)"
34 +       depends on SMP && EXPERIMENTAL
35 +       help
36 +         Try to reduce the overhead of running an SMP kernel on a uniprocessor
37 +         host slightly by replacing certain key instruction sequences
38 +         according to whether we currently have more than one CPU available.
39 +         This should provide a noticeable boost to performance when
40 +         running SMP kernels on UP machines, and have negligible impact
41 +         when running on an true SMP host.
42 +
43 +          If unsure, say N.
44 +         
45  config NR_CPUS
46         int "Maximum number of CPUs (2-255)"
47         range 2 255
48 @@ -218,7 +241,7 @@
49  
50  config SCHED_SMT
51         bool "SMT (Hyperthreading) scheduler support"
52 -       depends on SMP
53 +       depends on SMP && !X86_XEN
54         default off
55         help
56           SMT scheduler support improves the CPU scheduler's decision making
57 @@ -230,7 +253,7 @@
58  
59  config X86_UP_APIC
60         bool "Local APIC support on uniprocessors"
61 -       depends on !SMP && !(X86_VISWS || X86_VOYAGER)
62 +       depends on !SMP && !(X86_VISWS || X86_VOYAGER || XEN_UNPRIVILEGED_GUEST)
63         help
64           A local APIC (Advanced Programmable Interrupt Controller) is an
65           integrated interrupt controller in the CPU. If you have a single-CPU
66 @@ -255,12 +278,12 @@
67  
68  config X86_LOCAL_APIC
69         bool
70 -       depends on X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER)
71 +       depends on X86_UP_APIC || ((X86_VISWS || SMP) && !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST))
72         default y
73  
74  config X86_IO_APIC
75         bool
76 -       depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER))
77 +       depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER || XEN_UNPRIVILEGED_GUEST))
78         default y
79  
80  config X86_VISWS_APIC
81 @@ -268,9 +291,14 @@
82         depends on X86_VISWS
83         default y
84  
85 +config X86_TSC
86 +       bool
87 +       depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1) && !X86_NUMAQ && !X86_XEN
88 +       default y
89 +
90  config X86_MCE
91         bool "Machine Check Exception"
92 -       depends on !X86_VOYAGER
93 +       depends on !(X86_VOYAGER || X86_XEN)
94         ---help---
95           Machine Check Exception support allows the processor to notify the
96           kernel if it detects a problem (e.g. overheating, component failure).
97 @@ -360,6 +388,7 @@
98  
99  config MICROCODE
100         tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support"
101 +       depends on !XEN_UNPRIVILEGED_GUEST
102         ---help---
103           If you say Y here and also to "/dev file system support" in the
104           'File systems' section, you will be able to update the microcode on
105 @@ -377,6 +406,7 @@
106  
107  config X86_MSR
108         tristate "/dev/cpu/*/msr - Model-specific register support"
109 +       depends on !X86_XEN
110         help
111           This device gives privileged processes access to the x86
112           Model-Specific Registers (MSRs).  It is a character device with
113 @@ -392,6 +422,10 @@
114           with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
115           /dev/cpu/31/cpuid.
116  
117 +config SWIOTLB
118 +       bool
119 +       default n
120 +
121  source "drivers/firmware/Kconfig"
122  
123  choice
124 @@ -580,7 +614,7 @@
125  
126  config HIGHPTE
127         bool "Allocate 3rd-level pagetables from highmem"
128 -       depends on HIGHMEM4G || HIGHMEM64G
129 +       depends on (HIGHMEM4G || HIGHMEM64G) && !X86_XEN
130         help
131           The VM uses one page table entry for each page of physical memory.
132           For systems with a lot of RAM, this can be wasteful of precious
133 @@ -589,6 +623,7 @@
134  
135  config MATH_EMULATION
136         bool "Math emulation"
137 +       depends on !X86_XEN
138         ---help---
139           Linux can emulate a math coprocessor (used for floating point
140           operations) if you don't have one. 486DX and Pentium processors have
141 @@ -614,6 +649,8 @@
142  
143  config MTRR
144         bool "MTRR (Memory Type Range Register) support"
145 +       depends on !XEN_UNPRIVILEGED_GUEST
146 +       default y if X86_XEN
147         ---help---
148           On Intel P6 family processors (Pentium Pro, Pentium II and later)
149           the Memory Type Range Registers (MTRRs) may be used to control
150 @@ -648,7 +685,7 @@
151  
152  config EFI
153         bool "Boot from EFI support (EXPERIMENTAL)"
154 -       depends on ACPI
155 +       depends on ACPI && !X86_XEN
156         default n
157         ---help---
158         This enables the the kernel to boot on EFI platforms using
159 @@ -666,7 +703,7 @@
160  
161  config IRQBALANCE
162         bool "Enable kernel irq balancing"
163 -       depends on SMP && X86_IO_APIC
164 +       depends on SMP && X86_IO_APIC && !X86_XEN
165         default y
166         help
167           The default yes will allow the kernel to do irq load balancing.
168 @@ -709,7 +746,7 @@
169  
170  config KEXEC
171         bool "kexec system call (EXPERIMENTAL)"
172 -       depends on EXPERIMENTAL
173 +       depends on EXPERIMENTAL && !X86_XEN
174         help
175           kexec is a system call that implements the ability to shutdown your
176           current kernel, and to start another kernel.  It is like a reboot
177 @@ -763,6 +800,7 @@
178  config DOUBLEFAULT
179         default y
180         bool "Enable doublefault exception handler" if EMBEDDED
181 +       depends on !X86_NO_TSS
182         help
183            This option allows trapping of rare doublefault exceptions that
184            would otherwise cause a system to silently reboot. Disabling this
185 @@ -773,18 +811,20 @@
186  
187  
188  menu "Power management options (ACPI, APM)"
189 -       depends on !X86_VOYAGER
190 +       depends on !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST)
191  
192 +if !X86_XEN
193  source kernel/power/Kconfig
194 +endif
195  
196  source "drivers/acpi/Kconfig"
197  
198  menu "APM (Advanced Power Management) BIOS Support"
199 -depends on PM && !X86_VISWS
200 +depends on PM && !(X86_VISWS || X86_XEN)
201  
202  config APM
203         tristate "APM (Advanced Power Management) BIOS support"
204 -       depends on PM
205 +       depends on PM && PM_LEGACY
206         ---help---
207           APM is a BIOS specification for saving power using several different
208           techniques. This is mostly useful for battery powered laptops with
209 @@ -969,6 +1009,7 @@
210  
211  config PCI_GOBIOS
212         bool "BIOS"
213 +       depends on !X86_XEN
214  
215  config PCI_GOMMCONFIG
216         bool "MMConfig"
217 @@ -976,6 +1017,13 @@
218  config PCI_GODIRECT
219         bool "Direct"
220  
221 +config PCI_GOXEN_FE
222 +       bool "Xen PCI Frontend"
223 +       depends on X86_XEN
224 +       help
225 +         The PCI device frontend driver allows the kernel to import arbitrary
226 +         PCI devices from a PCI backend to support PCI driver domains.
227 +
228  config PCI_GOANY
229         bool "Any"
230  
231 @@ -983,7 +1031,7 @@
232  
233  config PCI_BIOS
234         bool
235 -       depends on !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY)
236 +       depends on !(X86_VISWS || X86_XEN) && PCI && (PCI_GOBIOS || PCI_GOANY)
237         default y
238  
239  config PCI_DIRECT
240 @@ -996,6 +1044,18 @@
241         depends on PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY)
242         default y
243  
244 +config XEN_PCIDEV_FRONTEND
245 +       bool
246 +       depends on PCI && X86_XEN && (PCI_GOXEN_FE || PCI_GOANY)
247 +       default y
248 +
249 +config XEN_PCIDEV_FE_DEBUG
250 +       bool "Xen PCI Frontend Debugging"
251 +       depends on XEN_PCIDEV_FRONTEND
252 +       default n
253 +       help
254 +         Enables some debug statements within the PCI Frontend.
255 +
256  source "drivers/pci/pcie/Kconfig"
257  
258  source "drivers/pci/Kconfig"
259 @@ -1006,7 +1066,7 @@
260  
261  config ISA
262         bool "ISA support"
263 -       depends on !(X86_VOYAGER || X86_VISWS)
264 +       depends on !(X86_VOYAGER || X86_VISWS || X86_XEN)
265         help
266           Find out whether you have ISA slots on your motherboard.  ISA is the
267           name of a bus system, i.e. the way the CPU talks to the other stuff
268 @@ -1033,7 +1093,7 @@
269  source "drivers/eisa/Kconfig"
270  
271  config MCA
272 -       bool "MCA support" if !(X86_VISWS || X86_VOYAGER)
273 +       bool "MCA support" if !(X86_VISWS || X86_VOYAGER || X86_XEN)
274         default y if X86_VOYAGER
275         help
276           MicroChannel Architecture is found in some IBM PS/2 machines and
277 @@ -1076,7 +1136,9 @@
278  menu "Instrumentation Support"
279         depends on EXPERIMENTAL
280  
281 +if !X86_XEN
282  source "arch/i386/oprofile/Kconfig"
283 +endif
284  
285  config KPROBES
286         bool "Kprobes (EXPERIMENTAL)"
287 @@ -1097,6 +1159,8 @@
288  
289  source "crypto/Kconfig"
290  
291 +source "drivers/xen/Kconfig"
292 +
293  source "lib/Kconfig"
294  
295  #
296 @@ -1122,7 +1186,7 @@
297  
298  config X86_HT
299         bool
300 -       depends on SMP && !(X86_VISWS || X86_VOYAGER)
301 +       depends on SMP && !(X86_VISWS || X86_VOYAGER || X86_XEN)
302         default y
303  
304  config X86_BIOS_REBOOT
305 @@ -1135,6 +1199,21 @@
306         depends on X86_SMP || (X86_VOYAGER && SMP)
307         default y
308  
309 +config X86_NO_TSS
310 +       bool
311 +       depends on X86_XEN
312 +       default y
313 +
314 +config X86_SYSENTER
315 +       bool
316 +       depends on !X86_NO_TSS
317 +       default y
318 +
319 +config X86_NO_IDT
320 +       bool
321 +       depends on X86_XEN
322 +       default y
323 +
324  config KTIME_SCALAR
325         bool
326         default y
327 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/Kconfig.cpu linux-2.6.16/arch/i386/Kconfig.cpu
328 --- linux-2.6.16.orig/arch/i386/Kconfig.cpu     2006-03-20 06:53:29.000000000 +0100
329 +++ linux-2.6.16/arch/i386/Kconfig.cpu  2006-06-26 09:51:32.000000000 +0200
330 @@ -251,7 +251,7 @@
331  
332  config X86_F00F_BUG
333         bool
334 -       depends on M586MMX || M586TSC || M586 || M486 || M386
335 +       depends on (M586MMX || M586TSC || M586 || M486 || M386) && !X86_NO_IDT
336         default y
337  
338  config X86_WP_WORKS_OK
339 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/Makefile linux-2.6.16/arch/i386/Makefile
340 --- linux-2.6.16.orig/arch/i386/Makefile        2006-03-20 06:53:29.000000000 +0100
341 +++ linux-2.6.16/arch/i386/Makefile     2006-06-26 09:51:32.000000000 +0200
342 @@ -45,6 +45,11 @@
343  
344  CFLAGS += $(cflags-y)
345  
346 +cppflags-$(CONFIG_XEN) += \
347 +       -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION)
348 +
349 +CPPFLAGS += $(cppflags-y)
350 +
351  # Default subarch .c files
352  mcore-y  := mach-default
353  
354 @@ -68,6 +73,10 @@
355  mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-i386/mach-summit
356  mcore-$(CONFIG_X86_SUMMIT)  := mach-default
357  
358 +# Xen subarch support
359 +mflags-$(CONFIG_X86_XEN)       := -Iinclude/asm-i386/mach-xen
360 +mcore-$(CONFIG_X86_XEN)                := mach-xen
361 +
362  # generic subarchitecture
363  mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-i386/mach-generic
364  mcore-$(CONFIG_X86_GENERICARCH) := mach-default
365 @@ -102,6 +111,19 @@
366  .PHONY: zImage bzImage compressed zlilo bzlilo \
367         zdisk bzdisk fdimage fdimage144 fdimage288 install
368  
369 +ifdef CONFIG_XEN
370 +CPPFLAGS := -Iinclude$(if $(KBUILD_SRC),2)/asm/mach-xen $(CPPFLAGS)
371 +head-y := arch/i386/kernel/head-xen.o arch/i386/kernel/init_task-xen.o
372 +boot := arch/i386/boot-xen
373 +.PHONY: vmlinuz
374 +all: vmlinuz
375 +
376 +vmlinuz: vmlinux
377 +       $(Q)$(MAKE) $(build)=$(boot) $@
378 +
379 +install:
380 +       $(Q)$(MAKE) $(build)=$(boot) XENGUEST=$(XENGUEST) $@
381 +else
382  all: bzImage
383  
384  # KBUILD_IMAGE specify target image being built
385 @@ -124,6 +146,7 @@
386  
387  install:
388         $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install
389 +endif
390  
391  archclean:
392         $(Q)$(MAKE) $(clean)=arch/i386/boot
393 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/boot-xen/Makefile linux-2.6.16/arch/i386/boot-xen/Makefile
394 --- linux-2.6.16.orig/arch/i386/boot-xen/Makefile       1970-01-01 01:00:00.000000000 +0100
395 +++ linux-2.6.16/arch/i386/boot-xen/Makefile    2006-06-26 09:51:32.000000000 +0200
396 @@ -0,0 +1,21 @@
397 +
398 +OBJCOPYFLAGS := -g --strip-unneeded
399 +
400 +vmlinuz: vmlinux-stripped FORCE
401 +       $(call if_changed,gzip)
402 +
403 +vmlinux-stripped: vmlinux FORCE
404 +       $(call if_changed,objcopy)
405 +
406 +INSTALL_ROOT := $(patsubst %/boot,%,$(INSTALL_PATH))
407 +
408 +XINSTALL_NAME ?= $(KERNELRELEASE)
409 +install:
410 +       mkdir -p $(INSTALL_ROOT)/boot
411 +       ln -f -s vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_ROOT)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(XENGUEST)$(INSTALL_SUFFIX)
412 +       rm -f $(INSTALL_ROOT)/boot/vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
413 +       install -m0644 vmlinuz $(INSTALL_ROOT)/boot/vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
414 +       install -m0644 vmlinux $(INSTALL_ROOT)/boot/vmlinux-syms-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
415 +       install -m0664 .config $(INSTALL_ROOT)/boot/config-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
416 +       install -m0664 System.map $(INSTALL_ROOT)/boot/System.map-$(XINSTALL_NAME)$(INSTALL_SUFFIX)
417 +       ln -f -s vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_ROOT)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL)$(XENGUEST)$(INSTALL_SUFFIX)
418 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/Makefile linux-2.6.16/arch/i386/kernel/Makefile
419 --- linux-2.6.16.orig/arch/i386/kernel/Makefile 2006-03-20 06:53:29.000000000 +0100
420 +++ linux-2.6.16/arch/i386/kernel/Makefile      2006-06-26 09:51:32.000000000 +0200
421 @@ -37,17 +37,26 @@
422  obj-$(CONFIG_DOUBLEFAULT)      += doublefault.o
423  obj-$(CONFIG_VM86)             += vm86.o
424  obj-$(CONFIG_EARLY_PRINTK)     += early_printk.o
425 +obj-$(CONFIG_SMP_ALTERNATIVES) += smpalts.o
426  
427  EXTRA_AFLAGS   := -traditional
428  
429  obj-$(CONFIG_SCx200)           += scx200.o
430  
431 +ifdef CONFIG_XEN
432 +vsyscall_note := vsyscall-note-xen.o
433 +else
434 +vsyscall_note := vsyscall-note.o
435 +endif
436 +
437 +VSYSCALL_TYPES-y                       := int80
438 +VSYSCALL_TYPES-$(CONFIG_X86_SYSENTER)  += sysenter
439  # vsyscall.o contains the vsyscall DSO images as __initdata.
440  # We must build both images before we can assemble it.
441  # Note: kbuild does not track this dependency due to usage of .incbin
442 -$(obj)/vsyscall.o: $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so
443 -targets += $(foreach F,int80 sysenter,vsyscall-$F.o vsyscall-$F.so)
444 -targets += vsyscall-note.o vsyscall.lds
445 +$(obj)/vsyscall.o: $(foreach F,$(VSYSCALL_TYPES-y),$(obj)/vsyscall-$F.so)
446 +targets += $(foreach F,$(VSYSCALL_TYPES-y),vsyscall-$F.o vsyscall-$F.so)
447 +targets += $(vsyscall_note) vsyscall.lds
448  
449  # The DSO images are built using a special linker script.
450  quiet_cmd_syscall = SYSCALL $@
451 @@ -62,7 +71,7 @@
452  
453  $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so: \
454  $(obj)/vsyscall-%.so: $(src)/vsyscall.lds \
455 -                     $(obj)/vsyscall-%.o $(obj)/vsyscall-note.o FORCE
456 +                     $(obj)/vsyscall-%.o $(obj)/$(vsyscall_note) FORCE
457         $(call if_changed,syscall)
458  
459  # We also create a special relocatable object that should mirror the symbol
460 @@ -74,5 +83,18 @@
461  
462  SYSCFLAGS_vsyscall-syms.o = -r
463  $(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \
464 -                       $(obj)/vsyscall-sysenter.o $(obj)/vsyscall-note.o FORCE
465 +                       $(foreach F,$(VSYSCALL_TYPES-y),$(obj)/vsyscall-$F.o) \
466 +                       $(obj)/$(vsyscall_note) FORCE
467         $(call if_changed,syscall)
468 +
469 +ifdef CONFIG_XEN
470 +include $(srctree)/scripts/Makefile.xen
471 +
472 +obj-y += fixup.o
473 +microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o
474 +n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o
475 +
476 +obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
477 +obj-y := $(call cherrypickxen, $(obj-y))
478 +extra-y := $(call cherrypickxen, $(extra-y))
479 +endif
480 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/acpi/Makefile linux-2.6.16/arch/i386/kernel/acpi/Makefile
481 --- linux-2.6.16.orig/arch/i386/kernel/acpi/Makefile    2006-03-20 06:53:29.000000000 +0100
482 +++ linux-2.6.16/arch/i386/kernel/acpi/Makefile 2006-06-26 09:51:32.000000000 +0200
483 @@ -6,3 +6,7 @@
484  obj-y                          += cstate.o processor.o
485  endif
486  
487 +ifdef CONFIG_XEN
488 +include $(srctree)/scripts/Makefile.xen
489 +obj-y := $(call cherrypickxen, $(obj-y), $(src))
490 +endif
491 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/acpi/boot-xen.c linux-2.6.16/arch/i386/kernel/acpi/boot-xen.c
492 --- linux-2.6.16.orig/arch/i386/kernel/acpi/boot-xen.c  1970-01-01 01:00:00.000000000 +0100
493 +++ linux-2.6.16/arch/i386/kernel/acpi/boot-xen.c       2006-06-26 09:51:32.000000000 +0200
494 @@ -0,0 +1,1161 @@
495 +/*
496 + *  boot.c - Architecture-Specific Low-Level ACPI Boot Support
497 + *
498 + *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
499 + *  Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
500 + *
501 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
502 + *
503 + *  This program is free software; you can redistribute it and/or modify
504 + *  it under the terms of the GNU General Public License as published by
505 + *  the Free Software Foundation; either version 2 of the License, or
506 + *  (at your option) any later version.
507 + *
508 + *  This program is distributed in the hope that it will be useful,
509 + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
510 + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
511 + *  GNU General Public License for more details.
512 + *
513 + *  You should have received a copy of the GNU General Public License
514 + *  along with this program; if not, write to the Free Software
515 + *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
516 + *
517 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
518 + */
519 +
520 +#include <linux/init.h>
521 +#include <linux/config.h>
522 +#include <linux/acpi.h>
523 +#include <linux/efi.h>
524 +#include <linux/module.h>
525 +#include <linux/dmi.h>
526 +#include <linux/irq.h>
527 +
528 +#include <asm/pgtable.h>
529 +#include <asm/io_apic.h>
530 +#include <asm/apic.h>
531 +#include <asm/io.h>
532 +#include <asm/mpspec.h>
533 +
534 +#ifdef CONFIG_X86_64
535 +
536 +extern void __init clustered_apic_check(void);
537 +
538 +extern int gsi_irq_sharing(int gsi);
539 +#include <asm/proto.h>
540 +
541 +static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; }
542 +
543 +
544 +#else                          /* X86 */
545 +
546 +#ifdef CONFIG_X86_LOCAL_APIC
547 +#include <mach_apic.h>
548 +#include <mach_mpparse.h>
549 +#endif                         /* CONFIG_X86_LOCAL_APIC */
550 +
551 +static inline int gsi_irq_sharing(int gsi) { return gsi; }
552 +
553 +#endif                         /* X86 */
554 +
555 +#define BAD_MADT_ENTRY(entry, end) (                                       \
556 +               (!entry) || (unsigned long)entry + sizeof(*entry) > end ||  \
557 +               ((acpi_table_entry_header *)entry)->length != sizeof(*entry))
558 +
559 +#define PREFIX                 "ACPI: "
560 +
561 +int acpi_noirq __initdata;     /* skip ACPI IRQ initialization */
562 +int acpi_pci_disabled __initdata;      /* skip ACPI PCI scan and IRQ initialization */
563 +int acpi_ht __initdata = 1;    /* enable HT */
564 +
565 +int acpi_lapic;
566 +int acpi_ioapic;
567 +int acpi_strict;
568 +EXPORT_SYMBOL(acpi_strict);
569 +
570 +acpi_interrupt_flags acpi_sci_flags __initdata;
571 +int acpi_sci_override_gsi __initdata;
572 +int acpi_skip_timer_override __initdata;
573 +
574 +#ifdef CONFIG_X86_LOCAL_APIC
575 +static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
576 +#endif
577 +
578 +#ifndef __HAVE_ARCH_CMPXCHG
579 +#warning ACPI uses CMPXCHG, i486 and later hardware
580 +#endif
581 +
582 +#define MAX_MADT_ENTRIES       256
583 +u8 x86_acpiid_to_apicid[MAX_MADT_ENTRIES] =
584 +    {[0 ... MAX_MADT_ENTRIES - 1] = 0xff };
585 +EXPORT_SYMBOL(x86_acpiid_to_apicid);
586 +
587 +/* --------------------------------------------------------------------------
588 +                              Boot-time Configuration
589 +   -------------------------------------------------------------------------- */
590 +
591 +/*
592 + * The default interrupt routing model is PIC (8259).  This gets
593 + * overriden if IOAPICs are enumerated (below).
594 + */
595 +enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
596 +
597 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
598 +
599 +/* rely on all ACPI tables being in the direct mapping */
600 +char *__acpi_map_table(unsigned long phys_addr, unsigned long size)
601 +{
602 +       if (!phys_addr || !size)
603 +               return NULL;
604 +
605 +       if (phys_addr+size <= (end_pfn_map << PAGE_SHIFT) + PAGE_SIZE)
606 +               return __va(phys_addr);
607 +
608 +       return NULL;
609 +}
610 +
611 +#else
612 +
613 +/*
614 + * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
615 + * to map the target physical address. The problem is that set_fixmap()
616 + * provides a single page, and it is possible that the page is not
617 + * sufficient.
618 + * By using this area, we can map up to MAX_IO_APICS pages temporarily,
619 + * i.e. until the next __va_range() call.
620 + *
621 + * Important Safety Note:  The fixed I/O APIC page numbers are *subtracted*
622 + * from the fixed base.  That's why we start at FIX_IO_APIC_BASE_END and
623 + * count idx down while incrementing the phys address.
624 + */
625 +char *__acpi_map_table(unsigned long phys, unsigned long size)
626 +{
627 +       unsigned long base, offset, mapped_size;
628 +       int idx;
629 +
630 +#ifndef CONFIG_XEN
631 +       if (phys + size < 8 * 1024 * 1024)
632 +               return __va(phys);
633 +#endif
634 +
635 +       offset = phys & (PAGE_SIZE - 1);
636 +       mapped_size = PAGE_SIZE - offset;
637 +       set_fixmap(FIX_ACPI_END, phys);
638 +       base = fix_to_virt(FIX_ACPI_END);
639 +
640 +       /*
641 +        * Most cases can be covered by the below.
642 +        */
643 +       idx = FIX_ACPI_END;
644 +       while (mapped_size < size) {
645 +               if (--idx < FIX_ACPI_BEGIN)
646 +                       return NULL;    /* cannot handle this */
647 +               phys += PAGE_SIZE;
648 +               set_fixmap(idx, phys);
649 +               mapped_size += PAGE_SIZE;
650 +       }
651 +
652 +       return ((unsigned char *)base + offset);
653 +}
654 +#endif
655 +
656 +#ifdef CONFIG_PCI_MMCONFIG
657 +/* The physical address of the MMCONFIG aperture.  Set from ACPI tables. */
658 +struct acpi_table_mcfg_config *pci_mmcfg_config;
659 +int pci_mmcfg_config_num;
660 +
661 +int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size)
662 +{
663 +       struct acpi_table_mcfg *mcfg;
664 +       unsigned long i;
665 +       int config_size;
666 +
667 +       if (!phys_addr || !size)
668 +               return -EINVAL;
669 +
670 +       mcfg = (struct acpi_table_mcfg *)__acpi_map_table(phys_addr, size);
671 +       if (!mcfg) {
672 +               printk(KERN_WARNING PREFIX "Unable to map MCFG\n");
673 +               return -ENODEV;
674 +       }
675 +
676 +       /* how many config structures do we have */
677 +       pci_mmcfg_config_num = 0;
678 +       i = size - sizeof(struct acpi_table_mcfg);
679 +       while (i >= sizeof(struct acpi_table_mcfg_config)) {
680 +               ++pci_mmcfg_config_num;
681 +               i -= sizeof(struct acpi_table_mcfg_config);
682 +       };
683 +       if (pci_mmcfg_config_num == 0) {
684 +               printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
685 +               return -ENODEV;
686 +       }
687 +
688 +       config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config);
689 +       pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL);
690 +       if (!pci_mmcfg_config) {
691 +               printk(KERN_WARNING PREFIX
692 +                      "No memory for MCFG config tables\n");
693 +               return -ENOMEM;
694 +       }
695 +
696 +       memcpy(pci_mmcfg_config, &mcfg->config, config_size);
697 +       for (i = 0; i < pci_mmcfg_config_num; ++i) {
698 +               if (mcfg->config[i].base_reserved) {
699 +                       printk(KERN_ERR PREFIX
700 +                              "MMCONFIG not in low 4GB of memory\n");
701 +                       return -ENODEV;
702 +               }
703 +       }
704 +
705 +       return 0;
706 +}
707 +#endif                         /* CONFIG_PCI_MMCONFIG */
708 +
709 +#ifdef CONFIG_X86_LOCAL_APIC
710 +static int __init acpi_parse_madt(unsigned long phys_addr, unsigned long size)
711 +{
712 +       struct acpi_table_madt *madt = NULL;
713 +
714 +       if (!phys_addr || !size)
715 +               return -EINVAL;
716 +
717 +       madt = (struct acpi_table_madt *)__acpi_map_table(phys_addr, size);
718 +       if (!madt) {
719 +               printk(KERN_WARNING PREFIX "Unable to map MADT\n");
720 +               return -ENODEV;
721 +       }
722 +
723 +       if (madt->lapic_address) {
724 +               acpi_lapic_addr = (u64) madt->lapic_address;
725 +
726 +               printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n",
727 +                      madt->lapic_address);
728 +       }
729 +
730 +       acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id);
731 +
732 +       return 0;
733 +}
734 +
735 +static int __init
736 +acpi_parse_lapic(acpi_table_entry_header * header, const unsigned long end)
737 +{
738 +       struct acpi_table_lapic *processor = NULL;
739 +
740 +       processor = (struct acpi_table_lapic *)header;
741 +
742 +       if (BAD_MADT_ENTRY(processor, end))
743 +               return -EINVAL;
744 +
745 +       acpi_table_print_madt_entry(header);
746 +
747 +       /* Record local apic id only when enabled */
748 +       if (processor->flags.enabled)
749 +               x86_acpiid_to_apicid[processor->acpi_id] = processor->id;
750 +
751 +       /*
752 +        * We need to register disabled CPU as well to permit
753 +        * counting disabled CPUs. This allows us to size
754 +        * cpus_possible_map more accurately, to permit
755 +        * to not preallocating memory for all NR_CPUS
756 +        * when we use CPU hotplug.
757 +        */
758 +       mp_register_lapic(processor->id,        /* APIC ID */
759 +                         processor->flags.enabled);    /* Enabled? */
760 +
761 +       return 0;
762 +}
763 +
764 +static int __init
765 +acpi_parse_lapic_addr_ovr(acpi_table_entry_header * header,
766 +                         const unsigned long end)
767 +{
768 +       struct acpi_table_lapic_addr_ovr *lapic_addr_ovr = NULL;
769 +
770 +       lapic_addr_ovr = (struct acpi_table_lapic_addr_ovr *)header;
771 +
772 +       if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
773 +               return -EINVAL;
774 +
775 +       acpi_lapic_addr = lapic_addr_ovr->address;
776 +
777 +       return 0;
778 +}
779 +
780 +static int __init
781 +acpi_parse_lapic_nmi(acpi_table_entry_header * header, const unsigned long end)
782 +{
783 +       struct acpi_table_lapic_nmi *lapic_nmi = NULL;
784 +
785 +       lapic_nmi = (struct acpi_table_lapic_nmi *)header;
786 +
787 +       if (BAD_MADT_ENTRY(lapic_nmi, end))
788 +               return -EINVAL;
789 +
790 +       acpi_table_print_madt_entry(header);
791 +
792 +       if (lapic_nmi->lint != 1)
793 +               printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
794 +
795 +       return 0;
796 +}
797 +
798 +#endif                         /*CONFIG_X86_LOCAL_APIC */
799 +
800 +#ifdef CONFIG_X86_IO_APIC
801 +
802 +static int __init
803 +acpi_parse_ioapic(acpi_table_entry_header * header, const unsigned long end)
804 +{
805 +       struct acpi_table_ioapic *ioapic = NULL;
806 +
807 +       ioapic = (struct acpi_table_ioapic *)header;
808 +
809 +       if (BAD_MADT_ENTRY(ioapic, end))
810 +               return -EINVAL;
811 +
812 +       acpi_table_print_madt_entry(header);
813 +
814 +       mp_register_ioapic(ioapic->id,
815 +                          ioapic->address, ioapic->global_irq_base);
816 +
817 +       return 0;
818 +}
819 +
820 +/*
821 + * Parse Interrupt Source Override for the ACPI SCI
822 + */
823 +static void acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
824 +{
825 +       if (trigger == 0)       /* compatible SCI trigger is level */
826 +               trigger = 3;
827 +
828 +       if (polarity == 0)      /* compatible SCI polarity is low */
829 +               polarity = 3;
830 +
831 +       /* Command-line over-ride via acpi_sci= */
832 +       if (acpi_sci_flags.trigger)
833 +               trigger = acpi_sci_flags.trigger;
834 +
835 +       if (acpi_sci_flags.polarity)
836 +               polarity = acpi_sci_flags.polarity;
837 +
838 +       /*
839 +        * mp_config_acpi_legacy_irqs() already setup IRQs < 16
840 +        * If GSI is < 16, this will update its flags,
841 +        * else it will create a new mp_irqs[] entry.
842 +        */
843 +       mp_override_legacy_irq(gsi, polarity, trigger, gsi);
844 +
845 +       /*
846 +        * stash over-ride to indicate we've been here
847 +        * and for later update of acpi_fadt
848 +        */
849 +       acpi_sci_override_gsi = gsi;
850 +       return;
851 +}
852 +
853 +static int __init
854 +acpi_parse_int_src_ovr(acpi_table_entry_header * header,
855 +                      const unsigned long end)
856 +{
857 +       struct acpi_table_int_src_ovr *intsrc = NULL;
858 +
859 +       intsrc = (struct acpi_table_int_src_ovr *)header;
860 +
861 +       if (BAD_MADT_ENTRY(intsrc, end))
862 +               return -EINVAL;
863 +
864 +       acpi_table_print_madt_entry(header);
865 +
866 +       if (intsrc->bus_irq == acpi_fadt.sci_int) {
867 +               acpi_sci_ioapic_setup(intsrc->global_irq,
868 +                                     intsrc->flags.polarity,
869 +                                     intsrc->flags.trigger);
870 +               return 0;
871 +       }
872 +
873 +       if (acpi_skip_timer_override &&
874 +           intsrc->bus_irq == 0 && intsrc->global_irq == 2) {
875 +               printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
876 +               return 0;
877 +       }
878 +
879 +       mp_override_legacy_irq(intsrc->bus_irq,
880 +                              intsrc->flags.polarity,
881 +                              intsrc->flags.trigger, intsrc->global_irq);
882 +
883 +       return 0;
884 +}
885 +
886 +static int __init
887 +acpi_parse_nmi_src(acpi_table_entry_header * header, const unsigned long end)
888 +{
889 +       struct acpi_table_nmi_src *nmi_src = NULL;
890 +
891 +       nmi_src = (struct acpi_table_nmi_src *)header;
892 +
893 +       if (BAD_MADT_ENTRY(nmi_src, end))
894 +               return -EINVAL;
895 +
896 +       acpi_table_print_madt_entry(header);
897 +
898 +       /* TBD: Support nimsrc entries? */
899 +
900 +       return 0;
901 +}
902 +
903 +#endif                         /* CONFIG_X86_IO_APIC */
904 +
905 +/*
906 + * acpi_pic_sci_set_trigger()
907 + * 
908 + * use ELCR to set PIC-mode trigger type for SCI
909 + *
910 + * If a PIC-mode SCI is not recognized or gives spurious IRQ7's
911 + * it may require Edge Trigger -- use "acpi_sci=edge"
912 + *
913 + * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers
914 + * for the 8259 PIC.  bit[n] = 1 means irq[n] is Level, otherwise Edge.
915 + * ECLR1 is IRQ's 0-7 (IRQ 0, 1, 2 must be 0)
916 + * ECLR2 is IRQ's 8-15 (IRQ 8, 13 must be 0)
917 + */
918 +
919 +void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
920 +{
921 +       unsigned int mask = 1 << irq;
922 +       unsigned int old, new;
923 +
924 +       /* Real old ELCR mask */
925 +       old = inb(0x4d0) | (inb(0x4d1) << 8);
926 +
927 +       /*
928 +        * If we use ACPI to set PCI irq's, then we should clear ELCR
929 +        * since we will set it correctly as we enable the PCI irq
930 +        * routing.
931 +        */
932 +       new = acpi_noirq ? old : 0;
933 +
934 +       /*
935 +        * Update SCI information in the ELCR, it isn't in the PCI
936 +        * routing tables..
937 +        */
938 +       switch (trigger) {
939 +       case 1:         /* Edge - clear */
940 +               new &= ~mask;
941 +               break;
942 +       case 3:         /* Level - set */
943 +               new |= mask;
944 +               break;
945 +       }
946 +
947 +       if (old == new)
948 +               return;
949 +
950 +       printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old);
951 +       outb(new, 0x4d0);
952 +       outb(new >> 8, 0x4d1);
953 +}
954 +
955 +int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
956 +{
957 +#ifdef CONFIG_X86_IO_APIC
958 +       if (use_pci_vector() && !platform_legacy_irq(gsi))
959 +               *irq = IO_APIC_VECTOR(gsi);
960 +       else
961 +#endif
962 +               *irq = gsi_irq_sharing(gsi);
963 +       return 0;
964 +}
965 +
966 +/*
967 + * success: return IRQ number (>=0)
968 + * failure: return < 0
969 + */
970 +int acpi_register_gsi(u32 gsi, int triggering, int polarity)
971 +{
972 +       unsigned int irq;
973 +       unsigned int plat_gsi = gsi;
974 +
975 +#ifdef CONFIG_PCI
976 +       /*
977 +        * Make sure all (legacy) PCI IRQs are set as level-triggered.
978 +        */
979 +       if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
980 +               extern void eisa_set_level_irq(unsigned int irq);
981 +
982 +               if (triggering == ACPI_LEVEL_SENSITIVE)
983 +                       eisa_set_level_irq(gsi);
984 +       }
985 +#endif
986 +
987 +#ifdef CONFIG_X86_IO_APIC
988 +       if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
989 +               plat_gsi = mp_register_gsi(gsi, triggering, polarity);
990 +       }
991 +#endif
992 +       acpi_gsi_to_irq(plat_gsi, &irq);
993 +       return irq;
994 +}
995 +
996 +EXPORT_SYMBOL(acpi_register_gsi);
997 +
998 +/*
999 + *  ACPI based hotplug support for CPU
1000 + */
1001 +#ifdef CONFIG_ACPI_HOTPLUG_CPU
1002 +int acpi_map_lsapic(acpi_handle handle, int *pcpu)
1003 +{
1004 +       /* TBD */
1005 +       return -EINVAL;
1006 +}
1007 +
1008 +EXPORT_SYMBOL(acpi_map_lsapic);
1009 +
1010 +int acpi_unmap_lsapic(int cpu)
1011 +{
1012 +       /* TBD */
1013 +       return -EINVAL;
1014 +}
1015 +
1016 +EXPORT_SYMBOL(acpi_unmap_lsapic);
1017 +#endif                         /* CONFIG_ACPI_HOTPLUG_CPU */
1018 +
1019 +int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
1020 +{
1021 +       /* TBD */
1022 +       return -EINVAL;
1023 +}
1024 +
1025 +EXPORT_SYMBOL(acpi_register_ioapic);
1026 +
1027 +int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
1028 +{
1029 +       /* TBD */
1030 +       return -EINVAL;
1031 +}
1032 +
1033 +EXPORT_SYMBOL(acpi_unregister_ioapic);
1034 +
1035 +static unsigned long __init
1036 +acpi_scan_rsdp(unsigned long start, unsigned long length)
1037 +{
1038 +       unsigned long offset = 0;
1039 +       unsigned long sig_len = sizeof("RSD PTR ") - 1;
1040 +       unsigned long vstart = (unsigned long)isa_bus_to_virt(start);
1041 +
1042 +       /*
1043 +        * Scan all 16-byte boundaries of the physical memory region for the
1044 +        * RSDP signature.
1045 +        */
1046 +       for (offset = 0; offset < length; offset += 16) {
1047 +               if (strncmp((char *)(vstart + offset), "RSD PTR ", sig_len))
1048 +                       continue;
1049 +               return (start + offset);
1050 +       }
1051 +
1052 +       return 0;
1053 +}
1054 +
1055 +static int __init acpi_parse_sbf(unsigned long phys_addr, unsigned long size)
1056 +{
1057 +       struct acpi_table_sbf *sb;
1058 +
1059 +       if (!phys_addr || !size)
1060 +               return -EINVAL;
1061 +
1062 +       sb = (struct acpi_table_sbf *)__acpi_map_table(phys_addr, size);
1063 +       if (!sb) {
1064 +               printk(KERN_WARNING PREFIX "Unable to map SBF\n");
1065 +               return -ENODEV;
1066 +       }
1067 +
1068 +       sbf_port = sb->sbf_cmos;        /* Save CMOS port */
1069 +
1070 +       return 0;
1071 +}
1072 +
1073 +#ifdef CONFIG_HPET_TIMER
1074 +
1075 +static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
1076 +{
1077 +       struct acpi_table_hpet *hpet_tbl;
1078 +
1079 +       if (!phys || !size)
1080 +               return -EINVAL;
1081 +
1082 +       hpet_tbl = (struct acpi_table_hpet *)__acpi_map_table(phys, size);
1083 +       if (!hpet_tbl) {
1084 +               printk(KERN_WARNING PREFIX "Unable to map HPET\n");
1085 +               return -ENODEV;
1086 +       }
1087 +
1088 +       if (hpet_tbl->addr.space_id != ACPI_SPACE_MEM) {
1089 +               printk(KERN_WARNING PREFIX "HPET timers must be located in "
1090 +                      "memory.\n");
1091 +               return -1;
1092 +       }
1093 +#ifdef CONFIG_X86_64
1094 +       vxtime.hpet_address = hpet_tbl->addr.addrl |
1095 +           ((long)hpet_tbl->addr.addrh << 32);
1096 +
1097 +       printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
1098 +              hpet_tbl->id, vxtime.hpet_address);
1099 +#else                          /* X86 */
1100 +       {
1101 +               extern unsigned long hpet_address;
1102 +
1103 +               hpet_address = hpet_tbl->addr.addrl;
1104 +               printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
1105 +                      hpet_tbl->id, hpet_address);
1106 +       }
1107 +#endif                         /* X86 */
1108 +
1109 +       return 0;
1110 +}
1111 +#else
1112 +#define        acpi_parse_hpet NULL
1113 +#endif
1114 +
1115 +#ifdef CONFIG_X86_PM_TIMER
1116 +extern u32 pmtmr_ioport;
1117 +#endif
1118 +
1119 +static int __init acpi_parse_fadt(unsigned long phys, unsigned long size)
1120 +{
1121 +       struct fadt_descriptor_rev2 *fadt = NULL;
1122 +
1123 +       fadt = (struct fadt_descriptor_rev2 *)__acpi_map_table(phys, size);
1124 +       if (!fadt) {
1125 +               printk(KERN_WARNING PREFIX "Unable to map FADT\n");
1126 +               return 0;
1127 +       }
1128 +       /* initialize sci_int early for INT_SRC_OVR MADT parsing */
1129 +       acpi_fadt.sci_int = fadt->sci_int;
1130 +
1131 +       /* initialize rev and apic_phys_dest_mode for x86_64 genapic */
1132 +       acpi_fadt.revision = fadt->revision;
1133 +       acpi_fadt.force_apic_physical_destination_mode =
1134 +           fadt->force_apic_physical_destination_mode;
1135 +
1136 +#if defined(CONFIG_X86_PM_TIMER) && !defined(CONFIG_XEN)
1137 +       /* detect the location of the ACPI PM Timer */
1138 +       if (fadt->revision >= FADT2_REVISION_ID) {
1139 +               /* FADT rev. 2 */
1140 +               if (fadt->xpm_tmr_blk.address_space_id !=
1141 +                   ACPI_ADR_SPACE_SYSTEM_IO)
1142 +                       return 0;
1143 +
1144 +               pmtmr_ioport = fadt->xpm_tmr_blk.address;
1145 +               /*
1146 +                * "X" fields are optional extensions to the original V1.0
1147 +                * fields, so we must selectively expand V1.0 fields if the
1148 +                * corresponding X field is zero.
1149 +                */
1150 +               if (!pmtmr_ioport)
1151 +                       pmtmr_ioport = fadt->V1_pm_tmr_blk;
1152 +       } else {
1153 +               /* FADT rev. 1 */
1154 +               pmtmr_ioport = fadt->V1_pm_tmr_blk;
1155 +       }
1156 +       if (pmtmr_ioport)
1157 +               printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n",
1158 +                      pmtmr_ioport);
1159 +#endif
1160 +       return 0;
1161 +}
1162 +
1163 +unsigned long __init acpi_find_rsdp(void)
1164 +{
1165 +       unsigned long rsdp_phys = 0;
1166 +
1167 +       if (efi_enabled) {
1168 +               if (efi.acpi20)
1169 +                       return __pa(efi.acpi20);
1170 +               else if (efi.acpi)
1171 +                       return __pa(efi.acpi);
1172 +       }
1173 +       /*
1174 +        * Scan memory looking for the RSDP signature. First search EBDA (low
1175 +        * memory) paragraphs and then search upper memory (E0000-FFFFF).
1176 +        */
1177 +       rsdp_phys = acpi_scan_rsdp(0, 0x400);
1178 +       if (!rsdp_phys)
1179 +               rsdp_phys = acpi_scan_rsdp(0xE0000, 0x20000);
1180 +
1181 +       return rsdp_phys;
1182 +}
1183 +
1184 +#ifdef CONFIG_X86_LOCAL_APIC
1185 +/*
1186 + * Parse LAPIC entries in MADT
1187 + * returns 0 on success, < 0 on error
1188 + */
1189 +static int __init acpi_parse_madt_lapic_entries(void)
1190 +{
1191 +       int count;
1192 +
1193 +       /* 
1194 +        * Note that the LAPIC address is obtained from the MADT (32-bit value)
1195 +        * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
1196 +        */
1197 +
1198 +       count =
1199 +           acpi_table_parse_madt(ACPI_MADT_LAPIC_ADDR_OVR,
1200 +                                 acpi_parse_lapic_addr_ovr, 0);
1201 +       if (count < 0) {
1202 +               printk(KERN_ERR PREFIX
1203 +                      "Error parsing LAPIC address override entry\n");
1204 +               return count;
1205 +       }
1206 +
1207 +       mp_register_lapic_address(acpi_lapic_addr);
1208 +
1209 +       count = acpi_table_parse_madt(ACPI_MADT_LAPIC, acpi_parse_lapic,
1210 +                                     MAX_APICS);
1211 +       if (!count) {
1212 +               printk(KERN_ERR PREFIX "No LAPIC entries present\n");
1213 +               /* TBD: Cleanup to allow fallback to MPS */
1214 +               return -ENODEV;
1215 +       } else if (count < 0) {
1216 +               printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
1217 +               /* TBD: Cleanup to allow fallback to MPS */
1218 +               return count;
1219 +       }
1220 +
1221 +       count =
1222 +           acpi_table_parse_madt(ACPI_MADT_LAPIC_NMI, acpi_parse_lapic_nmi, 0);
1223 +       if (count < 0) {
1224 +               printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
1225 +               /* TBD: Cleanup to allow fallback to MPS */
1226 +               return count;
1227 +       }
1228 +       return 0;
1229 +}
1230 +#endif                         /* CONFIG_X86_LOCAL_APIC */
1231 +
1232 +#ifdef CONFIG_X86_IO_APIC
1233 +/*
1234 + * Parse IOAPIC related entries in MADT
1235 + * returns 0 on success, < 0 on error
1236 + */
1237 +static int __init acpi_parse_madt_ioapic_entries(void)
1238 +{
1239 +       int count;
1240 +
1241 +       /*
1242 +        * ACPI interpreter is required to complete interrupt setup,
1243 +        * so if it is off, don't enumerate the io-apics with ACPI.
1244 +        * If MPS is present, it will handle them,
1245 +        * otherwise the system will stay in PIC mode
1246 +        */
1247 +       if (acpi_disabled || acpi_noirq) {
1248 +               return -ENODEV;
1249 +       }
1250 +
1251 +       /*
1252 +        * if "noapic" boot option, don't look for IO-APICs
1253 +        */
1254 +       if (skip_ioapic_setup) {
1255 +               printk(KERN_INFO PREFIX "Skipping IOAPIC probe "
1256 +                      "due to 'noapic' option.\n");
1257 +               return -ENODEV;
1258 +       }
1259 +
1260 +       count =
1261 +           acpi_table_parse_madt(ACPI_MADT_IOAPIC, acpi_parse_ioapic,
1262 +                                 MAX_IO_APICS);
1263 +       if (!count) {
1264 +               printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
1265 +               return -ENODEV;
1266 +       } else if (count < 0) {
1267 +               printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n");
1268 +               return count;
1269 +       }
1270 +
1271 +       count =
1272 +           acpi_table_parse_madt(ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr,
1273 +                                 NR_IRQ_VECTORS);
1274 +       if (count < 0) {
1275 +               printk(KERN_ERR PREFIX
1276 +                      "Error parsing interrupt source overrides entry\n");
1277 +               /* TBD: Cleanup to allow fallback to MPS */
1278 +               return count;
1279 +       }
1280 +
1281 +       /*
1282 +        * If BIOS did not supply an INT_SRC_OVR for the SCI
1283 +        * pretend we got one so we can set the SCI flags.
1284 +        */
1285 +       if (!acpi_sci_override_gsi)
1286 +               acpi_sci_ioapic_setup(acpi_fadt.sci_int, 0, 0);
1287 +
1288 +       /* Fill in identity legacy mapings where no override */
1289 +       mp_config_acpi_legacy_irqs();
1290 +
1291 +       count =
1292 +           acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src,
1293 +                                 NR_IRQ_VECTORS);
1294 +       if (count < 0) {
1295 +               printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
1296 +               /* TBD: Cleanup to allow fallback to MPS */
1297 +               return count;
1298 +       }
1299 +
1300 +       return 0;
1301 +}
1302 +#else
1303 +static inline int acpi_parse_madt_ioapic_entries(void)
1304 +{
1305 +       return -1;
1306 +}
1307 +#endif /* !CONFIG_X86_IO_APIC */
1308 +
1309 +static void __init acpi_process_madt(void)
1310 +{
1311 +#ifdef CONFIG_X86_LOCAL_APIC
1312 +       int count, error;
1313 +
1314 +       count = acpi_table_parse(ACPI_APIC, acpi_parse_madt);
1315 +       if (count >= 1) {
1316 +
1317 +               /*
1318 +                * Parse MADT LAPIC entries
1319 +                */
1320 +               error = acpi_parse_madt_lapic_entries();
1321 +               if (!error) {
1322 +                       acpi_lapic = 1;
1323 +
1324 +#ifdef CONFIG_X86_GENERICARCH
1325 +                       generic_bigsmp_probe();
1326 +#endif
1327 +                       /*
1328 +                        * Parse MADT IO-APIC entries
1329 +                        */
1330 +                       error = acpi_parse_madt_ioapic_entries();
1331 +                       if (!error) {
1332 +                               acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
1333 +                               acpi_irq_balance_set(NULL);
1334 +                               acpi_ioapic = 1;
1335 +
1336 +                               smp_found_config = 1;
1337 +                               clustered_apic_check();
1338 +                       }
1339 +               }
1340 +               if (error == -EINVAL) {
1341 +                       /*
1342 +                        * Dell Precision Workstation 410, 610 come here.
1343 +                        */
1344 +                       printk(KERN_ERR PREFIX
1345 +                              "Invalid BIOS MADT, disabling ACPI\n");
1346 +                       disable_acpi();
1347 +               }
1348 +       }
1349 +#endif
1350 +       return;
1351 +}
1352 +
1353 +extern int acpi_force;
1354 +
1355 +#ifdef __i386__
1356 +
1357 +static int __init disable_acpi_irq(struct dmi_system_id *d)
1358 +{
1359 +       if (!acpi_force) {
1360 +               printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n",
1361 +                      d->ident);
1362 +               acpi_noirq_set();
1363 +       }
1364 +       return 0;
1365 +}
1366 +
1367 +static int __init disable_acpi_pci(struct dmi_system_id *d)
1368 +{
1369 +       if (!acpi_force) {
1370 +               printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n",
1371 +                      d->ident);
1372 +               acpi_disable_pci();
1373 +       }
1374 +       return 0;
1375 +}
1376 +
1377 +static int __init dmi_disable_acpi(struct dmi_system_id *d)
1378 +{
1379 +       if (!acpi_force) {
1380 +               printk(KERN_NOTICE "%s detected: acpi off\n", d->ident);
1381 +               disable_acpi();
1382 +       } else {
1383 +               printk(KERN_NOTICE
1384 +                      "Warning: DMI blacklist says broken, but acpi forced\n");
1385 +       }
1386 +       return 0;
1387 +}
1388 +
1389 +/*
1390 + * Limit ACPI to CPU enumeration for HT
1391 + */
1392 +static int __init force_acpi_ht(struct dmi_system_id *d)
1393 +{
1394 +       if (!acpi_force) {
1395 +               printk(KERN_NOTICE "%s detected: force use of acpi=ht\n",
1396 +                      d->ident);
1397 +               disable_acpi();
1398 +               acpi_ht = 1;
1399 +       } else {
1400 +               printk(KERN_NOTICE
1401 +                      "Warning: acpi=force overrules DMI blacklist: acpi=ht\n");
1402 +       }
1403 +       return 0;
1404 +}
1405 +
1406 +/*
1407 + * If your system is blacklisted here, but you find that acpi=force
1408 + * works for you, please contact acpi-devel@sourceforge.net
1409 + */
1410 +static struct dmi_system_id __initdata acpi_dmi_table[] = {
1411 +       /*
1412 +        * Boxes that need ACPI disabled
1413 +        */
1414 +       {
1415 +        .callback = dmi_disable_acpi,
1416 +        .ident = "IBM Thinkpad",
1417 +        .matches = {
1418 +                    DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1419 +                    DMI_MATCH(DMI_BOARD_NAME, "2629H1G"),
1420 +                    },
1421 +        },
1422 +
1423 +       /*
1424 +        * Boxes that need acpi=ht
1425 +        */
1426 +       {
1427 +        .callback = force_acpi_ht,
1428 +        .ident = "FSC Primergy T850",
1429 +        .matches = {
1430 +                    DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
1431 +                    DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"),
1432 +                    },
1433 +        },
1434 +       {
1435 +        .callback = force_acpi_ht,
1436 +        .ident = "DELL GX240",
1437 +        .matches = {
1438 +                    DMI_MATCH(DMI_BOARD_VENDOR, "Dell Computer Corporation"),
1439 +                    DMI_MATCH(DMI_BOARD_NAME, "OptiPlex GX240"),
1440 +                    },
1441 +        },
1442 +       {
1443 +        .callback = force_acpi_ht,
1444 +        .ident = "HP VISUALIZE NT Workstation",
1445 +        .matches = {
1446 +                    DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
1447 +                    DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"),
1448 +                    },
1449 +        },
1450 +       {
1451 +        .callback = force_acpi_ht,
1452 +        .ident = "Compaq Workstation W8000",
1453 +        .matches = {
1454 +                    DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
1455 +                    DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"),
1456 +                    },
1457 +        },
1458 +       {
1459 +        .callback = force_acpi_ht,
1460 +        .ident = "ASUS P4B266",
1461 +        .matches = {
1462 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1463 +                    DMI_MATCH(DMI_BOARD_NAME, "P4B266"),
1464 +                    },
1465 +        },
1466 +       {
1467 +        .callback = force_acpi_ht,
1468 +        .ident = "ASUS P2B-DS",
1469 +        .matches = {
1470 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1471 +                    DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"),
1472 +                    },
1473 +        },
1474 +       {
1475 +        .callback = force_acpi_ht,
1476 +        .ident = "ASUS CUR-DLS",
1477 +        .matches = {
1478 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1479 +                    DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"),
1480 +                    },
1481 +        },
1482 +       {
1483 +        .callback = force_acpi_ht,
1484 +        .ident = "ABIT i440BX-W83977",
1485 +        .matches = {
1486 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"),
1487 +                    DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"),
1488 +                    },
1489 +        },
1490 +       {
1491 +        .callback = force_acpi_ht,
1492 +        .ident = "IBM Bladecenter",
1493 +        .matches = {
1494 +                    DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1495 +                    DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"),
1496 +                    },
1497 +        },
1498 +       {
1499 +        .callback = force_acpi_ht,
1500 +        .ident = "IBM eServer xSeries 360",
1501 +        .matches = {
1502 +                    DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1503 +                    DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"),
1504 +                    },
1505 +        },
1506 +       {
1507 +        .callback = force_acpi_ht,
1508 +        .ident = "IBM eserver xSeries 330",
1509 +        .matches = {
1510 +                    DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1511 +                    DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"),
1512 +                    },
1513 +        },
1514 +       {
1515 +        .callback = force_acpi_ht,
1516 +        .ident = "IBM eserver xSeries 440",
1517 +        .matches = {
1518 +                    DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1519 +                    DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"),
1520 +                    },
1521 +        },
1522 +
1523 +       /*
1524 +        * Boxes that need ACPI PCI IRQ routing disabled
1525 +        */
1526 +       {
1527 +        .callback = disable_acpi_irq,
1528 +        .ident = "ASUS A7V",
1529 +        .matches = {
1530 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"),
1531 +                    DMI_MATCH(DMI_BOARD_NAME, "<A7V>"),
1532 +                    /* newer BIOS, Revision 1011, does work */
1533 +                    DMI_MATCH(DMI_BIOS_VERSION,
1534 +                              "ASUS A7V ACPI BIOS Revision 1007"),
1535 +                    },
1536 +        },
1537 +
1538 +       /*
1539 +        * Boxes that need ACPI PCI IRQ routing and PCI scan disabled
1540 +        */
1541 +       {                       /* _BBN 0 bug */
1542 +        .callback = disable_acpi_pci,
1543 +        .ident = "ASUS PR-DLS",
1544 +        .matches = {
1545 +                    DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1546 +                    DMI_MATCH(DMI_BOARD_NAME, "PR-DLS"),
1547 +                    DMI_MATCH(DMI_BIOS_VERSION,
1548 +                              "ASUS PR-DLS ACPI BIOS Revision 1010"),
1549 +                    DMI_MATCH(DMI_BIOS_DATE, "03/21/2003")
1550 +                    },
1551 +        },
1552 +       {
1553 +        .callback = disable_acpi_pci,
1554 +        .ident = "Acer TravelMate 36x Laptop",
1555 +        .matches = {
1556 +                    DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
1557 +                    DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
1558 +                    },
1559 +        },
1560 +       {}
1561 +};
1562 +
1563 +#endif                         /* __i386__ */
1564 +
1565 +/*
1566 + * acpi_boot_table_init() and acpi_boot_init()
1567 + *  called from setup_arch(), always.
1568 + *     1. checksums all tables
1569 + *     2. enumerates lapics
1570 + *     3. enumerates io-apics
1571 + *
1572 + * acpi_table_init() is separate to allow reading SRAT without
1573 + * other side effects.
1574 + *
1575 + * side effects of acpi_boot_init:
1576 + *     acpi_lapic = 1 if LAPIC found
1577 + *     acpi_ioapic = 1 if IOAPIC found
1578 + *     if (acpi_lapic && acpi_ioapic) smp_found_config = 1;
1579 + *     if acpi_blacklisted() acpi_disabled = 1;
1580 + *     acpi_irq_model=...
1581 + *     ...
1582 + *
1583 + * return value: (currently ignored)
1584 + *     0: success
1585 + *     !0: failure
1586 + */
1587 +
1588 +int __init acpi_boot_table_init(void)
1589 +{
1590 +       int error;
1591 +
1592 +#ifdef __i386__
1593 +       dmi_check_system(acpi_dmi_table);
1594 +#endif
1595 +
1596 +       /*
1597 +        * If acpi_disabled, bail out
1598 +        * One exception: acpi=ht continues far enough to enumerate LAPICs
1599 +        */
1600 +       if (acpi_disabled && !acpi_ht)
1601 +               return 1;
1602 +
1603 +       /* 
1604 +        * Initialize the ACPI boot-time table parser.
1605 +        */
1606 +       error = acpi_table_init();
1607 +       if (error) {
1608 +               disable_acpi();
1609 +               return error;
1610 +       }
1611 +
1612 +       acpi_table_parse(ACPI_BOOT, acpi_parse_sbf);
1613 +
1614 +       /*
1615 +        * blacklist may disable ACPI entirely
1616 +        */
1617 +       error = acpi_blacklisted();
1618 +       if (error) {
1619 +               if (acpi_force) {
1620 +                       printk(KERN_WARNING PREFIX "acpi=force override\n");
1621 +               } else {
1622 +                       printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
1623 +                       disable_acpi();
1624 +                       return error;
1625 +               }
1626 +       }
1627 +
1628 +       return 0;
1629 +}
1630 +
1631 +int __init acpi_boot_init(void)
1632 +{
1633 +       /*
1634 +        * If acpi_disabled, bail out
1635 +        * One exception: acpi=ht continues far enough to enumerate LAPICs
1636 +        */
1637 +       if (acpi_disabled && !acpi_ht)
1638 +               return 1;
1639 +
1640 +       acpi_table_parse(ACPI_BOOT, acpi_parse_sbf);
1641 +
1642 +       /*
1643 +        * set sci_int and PM timer address
1644 +        */
1645 +       acpi_table_parse(ACPI_FADT, acpi_parse_fadt);
1646 +
1647 +       /*
1648 +        * Process the Multiple APIC Description Table (MADT), if present
1649 +        */
1650 +       acpi_process_madt();
1651 +
1652 +       acpi_table_parse(ACPI_HPET, acpi_parse_hpet);
1653 +
1654 +       return 0;
1655 +}
1656 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/apic-xen.c linux-2.6.16/arch/i386/kernel/apic-xen.c
1657 --- linux-2.6.16.orig/arch/i386/kernel/apic-xen.c       1970-01-01 01:00:00.000000000 +0100
1658 +++ linux-2.6.16/arch/i386/kernel/apic-xen.c    2006-06-26 09:51:32.000000000 +0200
1659 @@ -0,0 +1,140 @@
1660 +/*
1661 + *     Local APIC handling, local APIC timers
1662 + *
1663 + *     (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
1664 + *
1665 + *     Fixes
1666 + *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
1667 + *                                     thanks to Eric Gilmore
1668 + *                                     and Rolf G. Tews
1669 + *                                     for testing these extensively.
1670 + *     Maciej W. Rozycki       :       Various updates and fixes.
1671 + *     Mikael Pettersson       :       Power Management for UP-APIC.
1672 + *     Pavel Machek and
1673 + *     Mikael Pettersson       :       PM converted to driver model.
1674 + */
1675 +
1676 +#include <linux/config.h>
1677 +#include <linux/init.h>
1678 +
1679 +#include <linux/mm.h>
1680 +#include <linux/delay.h>
1681 +#include <linux/bootmem.h>
1682 +#include <linux/smp_lock.h>
1683 +#include <linux/interrupt.h>
1684 +#include <linux/mc146818rtc.h>
1685 +#include <linux/kernel_stat.h>
1686 +#include <linux/sysdev.h>
1687 +#include <linux/cpu.h>
1688 +#include <linux/module.h>
1689 +
1690 +#include <asm/atomic.h>
1691 +#include <asm/smp.h>
1692 +#include <asm/mtrr.h>
1693 +#include <asm/mpspec.h>
1694 +#include <asm/desc.h>
1695 +#include <asm/arch_hooks.h>
1696 +#include <asm/hpet.h>
1697 +#include <asm/i8253.h>
1698 +
1699 +#include <mach_apic.h>
1700 +#include <mach_ipi.h>
1701 +
1702 +#include "io_ports.h"
1703 +
1704 +#ifndef CONFIG_XEN
1705 +/*
1706 + * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
1707 + * IPIs in place of local APIC timers
1708 + */
1709 +static cpumask_t timer_bcast_ipi;
1710 +#endif
1711 +
1712 +/*
1713 + * Knob to control our willingness to enable the local APIC.
1714 + */
1715 +int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
1716 +
1717 +/*
1718 + * Debug level
1719 + */
1720 +int apic_verbosity;
1721 +
1722 +/*
1723 + * 'what should we do if we get a hw irq event on an illegal vector'.
1724 + * each architecture has to answer this themselves.
1725 + */
1726 +void ack_bad_irq(unsigned int irq)
1727 +{
1728 +       printk("unexpected IRQ trap at vector %02x\n", irq);
1729 +       /*
1730 +        * Currently unexpected vectors happen only on SMP and APIC.
1731 +        * We _must_ ack these because every local APIC has only N
1732 +        * irq slots per priority level, and a 'hanging, unacked' IRQ
1733 +        * holds up an irq slot - in excessive cases (when multiple
1734 +        * unexpected vectors occur) that might lock up the APIC
1735 +        * completely.
1736 +        * But only ack when the APIC is enabled -AK
1737 +        */
1738 +       if (cpu_has_apic)
1739 +               ack_APIC_irq();
1740 +}
1741 +
1742 +int get_physical_broadcast(void)
1743 +{
1744 +        return 0xff;
1745 +}
1746 +
1747 +#ifndef CONFIG_XEN
1748 +#ifndef CONFIG_SMP
1749 +static void up_apic_timer_interrupt_call(struct pt_regs *regs)
1750 +{
1751 +       int cpu = smp_processor_id();
1752 +
1753 +       /*
1754 +        * the NMI deadlock-detector uses this.
1755 +        */
1756 +       per_cpu(irq_stat, cpu).apic_timer_irqs++;
1757 +
1758 +       smp_local_timer_interrupt(regs);
1759 +}
1760 +#endif
1761 +
1762 +void smp_send_timer_broadcast_ipi(struct pt_regs *regs)
1763 +{
1764 +       cpumask_t mask;
1765 +
1766 +       cpus_and(mask, cpu_online_map, timer_bcast_ipi);
1767 +       if (!cpus_empty(mask)) {
1768 +#ifdef CONFIG_SMP
1769 +               send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
1770 +#else
1771 +               /*
1772 +                * We can directly call the apic timer interrupt handler
1773 +                * in UP case. Minus all irq related functions
1774 +                */
1775 +               up_apic_timer_interrupt_call(regs);
1776 +#endif
1777 +       }
1778 +}
1779 +#endif
1780 +
1781 +int setup_profiling_timer(unsigned int multiplier)
1782 +{
1783 +       return -EINVAL;
1784 +}
1785 +
1786 +/*
1787 + * This initializes the IO-APIC and APIC hardware if this is
1788 + * a UP kernel.
1789 + */
1790 +int __init APIC_init_uniprocessor (void)
1791 +{
1792 +#ifdef CONFIG_X86_IO_APIC
1793 +       if (smp_found_config)
1794 +               if (!skip_ioapic_setup && nr_ioapics)
1795 +                       setup_IO_APIC();
1796 +#endif
1797 +
1798 +       return 0;
1799 +}
1800 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/asm-offsets.c linux-2.6.16/arch/i386/kernel/asm-offsets.c
1801 --- linux-2.6.16.orig/arch/i386/kernel/asm-offsets.c    2006-03-20 06:53:29.000000000 +0100
1802 +++ linux-2.6.16/arch/i386/kernel/asm-offsets.c 2006-06-26 09:51:32.000000000 +0200
1803 @@ -13,6 +13,7 @@
1804  #include <asm/fixmap.h>
1805  #include <asm/processor.h>
1806  #include <asm/thread_info.h>
1807 +#include <asm/elf.h>
1808  
1809  #define DEFINE(sym, val) \
1810          asm volatile("\n->" #sym " %0 " #val : : "i" (val))
1811 @@ -63,10 +64,12 @@
1812         OFFSET(pbe_orig_address, pbe, orig_address);
1813         OFFSET(pbe_next, pbe, next);
1814  
1815 +#ifdef CONFIG_X86_SYSENTER
1816         /* Offset from the sysenter stack to tss.esp0 */
1817         DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) -
1818                  sizeof(struct tss_struct));
1819 +#endif
1820  
1821         DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
1822 -       DEFINE(VSYSCALL_BASE, __fix_to_virt(FIX_VSYSCALL));
1823 +       DEFINE(VSYSCALL_BASE, VSYSCALL_BASE);
1824  }
1825 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/cpu/Makefile linux-2.6.16/arch/i386/kernel/cpu/Makefile
1826 --- linux-2.6.16.orig/arch/i386/kernel/cpu/Makefile     2006-03-20 06:53:29.000000000 +0100
1827 +++ linux-2.6.16/arch/i386/kernel/cpu/Makefile  2006-06-26 09:51:32.000000000 +0200
1828 @@ -17,3 +17,8 @@
1829  
1830  obj-$(CONFIG_MTRR)     +=      mtrr/
1831  obj-$(CONFIG_CPU_FREQ) +=      cpufreq/
1832 +
1833 +ifdef CONFIG_XEN
1834 +include $(srctree)/scripts/Makefile.xen
1835 +obj-y := $(call cherrypickxen, $(obj-y), $(src))
1836 +endif
1837 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/cpu/common-xen.c linux-2.6.16/arch/i386/kernel/cpu/common-xen.c
1838 --- linux-2.6.16.orig/arch/i386/kernel/cpu/common-xen.c 1970-01-01 01:00:00.000000000 +0100
1839 +++ linux-2.6.16/arch/i386/kernel/cpu/common-xen.c      2006-06-26 09:51:32.000000000 +0200
1840 @@ -0,0 +1,715 @@
1841 +#include <linux/init.h>
1842 +#include <linux/string.h>
1843 +#include <linux/delay.h>
1844 +#include <linux/smp.h>
1845 +#include <linux/module.h>
1846 +#include <linux/percpu.h>
1847 +#include <linux/bootmem.h>
1848 +#include <asm/semaphore.h>
1849 +#include <asm/processor.h>
1850 +#include <asm/i387.h>
1851 +#include <asm/msr.h>
1852 +#include <asm/io.h>
1853 +#include <asm/mmu_context.h>
1854 +#ifdef CONFIG_X86_LOCAL_APIC
1855 +#include <asm/mpspec.h>
1856 +#include <asm/apic.h>
1857 +#include <mach_apic.h>
1858 +#endif
1859 +#include <asm/hypervisor.h>
1860 +
1861 +#include "cpu.h"
1862 +
1863 +DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
1864 +EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
1865 +
1866 +#ifndef CONFIG_XEN
1867 +DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
1868 +EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
1869 +#endif
1870 +
1871 +static int cachesize_override __devinitdata = -1;
1872 +static int disable_x86_fxsr __devinitdata = 0;
1873 +static int disable_x86_serial_nr __devinitdata = 1;
1874 +
1875 +struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
1876 +
1877 +extern int disable_pse;
1878 +
1879 +static void default_init(struct cpuinfo_x86 * c)
1880 +{
1881 +       /* Not much we can do here... */
1882 +       /* Check if at least it has cpuid */
1883 +       if (c->cpuid_level == -1) {
1884 +               /* No cpuid. It must be an ancient CPU */
1885 +               if (c->x86 == 4)
1886 +                       strcpy(c->x86_model_id, "486");
1887 +               else if (c->x86 == 3)
1888 +                       strcpy(c->x86_model_id, "386");
1889 +       }
1890 +}
1891 +
1892 +static struct cpu_dev default_cpu = {
1893 +       .c_init = default_init,
1894 +       .c_vendor = "Unknown",
1895 +};
1896 +static struct cpu_dev * this_cpu = &default_cpu;
1897 +
1898 +static int __init cachesize_setup(char *str)
1899 +{
1900 +       get_option (&str, &cachesize_override);
1901 +       return 1;
1902 +}
1903 +__setup("cachesize=", cachesize_setup);
1904 +
1905 +int __devinit get_model_name(struct cpuinfo_x86 *c)
1906 +{
1907 +       unsigned int *v;
1908 +       char *p, *q;
1909 +
1910 +       if (cpuid_eax(0x80000000) < 0x80000004)
1911 +               return 0;
1912 +
1913 +       v = (unsigned int *) c->x86_model_id;
1914 +       cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
1915 +       cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
1916 +       cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
1917 +       c->x86_model_id[48] = 0;
1918 +
1919 +       /* Intel chips right-justify this string for some dumb reason;
1920 +          undo that brain damage */
1921 +       p = q = &c->x86_model_id[0];
1922 +       while ( *p == ' ' )
1923 +            p++;
1924 +       if ( p != q ) {
1925 +            while ( *p )
1926 +                 *q++ = *p++;
1927 +            while ( q <= &c->x86_model_id[48] )
1928 +                 *q++ = '\0';  /* Zero-pad the rest */
1929 +       }
1930 +
1931 +       return 1;
1932 +}
1933 +
1934 +
1935 +void __devinit display_cacheinfo(struct cpuinfo_x86 *c)
1936 +{
1937 +       unsigned int n, dummy, ecx, edx, l2size;
1938 +
1939 +       n = cpuid_eax(0x80000000);
1940 +
1941 +       if (n >= 0x80000005) {
1942 +               cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
1943 +               printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
1944 +                       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
1945 +               c->x86_cache_size=(ecx>>24)+(edx>>24);  
1946 +       }
1947 +
1948 +       if (n < 0x80000006)     /* Some chips just has a large L1. */
1949 +               return;
1950 +
1951 +       ecx = cpuid_ecx(0x80000006);
1952 +       l2size = ecx >> 16;
1953 +       
1954 +       /* do processor-specific cache resizing */
1955 +       if (this_cpu->c_size_cache)
1956 +               l2size = this_cpu->c_size_cache(c,l2size);
1957 +
1958 +       /* Allow user to override all this if necessary. */
1959 +       if (cachesize_override != -1)
1960 +               l2size = cachesize_override;
1961 +
1962 +       if ( l2size == 0 )
1963 +               return;         /* Again, no L2 cache is possible */
1964 +
1965 +       c->x86_cache_size = l2size;
1966 +
1967 +       printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
1968 +              l2size, ecx & 0xFF);
1969 +}
1970 +
1971 +/* Naming convention should be: <Name> [(<Codename>)] */
1972 +/* This table only is used unless init_<vendor>() below doesn't set it; */
1973 +/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */
1974 +
1975 +/* Look up CPU names by table lookup. */
1976 +static char __devinit *table_lookup_model(struct cpuinfo_x86 *c)
1977 +{
1978 +       struct cpu_model_info *info;
1979 +
1980 +       if ( c->x86_model >= 16 )
1981 +               return NULL;    /* Range check */
1982 +
1983 +       if (!this_cpu)
1984 +               return NULL;
1985 +
1986 +       info = this_cpu->c_models;
1987 +
1988 +       while (info && info->family) {
1989 +               if (info->family == c->x86)
1990 +                       return info->model_names[c->x86_model];
1991 +               info++;
1992 +       }
1993 +       return NULL;            /* Not found */
1994 +}
1995 +
1996 +
1997 +static void __devinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
1998 +{
1999 +       char *v = c->x86_vendor_id;
2000 +       int i;
2001 +       static int printed;
2002 +
2003 +       for (i = 0; i < X86_VENDOR_NUM; i++) {
2004 +               if (cpu_devs[i]) {
2005 +                       if (!strcmp(v,cpu_devs[i]->c_ident[0]) ||
2006 +                           (cpu_devs[i]->c_ident[1] && 
2007 +                            !strcmp(v,cpu_devs[i]->c_ident[1]))) {
2008 +                               c->x86_vendor = i;
2009 +                               if (!early)
2010 +                                       this_cpu = cpu_devs[i];
2011 +                               return;
2012 +                       }
2013 +               }
2014 +       }
2015 +       if (!printed) {
2016 +               printed++;
2017 +               printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
2018 +               printk(KERN_ERR "CPU: Your system may be unstable.\n");
2019 +       }
2020 +       c->x86_vendor = X86_VENDOR_UNKNOWN;
2021 +       this_cpu = &default_cpu;
2022 +}
2023 +
2024 +
2025 +static int __init x86_fxsr_setup(char * s)
2026 +{
2027 +       disable_x86_fxsr = 1;
2028 +       return 1;
2029 +}
2030 +__setup("nofxsr", x86_fxsr_setup);
2031 +
2032 +
2033 +/* Standard macro to see if a specific flag is changeable */
2034 +static inline int flag_is_changeable_p(u32 flag)
2035 +{
2036 +       u32 f1, f2;
2037 +
2038 +       asm("pushfl\n\t"
2039 +           "pushfl\n\t"
2040 +           "popl %0\n\t"
2041 +           "movl %0,%1\n\t"
2042 +           "xorl %2,%0\n\t"
2043 +           "pushl %0\n\t"
2044 +           "popfl\n\t"
2045 +           "pushfl\n\t"
2046 +           "popl %0\n\t"
2047 +           "popfl\n\t"
2048 +           : "=&r" (f1), "=&r" (f2)
2049 +           : "ir" (flag));
2050 +
2051 +       return ((f1^f2) & flag) != 0;
2052 +}
2053 +
2054 +
2055 +/* Probe for the CPUID instruction */
2056 +static int __devinit have_cpuid_p(void)
2057 +{
2058 +       return flag_is_changeable_p(X86_EFLAGS_ID);
2059 +}
2060 +
2061 +/* Do minimum CPU detection early.
2062 +   Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
2063 +   The others are not touched to avoid unwanted side effects.
2064 +
2065 +   WARNING: this function is only called on the BP.  Don't add code here
2066 +   that is supposed to run on all CPUs. */
2067 +static void __init early_cpu_detect(void)
2068 +{
2069 +       struct cpuinfo_x86 *c = &boot_cpu_data;
2070 +
2071 +       c->x86_cache_alignment = 32;
2072 +
2073 +       if (!have_cpuid_p())
2074 +               return;
2075 +
2076 +       /* Get vendor name */
2077 +       cpuid(0x00000000, &c->cpuid_level,
2078 +             (int *)&c->x86_vendor_id[0],
2079 +             (int *)&c->x86_vendor_id[8],
2080 +             (int *)&c->x86_vendor_id[4]);
2081 +
2082 +       get_cpu_vendor(c, 1);
2083 +
2084 +       c->x86 = 4;
2085 +       if (c->cpuid_level >= 0x00000001) {
2086 +               u32 junk, tfms, cap0, misc;
2087 +               cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
2088 +               c->x86 = (tfms >> 8) & 15;
2089 +               c->x86_model = (tfms >> 4) & 15;
2090 +               if (c->x86 == 0xf)
2091 +                       c->x86 += (tfms >> 20) & 0xff;
2092 +               if (c->x86 >= 0x6)
2093 +                       c->x86_model += ((tfms >> 16) & 0xF) << 4;
2094 +               c->x86_mask = tfms & 15;
2095 +               if (cap0 & (1<<19))
2096 +                       c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
2097 +       }
2098 +}
2099 +
2100 +void __devinit generic_identify(struct cpuinfo_x86 * c)
2101 +{
2102 +       u32 tfms, xlvl;
2103 +       int junk;
2104 +
2105 +       if (have_cpuid_p()) {
2106 +               /* Get vendor name */
2107 +               cpuid(0x00000000, &c->cpuid_level,
2108 +                     (int *)&c->x86_vendor_id[0],
2109 +                     (int *)&c->x86_vendor_id[8],
2110 +                     (int *)&c->x86_vendor_id[4]);
2111 +               
2112 +               get_cpu_vendor(c, 0);
2113 +               /* Initialize the standard set of capabilities */
2114 +               /* Note that the vendor-specific code below might override */
2115 +       
2116 +               /* Intel-defined flags: level 0x00000001 */
2117 +               if ( c->cpuid_level >= 0x00000001 ) {
2118 +                       u32 capability, excap;
2119 +                       cpuid(0x00000001, &tfms, &junk, &excap, &capability);
2120 +                       c->x86_capability[0] = capability;
2121 +                       c->x86_capability[4] = excap;
2122 +                       c->x86 = (tfms >> 8) & 15;
2123 +                       c->x86_model = (tfms >> 4) & 15;
2124 +                       if (c->x86 == 0xf)
2125 +                               c->x86 += (tfms >> 20) & 0xff;
2126 +                       if (c->x86 >= 0x6)
2127 +                               c->x86_model += ((tfms >> 16) & 0xF) << 4;
2128 +                       c->x86_mask = tfms & 15;
2129 +               } else {
2130 +                       /* Have CPUID level 0 only - unheard of */
2131 +                       c->x86 = 4;
2132 +               }
2133 +
2134 +               /* AMD-defined flags: level 0x80000001 */
2135 +               xlvl = cpuid_eax(0x80000000);
2136 +               if ( (xlvl & 0xffff0000) == 0x80000000 ) {
2137 +                       if ( xlvl >= 0x80000001 ) {
2138 +                               c->x86_capability[1] = cpuid_edx(0x80000001);
2139 +                               c->x86_capability[6] = cpuid_ecx(0x80000001);
2140 +                       }
2141 +                       if ( xlvl >= 0x80000004 )
2142 +                               get_model_name(c); /* Default name */
2143 +               }
2144 +       }
2145 +
2146 +       early_intel_workaround(c);
2147 +
2148 +#ifdef CONFIG_X86_HT
2149 +       phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
2150 +#endif
2151 +}
2152 +
2153 +static void __devinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
2154 +{
2155 +       if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) {
2156 +               /* Disable processor serial number */
2157 +               unsigned long lo,hi;
2158 +               rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
2159 +               lo |= 0x200000;
2160 +               wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
2161 +               printk(KERN_NOTICE "CPU serial number disabled.\n");
2162 +               clear_bit(X86_FEATURE_PN, c->x86_capability);
2163 +
2164 +               /* Disabling the serial number may affect the cpuid level */
2165 +               c->cpuid_level = cpuid_eax(0);
2166 +       }
2167 +}
2168 +
2169 +static int __init x86_serial_nr_setup(char *s)
2170 +{
2171 +       disable_x86_serial_nr = 0;
2172 +       return 1;
2173 +}
2174 +__setup("serialnumber", x86_serial_nr_setup);
2175 +
2176 +
2177 +
2178 +/*
2179 + * This does the hard work of actually picking apart the CPU stuff...
2180 + */
2181 +void __devinit identify_cpu(struct cpuinfo_x86 *c)
2182 +{
2183 +       int i;
2184 +
2185 +       c->loops_per_jiffy = loops_per_jiffy;
2186 +       c->x86_cache_size = -1;
2187 +       c->x86_vendor = X86_VENDOR_UNKNOWN;
2188 +       c->cpuid_level = -1;    /* CPUID not detected */
2189 +       c->x86_model = c->x86_mask = 0; /* So far unknown... */
2190 +       c->x86_vendor_id[0] = '\0'; /* Unset */
2191 +       c->x86_model_id[0] = '\0';  /* Unset */
2192 +       c->x86_max_cores = 1;
2193 +       memset(&c->x86_capability, 0, sizeof c->x86_capability);
2194 +
2195 +       if (!have_cpuid_p()) {
2196 +               /* First of all, decide if this is a 486 or higher */
2197 +               /* It's a 486 if we can modify the AC flag */
2198 +               if ( flag_is_changeable_p(X86_EFLAGS_AC) )
2199 +                       c->x86 = 4;
2200 +               else
2201 +                       c->x86 = 3;
2202 +       }
2203 +
2204 +       generic_identify(c);
2205 +
2206 +       printk(KERN_DEBUG "CPU: After generic identify, caps:");
2207 +       for (i = 0; i < NCAPINTS; i++)
2208 +               printk(" %08lx", c->x86_capability[i]);
2209 +       printk("\n");
2210 +
2211 +       if (this_cpu->c_identify) {
2212 +               this_cpu->c_identify(c);
2213 +
2214 +               printk(KERN_DEBUG "CPU: After vendor identify, caps:");
2215 +               for (i = 0; i < NCAPINTS; i++)
2216 +                       printk(" %08lx", c->x86_capability[i]);
2217 +               printk("\n");
2218 +       }
2219 +
2220 +       /*
2221 +        * Vendor-specific initialization.  In this section we
2222 +        * canonicalize the feature flags, meaning if there are
2223 +        * features a certain CPU supports which CPUID doesn't
2224 +        * tell us, CPUID claiming incorrect flags, or other bugs,
2225 +        * we handle them here.
2226 +        *
2227 +        * At the end of this section, c->x86_capability better
2228 +        * indicate the features this CPU genuinely supports!
2229 +        */
2230 +       if (this_cpu->c_init)
2231 +               this_cpu->c_init(c);
2232 +
2233 +       /* Disable the PN if appropriate */
2234 +       squash_the_stupid_serial_number(c);
2235 +
2236 +       /*
2237 +        * The vendor-specific functions might have changed features.  Now
2238 +        * we do "generic changes."
2239 +        */
2240 +
2241 +       /* TSC disabled? */
2242 +       if ( tsc_disable )
2243 +               clear_bit(X86_FEATURE_TSC, c->x86_capability);
2244 +
2245 +       /* FXSR disabled? */
2246 +       if (disable_x86_fxsr) {
2247 +               clear_bit(X86_FEATURE_FXSR, c->x86_capability);
2248 +               clear_bit(X86_FEATURE_XMM, c->x86_capability);
2249 +       }
2250 +
2251 +       if (disable_pse)
2252 +               clear_bit(X86_FEATURE_PSE, c->x86_capability);
2253 +
2254 +       /* If the model name is still unset, do table lookup. */
2255 +       if ( !c->x86_model_id[0] ) {
2256 +               char *p;
2257 +               p = table_lookup_model(c);
2258 +               if ( p )
2259 +                       strcpy(c->x86_model_id, p);
2260 +               else
2261 +                       /* Last resort... */
2262 +                       sprintf(c->x86_model_id, "%02x/%02x",
2263 +                               c->x86_vendor, c->x86_model);
2264 +       }
2265 +
2266 +       /* Now the feature flags better reflect actual CPU features! */
2267 +
2268 +       printk(KERN_DEBUG "CPU: After all inits, caps:");
2269 +       for (i = 0; i < NCAPINTS; i++)
2270 +               printk(" %08lx", c->x86_capability[i]);
2271 +       printk("\n");
2272 +
2273 +       /*
2274 +        * On SMP, boot_cpu_data holds the common feature set between
2275 +        * all CPUs; so make sure that we indicate which features are
2276 +        * common between the CPUs.  The first time this routine gets
2277 +        * executed, c == &boot_cpu_data.
2278 +        */
2279 +       if ( c != &boot_cpu_data ) {
2280 +               /* AND the already accumulated flags with these */
2281 +               for ( i = 0 ; i < NCAPINTS ; i++ )
2282 +                       boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
2283 +       }
2284 +
2285 +       /* Init Machine Check Exception if available. */
2286 +       mcheck_init(c);
2287 +
2288 +       if (c == &boot_cpu_data)
2289 +               sysenter_setup();
2290 +       enable_sep_cpu();
2291 +
2292 +       if (c == &boot_cpu_data)
2293 +               mtrr_bp_init();
2294 +       else
2295 +               mtrr_ap_init();
2296 +}
2297 +
2298 +#ifdef CONFIG_X86_HT
2299 +void __devinit detect_ht(struct cpuinfo_x86 *c)
2300 +{
2301 +       u32     eax, ebx, ecx, edx;
2302 +       int     index_msb, core_bits;
2303 +       int     cpu = smp_processor_id();
2304 +
2305 +       cpuid(1, &eax, &ebx, &ecx, &edx);
2306 +
2307 +       c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
2308 +
2309 +       if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
2310 +               return;
2311 +
2312 +       smp_num_siblings = (ebx & 0xff0000) >> 16;
2313 +
2314 +       if (smp_num_siblings == 1) {
2315 +               printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
2316 +       } else if (smp_num_siblings > 1 ) {
2317 +
2318 +               if (smp_num_siblings > NR_CPUS) {
2319 +                       printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
2320 +                       smp_num_siblings = 1;
2321 +                       return;
2322 +               }
2323 +
2324 +               index_msb = get_count_order(smp_num_siblings);
2325 +               phys_proc_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
2326 +
2327 +               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
2328 +                      phys_proc_id[cpu]);
2329 +
2330 +               smp_num_siblings = smp_num_siblings / c->x86_max_cores;
2331 +
2332 +               index_msb = get_count_order(smp_num_siblings) ;
2333 +
2334 +               core_bits = get_count_order(c->x86_max_cores);
2335 +
2336 +               cpu_core_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
2337 +                                              ((1 << core_bits) - 1);
2338 +
2339 +               if (c->x86_max_cores > 1)
2340 +                       printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
2341 +                              cpu_core_id[cpu]);
2342 +       }
2343 +}
2344 +#endif
2345 +
2346 +void __devinit print_cpu_info(struct cpuinfo_x86 *c)
2347 +{
2348 +       char *vendor = NULL;
2349 +
2350 +       if (c->x86_vendor < X86_VENDOR_NUM)
2351 +               vendor = this_cpu->c_vendor;
2352 +       else if (c->cpuid_level >= 0)
2353 +               vendor = c->x86_vendor_id;
2354 +
2355 +       if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor)))
2356 +               printk("%s ", vendor);
2357 +
2358 +       if (!c->x86_model_id[0])
2359 +               printk("%d86", c->x86);
2360 +       else
2361 +               printk("%s", c->x86_model_id);
2362 +
2363 +       if (c->x86_mask || c->cpuid_level >= 0) 
2364 +               printk(" stepping %02x\n", c->x86_mask);
2365 +       else
2366 +               printk("\n");
2367 +}
2368 +
2369 +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
2370 +
2371 +/* This is hacky. :)
2372 + * We're emulating future behavior.
2373 + * In the future, the cpu-specific init functions will be called implicitly
2374 + * via the magic of initcalls.
2375 + * They will insert themselves into the cpu_devs structure.
2376 + * Then, when cpu_init() is called, we can just iterate over that array.
2377 + */
2378 +
2379 +extern int intel_cpu_init(void);
2380 +extern int cyrix_init_cpu(void);
2381 +extern int nsc_init_cpu(void);
2382 +extern int amd_init_cpu(void);
2383 +extern int centaur_init_cpu(void);
2384 +extern int transmeta_init_cpu(void);
2385 +extern int rise_init_cpu(void);
2386 +extern int nexgen_init_cpu(void);
2387 +extern int umc_init_cpu(void);
2388 +
2389 +void __init early_cpu_init(void)
2390 +{
2391 +       intel_cpu_init();
2392 +       cyrix_init_cpu();
2393 +       nsc_init_cpu();
2394 +       amd_init_cpu();
2395 +       centaur_init_cpu();
2396 +       transmeta_init_cpu();
2397 +       rise_init_cpu();
2398 +       nexgen_init_cpu();
2399 +       umc_init_cpu();
2400 +       early_cpu_detect();
2401 +
2402 +#ifdef CONFIG_DEBUG_PAGEALLOC
2403 +       /* pse is not compatible with on-the-fly unmapping,
2404 +        * disable it even if the cpus claim to support it.
2405 +        */
2406 +       clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
2407 +       disable_pse = 1;
2408 +#endif
2409 +}
2410 +
2411 +void __cpuinit cpu_gdt_init(struct Xgt_desc_struct *gdt_descr)
2412 +{
2413 +       unsigned long frames[16];
2414 +       unsigned long va;
2415 +       int f;
2416 +
2417 +       for (va = gdt_descr->address, f = 0;
2418 +            va < gdt_descr->address + gdt_descr->size;
2419 +            va += PAGE_SIZE, f++) {
2420 +               frames[f] = virt_to_mfn(va);
2421 +               make_lowmem_page_readonly(
2422 +                       (void *)va, XENFEAT_writable_descriptor_tables);
2423 +       }
2424 +       if (HYPERVISOR_set_gdt(frames, gdt_descr->size / 8))
2425 +               BUG();
2426 +}
2427 +
2428 +/*
2429 + * cpu_init() initializes state that is per-CPU. Some data is already
2430 + * initialized (naturally) in the bootstrap process, such as the GDT
2431 + * and IDT. We reload them nevertheless, this function acts as a
2432 + * 'CPU state barrier', nothing should get across.
2433 + */
2434 +void __cpuinit cpu_init(void)
2435 +{
2436 +       int cpu = smp_processor_id();
2437 +#ifndef CONFIG_X86_NO_TSS
2438 +       struct tss_struct * t = &per_cpu(init_tss, cpu);
2439 +#endif
2440 +       struct thread_struct *thread = &current->thread;
2441 +       struct desc_struct *gdt;
2442 +       struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
2443 +
2444 +       if (cpu_test_and_set(cpu, cpu_initialized)) {
2445 +               printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
2446 +               for (;;) local_irq_enable();
2447 +       }
2448 +       printk(KERN_INFO "Initializing CPU#%d\n", cpu);
2449 +
2450 +       if (cpu_has_vme || cpu_has_de)
2451 +               clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
2452 +       if (tsc_disable && cpu_has_tsc) {
2453 +               printk(KERN_NOTICE "Disabling TSC...\n");
2454 +               /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
2455 +               clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
2456 +               set_in_cr4(X86_CR4_TSD);
2457 +       }
2458 +
2459 +#ifndef CONFIG_XEN
2460 +       /*
2461 +        * This is a horrible hack to allocate the GDT.  The problem
2462 +        * is that cpu_init() is called really early for the boot CPU
2463 +        * (and hence needs bootmem) but much later for the secondary
2464 +        * CPUs, when bootmem will have gone away
2465 +        */
2466 +       if (NODE_DATA(0)->bdata->node_bootmem_map) {
2467 +               gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
2468 +               /* alloc_bootmem_pages panics on failure, so no check */
2469 +               memset(gdt, 0, PAGE_SIZE);
2470 +       } else {
2471 +               gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
2472 +               if (unlikely(!gdt)) {
2473 +                       printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
2474 +                       for (;;)
2475 +                               local_irq_enable();
2476 +               }
2477 +       }
2478 +
2479 +       /*
2480 +        * Initialize the per-CPU GDT with the boot GDT,
2481 +        * and set up the GDT descriptor:
2482 +        */
2483 +       memcpy(gdt, cpu_gdt_table, GDT_SIZE);
2484 +
2485 +       /* Set up GDT entry for 16bit stack */
2486 +       *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |=
2487 +               ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
2488 +               ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
2489 +               (CPU_16BIT_STACK_SIZE - 1);
2490 +
2491 +       cpu_gdt_descr->size = GDT_SIZE - 1;
2492 +       cpu_gdt_descr->address = (unsigned long)gdt;
2493 +#else
2494 +       if (cpu == 0 && cpu_gdt_descr->address == 0) {
2495 +               gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
2496 +               /* alloc_bootmem_pages panics on failure, so no check */
2497 +               memset(gdt, 0, PAGE_SIZE);
2498 +
2499 +               memcpy(gdt, cpu_gdt_table, GDT_SIZE);
2500 +               
2501 +               cpu_gdt_descr->size = GDT_SIZE;
2502 +               cpu_gdt_descr->address = (unsigned long)gdt;
2503 +       }
2504 +#endif
2505 +
2506 +       cpu_gdt_init(cpu_gdt_descr);
2507 +
2508 +       /*
2509 +        * Set up and load the per-CPU TSS and LDT
2510 +        */
2511 +       atomic_inc(&init_mm.mm_count);
2512 +       current->active_mm = &init_mm;
2513 +       if (current->mm)
2514 +               BUG();
2515 +       enter_lazy_tlb(&init_mm, current);
2516 +
2517 +       load_esp0(t, thread);
2518 +
2519 +       load_LDT(&init_mm.context);
2520 +
2521 +#ifdef CONFIG_DOUBLEFAULT
2522 +       /* Set up doublefault TSS pointer in the GDT */
2523 +       __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
2524 +#endif
2525 +
2526 +       /* Clear %fs and %gs. */
2527 +       asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
2528 +
2529 +       /* Clear all 6 debug registers: */
2530 +       set_debugreg(0, 0);
2531 +       set_debugreg(0, 1);
2532 +       set_debugreg(0, 2);
2533 +       set_debugreg(0, 3);
2534 +       set_debugreg(0, 6);
2535 +       set_debugreg(0, 7);
2536 +
2537 +       /*
2538 +        * Force FPU initialization:
2539 +        */
2540 +       current_thread_info()->status = 0;
2541 +       clear_used_math();
2542 +       mxcsr_feature_mask_init();
2543 +}
2544 +
2545 +#ifdef CONFIG_HOTPLUG_CPU
2546 +void __devinit cpu_uninit(void)
2547 +{
2548 +       int cpu = raw_smp_processor_id();
2549 +       cpu_clear(cpu, cpu_initialized);
2550 +
2551 +       /* lazy TLB state */
2552 +       per_cpu(cpu_tlbstate, cpu).state = 0;
2553 +       per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
2554 +}
2555 +#endif
2556 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/cpu/mtrr/Makefile linux-2.6.16/arch/i386/kernel/cpu/mtrr/Makefile
2557 --- linux-2.6.16.orig/arch/i386/kernel/cpu/mtrr/Makefile        2006-03-20 06:53:29.000000000 +0100
2558 +++ linux-2.6.16/arch/i386/kernel/cpu/mtrr/Makefile     2006-06-26 09:51:32.000000000 +0200
2559 @@ -3,3 +3,10 @@
2560  obj-y          += cyrix.o
2561  obj-y          += centaur.o
2562  
2563 +ifdef CONFIG_XEN
2564 +include $(srctree)/scripts/Makefile.xen
2565 +n-obj-xen := generic.o state.o amd.o cyrix.o centaur.o
2566 +
2567 +obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
2568 +obj-y := $(call cherrypickxen, $(obj-y))
2569 +endif
2570 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/cpu/mtrr/main-xen.c linux-2.6.16/arch/i386/kernel/cpu/mtrr/main-xen.c
2571 --- linux-2.6.16.orig/arch/i386/kernel/cpu/mtrr/main-xen.c      1970-01-01 01:00:00.000000000 +0100
2572 +++ linux-2.6.16/arch/i386/kernel/cpu/mtrr/main-xen.c   2006-06-26 09:51:32.000000000 +0200
2573 @@ -0,0 +1,196 @@
2574 +#include <linux/init.h>
2575 +#include <linux/proc_fs.h>
2576 +#include <linux/ctype.h>
2577 +#include <linux/module.h>
2578 +#include <linux/seq_file.h>
2579 +#include <asm/uaccess.h>
2580 +
2581 +#include <asm/mtrr.h>
2582 +#include "mtrr.h"
2583 +
2584 +static DECLARE_MUTEX(mtrr_sem);
2585 +
2586 +void generic_get_mtrr(unsigned int reg, unsigned long *base,
2587 +                     unsigned int *size, mtrr_type * type)
2588 +{
2589 +       dom0_op_t op;
2590 +
2591 +       op.cmd = DOM0_READ_MEMTYPE;
2592 +       op.u.read_memtype.reg = reg;
2593 +       (void)HYPERVISOR_dom0_op(&op);
2594 +
2595 +       *size = op.u.read_memtype.nr_mfns;
2596 +       *base = op.u.read_memtype.mfn;
2597 +       *type = op.u.read_memtype.type;
2598 +}
2599 +
2600 +struct mtrr_ops generic_mtrr_ops = {
2601 +       .use_intel_if      = 1,
2602 +       .get               = generic_get_mtrr,
2603 +};
2604 +
2605 +struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
2606 +unsigned int num_var_ranges;
2607 +unsigned int *usage_table;
2608 +
2609 +static void __init set_num_var_ranges(void)
2610 +{
2611 +       dom0_op_t op;
2612 +
2613 +       for (num_var_ranges = 0; ; num_var_ranges++) {
2614 +               op.cmd = DOM0_READ_MEMTYPE;
2615 +               op.u.read_memtype.reg = num_var_ranges;
2616 +               if (HYPERVISOR_dom0_op(&op) != 0)
2617 +                       break;
2618 +       }
2619 +}
2620 +
2621 +static void __init init_table(void)
2622 +{
2623 +       int i, max;
2624 +
2625 +       max = num_var_ranges;
2626 +       if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
2627 +           == NULL) {
2628 +               printk(KERN_ERR "mtrr: could not allocate\n");
2629 +               return;
2630 +       }
2631 +       for (i = 0; i < max; i++)
2632 +               usage_table[i] = 0;
2633 +}
2634 +
2635 +int mtrr_add_page(unsigned long base, unsigned long size, 
2636 +                 unsigned int type, char increment)
2637 +{
2638 +       int error;
2639 +       dom0_op_t op;
2640 +
2641 +       down(&mtrr_sem);
2642 +
2643 +       op.cmd = DOM0_ADD_MEMTYPE;
2644 +       op.u.add_memtype.mfn     = base;
2645 +       op.u.add_memtype.nr_mfns = size;
2646 +       op.u.add_memtype.type    = type;
2647 +       error = HYPERVISOR_dom0_op(&op);
2648 +       if (error) {
2649 +               up(&mtrr_sem);
2650 +               BUG_ON(error > 0);
2651 +               return error;
2652 +       }
2653 +
2654 +       if (increment)
2655 +               ++usage_table[op.u.add_memtype.reg];
2656 +
2657 +       up(&mtrr_sem);
2658 +
2659 +       return op.u.add_memtype.reg;
2660 +}
2661 +
2662 +static int mtrr_check(unsigned long base, unsigned long size)
2663 +{
2664 +       if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
2665 +               printk(KERN_WARNING
2666 +                       "mtrr: size and base must be multiples of 4 kiB\n");
2667 +               printk(KERN_DEBUG
2668 +                       "mtrr: size: 0x%lx  base: 0x%lx\n", size, base);
2669 +               dump_stack();
2670 +               return -1;
2671 +       }
2672 +       return 0;
2673 +}
2674 +
2675 +int
2676 +mtrr_add(unsigned long base, unsigned long size, unsigned int type,
2677 +        char increment)
2678 +{
2679 +       if (mtrr_check(base, size))
2680 +               return -EINVAL;
2681 +       return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
2682 +                            increment);
2683 +}
2684 +
2685 +int mtrr_del_page(int reg, unsigned long base, unsigned long size)
2686 +{
2687 +       unsigned i;
2688 +       mtrr_type ltype;
2689 +       unsigned long lbase;
2690 +       unsigned int lsize;
2691 +       int error = -EINVAL;
2692 +       dom0_op_t op;
2693 +
2694 +       down(&mtrr_sem);
2695 +
2696 +       if (reg < 0) {
2697 +               /*  Search for existing MTRR  */
2698 +               for (i = 0; i < num_var_ranges; ++i) {
2699 +                       mtrr_if->get(i, &lbase, &lsize, &ltype);
2700 +                       if (lbase == base && lsize == size) {
2701 +                               reg = i;
2702 +                               break;
2703 +                       }
2704 +               }
2705 +               if (reg < 0) {
2706 +                       printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base,
2707 +                              size);
2708 +                       goto out;
2709 +               }
2710 +       }
2711 +       if (usage_table[reg] < 1) {
2712 +               printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
2713 +               goto out;
2714 +       }
2715 +       if (--usage_table[reg] < 1) {
2716 +               op.cmd = DOM0_DEL_MEMTYPE;
2717 +               op.u.del_memtype.handle = 0;
2718 +               op.u.del_memtype.reg    = reg;
2719 +               error = HYPERVISOR_dom0_op(&op);
2720 +               if (error) {
2721 +                       BUG_ON(error > 0);
2722 +                       goto out;
2723 +               }
2724 +       }
2725 +       error = reg;
2726 + out:
2727 +       up(&mtrr_sem);
2728 +       return error;
2729 +}
2730 +
2731 +int
2732 +mtrr_del(int reg, unsigned long base, unsigned long size)
2733 +{
2734 +       if (mtrr_check(base, size))
2735 +               return -EINVAL;
2736 +       return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
2737 +}
2738 +
2739 +EXPORT_SYMBOL(mtrr_add);
2740 +EXPORT_SYMBOL(mtrr_del);
2741 +
2742 +void __init mtrr_bp_init(void)
2743 +{
2744 +}
2745 +
2746 +void mtrr_ap_init(void)
2747 +{
2748 +}
2749 +
2750 +static int __init mtrr_init(void)
2751 +{
2752 +       struct cpuinfo_x86 *c = &boot_cpu_data;
2753 +
2754 +       if (!(xen_start_info->flags & SIF_PRIVILEGED))
2755 +               return -ENODEV;
2756 +
2757 +       if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
2758 +           (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
2759 +           (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
2760 +           (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
2761 +               return -ENODEV;
2762 +
2763 +       set_num_var_ranges();
2764 +       init_table();
2765 +
2766 +       return 0;
2767 +}
2768 +
2769 +subsys_initcall(mtrr_init);
2770 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/early_printk-xen.c linux-2.6.16/arch/i386/kernel/early_printk-xen.c
2771 --- linux-2.6.16.orig/arch/i386/kernel/early_printk-xen.c       1970-01-01 01:00:00.000000000 +0100
2772 +++ linux-2.6.16/arch/i386/kernel/early_printk-xen.c    2006-06-26 09:51:32.000000000 +0200
2773 @@ -0,0 +1,2 @@
2774 +
2775 +#include "../../x86_64/kernel/early_printk-xen.c"
2776 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/entry-xen.S linux-2.6.16/arch/i386/kernel/entry-xen.S
2777 --- linux-2.6.16.orig/arch/i386/kernel/entry-xen.S      1970-01-01 01:00:00.000000000 +0100
2778 +++ linux-2.6.16/arch/i386/kernel/entry-xen.S   2006-06-26 09:51:32.000000000 +0200
2779 @@ -0,0 +1,876 @@
2780 +/*
2781 + *  linux/arch/i386/entry.S
2782 + *
2783 + *  Copyright (C) 1991, 1992  Linus Torvalds
2784 + */
2785 +
2786 +/*
2787 + * entry.S contains the system-call and fault low-level handling routines.
2788 + * This also contains the timer-interrupt handler, as well as all interrupts
2789 + * and faults that can result in a task-switch.
2790 + *
2791 + * NOTE: This code handles signal-recognition, which happens every time
2792 + * after a timer-interrupt and after each system call.
2793 + *
2794 + * I changed all the .align's to 4 (16 byte alignment), as that's faster
2795 + * on a 486.
2796 + *
2797 + * Stack layout in 'ret_from_system_call':
2798 + *     ptrace needs to have all regs on the stack.
2799 + *     if the order here is changed, it needs to be
2800 + *     updated in fork.c:copy_process, signal.c:do_signal,
2801 + *     ptrace.c and ptrace.h
2802 + *
2803 + *      0(%esp) - %ebx
2804 + *      4(%esp) - %ecx
2805 + *      8(%esp) - %edx
2806 + *       C(%esp) - %esi
2807 + *     10(%esp) - %edi
2808 + *     14(%esp) - %ebp
2809 + *     18(%esp) - %eax
2810 + *     1C(%esp) - %ds
2811 + *     20(%esp) - %es
2812 + *     24(%esp) - orig_eax
2813 + *     28(%esp) - %eip
2814 + *     2C(%esp) - %cs
2815 + *     30(%esp) - %eflags
2816 + *     34(%esp) - %oldesp
2817 + *     38(%esp) - %oldss
2818 + *
2819 + * "current" is in register %ebx during any slow entries.
2820 + */
2821 +
2822 +#include <linux/config.h>
2823 +#include <linux/linkage.h>
2824 +#include <asm/thread_info.h>
2825 +#include <asm/errno.h>
2826 +#include <asm/segment.h>
2827 +#include <asm/smp.h>
2828 +#include <asm/page.h>
2829 +#include <asm/desc.h>
2830 +#include "irq_vectors.h"
2831 +#include <xen/interface/xen.h>
2832 +
2833 +#define nr_syscalls ((syscall_table_size)/4)
2834 +
2835 +EBX            = 0x00
2836 +ECX            = 0x04
2837 +EDX            = 0x08
2838 +ESI            = 0x0C
2839 +EDI            = 0x10
2840 +EBP            = 0x14
2841 +EAX            = 0x18
2842 +DS             = 0x1C
2843 +ES             = 0x20
2844 +ORIG_EAX       = 0x24
2845 +EIP            = 0x28
2846 +CS             = 0x2C
2847 +EFLAGS         = 0x30
2848 +OLDESP         = 0x34
2849 +OLDSS          = 0x38
2850 +
2851 +CF_MASK                = 0x00000001
2852 +TF_MASK                = 0x00000100
2853 +IF_MASK                = 0x00000200
2854 +DF_MASK                = 0x00000400 
2855 +NT_MASK                = 0x00004000
2856 +VM_MASK                = 0x00020000
2857 +/* Pseudo-eflags. */
2858 +NMI_MASK       = 0x80000000
2859 +
2860 +#ifndef CONFIG_XEN
2861 +#define DISABLE_INTERRUPTS     cli
2862 +#define ENABLE_INTERRUPTS      sti
2863 +#else
2864 +/* Offsets into shared_info_t. */
2865 +#define evtchn_upcall_pending          /* 0 */
2866 +#define evtchn_upcall_mask             1
2867 +
2868 +#define sizeof_vcpu_shift              6
2869 +
2870 +#ifdef CONFIG_SMP
2871 +#define GET_VCPU_INFO          movl TI_cpu(%ebp),%esi                  ; \
2872 +                               shl  $sizeof_vcpu_shift,%esi            ; \
2873 +                               addl HYPERVISOR_shared_info,%esi
2874 +#else
2875 +#define GET_VCPU_INFO          movl HYPERVISOR_shared_info,%esi
2876 +#endif
2877 +
2878 +#define __DISABLE_INTERRUPTS   movb $1,evtchn_upcall_mask(%esi)
2879 +#define __ENABLE_INTERRUPTS    movb $0,evtchn_upcall_mask(%esi)
2880 +#define DISABLE_INTERRUPTS     GET_VCPU_INFO                           ; \
2881 +                               __DISABLE_INTERRUPTS
2882 +#define ENABLE_INTERRUPTS      GET_VCPU_INFO                           ; \
2883 +                               __ENABLE_INTERRUPTS
2884 +#define __TEST_PENDING         testb $0xFF,evtchn_upcall_pending(%esi)
2885 +#endif
2886 +
2887 +#ifdef CONFIG_PREEMPT
2888 +#define preempt_stop           cli
2889 +#else
2890 +#define preempt_stop
2891 +#define resume_kernel          restore_nocheck
2892 +#endif
2893 +
2894 +#define SAVE_ALL \
2895 +       cld; \
2896 +       pushl %es; \
2897 +       pushl %ds; \
2898 +       pushl %eax; \
2899 +       pushl %ebp; \
2900 +       pushl %edi; \
2901 +       pushl %esi; \
2902 +       pushl %edx; \
2903 +       pushl %ecx; \
2904 +       pushl %ebx; \
2905 +       movl $(__USER_DS), %edx; \
2906 +       movl %edx, %ds; \
2907 +       movl %edx, %es;
2908 +
2909 +#define RESTORE_INT_REGS \
2910 +       popl %ebx;      \
2911 +       popl %ecx;      \
2912 +       popl %edx;      \
2913 +       popl %esi;      \
2914 +       popl %edi;      \
2915 +       popl %ebp;      \
2916 +       popl %eax
2917 +
2918 +#define RESTORE_REGS   \
2919 +       RESTORE_INT_REGS; \
2920 +1:     popl %ds;       \
2921 +2:     popl %es;       \
2922 +.section .fixup,"ax";  \
2923 +3:     movl $0,(%esp); \
2924 +       jmp 1b;         \
2925 +4:     movl $0,(%esp); \
2926 +       jmp 2b;         \
2927 +.previous;             \
2928 +.section __ex_table,"a";\
2929 +       .align 4;       \
2930 +       .long 1b,3b;    \
2931 +       .long 2b,4b;    \
2932 +.previous
2933 +
2934 +
2935 +ENTRY(ret_from_fork)
2936 +       pushl %eax
2937 +       call schedule_tail
2938 +       GET_THREAD_INFO(%ebp)
2939 +       popl %eax
2940 +       jmp syscall_exit
2941 +
2942 +/*
2943 + * Return to user mode is not as complex as all this looks,
2944 + * but we want the default path for a system call return to
2945 + * go as quickly as possible which is why some of this is
2946 + * less clear than it otherwise should be.
2947 + */
2948 +
2949 +       # userspace resumption stub bypassing syscall exit tracing
2950 +       ALIGN
2951 +ret_from_exception:
2952 +       preempt_stop
2953 +ret_from_intr:
2954 +       GET_THREAD_INFO(%ebp)
2955 +       movl EFLAGS(%esp), %eax         # mix EFLAGS and CS
2956 +       movb CS(%esp), %al
2957 +       testl $(VM_MASK | 2), %eax
2958 +       jz resume_kernel
2959 +ENTRY(resume_userspace)
2960 +       DISABLE_INTERRUPTS              # make sure we don't miss an interrupt
2961 +                                       # setting need_resched or sigpending
2962 +                                       # between sampling and the iret
2963 +       movl TI_flags(%ebp), %ecx
2964 +       andl $_TIF_WORK_MASK, %ecx      # is there any work to be done on
2965 +                                       # int/exception return?
2966 +       jne work_pending
2967 +       jmp restore_all
2968 +
2969 +#ifdef CONFIG_PREEMPT
2970 +ENTRY(resume_kernel)
2971 +       cli
2972 +       cmpl $0,TI_preempt_count(%ebp)  # non-zero preempt_count ?
2973 +       jnz restore_nocheck
2974 +need_resched:
2975 +       movl TI_flags(%ebp), %ecx       # need_resched set ?
2976 +       testb $_TIF_NEED_RESCHED, %cl
2977 +       jz restore_all
2978 +       testl $IF_MASK,EFLAGS(%esp)     # interrupts off (exception path) ?
2979 +       jz restore_all
2980 +       call preempt_schedule_irq
2981 +       jmp need_resched
2982 +#endif
2983 +
2984 +#ifdef CONFIG_X86_SYSENTER
2985 +/* SYSENTER_RETURN points to after the "sysenter" instruction in
2986 +   the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
2987 +
2988 +       # sysenter call handler stub
2989 +ENTRY(sysenter_entry)
2990 +       movl TSS_sysenter_esp0(%esp),%esp
2991 +sysenter_past_esp:
2992 +       sti
2993 +       pushl $(__USER_DS)
2994 +       pushl %ebp
2995 +       pushfl
2996 +       pushl $(__USER_CS)
2997 +       pushl $SYSENTER_RETURN
2998 +
2999 +/*
3000 + * Load the potential sixth argument from user stack.
3001 + * Careful about security.
3002 + */
3003 +       cmpl $__PAGE_OFFSET-3,%ebp
3004 +       jae syscall_fault
3005 +1:     movl (%ebp),%ebp
3006 +.section __ex_table,"a"
3007 +       .align 4
3008 +       .long 1b,syscall_fault
3009 +.previous
3010 +
3011 +       pushl %eax
3012 +       SAVE_ALL
3013 +       GET_THREAD_INFO(%ebp)
3014 +
3015 +       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
3016 +       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
3017 +       jnz syscall_trace_entry
3018 +       cmpl $(nr_syscalls), %eax
3019 +       jae syscall_badsys
3020 +       call *sys_call_table(,%eax,4)
3021 +       movl %eax,EAX(%esp)
3022 +       cli
3023 +       movl TI_flags(%ebp), %ecx
3024 +       testw $_TIF_ALLWORK_MASK, %cx
3025 +       jne syscall_exit_work
3026 +/* if something modifies registers it must also disable sysexit */
3027 +       movl EIP(%esp), %edx
3028 +       movl OLDESP(%esp), %ecx
3029 +       xorl %ebp,%ebp
3030 +       sti
3031 +       sysexit
3032 +#endif /* CONFIG_X86_SYSENTER */
3033 +
3034 +
3035 +       # system call handler stub
3036 +ENTRY(system_call)
3037 +       pushl %eax                      # save orig_eax
3038 +       SAVE_ALL
3039 +       GET_THREAD_INFO(%ebp)
3040 +                                       # system call tracing in operation / emulation
3041 +       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
3042 +       testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
3043 +       jnz syscall_trace_entry
3044 +       cmpl $(nr_syscalls), %eax
3045 +       jae syscall_badsys
3046 +syscall_call:
3047 +       call *sys_call_table(,%eax,4)
3048 +       movl %eax,EAX(%esp)             # store the return value
3049 +syscall_exit:
3050 +       DISABLE_INTERRUPTS              # make sure we don't miss an interrupt
3051 +                                       # setting need_resched or sigpending
3052 +                                       # between sampling and the iret
3053 +       movl TI_flags(%ebp), %ecx
3054 +       testw $_TIF_ALLWORK_MASK, %cx   # current->work
3055 +       jne syscall_exit_work
3056 +
3057 +restore_all:
3058 +#ifndef CONFIG_XEN
3059 +       movl EFLAGS(%esp), %eax         # mix EFLAGS, SS and CS
3060 +       # Warning: OLDSS(%esp) contains the wrong/random values if we
3061 +       # are returning to the kernel.
3062 +       # See comments in process.c:copy_thread() for details.
3063 +       movb OLDSS(%esp), %ah
3064 +       movb CS(%esp), %al
3065 +       andl $(VM_MASK | (4 << 8) | 3), %eax
3066 +       cmpl $((4 << 8) | 3), %eax
3067 +       je ldt_ss                       # returning to user-space with LDT SS
3068 +restore_nocheck:
3069 +#else
3070 +restore_nocheck:
3071 +       movl EFLAGS(%esp), %eax
3072 +       testl $(VM_MASK|NMI_MASK), %eax
3073 +       jnz hypervisor_iret
3074 +       shr $9, %eax                    # EAX[0] == IRET_EFLAGS.IF
3075 +       GET_VCPU_INFO
3076 +       andb evtchn_upcall_mask(%esi),%al
3077 +       andb $1,%al                     # EAX[0] == IRET_EFLAGS.IF & event_mask
3078 +       jnz restore_all_enable_events   #        != 0 => enable event delivery
3079 +#endif
3080 +       RESTORE_REGS
3081 +       addl $4, %esp
3082 +1:     iret
3083 +.section .fixup,"ax"
3084 +iret_exc:
3085 +#ifndef CONFIG_XEN
3086 +       sti
3087 +#endif
3088 +       pushl $0                        # no error code
3089 +       pushl $do_iret_error
3090 +       jmp error_code
3091 +.previous
3092 +.section __ex_table,"a"
3093 +       .align 4
3094 +       .long 1b,iret_exc
3095 +.previous
3096 +
3097 +#ifndef CONFIG_XEN
3098 +ldt_ss:
3099 +       larl OLDSS(%esp), %eax
3100 +       jnz restore_nocheck
3101 +       testl $0x00400000, %eax         # returning to 32bit stack?
3102 +       jnz restore_nocheck             # allright, normal return
3103 +       /* If returning to userspace with 16bit stack,
3104 +        * try to fix the higher word of ESP, as the CPU
3105 +        * won't restore it.
3106 +        * This is an "official" bug of all the x86-compatible
3107 +        * CPUs, which we can try to work around to make
3108 +        * dosemu and wine happy. */
3109 +       subl $8, %esp           # reserve space for switch16 pointer
3110 +       cli
3111 +       movl %esp, %eax
3112 +       /* Set up the 16bit stack frame with switch32 pointer on top,
3113 +        * and a switch16 pointer on top of the current frame. */
3114 +       call setup_x86_bogus_stack
3115 +       RESTORE_REGS
3116 +       lss 20+4(%esp), %esp    # switch to 16bit stack
3117 +1:     iret
3118 +.section __ex_table,"a"
3119 +       .align 4
3120 +       .long 1b,iret_exc
3121 +.previous
3122 +#else
3123 +hypervisor_iret:
3124 +       andl $~NMI_MASK, EFLAGS(%esp)
3125 +       RESTORE_REGS
3126 +       addl $4, %esp
3127 +       jmp  hypercall_page + (__HYPERVISOR_iret * 32)
3128 +#endif
3129 +
3130 +       # perform work that needs to be done immediately before resumption
3131 +       ALIGN
3132 +work_pending:
3133 +       testb $_TIF_NEED_RESCHED, %cl
3134 +       jz work_notifysig
3135 +work_resched:
3136 +       call schedule
3137 +       DISABLE_INTERRUPTS              # make sure we don't miss an interrupt
3138 +                                       # setting need_resched or sigpending
3139 +                                       # between sampling and the iret
3140 +       movl TI_flags(%ebp), %ecx
3141 +       andl $_TIF_WORK_MASK, %ecx      # is there any work to be done other
3142 +                                       # than syscall tracing?
3143 +       jz restore_all
3144 +       testb $_TIF_NEED_RESCHED, %cl
3145 +       jnz work_resched
3146 +
3147 +work_notifysig:                                # deal with pending signals and
3148 +                                       # notify-resume requests
3149 +       testl $VM_MASK, EFLAGS(%esp)
3150 +       movl %esp, %eax
3151 +       jne work_notifysig_v86          # returning to kernel-space or
3152 +                                       # vm86-space
3153 +       xorl %edx, %edx
3154 +       call do_notify_resume
3155 +       jmp resume_userspace
3156 +
3157 +       ALIGN
3158 +work_notifysig_v86:
3159 +#ifdef CONFIG_VM86
3160 +       pushl %ecx                      # save ti_flags for do_notify_resume
3161 +       call save_v86_state             # %eax contains pt_regs pointer
3162 +       popl %ecx
3163 +       movl %eax, %esp
3164 +       xorl %edx, %edx
3165 +       call do_notify_resume
3166 +       jmp resume_userspace
3167 +#endif
3168 +
3169 +       # perform syscall exit tracing
3170 +       ALIGN
3171 +syscall_trace_entry:
3172 +       movl $-ENOSYS,EAX(%esp)
3173 +       movl %esp, %eax
3174 +       xorl %edx,%edx
3175 +       call do_syscall_trace
3176 +       cmpl $0, %eax
3177 +       jne resume_userspace            # ret != 0 -> running under PTRACE_SYSEMU,
3178 +                                       # so must skip actual syscall
3179 +       movl ORIG_EAX(%esp), %eax
3180 +       cmpl $(nr_syscalls), %eax
3181 +       jnae syscall_call
3182 +       jmp syscall_exit
3183 +
3184 +       # perform syscall exit tracing
3185 +       ALIGN
3186 +syscall_exit_work:
3187 +       testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
3188 +       jz work_pending
3189 +       ENABLE_INTERRUPTS               # could let do_syscall_trace() call
3190 +                                       # schedule() instead
3191 +       movl %esp, %eax
3192 +       movl $1, %edx
3193 +       call do_syscall_trace
3194 +       jmp resume_userspace
3195 +
3196 +       ALIGN
3197 +syscall_fault:
3198 +       pushl %eax                      # save orig_eax
3199 +       SAVE_ALL
3200 +       GET_THREAD_INFO(%ebp)
3201 +       movl $-EFAULT,EAX(%esp)
3202 +       jmp resume_userspace
3203 +
3204 +       ALIGN
3205 +syscall_badsys:
3206 +       movl $-ENOSYS,EAX(%esp)
3207 +       jmp resume_userspace
3208 +
3209 +#ifndef CONFIG_XEN
3210 +#define FIXUP_ESPFIX_STACK \
3211 +       movl %esp, %eax; \
3212 +       /* switch to 32bit stack using the pointer on top of 16bit stack */ \
3213 +       lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
3214 +       /* copy data from 16bit stack to 32bit stack */ \
3215 +       call fixup_x86_bogus_stack; \
3216 +       /* put ESP to the proper location */ \
3217 +       movl %eax, %esp;
3218 +#define UNWIND_ESPFIX_STACK \
3219 +       pushl %eax; \
3220 +       movl %ss, %eax; \
3221 +       /* see if on 16bit stack */ \
3222 +       cmpw $__ESPFIX_SS, %ax; \
3223 +       jne 28f; \
3224 +       movl $__KERNEL_DS, %edx; \
3225 +       movl %edx, %ds; \
3226 +       movl %edx, %es; \
3227 +       /* switch to 32bit stack */ \
3228 +       FIXUP_ESPFIX_STACK \
3229 +28:    popl %eax;
3230 +
3231 +/*
3232 + * Build the entry stubs and pointer table with
3233 + * some assembler magic.
3234 + */
3235 +.data
3236 +ENTRY(interrupt)
3237 +.text
3238 +
3239 +vector=0
3240 +ENTRY(irq_entries_start)
3241 +.rept NR_IRQS
3242 +       ALIGN
3243 +1:     pushl $~(vector)
3244 +       jmp common_interrupt
3245 +.data
3246 +       .long 1b
3247 +.text
3248 +vector=vector+1
3249 +.endr
3250 +
3251 +       ALIGN
3252 +common_interrupt:
3253 +       SAVE_ALL
3254 +       movl %esp,%eax
3255 +       call do_IRQ
3256 +       jmp ret_from_intr
3257 +
3258 +#define BUILD_INTERRUPT(name, nr)      \
3259 +ENTRY(name)                            \
3260 +       pushl $~(nr);                   \
3261 +       SAVE_ALL                        \
3262 +       movl %esp,%eax;                 \
3263 +       call smp_/**/name;              \
3264 +       jmp ret_from_intr;
3265 +
3266 +/* The include is where all of the SMP etc. interrupts come from */
3267 +#include "entry_arch.h"
3268 +#else
3269 +#define UNWIND_ESPFIX_STACK
3270 +#endif
3271 +
3272 +ENTRY(divide_error)
3273 +       pushl $0                        # no error code
3274 +       pushl $do_divide_error
3275 +       ALIGN
3276 +error_code:
3277 +       pushl %ds
3278 +       pushl %eax
3279 +       xorl %eax, %eax
3280 +       pushl %ebp
3281 +       pushl %edi
3282 +       pushl %esi
3283 +       pushl %edx
3284 +       decl %eax                       # eax = -1
3285 +       pushl %ecx
3286 +       pushl %ebx
3287 +       cld
3288 +       pushl %es
3289 +       UNWIND_ESPFIX_STACK
3290 +       popl %ecx
3291 +       movl ES(%esp), %edi             # get the function address
3292 +       movl ORIG_EAX(%esp), %edx       # get the error code
3293 +       movl %eax, ORIG_EAX(%esp)
3294 +       movl %ecx, ES(%esp)
3295 +       movl $(__USER_DS), %ecx
3296 +       movl %ecx, %ds
3297 +       movl %ecx, %es
3298 +       movl %esp,%eax                  # pt_regs pointer
3299 +       call *%edi
3300 +       jmp ret_from_exception
3301 +
3302 +#ifdef CONFIG_XEN
3303 +# A note on the "critical region" in our callback handler.
3304 +# We want to avoid stacking callback handlers due to events occurring
3305 +# during handling of the last event. To do this, we keep events disabled
3306 +# until we've done all processing. HOWEVER, we must enable events before
3307 +# popping the stack frame (can't be done atomically) and so it would still
3308 +# be possible to get enough handler activations to overflow the stack.
3309 +# Although unlikely, bugs of that kind are hard to track down, so we'd
3310 +# like to avoid the possibility.
3311 +# So, on entry to the handler we detect whether we interrupted an
3312 +# existing activation in its critical region -- if so, we pop the current
3313 +# activation and restart the handler using the previous one.
3314 +ENTRY(hypervisor_callback)
3315 +       pushl %eax
3316 +       SAVE_ALL
3317 +       movl EIP(%esp),%eax
3318 +       cmpl $scrit,%eax
3319 +       jb   11f
3320 +       cmpl $ecrit,%eax
3321 +       jb   critical_region_fixup
3322 +11:    push %esp
3323 +       call evtchn_do_upcall
3324 +       add  $4,%esp
3325 +       jmp  ret_from_intr
3326 +
3327 +        ALIGN
3328 +restore_all_enable_events:
3329 +       __ENABLE_INTERRUPTS
3330 +scrit: /**** START OF CRITICAL REGION ****/
3331 +       __TEST_PENDING
3332 +       jnz  14f                        # process more events if necessary...
3333 +       RESTORE_REGS
3334 +       addl $4, %esp
3335 +1:     iret
3336 +.section __ex_table,"a"
3337 +       .align 4
3338 +       .long 1b,iret_exc
3339 +.previous
3340 +14:    __DISABLE_INTERRUPTS
3341 +       jmp  11b
3342 +ecrit:  /**** END OF CRITICAL REGION ****/
3343 +# [How we do the fixup]. We want to merge the current stack frame with the
3344 +# just-interrupted frame. How we do this depends on where in the critical
3345 +# region the interrupted handler was executing, and so how many saved
3346 +# registers are in each frame. We do this quickly using the lookup table
3347 +# 'critical_fixup_table'. For each byte offset in the critical region, it
3348 +# provides the number of bytes which have already been popped from the
3349 +# interrupted stack frame.
3350 +critical_region_fixup:
3351 +       addl $critical_fixup_table-scrit,%eax
3352 +       movzbl (%eax),%eax              # %eax contains num bytes popped
3353 +       cmpb $0xff,%al                  # 0xff => vcpu_info critical region
3354 +       jne  15f
3355 +       GET_THREAD_INFO(%ebp)
3356 +        xorl %eax,%eax
3357 +15:    mov  %esp,%esi
3358 +       add  %eax,%esi                  # %esi points at end of src region
3359 +       mov  %esp,%edi
3360 +       add  $0x34,%edi                 # %edi points at end of dst region
3361 +       mov  %eax,%ecx
3362 +       shr  $2,%ecx                    # convert words to bytes
3363 +       je   17f                        # skip loop if nothing to copy
3364 +16:    subl $4,%esi                    # pre-decrementing copy loop
3365 +       subl $4,%edi
3366 +       movl (%esi),%eax
3367 +       movl %eax,(%edi)
3368 +       loop 16b
3369 +17:    movl %edi,%esp                  # final %edi is top of merged stack
3370 +       jmp  11b
3371 +
3372 +critical_fixup_table:
3373 +       .byte 0xff,0xff,0xff            # testb $0xff,(%esi) = __TEST_PENDING
3374 +       .byte 0xff,0xff                 # jnz  14f
3375 +       .byte 0x00                      # pop  %ebx
3376 +       .byte 0x04                      # pop  %ecx
3377 +       .byte 0x08                      # pop  %edx
3378 +       .byte 0x0c                      # pop  %esi
3379 +       .byte 0x10                      # pop  %edi
3380 +       .byte 0x14                      # pop  %ebp
3381 +       .byte 0x18                      # pop  %eax
3382 +       .byte 0x1c                      # pop  %ds
3383 +       .byte 0x20                      # pop  %es
3384 +       .byte 0x24,0x24,0x24            # add  $4,%esp
3385 +       .byte 0x28                      # iret
3386 +       .byte 0xff,0xff,0xff,0xff       # movb $1,1(%esi)
3387 +       .byte 0x00,0x00                 # jmp  11b
3388 +
3389 +# Hypervisor uses this for application faults while it executes.
3390 +# We get here for two reasons:
3391 +#  1. Fault while reloading DS, ES, FS or GS
3392 +#  2. Fault while executing IRET
3393 +# Category 1 we fix up by reattempting the load, and zeroing the segment
3394 +# register if the load fails.
3395 +# Category 2 we fix up by jumping to do_iret_error. We cannot use the
3396 +# normal Linux return path in this case because if we use the IRET hypercall
3397 +# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
3398 +# We distinguish between categories by maintaining a status value in EAX.
3399 +ENTRY(failsafe_callback)
3400 +       pushl %eax
3401 +       movl $1,%eax
3402 +1:     mov 4(%esp),%ds
3403 +2:     mov 8(%esp),%es
3404 +3:     mov 12(%esp),%fs
3405 +4:     mov 16(%esp),%gs
3406 +       testl %eax,%eax
3407 +       popl %eax
3408 +       jz 5f
3409 +       addl $16,%esp           # EAX != 0 => Category 2 (Bad IRET)
3410 +       jmp iret_exc
3411 +5:     addl $16,%esp           # EAX == 0 => Category 1 (Bad segment)
3412 +       pushl $0
3413 +       SAVE_ALL
3414 +       jmp ret_from_exception
3415 +.section .fixup,"ax";          \
3416 +6:     xorl %eax,%eax;         \
3417 +       movl %eax,4(%esp);      \
3418 +       jmp 1b;                 \
3419 +7:     xorl %eax,%eax;         \
3420 +       movl %eax,8(%esp);      \
3421 +       jmp 2b;                 \
3422 +8:     xorl %eax,%eax;         \
3423 +       movl %eax,12(%esp);     \
3424 +       jmp 3b;                 \
3425 +9:     xorl %eax,%eax;         \
3426 +       movl %eax,16(%esp);     \
3427 +       jmp 4b;                 \
3428 +.previous;                     \
3429 +.section __ex_table,"a";       \
3430 +       .align 4;               \
3431 +       .long 1b,6b;            \
3432 +       .long 2b,7b;            \
3433 +       .long 3b,8b;            \
3434 +       .long 4b,9b;            \
3435 +.previous
3436 +#endif
3437 +
3438 +ENTRY(coprocessor_error)
3439 +       pushl $0
3440 +       pushl $do_coprocessor_error
3441 +       jmp error_code
3442 +
3443 +ENTRY(simd_coprocessor_error)
3444 +       pushl $0
3445 +       pushl $do_simd_coprocessor_error
3446 +       jmp error_code
3447 +
3448 +ENTRY(device_not_available)
3449 +       pushl $-1                       # mark this as an int
3450 +       SAVE_ALL
3451 +#ifndef CONFIG_XEN
3452 +       movl %cr0, %eax
3453 +       testl $0x4, %eax                # EM (math emulation bit)
3454 +       je device_available_emulate
3455 +       pushl $0                        # temporary storage for ORIG_EIP
3456 +       call math_emulate
3457 +       addl $4, %esp
3458 +       jmp ret_from_exception
3459 +device_available_emulate:
3460 +#endif
3461 +       preempt_stop
3462 +       call math_state_restore
3463 +       jmp ret_from_exception
3464 +
3465 +#ifdef CONFIG_X86_SYSENTER
3466 +/*
3467 + * Debug traps and NMI can happen at the one SYSENTER instruction
3468 + * that sets up the real kernel stack. Check here, since we can't
3469 + * allow the wrong stack to be used.
3470 + *
3471 + * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have
3472 + * already pushed 3 words if it hits on the sysenter instruction:
3473 + * eflags, cs and eip.
3474 + *
3475 + * We just load the right stack, and push the three (known) values
3476 + * by hand onto the new stack - while updating the return eip past
3477 + * the instruction that would have done it for sysenter.
3478 + */
3479 +#define FIX_STACK(offset, ok, label)           \
3480 +       cmpw $__KERNEL_CS,4(%esp);              \
3481 +       jne ok;                                 \
3482 +label:                                         \
3483 +       movl TSS_sysenter_esp0+offset(%esp),%esp;       \
3484 +       pushfl;                                 \
3485 +       pushl $__KERNEL_CS;                     \
3486 +       pushl $sysenter_past_esp
3487 +#endif /* CONFIG_X86_SYSENTER */
3488 +
3489 +KPROBE_ENTRY(debug)
3490 +#ifdef CONFIG_X86_SYSENTER
3491 +       cmpl $sysenter_entry,(%esp)
3492 +       jne debug_stack_correct
3493 +       FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
3494 +debug_stack_correct:
3495 +#endif /* !CONFIG_X86_SYSENTER */
3496 +       pushl $-1                       # mark this as an int
3497 +       SAVE_ALL
3498 +       xorl %edx,%edx                  # error code 0
3499 +       movl %esp,%eax                  # pt_regs pointer
3500 +       call do_debug
3501 +       jmp ret_from_exception
3502 +       .previous .text
3503 +
3504 +#ifndef CONFIG_XEN
3505 +/*
3506 + * NMI is doubly nasty. It can happen _while_ we're handling
3507 + * a debug fault, and the debug fault hasn't yet been able to
3508 + * clear up the stack. So we first check whether we got  an
3509 + * NMI on the sysenter entry path, but after that we need to
3510 + * check whether we got an NMI on the debug path where the debug
3511 + * fault happened on the sysenter path.
3512 + */
3513 +ENTRY(nmi)
3514 +       pushl %eax
3515 +       movl %ss, %eax
3516 +       cmpw $__ESPFIX_SS, %ax
3517 +       popl %eax
3518 +       je nmi_16bit_stack
3519 +       cmpl $sysenter_entry,(%esp)
3520 +       je nmi_stack_fixup
3521 +       pushl %eax
3522 +       movl %esp,%eax
3523 +       /* Do not access memory above the end of our stack page,
3524 +        * it might not exist.
3525 +        */
3526 +       andl $(THREAD_SIZE-1),%eax
3527 +       cmpl $(THREAD_SIZE-20),%eax
3528 +       popl %eax
3529 +       jae nmi_stack_correct
3530 +       cmpl $sysenter_entry,12(%esp)
3531 +       je nmi_debug_stack_check
3532 +nmi_stack_correct:
3533 +       pushl %eax
3534 +       SAVE_ALL
3535 +       xorl %edx,%edx          # zero error code
3536 +       movl %esp,%eax          # pt_regs pointer
3537 +       call do_nmi
3538 +       jmp restore_all
3539 +
3540 +nmi_stack_fixup:
3541 +       FIX_STACK(12,nmi_stack_correct, 1)
3542 +       jmp nmi_stack_correct
3543 +nmi_debug_stack_check:
3544 +       cmpw $__KERNEL_CS,16(%esp)
3545 +       jne nmi_stack_correct
3546 +       cmpl $debug,(%esp)
3547 +       jb nmi_stack_correct
3548 +       cmpl $debug_esp_fix_insn,(%esp)
3549 +       ja nmi_stack_correct
3550 +       FIX_STACK(24,nmi_stack_correct, 1)
3551 +       jmp nmi_stack_correct
3552 +
3553 +nmi_16bit_stack:
3554 +       /* create the pointer to lss back */
3555 +       pushl %ss
3556 +       pushl %esp
3557 +       movzwl %sp, %esp
3558 +       addw $4, (%esp)
3559 +       /* copy the iret frame of 12 bytes */
3560 +       .rept 3
3561 +       pushl 16(%esp)
3562 +       .endr
3563 +       pushl %eax
3564 +       SAVE_ALL
3565 +       FIXUP_ESPFIX_STACK              # %eax == %esp
3566 +       xorl %edx,%edx                  # zero error code
3567 +       call do_nmi
3568 +       RESTORE_REGS
3569 +       lss 12+4(%esp), %esp            # back to 16bit stack
3570 +1:     iret
3571 +.section __ex_table,"a"
3572 +       .align 4
3573 +       .long 1b,iret_exc
3574 +.previous
3575 +#else
3576 +ENTRY(nmi)
3577 +       pushl %eax
3578 +       SAVE_ALL
3579 +       xorl %edx,%edx          # zero error code
3580 +       movl %esp,%eax          # pt_regs pointer
3581 +       call do_nmi
3582 +       orl  $NMI_MASK, EFLAGS(%esp)
3583 +       jmp restore_all
3584 +#endif
3585 +
3586 +KPROBE_ENTRY(int3)
3587 +       pushl $-1                       # mark this as an int
3588 +       SAVE_ALL
3589 +       xorl %edx,%edx          # zero error code
3590 +       movl %esp,%eax          # pt_regs pointer
3591 +       call do_int3
3592 +       jmp ret_from_exception
3593 +       .previous .text
3594 +
3595 +ENTRY(overflow)
3596 +       pushl $0
3597 +       pushl $do_overflow
3598 +       jmp error_code
3599 +
3600 +ENTRY(bounds)
3601 +       pushl $0
3602 +       pushl $do_bounds
3603 +       jmp error_code
3604 +
3605 +ENTRY(invalid_op)
3606 +       pushl $0
3607 +       pushl $do_invalid_op
3608 +       jmp error_code
3609 +
3610 +ENTRY(coprocessor_segment_overrun)
3611 +       pushl $0
3612 +       pushl $do_coprocessor_segment_overrun
3613 +       jmp error_code
3614 +
3615 +ENTRY(invalid_TSS)
3616 +       pushl $do_invalid_TSS
3617 +       jmp error_code
3618 +
3619 +ENTRY(segment_not_present)
3620 +       pushl $do_segment_not_present
3621 +       jmp error_code
3622 +
3623 +ENTRY(stack_segment)
3624 +       pushl $do_stack_segment
3625 +       jmp error_code
3626 +
3627 +KPROBE_ENTRY(general_protection)
3628 +       pushl $do_general_protection
3629 +       jmp error_code
3630 +       .previous .text
3631 +
3632 +ENTRY(alignment_check)
3633 +       pushl $do_alignment_check
3634 +       jmp error_code
3635 +
3636 +KPROBE_ENTRY(page_fault)
3637 +       pushl $do_page_fault
3638 +       jmp error_code
3639 +       .previous .text
3640 +
3641 +#ifdef CONFIG_X86_MCE
3642 +ENTRY(machine_check)
3643 +       pushl $0
3644 +       pushl machine_check_vector
3645 +       jmp error_code
3646 +#endif
3647 +
3648 +ENTRY(fixup_4gb_segment)
3649 +       pushl $do_fixup_4gb_segment
3650 +       jmp error_code
3651 +
3652 +.section .rodata,"a"
3653 +#include "syscall_table.S"
3654 +
3655 +syscall_table_size=(.-sys_call_table)
3656 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/fixup.c linux-2.6.16/arch/i386/kernel/fixup.c
3657 --- linux-2.6.16.orig/arch/i386/kernel/fixup.c  1970-01-01 01:00:00.000000000 +0100
3658 +++ linux-2.6.16/arch/i386/kernel/fixup.c       2006-06-26 09:51:32.000000000 +0200
3659 @@ -0,0 +1,95 @@
3660 +/******************************************************************************
3661 + * fixup.c
3662 + * 
3663 + * Binary-rewriting of certain IA32 instructions, on notification by Xen.
3664 + * Used to avoid repeated slow emulation of common instructions used by the
3665 + * user-space TLS (Thread-Local Storage) libraries.
3666 + * 
3667 + * **** NOTE ****
3668 + *  Issues with the binary rewriting have caused it to be removed. Instead
3669 + *  we rely on Xen's emulator to boot the kernel, and then print a banner
3670 + *  message recommending that the user disables /lib/tls.
3671 + * 
3672 + * Copyright (c) 2004, K A Fraser
3673 + * 
3674 + * This program is free software; you can redistribute it and/or modify
3675 + * it under the terms of the GNU General Public License as published by
3676 + * the Free Software Foundation; either version 2 of the License, or
3677 + * (at your option) any later version.
3678 + * 
3679 + * This program is distributed in the hope that it will be useful,
3680 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
3681 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
3682 + * GNU General Public License for more details.
3683 + * 
3684 + * You should have received a copy of the GNU General Public License
3685 + * along with this program; if not, write to the Free Software
3686 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
3687 + */
3688 +
3689 +#include <linux/config.h>
3690 +#include <linux/init.h>
3691 +#include <linux/sched.h>
3692 +#include <linux/slab.h>
3693 +#include <linux/kernel.h>
3694 +#include <linux/delay.h>
3695 +#include <linux/version.h>
3696 +
3697 +#define DP(_f, _args...) printk(KERN_ALERT "  " _f "\n" , ## _args )
3698 +
3699 +fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
3700 +{
3701 +       static unsigned long printed = 0;
3702 +       char info[100];
3703 +       int i;
3704 +
3705 +       if (test_and_set_bit(0, &printed))
3706 +               return;
3707 +
3708 +       HYPERVISOR_vm_assist(
3709 +               VMASST_CMD_disable, VMASST_TYPE_4gb_segments_notify);
3710 +
3711 +       sprintf(info, "%s (pid=%d)", current->comm, current->tgid);
3712 +
3713 +
3714 +       DP("");
3715 +       DP("***************************************************************");
3716 +       DP("***************************************************************");
3717 +       DP("** WARNING: Currently emulating unsupported memory accesses  **");
3718 +       DP("**          in /lib/tls glibc libraries. The emulation is    **");
3719 +       DP("**          slow. To ensure full performance you should      **");
3720 +       DP("**          install a 'xen-friendly' (nosegneg) version of   **");
3721 +       DP("**          the library, or disable tls support by executing **");
3722 +       DP("**          the following as root:                           **");
3723 +       DP("**          mv /lib/tls /lib/tls.disabled                    **");
3724 +       DP("** Offending process: %-38.38s **", info);
3725 +       DP("***************************************************************");
3726 +       DP("***************************************************************");
3727 +       DP("");
3728 +
3729 +       for (i = 5; i > 0; i--) {
3730 +               printk("Pausing... %d", i);
3731 +               mdelay(1000);
3732 +               printk("\b\b\b\b\b\b\b\b\b\b\b\b");
3733 +       }
3734 +
3735 +       printk("Continuing...\n\n");
3736 +}
3737 +
3738 +static int __init fixup_init(void)
3739 +{
3740 +       HYPERVISOR_vm_assist(
3741 +               VMASST_CMD_enable, VMASST_TYPE_4gb_segments_notify);
3742 +       return 0;
3743 +}
3744 +__initcall(fixup_init);
3745 +
3746 +/*
3747 + * Local variables:
3748 + *  c-file-style: "linux"
3749 + *  indent-tabs-mode: t
3750 + *  c-indent-level: 8
3751 + *  c-basic-offset: 8
3752 + *  tab-width: 8
3753 + * End:
3754 + */
3755 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/head-xen.S linux-2.6.16/arch/i386/kernel/head-xen.S
3756 --- linux-2.6.16.orig/arch/i386/kernel/head-xen.S       1970-01-01 01:00:00.000000000 +0100
3757 +++ linux-2.6.16/arch/i386/kernel/head-xen.S    2006-06-26 09:51:32.000000000 +0200
3758 @@ -0,0 +1,171 @@
3759 +
3760 +
3761 +.text
3762 +#include <linux/config.h>
3763 +#include <linux/threads.h>
3764 +#include <linux/linkage.h>
3765 +#include <asm/segment.h>
3766 +#include <asm/page.h>
3767 +#include <asm/thread_info.h>
3768 +#include <asm/asm-offsets.h>
3769 +#include <xen/interface/arch-x86_32.h>
3770 +
3771 +/*
3772 + * References to members of the new_cpu_data structure.
3773 + */
3774 +
3775 +#define X86            new_cpu_data+CPUINFO_x86
3776 +#define X86_VENDOR     new_cpu_data+CPUINFO_x86_vendor
3777 +#define X86_MODEL      new_cpu_data+CPUINFO_x86_model
3778 +#define X86_MASK       new_cpu_data+CPUINFO_x86_mask
3779 +#define X86_HARD_MATH  new_cpu_data+CPUINFO_hard_math
3780 +#define X86_CPUID      new_cpu_data+CPUINFO_cpuid_level
3781 +#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
3782 +#define X86_VENDOR_ID  new_cpu_data+CPUINFO_x86_vendor_id
3783 +
3784 +ENTRY(startup_32)
3785 +       movl %esi,xen_start_info
3786 +       cld
3787 +
3788 +       /* Set up the stack pointer */
3789 +       movl $(init_thread_union+THREAD_SIZE),%esp
3790 +
3791 +       /* get vendor info */
3792 +       xorl %eax,%eax                  # call CPUID with 0 -> return vendor ID
3793 +       XEN_CPUID
3794 +       movl %eax,X86_CPUID             # save CPUID level
3795 +       movl %ebx,X86_VENDOR_ID         # lo 4 chars
3796 +       movl %edx,X86_VENDOR_ID+4       # next 4 chars
3797 +       movl %ecx,X86_VENDOR_ID+8       # last 4 chars
3798 +
3799 +       movl $1,%eax            # Use the CPUID instruction to get CPU type
3800 +       XEN_CPUID
3801 +       movb %al,%cl            # save reg for future use
3802 +       andb $0x0f,%ah          # mask processor family
3803 +       movb %ah,X86
3804 +       andb $0xf0,%al          # mask model
3805 +       shrb $4,%al
3806 +       movb %al,X86_MODEL
3807 +       andb $0x0f,%cl          # mask mask revision
3808 +       movb %cl,X86_MASK
3809 +       movl %edx,X86_CAPABILITY
3810 +
3811 +       movb $1,X86_HARD_MATH
3812 +
3813 +       xorl %eax,%eax                  # Clear FS/GS and LDT
3814 +       movl %eax,%fs
3815 +       movl %eax,%gs
3816 +       cld                     # gcc2 wants the direction flag cleared at all times
3817 +
3818 +       call start_kernel
3819 +L6:
3820 +       jmp L6                  # main should never return here, but
3821 +                               # just in case, we know what happens.
3822 +
3823 +#define HYPERCALL_PAGE_OFFSET 0x1000
3824 +.org HYPERCALL_PAGE_OFFSET
3825 +ENTRY(hypercall_page)
3826 +.skip 0x1000
3827 +
3828 +/*
3829 + * Real beginning of normal "text" segment
3830 + */
3831 +ENTRY(stext)
3832 +ENTRY(_stext)
3833 +
3834 +/*
3835 + * BSS section
3836 + */
3837 +.section ".bss.page_aligned","w"
3838 +ENTRY(empty_zero_page)
3839 +       .fill 4096,1,0
3840 +
3841 +/*
3842 + * This starts the data section.
3843 + */
3844 +.data
3845 +
3846 +/*
3847 + * The Global Descriptor Table contains 28 quadwords, per-CPU.
3848 + */
3849 +ENTRY(cpu_gdt_table)
3850 +       .quad 0x0000000000000000        /* NULL descriptor */
3851 +       .quad 0x0000000000000000        /* 0x0b reserved */
3852 +       .quad 0x0000000000000000        /* 0x13 reserved */
3853 +       .quad 0x0000000000000000        /* 0x1b reserved */
3854 +       .quad 0x0000000000000000        /* 0x20 unused */
3855 +       .quad 0x0000000000000000        /* 0x28 unused */
3856 +       .quad 0x0000000000000000        /* 0x33 TLS entry 1 */
3857 +       .quad 0x0000000000000000        /* 0x3b TLS entry 2 */
3858 +       .quad 0x0000000000000000        /* 0x43 TLS entry 3 */
3859 +       .quad 0x0000000000000000        /* 0x4b reserved */
3860 +       .quad 0x0000000000000000        /* 0x53 reserved */
3861 +       .quad 0x0000000000000000        /* 0x5b reserved */
3862 +
3863 +       .quad 0x00cf9a000000ffff        /* 0x60 kernel 4GB code at 0x00000000 */
3864 +       .quad 0x00cf92000000ffff        /* 0x68 kernel 4GB data at 0x00000000 */
3865 +       .quad 0x00cffa000000ffff        /* 0x73 user 4GB code at 0x00000000 */
3866 +       .quad 0x00cff2000000ffff        /* 0x7b user 4GB data at 0x00000000 */
3867 +
3868 +       .quad 0x0000000000000000        /* 0x80 TSS descriptor */
3869 +       .quad 0x0000000000000000        /* 0x88 LDT descriptor */
3870 +
3871 +       /*
3872 +        * Segments used for calling PnP BIOS have byte granularity.
3873 +        * They code segments and data segments have fixed 64k limits,
3874 +        * the transfer segment sizes are set at run time.
3875 +        */
3876 +       .quad 0x0000000000000000        /* 0x90 32-bit code */
3877 +       .quad 0x0000000000000000        /* 0x98 16-bit code */
3878 +       .quad 0x0000000000000000        /* 0xa0 16-bit data */
3879 +       .quad 0x0000000000000000        /* 0xa8 16-bit data */
3880 +       .quad 0x0000000000000000        /* 0xb0 16-bit data */
3881 +
3882 +       /*
3883 +        * The APM segments have byte granularity and their bases
3884 +        * are set at run time.  All have 64k limits.
3885 +        */
3886 +       .quad 0x0000000000000000        /* 0xb8 APM CS    code */
3887 +       .quad 0x0000000000000000        /* 0xc0 APM CS 16 code (16 bit) */
3888 +       .quad 0x0000000000000000        /* 0xc8 APM DS    data */
3889 +
3890 +       .quad 0x0000000000000000        /* 0xd0 - ESPFIX 16-bit SS */
3891 +       .quad 0x0000000000000000        /* 0xd8 - unused */
3892 +       .quad 0x0000000000000000        /* 0xe0 - unused */
3893 +       .quad 0x0000000000000000        /* 0xe8 - unused */
3894 +       .quad 0x0000000000000000        /* 0xf0 - unused */
3895 +       .quad 0x0000000000000000        /* 0xf8 - GDT entry 31: double-fault TSS */
3896 +
3897 +/*
3898 + * __xen_guest information
3899 + */
3900 +.macro utoa value
3901 + .if (\value) < 0 || (\value) >= 0x10
3902 +       utoa (((\value)>>4)&0x0fffffff)
3903 + .endif
3904 + .if ((\value) & 0xf) < 10
3905 +  .byte '0' + ((\value) & 0xf)
3906 + .else
3907 +  .byte 'A' + ((\value) & 0xf) - 10
3908 + .endif
3909 +.endm
3910 +
3911 +.section __xen_guest
3912 +       .ascii  "GUEST_OS=linux,GUEST_VER=2.6"
3913 +       .ascii  ",XEN_VER=xen-3.0"
3914 +       .ascii  ",VIRT_BASE=0x"
3915 +               utoa __PAGE_OFFSET
3916 +       .ascii  ",HYPERCALL_PAGE=0x"
3917 +               utoa ((__PHYSICAL_START+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT)
3918 +       .ascii  ",FEATURES=writable_page_tables"
3919 +       .ascii           "|writable_descriptor_tables"
3920 +       .ascii           "|auto_translated_physmap"
3921 +       .ascii           "|pae_pgdir_above_4gb"
3922 +       .ascii           "|supervisor_mode_kernel"
3923 +#ifdef CONFIG_X86_PAE
3924 +       .ascii  ",PAE=yes"
3925 +#else
3926 +       .ascii  ",PAE=no"
3927 +#endif
3928 +       .ascii  ",LOADER=generic"
3929 +       .byte   0
3930 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/init_task-xen.c linux-2.6.16/arch/i386/kernel/init_task-xen.c
3931 --- linux-2.6.16.orig/arch/i386/kernel/init_task-xen.c  1970-01-01 01:00:00.000000000 +0100
3932 +++ linux-2.6.16/arch/i386/kernel/init_task-xen.c       2006-06-26 09:51:32.000000000 +0200
3933 @@ -0,0 +1,51 @@
3934 +#include <linux/mm.h>
3935 +#include <linux/module.h>
3936 +#include <linux/sched.h>
3937 +#include <linux/init.h>
3938 +#include <linux/init_task.h>
3939 +#include <linux/fs.h>
3940 +#include <linux/mqueue.h>
3941 +
3942 +#include <asm/uaccess.h>
3943 +#include <asm/pgtable.h>
3944 +#include <asm/desc.h>
3945 +
3946 +static struct fs_struct init_fs = INIT_FS;
3947 +static struct files_struct init_files = INIT_FILES;
3948 +static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
3949 +static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
3950 +
3951 +#define swapper_pg_dir ((pgd_t *)NULL)
3952 +struct mm_struct init_mm = INIT_MM(init_mm);
3953 +#undef swapper_pg_dir
3954 +
3955 +EXPORT_SYMBOL(init_mm);
3956 +
3957 +/*
3958 + * Initial thread structure.
3959 + *
3960 + * We need to make sure that this is THREAD_SIZE aligned due to the
3961 + * way process stacks are handled. This is done by having a special
3962 + * "init_task" linker map entry..
3963 + */
3964 +union thread_union init_thread_union 
3965 +       __attribute__((__section__(".data.init_task"))) =
3966 +               { INIT_THREAD_INFO(init_task) };
3967 +
3968 +/*
3969 + * Initial task structure.
3970 + *
3971 + * All other task structs will be allocated on slabs in fork.c
3972 + */
3973 +struct task_struct init_task = INIT_TASK(init_task);
3974 +
3975 +EXPORT_SYMBOL(init_task);
3976 +
3977 +#ifndef CONFIG_X86_NO_TSS
3978 +/*
3979 + * per-CPU TSS segments. Threads are completely 'soft' on Linux,
3980 + * no more per-task TSS's.
3981 + */ 
3982 +DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
3983 +#endif
3984 +
3985 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/io_apic-xen.c linux-2.6.16/arch/i386/kernel/io_apic-xen.c
3986 --- linux-2.6.16.orig/arch/i386/kernel/io_apic-xen.c    1970-01-01 01:00:00.000000000 +0100
3987 +++ linux-2.6.16/arch/i386/kernel/io_apic-xen.c 2006-06-26 09:51:32.000000000 +0200
3988 @@ -0,0 +1,2747 @@
3989 +/*
3990 + *     Intel IO-APIC support for multi-Pentium hosts.
3991 + *
3992 + *     Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
3993 + *
3994 + *     Many thanks to Stig Venaas for trying out countless experimental
3995 + *     patches and reporting/debugging problems patiently!
3996 + *
3997 + *     (c) 1999, Multiple IO-APIC support, developed by
3998 + *     Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
3999 + *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
4000 + *     further tested and cleaned up by Zach Brown <zab@redhat.com>
4001 + *     and Ingo Molnar <mingo@redhat.com>
4002 + *
4003 + *     Fixes
4004 + *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
4005 + *                                     thanks to Eric Gilmore
4006 + *                                     and Rolf G. Tews
4007 + *                                     for testing these extensively
4008 + *     Paul Diefenbaugh        :       Added full ACPI support
4009 + */
4010 +
4011 +#include <linux/mm.h>
4012 +#include <linux/interrupt.h>
4013 +#include <linux/init.h>
4014 +#include <linux/delay.h>
4015 +#include <linux/sched.h>
4016 +#include <linux/config.h>
4017 +#include <linux/smp_lock.h>
4018 +#include <linux/mc146818rtc.h>
4019 +#include <linux/compiler.h>
4020 +#include <linux/acpi.h>
4021 +#include <linux/module.h>
4022 +#include <linux/sysdev.h>
4023 +
4024 +#include <asm/io.h>
4025 +#include <asm/smp.h>
4026 +#include <asm/desc.h>
4027 +#include <asm/timer.h>
4028 +#include <asm/i8259.h>
4029 +
4030 +#include <mach_apic.h>
4031 +
4032 +#include "io_ports.h"
4033 +
4034 +#ifdef CONFIG_XEN
4035 +
4036 +#include <xen/interface/xen.h>
4037 +#include <xen/interface/physdev.h>
4038 +
4039 +/* Fake i8259 */
4040 +#define make_8259A_irq(_irq)     (io_apic_irqs &= ~(1UL<<(_irq)))
4041 +#define disable_8259A_irq(_irq)  ((void)0)
4042 +#define i8259A_irq_pending(_irq) (0)
4043 +
4044 +unsigned long io_apic_irqs;
4045 +
4046 +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
4047 +{
4048 +       physdev_op_t op;
4049 +       int ret;
4050 +
4051 +       op.cmd = PHYSDEVOP_APIC_READ;
4052 +       op.u.apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
4053 +       op.u.apic_op.reg = reg;
4054 +       ret = HYPERVISOR_physdev_op(&op);
4055 +       if (ret)
4056 +               return ret;
4057 +       return op.u.apic_op.value;
4058 +}
4059 +
4060 +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
4061 +{
4062 +       physdev_op_t op;
4063 +
4064 +       op.cmd = PHYSDEVOP_APIC_WRITE;
4065 +       op.u.apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
4066 +       op.u.apic_op.reg = reg;
4067 +       op.u.apic_op.value = value;
4068 +       HYPERVISOR_physdev_op(&op);
4069 +}
4070 +
4071 +#define io_apic_read(a,r)    xen_io_apic_read(a,r)
4072 +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
4073 +
4074 +#endif /* CONFIG_XEN */
4075 +
4076 +int (*ioapic_renumber_irq)(int ioapic, int irq);
4077 +atomic_t irq_mis_count;
4078 +
4079 +/* Where if anywhere is the i8259 connect in external int mode */
4080 +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
4081 +
4082 +static DEFINE_SPINLOCK(ioapic_lock);
4083 +
4084 +int timer_over_8254 __initdata = 1;
4085 +
4086 +/*
4087 + *     Is the SiS APIC rmw bug present ?
4088 + *     -1 = don't know, 0 = no, 1 = yes
4089 + */
4090 +int sis_apic_bug = -1;
4091 +
4092 +/*
4093 + * # of IRQ routing registers
4094 + */
4095 +int nr_ioapic_registers[MAX_IO_APICS];
4096 +
4097 +int disable_timer_pin_1 __initdata;
4098 +
4099 +/*
4100 + * Rough estimation of how many shared IRQs there are, can
4101 + * be changed anytime.
4102 + */
4103 +#define MAX_PLUS_SHARED_IRQS NR_IRQS
4104 +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
4105 +
4106 +/*
4107 + * This is performance-critical, we want to do it O(1)
4108 + *
4109 + * the indexing order of this array favors 1:1 mappings
4110 + * between pins and IRQs.
4111 + */
4112 +
4113 +static struct irq_pin_list {
4114 +       int apic, pin, next;
4115 +} irq_2_pin[PIN_MAP_SIZE];
4116 +
4117 +int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
4118 +#ifdef CONFIG_PCI_MSI
4119 +#define vector_to_irq(vector)  \
4120 +       (platform_legacy_irq(vector) ? vector : vector_irq[vector])
4121 +#else
4122 +#define vector_to_irq(vector)  (vector)
4123 +#endif
4124 +
4125 +/*
4126 + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
4127 + * shared ISA-space IRQs, so we have to support them. We are super
4128 + * fast in the common case, and fast for shared ISA-space IRQs.
4129 + */
4130 +static void add_pin_to_irq(unsigned int irq, int apic, int pin)
4131 +{
4132 +       static int first_free_entry = NR_IRQS;
4133 +       struct irq_pin_list *entry = irq_2_pin + irq;
4134 +
4135 +       while (entry->next)
4136 +               entry = irq_2_pin + entry->next;
4137 +
4138 +       if (entry->pin != -1) {
4139 +               entry->next = first_free_entry;
4140 +               entry = irq_2_pin + entry->next;
4141 +               if (++first_free_entry >= PIN_MAP_SIZE)
4142 +                       panic("io_apic.c: whoops");
4143 +       }
4144 +       entry->apic = apic;
4145 +       entry->pin = pin;
4146 +}
4147 +
4148 +#ifdef CONFIG_XEN
4149 +#define clear_IO_APIC() ((void)0)
4150 +#else
4151 +/*
4152 + * Reroute an IRQ to a different pin.
4153 + */
4154 +static void __init replace_pin_at_irq(unsigned int irq,
4155 +                                     int oldapic, int oldpin,
4156 +                                     int newapic, int newpin)
4157 +{
4158 +       struct irq_pin_list *entry = irq_2_pin + irq;
4159 +
4160 +       while (1) {
4161 +               if (entry->apic == oldapic && entry->pin == oldpin) {
4162 +                       entry->apic = newapic;
4163 +                       entry->pin = newpin;
4164 +               }
4165 +               if (!entry->next)
4166 +                       break;
4167 +               entry = irq_2_pin + entry->next;
4168 +       }
4169 +}
4170 +
4171 +static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
4172 +{
4173 +       struct irq_pin_list *entry = irq_2_pin + irq;
4174 +       unsigned int pin, reg;
4175 +
4176 +       for (;;) {
4177 +               pin = entry->pin;
4178 +               if (pin == -1)
4179 +                       break;
4180 +               reg = io_apic_read(entry->apic, 0x10 + pin*2);
4181 +               reg &= ~disable;
4182 +               reg |= enable;
4183 +               io_apic_modify(entry->apic, 0x10 + pin*2, reg);
4184 +               if (!entry->next)
4185 +                       break;
4186 +               entry = irq_2_pin + entry->next;
4187 +       }
4188 +}
4189 +
4190 +/* mask = 1 */
4191 +static void __mask_IO_APIC_irq (unsigned int irq)
4192 +{
4193 +       __modify_IO_APIC_irq(irq, 0x00010000, 0);
4194 +}
4195 +
4196 +/* mask = 0 */
4197 +static void __unmask_IO_APIC_irq (unsigned int irq)
4198 +{
4199 +       __modify_IO_APIC_irq(irq, 0, 0x00010000);
4200 +}
4201 +
4202 +/* mask = 1, trigger = 0 */
4203 +static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
4204 +{
4205 +       __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
4206 +}
4207 +
4208 +/* mask = 0, trigger = 1 */
4209 +static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
4210 +{
4211 +       __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
4212 +}
4213 +
4214 +static void mask_IO_APIC_irq (unsigned int irq)
4215 +{
4216 +       unsigned long flags;
4217 +
4218 +       spin_lock_irqsave(&ioapic_lock, flags);
4219 +       __mask_IO_APIC_irq(irq);
4220 +       spin_unlock_irqrestore(&ioapic_lock, flags);
4221 +}
4222 +
4223 +static void unmask_IO_APIC_irq (unsigned int irq)
4224 +{
4225 +       unsigned long flags;
4226 +
4227 +       spin_lock_irqsave(&ioapic_lock, flags);
4228 +       __unmask_IO_APIC_irq(irq);
4229 +       spin_unlock_irqrestore(&ioapic_lock, flags);
4230 +}
4231 +
4232 +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
4233 +{
4234 +       struct IO_APIC_route_entry entry;
4235 +       unsigned long flags;
4236 +       
4237 +       /* Check delivery_mode to be sure we're not clearing an SMI pin */
4238 +       spin_lock_irqsave(&ioapic_lock, flags);
4239 +       *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
4240 +       *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
4241 +       spin_unlock_irqrestore(&ioapic_lock, flags);
4242 +       if (entry.delivery_mode == dest_SMI)
4243 +               return;
4244 +
4245 +       /*
4246 +        * Disable it in the IO-APIC irq-routing table:
4247 +        */
4248 +       memset(&entry, 0, sizeof(entry));
4249 +       entry.mask = 1;
4250 +       spin_lock_irqsave(&ioapic_lock, flags);
4251 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
4252 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
4253 +       spin_unlock_irqrestore(&ioapic_lock, flags);
4254 +}
4255 +
4256 +static void clear_IO_APIC (void)
4257 +{
4258 +       int apic, pin;
4259 +
4260 +       for (apic = 0; apic < nr_ioapics; apic++)
4261 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
4262 +                       clear_IO_APIC_pin(apic, pin);
4263 +}
4264 +
4265 +#ifdef CONFIG_SMP
4266 +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
4267 +{
4268 +       unsigned long flags;
4269 +       int pin;
4270 +       struct irq_pin_list *entry = irq_2_pin + irq;
4271 +       unsigned int apicid_value;
4272 +       cpumask_t tmp;
4273 +       
4274 +       cpus_and(tmp, cpumask, cpu_online_map);
4275 +       if (cpus_empty(tmp))
4276 +               tmp = TARGET_CPUS;
4277 +
4278 +       cpus_and(cpumask, tmp, CPU_MASK_ALL);
4279 +
4280 +       apicid_value = cpu_mask_to_apicid(cpumask);
4281 +       /* Prepare to do the io_apic_write */
4282 +       apicid_value = apicid_value << 24;
4283 +       spin_lock_irqsave(&ioapic_lock, flags);
4284 +       for (;;) {
4285 +               pin = entry->pin;
4286 +               if (pin == -1)
4287 +                       break;
4288 +               io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
4289 +               if (!entry->next)
4290 +                       break;
4291 +               entry = irq_2_pin + entry->next;
4292 +       }
4293 +       set_irq_info(irq, cpumask);
4294 +       spin_unlock_irqrestore(&ioapic_lock, flags);
4295 +}
4296 +
4297 +#if defined(CONFIG_IRQBALANCE)
4298 +# include <asm/processor.h>    /* kernel_thread() */
4299 +# include <linux/kernel_stat.h>        /* kstat */
4300 +# include <linux/slab.h>               /* kmalloc() */
4301 +# include <linux/timer.h>      /* time_after() */
4302
4303 +# ifdef CONFIG_BALANCED_IRQ_DEBUG
4304 +#  define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0)
4305 +#  define Dprintk(x...) do { TDprintk(x); } while (0)
4306 +# else
4307 +#  define TDprintk(x...) 
4308 +#  define Dprintk(x...) 
4309 +# endif
4310 +
4311 +
4312 +#define IRQBALANCE_CHECK_ARCH -999
4313 +static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH;
4314 +static int physical_balance = 0;
4315 +
4316 +static struct irq_cpu_info {
4317 +       unsigned long * last_irq;
4318 +       unsigned long * irq_delta;
4319 +       unsigned long irq;
4320 +} irq_cpu_data[NR_CPUS];
4321 +
4322 +#define CPU_IRQ(cpu)           (irq_cpu_data[cpu].irq)
4323 +#define LAST_CPU_IRQ(cpu,irq)   (irq_cpu_data[cpu].last_irq[irq])
4324 +#define IRQ_DELTA(cpu,irq)     (irq_cpu_data[cpu].irq_delta[irq])
4325 +
4326 +#define IDLE_ENOUGH(cpu,now) \
4327 +       (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
4328 +
4329 +#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
4330 +
4331 +#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
4332 +
4333 +#define MAX_BALANCED_IRQ_INTERVAL      (5*HZ)
4334 +#define MIN_BALANCED_IRQ_INTERVAL      (HZ/2)
4335 +#define BALANCED_IRQ_MORE_DELTA                (HZ/10)
4336 +#define BALANCED_IRQ_LESS_DELTA                (HZ)
4337 +
4338 +static long balanced_irq_interval = MAX_BALANCED_IRQ_INTERVAL;
4339 +
4340 +static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
4341 +                       unsigned long now, int direction)
4342 +{
4343 +       int search_idle = 1;
4344 +       int cpu = curr_cpu;
4345 +
4346 +       goto inside;
4347 +
4348 +       do {
4349 +               if (unlikely(cpu == curr_cpu))
4350 +                       search_idle = 0;
4351 +inside:
4352 +               if (direction == 1) {
4353 +                       cpu++;
4354 +                       if (cpu >= NR_CPUS)
4355 +                               cpu = 0;
4356 +               } else {
4357 +                       cpu--;
4358 +                       if (cpu == -1)
4359 +                               cpu = NR_CPUS-1;
4360 +               }
4361 +       } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
4362 +                       (search_idle && !IDLE_ENOUGH(cpu,now)));
4363 +
4364 +       return cpu;
4365 +}
4366 +
4367 +static inline void balance_irq(int cpu, int irq)
4368 +{
4369 +       unsigned long now = jiffies;
4370 +       cpumask_t allowed_mask;
4371 +       unsigned int new_cpu;
4372 +               
4373 +       if (irqbalance_disabled)
4374 +               return; 
4375 +
4376 +       cpus_and(allowed_mask, cpu_online_map, irq_affinity[irq]);
4377 +       new_cpu = move(cpu, allowed_mask, now, 1);
4378 +       if (cpu != new_cpu) {
4379 +               set_pending_irq(irq, cpumask_of_cpu(new_cpu));
4380 +       }
4381 +}
4382 +
4383 +static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
4384 +{
4385 +       int i, j;
4386 +       Dprintk("Rotating IRQs among CPUs.\n");
4387 +       for (i = 0; i < NR_CPUS; i++) {
4388 +               for (j = 0; cpu_online(i) && (j < NR_IRQS); j++) {
4389 +                       if (!irq_desc[j].action)
4390 +                               continue;
4391 +                       /* Is it a significant load ?  */
4392 +                       if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
4393 +                                               useful_load_threshold)
4394 +                               continue;
4395 +                       balance_irq(i, j);
4396 +               }
4397 +       }
4398 +       balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
4399 +               balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);       
4400 +       return;
4401 +}
4402 +
4403 +static void do_irq_balance(void)
4404 +{
4405 +       int i, j;
4406 +       unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
4407 +       unsigned long move_this_load = 0;
4408 +       int max_loaded = 0, min_loaded = 0;
4409 +       int load;
4410 +       unsigned long useful_load_threshold = balanced_irq_interval + 10;
4411 +       int selected_irq;
4412 +       int tmp_loaded, first_attempt = 1;
4413 +       unsigned long tmp_cpu_irq;
4414 +       unsigned long imbalance = 0;
4415 +       cpumask_t allowed_mask, target_cpu_mask, tmp;
4416 +
4417 +       for (i = 0; i < NR_CPUS; i++) {
4418 +               int package_index;
4419 +               CPU_IRQ(i) = 0;
4420 +               if (!cpu_online(i))
4421 +                       continue;
4422 +               package_index = CPU_TO_PACKAGEINDEX(i);
4423 +               for (j = 0; j < NR_IRQS; j++) {
4424 +                       unsigned long value_now, delta;
4425 +                       /* Is this an active IRQ? */
4426 +                       if (!irq_desc[j].action)
4427 +                               continue;
4428 +                       if ( package_index == i )
4429 +                               IRQ_DELTA(package_index,j) = 0;
4430 +                       /* Determine the total count per processor per IRQ */
4431 +                       value_now = (unsigned long) kstat_cpu(i).irqs[j];
4432 +
4433 +                       /* Determine the activity per processor per IRQ */
4434 +                       delta = value_now - LAST_CPU_IRQ(i,j);
4435 +
4436 +                       /* Update last_cpu_irq[][] for the next time */
4437 +                       LAST_CPU_IRQ(i,j) = value_now;
4438 +
4439 +                       /* Ignore IRQs whose rate is less than the clock */
4440 +                       if (delta < useful_load_threshold)
4441 +                               continue;
4442 +                       /* update the load for the processor or package total */
4443 +                       IRQ_DELTA(package_index,j) += delta;
4444 +
4445 +                       /* Keep track of the higher numbered sibling as well */
4446 +                       if (i != package_index)
4447 +                               CPU_IRQ(i) += delta;
4448 +                       /*
4449 +                        * We have sibling A and sibling B in the package
4450 +                        *
4451 +                        * cpu_irq[A] = load for cpu A + load for cpu B
4452 +                        * cpu_irq[B] = load for cpu B
4453 +                        */
4454 +                       CPU_IRQ(package_index) += delta;
4455 +               }
4456 +       }
4457 +       /* Find the least loaded processor package */
4458 +       for (i = 0; i < NR_CPUS; i++) {
4459 +               if (!cpu_online(i))
4460 +                       continue;
4461 +               if (i != CPU_TO_PACKAGEINDEX(i))
4462 +                       continue;
4463 +               if (min_cpu_irq > CPU_IRQ(i)) {
4464 +                       min_cpu_irq = CPU_IRQ(i);
4465 +                       min_loaded = i;
4466 +               }
4467 +       }
4468 +       max_cpu_irq = ULONG_MAX;
4469 +
4470 +tryanothercpu:
4471 +       /* Look for heaviest loaded processor.
4472 +        * We may come back to get the next heaviest loaded processor.
4473 +        * Skip processors with trivial loads.
4474 +        */
4475 +       tmp_cpu_irq = 0;
4476 +       tmp_loaded = -1;
4477 +       for (i = 0; i < NR_CPUS; i++) {
4478 +               if (!cpu_online(i))
4479 +                       continue;
4480 +               if (i != CPU_TO_PACKAGEINDEX(i))
4481 +                       continue;
4482 +               if (max_cpu_irq <= CPU_IRQ(i)) 
4483 +                       continue;
4484 +               if (tmp_cpu_irq < CPU_IRQ(i)) {
4485 +                       tmp_cpu_irq = CPU_IRQ(i);
4486 +                       tmp_loaded = i;
4487 +               }
4488 +       }
4489 +
4490 +       if (tmp_loaded == -1) {
4491 +        /* In the case of small number of heavy interrupt sources, 
4492 +         * loading some of the cpus too much. We use Ingo's original 
4493 +         * approach to rotate them around.
4494 +         */
4495 +               if (!first_attempt && imbalance >= useful_load_threshold) {
4496 +                       rotate_irqs_among_cpus(useful_load_threshold);
4497 +                       return;
4498 +               }
4499 +               goto not_worth_the_effort;
4500 +       }
4501 +       
4502 +       first_attempt = 0;              /* heaviest search */
4503 +       max_cpu_irq = tmp_cpu_irq;      /* load */
4504 +       max_loaded = tmp_loaded;        /* processor */
4505 +       imbalance = (max_cpu_irq - min_cpu_irq) / 2;
4506 +       
4507 +       Dprintk("max_loaded cpu = %d\n", max_loaded);
4508 +       Dprintk("min_loaded cpu = %d\n", min_loaded);
4509 +       Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq);
4510 +       Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq);
4511 +       Dprintk("load imbalance = %lu\n", imbalance);
4512 +
4513 +       /* if imbalance is less than approx 10% of max load, then
4514 +        * observe diminishing returns action. - quit
4515 +        */
4516 +       if (imbalance < (max_cpu_irq >> 3)) {
4517 +               Dprintk("Imbalance too trivial\n");
4518 +               goto not_worth_the_effort;
4519 +       }
4520 +
4521 +tryanotherirq:
4522 +       /* if we select an IRQ to move that can't go where we want, then
4523 +        * see if there is another one to try.
4524 +        */
4525 +       move_this_load = 0;
4526 +       selected_irq = -1;
4527 +       for (j = 0; j < NR_IRQS; j++) {
4528 +               /* Is this an active IRQ? */
4529 +               if (!irq_desc[j].action)
4530 +                       continue;
4531 +               if (imbalance <= IRQ_DELTA(max_loaded,j))
4532 +                       continue;
4533 +               /* Try to find the IRQ that is closest to the imbalance
4534 +                * without going over.
4535 +                */
4536 +               if (move_this_load < IRQ_DELTA(max_loaded,j)) {
4537 +                       move_this_load = IRQ_DELTA(max_loaded,j);
4538 +                       selected_irq = j;
4539 +               }
4540 +       }
4541 +       if (selected_irq == -1) {
4542 +               goto tryanothercpu;
4543 +       }
4544 +
4545 +       imbalance = move_this_load;
4546 +       
4547 +       /* For physical_balance case, we accumlated both load
4548 +        * values in the one of the siblings cpu_irq[],
4549 +        * to use the same code for physical and logical processors
4550 +        * as much as possible. 
4551 +        *
4552 +        * NOTE: the cpu_irq[] array holds the sum of the load for
4553 +        * sibling A and sibling B in the slot for the lowest numbered
4554 +        * sibling (A), _AND_ the load for sibling B in the slot for
4555 +        * the higher numbered sibling.
4556 +        *
4557 +        * We seek the least loaded sibling by making the comparison
4558 +        * (A+B)/2 vs B
4559 +        */
4560 +       load = CPU_IRQ(min_loaded) >> 1;
4561 +       for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) {
4562 +               if (load > CPU_IRQ(j)) {
4563 +                       /* This won't change cpu_sibling_map[min_loaded] */
4564 +                       load = CPU_IRQ(j);
4565 +                       min_loaded = j;
4566 +               }
4567 +       }
4568 +
4569 +       cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]);
4570 +       target_cpu_mask = cpumask_of_cpu(min_loaded);
4571 +       cpus_and(tmp, target_cpu_mask, allowed_mask);
4572 +
4573 +       if (!cpus_empty(tmp)) {
4574 +
4575 +               Dprintk("irq = %d moved to cpu = %d\n",
4576 +                               selected_irq, min_loaded);
4577 +               /* mark for change destination */
4578 +               set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
4579 +
4580 +               /* Since we made a change, come back sooner to 
4581 +                * check for more variation.
4582 +                */
4583 +               balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
4584 +                       balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);       
4585 +               return;
4586 +       }
4587 +       goto tryanotherirq;
4588 +
4589 +not_worth_the_effort:
4590 +       /*
4591 +        * if we did not find an IRQ to move, then adjust the time interval
4592 +        * upward
4593 +        */
4594 +       balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
4595 +               balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);       
4596 +       Dprintk("IRQ worth rotating not found\n");
4597 +       return;
4598 +}
4599 +
4600 +static int balanced_irq(void *unused)
4601 +{
4602 +       int i;
4603 +       unsigned long prev_balance_time = jiffies;
4604 +       long time_remaining = balanced_irq_interval;
4605 +
4606 +       daemonize("kirqd");
4607 +       
4608 +       /* push everything to CPU 0 to give us a starting point.  */
4609 +       for (i = 0 ; i < NR_IRQS ; i++) {
4610 +               pending_irq_cpumask[i] = cpumask_of_cpu(0);
4611 +               set_pending_irq(i, cpumask_of_cpu(0));
4612 +       }
4613 +
4614 +       for ( ; ; ) {
4615 +               time_remaining = schedule_timeout_interruptible(time_remaining);
4616 +               try_to_freeze();
4617 +               if (time_after(jiffies,
4618 +                               prev_balance_time+balanced_irq_interval)) {
4619 +                       preempt_disable();
4620 +                       do_irq_balance();
4621 +                       prev_balance_time = jiffies;
4622 +                       time_remaining = balanced_irq_interval;
4623 +                       preempt_enable();
4624 +               }
4625 +       }
4626 +       return 0;
4627 +}
4628 +
4629 +static int __init balanced_irq_init(void)
4630 +{
4631 +       int i;
4632 +       struct cpuinfo_x86 *c;
4633 +       cpumask_t tmp;
4634 +
4635 +       cpus_shift_right(tmp, cpu_online_map, 2);
4636 +        c = &boot_cpu_data;
4637 +       /* When not overwritten by the command line ask subarchitecture. */
4638 +       if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
4639 +               irqbalance_disabled = NO_BALANCE_IRQ;
4640 +       if (irqbalance_disabled)
4641 +               return 0;
4642 +       
4643 +        /* disable irqbalance completely if there is only one processor online */
4644 +       if (num_online_cpus() < 2) {
4645 +               irqbalance_disabled = 1;
4646 +               return 0;
4647 +       }
4648 +       /*
4649 +        * Enable physical balance only if more than 1 physical processor
4650 +        * is present
4651 +        */
4652 +       if (smp_num_siblings > 1 && !cpus_empty(tmp))
4653 +               physical_balance = 1;
4654 +
4655 +       for (i = 0; i < NR_CPUS; i++) {
4656 +               if (!cpu_online(i))
4657 +                       continue;
4658 +               irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
4659 +               irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
4660 +               if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
4661 +                       printk(KERN_ERR "balanced_irq_init: out of memory");
4662 +                       goto failed;
4663 +               }
4664 +               memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
4665 +               memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
4666 +       }
4667 +       
4668 +       printk(KERN_INFO "Starting balanced_irq\n");
4669 +       if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) 
4670 +               return 0;
4671 +       else 
4672 +               printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
4673 +failed:
4674 +       for (i = 0; i < NR_CPUS; i++) {
4675 +               kfree(irq_cpu_data[i].irq_delta);
4676 +               kfree(irq_cpu_data[i].last_irq);
4677 +       }
4678 +       return 0;
4679 +}
4680 +
4681 +int __init irqbalance_disable(char *str)
4682 +{
4683 +       irqbalance_disabled = 1;
4684 +       return 0;
4685 +}
4686 +
4687 +__setup("noirqbalance", irqbalance_disable);
4688 +
4689 +late_initcall(balanced_irq_init);
4690 +#endif /* CONFIG_IRQBALANCE */
4691 +#endif /* CONFIG_SMP */
4692 +#endif
4693 +
4694 +#ifndef CONFIG_SMP
4695 +void fastcall send_IPI_self(int vector)
4696 +{
4697 +#ifndef CONFIG_XEN
4698 +       unsigned int cfg;
4699 +
4700 +       /*
4701 +        * Wait for idle.
4702 +        */
4703 +       apic_wait_icr_idle();
4704 +       cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
4705 +       /*
4706 +        * Send the IPI. The write to APIC_ICR fires this off.
4707 +        */
4708 +       apic_write_around(APIC_ICR, cfg);
4709 +#endif
4710 +}
4711 +#endif /* !CONFIG_SMP */
4712 +
4713 +
4714 +/*
4715 + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
4716 + * specific CPU-side IRQs.
4717 + */
4718 +
4719 +#define MAX_PIRQS 8
4720 +static int pirq_entries [MAX_PIRQS];
4721 +static int pirqs_enabled;
4722 +int skip_ioapic_setup;
4723 +
4724 +static int __init ioapic_setup(char *str)
4725 +{
4726 +       skip_ioapic_setup = 1;
4727 +       return 1;
4728 +}
4729 +
4730 +__setup("noapic", ioapic_setup);
4731 +
4732 +static int __init ioapic_pirq_setup(char *str)
4733 +{
4734 +       int i, max;
4735 +       int ints[MAX_PIRQS+1];
4736 +
4737 +       get_options(str, ARRAY_SIZE(ints), ints);
4738 +
4739 +       for (i = 0; i < MAX_PIRQS; i++)
4740 +               pirq_entries[i] = -1;
4741 +
4742 +       pirqs_enabled = 1;
4743 +       apic_printk(APIC_VERBOSE, KERN_INFO
4744 +                       "PIRQ redirection, working around broken MP-BIOS.\n");
4745 +       max = MAX_PIRQS;
4746 +       if (ints[0] < MAX_PIRQS)
4747 +               max = ints[0];
4748 +
4749 +       for (i = 0; i < max; i++) {
4750 +               apic_printk(APIC_VERBOSE, KERN_DEBUG
4751 +                               "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
4752 +               /*
4753 +                * PIRQs are mapped upside down, usually.
4754 +                */
4755 +               pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
4756 +       }
4757 +       return 1;
4758 +}
4759 +
4760 +__setup("pirq=", ioapic_pirq_setup);
4761 +
4762 +/*
4763 + * Find the IRQ entry number of a certain pin.
4764 + */
4765 +static int find_irq_entry(int apic, int pin, int type)
4766 +{
4767 +       int i;
4768 +
4769 +       for (i = 0; i < mp_irq_entries; i++)
4770 +               if (mp_irqs[i].mpc_irqtype == type &&
4771 +                   (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
4772 +                    mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
4773 +                   mp_irqs[i].mpc_dstirq == pin)
4774 +                       return i;
4775 +
4776 +       return -1;
4777 +}
4778 +
4779 +/*
4780 + * Find the pin to which IRQ[irq] (ISA) is connected
4781 + */
4782 +static int __init find_isa_irq_pin(int irq, int type)
4783 +{
4784 +       int i;
4785 +
4786 +       for (i = 0; i < mp_irq_entries; i++) {
4787 +               int lbus = mp_irqs[i].mpc_srcbus;
4788 +
4789 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
4790 +                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
4791 +                    mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
4792 +                    mp_bus_id_to_type[lbus] == MP_BUS_NEC98
4793 +                   ) &&
4794 +                   (mp_irqs[i].mpc_irqtype == type) &&
4795 +                   (mp_irqs[i].mpc_srcbusirq == irq))
4796 +
4797 +                       return mp_irqs[i].mpc_dstirq;
4798 +       }
4799 +       return -1;
4800 +}
4801 +
4802 +static int __init find_isa_irq_apic(int irq, int type)
4803 +{
4804 +       int i;
4805 +
4806 +       for (i = 0; i < mp_irq_entries; i++) {
4807 +               int lbus = mp_irqs[i].mpc_srcbus;
4808 +
4809 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
4810 +                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
4811 +                    mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
4812 +                    mp_bus_id_to_type[lbus] == MP_BUS_NEC98
4813 +                   ) &&
4814 +                   (mp_irqs[i].mpc_irqtype == type) &&
4815 +                   (mp_irqs[i].mpc_srcbusirq == irq))
4816 +                       break;
4817 +       }
4818 +       if (i < mp_irq_entries) {
4819 +               int apic;
4820 +               for(apic = 0; apic < nr_ioapics; apic++) {
4821 +                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
4822 +                               return apic;
4823 +               }
4824 +       }
4825 +
4826 +       return -1;
4827 +}
4828 +
4829 +/*
4830 + * Find a specific PCI IRQ entry.
4831 + * Not an __init, possibly needed by modules
4832 + */
4833 +static int pin_2_irq(int idx, int apic, int pin);
4834 +
4835 +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
4836 +{
4837 +       int apic, i, best_guess = -1;
4838 +
4839 +       apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
4840 +               "slot:%d, pin:%d.\n", bus, slot, pin);
4841 +       if (mp_bus_id_to_pci_bus[bus] == -1) {
4842 +               printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
4843 +               return -1;
4844 +       }
4845 +       for (i = 0; i < mp_irq_entries; i++) {
4846 +               int lbus = mp_irqs[i].mpc_srcbus;
4847 +
4848 +               for (apic = 0; apic < nr_ioapics; apic++)
4849 +                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
4850 +                           mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
4851 +                               break;
4852 +
4853 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
4854 +                   !mp_irqs[i].mpc_irqtype &&
4855 +                   (bus == lbus) &&
4856 +                   (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
4857 +                       int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
4858 +
4859 +                       if (!(apic || IO_APIC_IRQ(irq)))
4860 +                               continue;
4861 +
4862 +                       if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
4863 +                               return irq;
4864 +                       /*
4865 +                        * Use the first all-but-pin matching entry as a
4866 +                        * best-guess fuzzy result for broken mptables.
4867 +                        */
4868 +                       if (best_guess < 0)
4869 +                               best_guess = irq;
4870 +               }
4871 +       }
4872 +       return best_guess;
4873 +}
4874 +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
4875 +
4876 +/*
4877 + * This function currently is only a helper for the i386 smp boot process where 
4878 + * we need to reprogram the ioredtbls to cater for the cpus which have come online
4879 + * so mask in all cases should simply be TARGET_CPUS
4880 + */
4881 +#ifdef CONFIG_SMP
4882 +#ifndef CONFIG_XEN
4883 +void __init setup_ioapic_dest(void)
4884 +{
4885 +       int pin, ioapic, irq, irq_entry;
4886 +
4887 +       if (skip_ioapic_setup == 1)
4888 +               return;
4889 +
4890 +       for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
4891 +               for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
4892 +                       irq_entry = find_irq_entry(ioapic, pin, mp_INT);
4893 +                       if (irq_entry == -1)
4894 +                               continue;
4895 +                       irq = pin_2_irq(irq_entry, ioapic, pin);
4896 +                       set_ioapic_affinity_irq(irq, TARGET_CPUS);
4897 +               }
4898 +
4899 +       }
4900 +}
4901 +#endif /* !CONFIG_XEN */
4902 +#endif
4903 +
4904 +/*
4905 + * EISA Edge/Level control register, ELCR
4906 + */
4907 +static int EISA_ELCR(unsigned int irq)
4908 +{
4909 +       if (irq < 16) {
4910 +               unsigned int port = 0x4d0 + (irq >> 3);
4911 +               return (inb(port) >> (irq & 7)) & 1;
4912 +       }
4913 +       apic_printk(APIC_VERBOSE, KERN_INFO
4914 +                       "Broken MPtable reports ISA irq %d\n", irq);
4915 +       return 0;
4916 +}
4917 +
4918 +/* EISA interrupts are always polarity zero and can be edge or level
4919 + * trigger depending on the ELCR value.  If an interrupt is listed as
4920 + * EISA conforming in the MP table, that means its trigger type must
4921 + * be read in from the ELCR */
4922 +
4923 +#define default_EISA_trigger(idx)      (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
4924 +#define default_EISA_polarity(idx)     (0)
4925 +
4926 +/* ISA interrupts are always polarity zero edge triggered,
4927 + * when listed as conforming in the MP table. */
4928 +
4929 +#define default_ISA_trigger(idx)       (0)
4930 +#define default_ISA_polarity(idx)      (0)
4931 +
4932 +/* PCI interrupts are always polarity one level triggered,
4933 + * when listed as conforming in the MP table. */
4934 +
4935 +#define default_PCI_trigger(idx)       (1)
4936 +#define default_PCI_polarity(idx)      (1)
4937 +
4938 +/* MCA interrupts are always polarity zero level triggered,
4939 + * when listed as conforming in the MP table. */
4940 +
4941 +#define default_MCA_trigger(idx)       (1)
4942 +#define default_MCA_polarity(idx)      (0)
4943 +
4944 +/* NEC98 interrupts are always polarity zero edge triggered,
4945 + * when listed as conforming in the MP table. */
4946 +
4947 +#define default_NEC98_trigger(idx)     (0)
4948 +#define default_NEC98_polarity(idx)    (0)
4949 +
4950 +static int __init MPBIOS_polarity(int idx)
4951 +{
4952 +       int bus = mp_irqs[idx].mpc_srcbus;
4953 +       int polarity;
4954 +
4955 +       /*
4956 +        * Determine IRQ line polarity (high active or low active):
4957 +        */
4958 +       switch (mp_irqs[idx].mpc_irqflag & 3)
4959 +       {
4960 +               case 0: /* conforms, ie. bus-type dependent polarity */
4961 +               {
4962 +                       switch (mp_bus_id_to_type[bus])
4963 +                       {
4964 +                               case MP_BUS_ISA: /* ISA pin */
4965 +                               {
4966 +                                       polarity = default_ISA_polarity(idx);
4967 +                                       break;
4968 +                               }
4969 +                               case MP_BUS_EISA: /* EISA pin */
4970 +                               {
4971 +                                       polarity = default_EISA_polarity(idx);
4972 +                                       break;
4973 +                               }
4974 +                               case MP_BUS_PCI: /* PCI pin */
4975 +                               {
4976 +                                       polarity = default_PCI_polarity(idx);
4977 +                                       break;
4978 +                               }
4979 +                               case MP_BUS_MCA: /* MCA pin */
4980 +                               {
4981 +                                       polarity = default_MCA_polarity(idx);
4982 +                                       break;
4983 +                               }
4984 +                               case MP_BUS_NEC98: /* NEC 98 pin */
4985 +                               {
4986 +                                       polarity = default_NEC98_polarity(idx);
4987 +                                       break;
4988 +                               }
4989 +                               default:
4990 +                               {
4991 +                                       printk(KERN_WARNING "broken BIOS!!\n");
4992 +                                       polarity = 1;
4993 +                                       break;
4994 +                               }
4995 +                       }
4996 +                       break;
4997 +               }
4998 +               case 1: /* high active */
4999 +               {
5000 +                       polarity = 0;
5001 +                       break;
5002 +               }
5003 +               case 2: /* reserved */
5004 +               {
5005 +                       printk(KERN_WARNING "broken BIOS!!\n");
5006 +                       polarity = 1;
5007 +                       break;
5008 +               }
5009 +               case 3: /* low active */
5010 +               {
5011 +                       polarity = 1;
5012 +                       break;
5013 +               }
5014 +               default: /* invalid */
5015 +               {
5016 +                       printk(KERN_WARNING "broken BIOS!!\n");
5017 +                       polarity = 1;
5018 +                       break;
5019 +               }
5020 +       }
5021 +       return polarity;
5022 +}
5023 +
5024 +static int MPBIOS_trigger(int idx)
5025 +{
5026 +       int bus = mp_irqs[idx].mpc_srcbus;
5027 +       int trigger;
5028 +
5029 +       /*
5030 +        * Determine IRQ trigger mode (edge or level sensitive):
5031 +        */
5032 +       switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
5033 +       {
5034 +               case 0: /* conforms, ie. bus-type dependent */
5035 +               {
5036 +                       switch (mp_bus_id_to_type[bus])
5037 +                       {
5038 +                               case MP_BUS_ISA: /* ISA pin */
5039 +                               {
5040 +                                       trigger = default_ISA_trigger(idx);
5041 +                                       break;
5042 +                               }
5043 +                               case MP_BUS_EISA: /* EISA pin */
5044 +                               {
5045 +                                       trigger = default_EISA_trigger(idx);
5046 +                                       break;
5047 +                               }
5048 +                               case MP_BUS_PCI: /* PCI pin */
5049 +                               {
5050 +                                       trigger = default_PCI_trigger(idx);
5051 +                                       break;
5052 +                               }
5053 +                               case MP_BUS_MCA: /* MCA pin */
5054 +                               {
5055 +                                       trigger = default_MCA_trigger(idx);
5056 +                                       break;
5057 +                               }
5058 +                               case MP_BUS_NEC98: /* NEC 98 pin */
5059 +                               {
5060 +                                       trigger = default_NEC98_trigger(idx);
5061 +                                       break;
5062 +                               }
5063 +                               default:
5064 +                               {
5065 +                                       printk(KERN_WARNING "broken BIOS!!\n");
5066 +                                       trigger = 1;
5067 +                                       break;
5068 +                               }
5069 +                       }
5070 +                       break;
5071 +               }
5072 +               case 1: /* edge */
5073 +               {
5074 +                       trigger = 0;
5075 +                       break;
5076 +               }
5077 +               case 2: /* reserved */
5078 +               {
5079 +                       printk(KERN_WARNING "broken BIOS!!\n");
5080 +                       trigger = 1;
5081 +                       break;
5082 +               }
5083 +               case 3: /* level */
5084 +               {
5085 +                       trigger = 1;
5086 +                       break;
5087 +               }
5088 +               default: /* invalid */
5089 +               {
5090 +                       printk(KERN_WARNING "broken BIOS!!\n");
5091 +                       trigger = 0;
5092 +                       break;
5093 +               }
5094 +       }
5095 +       return trigger;
5096 +}
5097 +
5098 +static inline int irq_polarity(int idx)
5099 +{
5100 +       return MPBIOS_polarity(idx);
5101 +}
5102 +
5103 +static inline int irq_trigger(int idx)
5104 +{
5105 +       return MPBIOS_trigger(idx);
5106 +}
5107 +
5108 +static int pin_2_irq(int idx, int apic, int pin)
5109 +{
5110 +       int irq, i;
5111 +       int bus = mp_irqs[idx].mpc_srcbus;
5112 +
5113 +       /*
5114 +        * Debugging check, we are in big trouble if this message pops up!
5115 +        */
5116 +       if (mp_irqs[idx].mpc_dstirq != pin)
5117 +               printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
5118 +
5119 +       switch (mp_bus_id_to_type[bus])
5120 +       {
5121 +               case MP_BUS_ISA: /* ISA pin */
5122 +               case MP_BUS_EISA:
5123 +               case MP_BUS_MCA:
5124 +               case MP_BUS_NEC98:
5125 +               {
5126 +                       irq = mp_irqs[idx].mpc_srcbusirq;
5127 +                       break;
5128 +               }
5129 +               case MP_BUS_PCI: /* PCI pin */
5130 +               {
5131 +                       /*
5132 +                        * PCI IRQs are mapped in order
5133 +                        */
5134 +                       i = irq = 0;
5135 +                       while (i < apic)
5136 +                               irq += nr_ioapic_registers[i++];
5137 +                       irq += pin;
5138 +
5139 +                       /*
5140 +                        * For MPS mode, so far only needed by ES7000 platform
5141 +                        */
5142 +                       if (ioapic_renumber_irq)
5143 +                               irq = ioapic_renumber_irq(apic, irq);
5144 +
5145 +                       break;
5146 +               }
5147 +               default:
5148 +               {
5149 +                       printk(KERN_ERR "unknown bus type %d.\n",bus); 
5150 +                       irq = 0;
5151 +                       break;
5152 +               }
5153 +       }
5154 +
5155 +       /*
5156 +        * PCI IRQ command line redirection. Yes, limits are hardcoded.
5157 +        */
5158 +       if ((pin >= 16) && (pin <= 23)) {
5159 +               if (pirq_entries[pin-16] != -1) {
5160 +                       if (!pirq_entries[pin-16]) {
5161 +                               apic_printk(APIC_VERBOSE, KERN_DEBUG
5162 +                                               "disabling PIRQ%d\n", pin-16);
5163 +                       } else {
5164 +                               irq = pirq_entries[pin-16];
5165 +                               apic_printk(APIC_VERBOSE, KERN_DEBUG
5166 +                                               "using PIRQ%d -> IRQ %d\n",
5167 +                                               pin-16, irq);
5168 +                       }
5169 +               }
5170 +       }
5171 +       return irq;
5172 +}
5173 +
5174 +static inline int IO_APIC_irq_trigger(int irq)
5175 +{
5176 +       int apic, idx, pin;
5177 +
5178 +       for (apic = 0; apic < nr_ioapics; apic++) {
5179 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
5180 +                       idx = find_irq_entry(apic,pin,mp_INT);
5181 +                       if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
5182 +                               return irq_trigger(idx);
5183 +               }
5184 +       }
5185 +       /*
5186 +        * nonexistent IRQs are edge default
5187 +        */
5188 +       return 0;
5189 +}
5190 +
5191 +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
5192 +u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */
5193 +
5194 +int assign_irq_vector(int irq)
5195 +{
5196 +       static int current_vector = FIRST_DEVICE_VECTOR;
5197 +       physdev_op_t op;
5198 +
5199 +       BUG_ON(irq >= NR_IRQ_VECTORS);
5200 +       if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
5201 +               return IO_APIC_VECTOR(irq);
5202 +
5203 +       op.cmd = PHYSDEVOP_ASSIGN_VECTOR;
5204 +       op.u.irq_op.irq = irq;
5205 +       if (HYPERVISOR_physdev_op(&op))
5206 +               return -ENOSPC;
5207 +       current_vector = op.u.irq_op.vector;
5208 +
5209 +       vector_irq[current_vector] = irq;
5210 +       if (irq != AUTO_ASSIGN)
5211 +               IO_APIC_VECTOR(irq) = current_vector;
5212 +
5213 +       return current_vector;
5214 +}
5215 +
5216 +#ifndef CONFIG_XEN
5217 +static struct hw_interrupt_type ioapic_level_type;
5218 +static struct hw_interrupt_type ioapic_edge_type;
5219 +
5220 +#define IOAPIC_AUTO    -1
5221 +#define IOAPIC_EDGE    0
5222 +#define IOAPIC_LEVEL   1
5223 +
5224 +static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
5225 +{
5226 +       if (use_pci_vector() && !platform_legacy_irq(irq)) {
5227 +               if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
5228 +                               trigger == IOAPIC_LEVEL)
5229 +                       irq_desc[vector].handler = &ioapic_level_type;
5230 +               else
5231 +                       irq_desc[vector].handler = &ioapic_edge_type;
5232 +               set_intr_gate(vector, interrupt[vector]);
5233 +       } else  {
5234 +               if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
5235 +                               trigger == IOAPIC_LEVEL)
5236 +                       irq_desc[irq].handler = &ioapic_level_type;
5237 +               else
5238 +                       irq_desc[irq].handler = &ioapic_edge_type;
5239 +               set_intr_gate(vector, interrupt[irq]);
5240 +       }
5241 +}
5242 +#else
5243 +#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
5244 +#endif
5245 +
5246 +static void __init setup_IO_APIC_irqs(void)
5247 +{
5248 +       struct IO_APIC_route_entry entry;
5249 +       int apic, pin, idx, irq, first_notcon = 1, vector;
5250 +       unsigned long flags;
5251 +
5252 +       apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
5253 +
5254 +       for (apic = 0; apic < nr_ioapics; apic++) {
5255 +       for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
5256 +
5257 +               /*
5258 +                * add it to the IO-APIC irq-routing table:
5259 +                */
5260 +               memset(&entry,0,sizeof(entry));
5261 +
5262 +               entry.delivery_mode = INT_DELIVERY_MODE;
5263 +               entry.dest_mode = INT_DEST_MODE;
5264 +               entry.mask = 0;                         /* enable IRQ */
5265 +               entry.dest.logical.logical_dest = 
5266 +                                       cpu_mask_to_apicid(TARGET_CPUS);
5267 +
5268 +               idx = find_irq_entry(apic,pin,mp_INT);
5269 +               if (idx == -1) {
5270 +                       if (first_notcon) {
5271 +                               apic_printk(APIC_VERBOSE, KERN_DEBUG
5272 +                                               " IO-APIC (apicid-pin) %d-%d",
5273 +                                               mp_ioapics[apic].mpc_apicid,
5274 +                                               pin);
5275 +                               first_notcon = 0;
5276 +                       } else
5277 +                               apic_printk(APIC_VERBOSE, ", %d-%d",
5278 +                                       mp_ioapics[apic].mpc_apicid, pin);
5279 +                       continue;
5280 +               }
5281 +
5282 +               entry.trigger = irq_trigger(idx);
5283 +               entry.polarity = irq_polarity(idx);
5284 +
5285 +               if (irq_trigger(idx)) {
5286 +                       entry.trigger = 1;
5287 +                       entry.mask = 1;
5288 +               }
5289 +
5290 +               irq = pin_2_irq(idx, apic, pin);
5291 +               /*
5292 +                * skip adding the timer int on secondary nodes, which causes
5293 +                * a small but painful rift in the time-space continuum
5294 +                */
5295 +               if (multi_timer_check(apic, irq))
5296 +                       continue;
5297 +               else
5298 +                       add_pin_to_irq(irq, apic, pin);
5299 +
5300 +               if (/*!apic &&*/ !IO_APIC_IRQ(irq))
5301 +                       continue;
5302 +
5303 +               if (IO_APIC_IRQ(irq)) {
5304 +                       vector = assign_irq_vector(irq);
5305 +                       entry.vector = vector;
5306 +                       ioapic_register_intr(irq, vector, IOAPIC_AUTO);
5307 +               
5308 +                       if (!apic && (irq < 16))
5309 +                               disable_8259A_irq(irq);
5310 +               }
5311 +               spin_lock_irqsave(&ioapic_lock, flags);
5312 +               io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
5313 +               io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
5314 +               set_native_irq_info(irq, TARGET_CPUS);
5315 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5316 +       }
5317 +       }
5318 +
5319 +       if (!first_notcon)
5320 +               apic_printk(APIC_VERBOSE, " not connected.\n");
5321 +}
5322 +
5323 +/*
5324 + * Set up the 8259A-master output pin:
5325 + */
5326 +#ifndef CONFIG_XEN
5327 +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
5328 +{
5329 +       struct IO_APIC_route_entry entry;
5330 +       unsigned long flags;
5331 +
5332 +       memset(&entry,0,sizeof(entry));
5333 +
5334 +       disable_8259A_irq(0);
5335 +
5336 +       /* mask LVT0 */
5337 +       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
5338 +
5339 +       /*
5340 +        * We use logical delivery to get the timer IRQ
5341 +        * to the first CPU.
5342 +        */
5343 +       entry.dest_mode = INT_DEST_MODE;
5344 +       entry.mask = 0;                                 /* unmask IRQ now */
5345 +       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
5346 +       entry.delivery_mode = INT_DELIVERY_MODE;
5347 +       entry.polarity = 0;
5348 +       entry.trigger = 0;
5349 +       entry.vector = vector;
5350 +
5351 +       /*
5352 +        * The timer IRQ doesn't have to know that behind the
5353 +        * scene we have a 8259A-master in AEOI mode ...
5354 +        */
5355 +       irq_desc[0].handler = &ioapic_edge_type;
5356 +
5357 +       /*
5358 +        * Add it to the IO-APIC irq-routing table:
5359 +        */
5360 +       spin_lock_irqsave(&ioapic_lock, flags);
5361 +       io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
5362 +       io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
5363 +       spin_unlock_irqrestore(&ioapic_lock, flags);
5364 +
5365 +       enable_8259A_irq(0);
5366 +}
5367 +
5368 +static inline void UNEXPECTED_IO_APIC(void)
5369 +{
5370 +}
5371 +
5372 +void __init print_IO_APIC(void)
5373 +{
5374 +       int apic, i;
5375 +       union IO_APIC_reg_00 reg_00;
5376 +       union IO_APIC_reg_01 reg_01;
5377 +       union IO_APIC_reg_02 reg_02;
5378 +       union IO_APIC_reg_03 reg_03;
5379 +       unsigned long flags;
5380 +
5381 +       if (apic_verbosity == APIC_QUIET)
5382 +               return;
5383 +
5384 +       printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
5385 +       for (i = 0; i < nr_ioapics; i++)
5386 +               printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
5387 +                      mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
5388 +
5389 +       /*
5390 +        * We are a bit conservative about what we expect.  We have to
5391 +        * know about every hardware change ASAP.
5392 +        */
5393 +       printk(KERN_INFO "testing the IO APIC.......................\n");
5394 +
5395 +       for (apic = 0; apic < nr_ioapics; apic++) {
5396 +
5397 +       spin_lock_irqsave(&ioapic_lock, flags);
5398 +       reg_00.raw = io_apic_read(apic, 0);
5399 +       reg_01.raw = io_apic_read(apic, 1);
5400 +       if (reg_01.bits.version >= 0x10)
5401 +               reg_02.raw = io_apic_read(apic, 2);
5402 +       if (reg_01.bits.version >= 0x20)
5403 +               reg_03.raw = io_apic_read(apic, 3);
5404 +       spin_unlock_irqrestore(&ioapic_lock, flags);
5405 +
5406 +       printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
5407 +       printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
5408 +       printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
5409 +       printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
5410 +       printk(KERN_DEBUG ".......    : LTS          : %X\n", reg_00.bits.LTS);
5411 +       if (reg_00.bits.ID >= get_physical_broadcast())
5412 +               UNEXPECTED_IO_APIC();
5413 +       if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
5414 +               UNEXPECTED_IO_APIC();
5415 +
5416 +       printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
5417 +       printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
5418 +       if (    (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
5419 +               (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
5420 +               (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
5421 +               (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
5422 +               (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
5423 +               (reg_01.bits.entries != 0x2E) &&
5424 +               (reg_01.bits.entries != 0x3F)
5425 +       )
5426 +               UNEXPECTED_IO_APIC();
5427 +
5428 +       printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
5429 +       printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
5430 +       if (    (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
5431 +               (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
5432 +               (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
5433 +               (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
5434 +               (reg_01.bits.version != 0x20)    /* Intel P64H (82806 AA) */
5435 +       )
5436 +               UNEXPECTED_IO_APIC();
5437 +       if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
5438 +               UNEXPECTED_IO_APIC();
5439 +
5440 +       /*
5441 +        * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
5442 +        * but the value of reg_02 is read as the previous read register
5443 +        * value, so ignore it if reg_02 == reg_01.
5444 +        */
5445 +       if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
5446 +               printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
5447 +               printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
5448 +               if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
5449 +                       UNEXPECTED_IO_APIC();
5450 +       }
5451 +
5452 +       /*
5453 +        * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
5454 +        * or reg_03, but the value of reg_0[23] is read as the previous read
5455 +        * register value, so ignore it if reg_03 == reg_0[12].
5456 +        */
5457 +       if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
5458 +           reg_03.raw != reg_01.raw) {
5459 +               printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
5460 +               printk(KERN_DEBUG ".......     : Boot DT    : %X\n", reg_03.bits.boot_DT);
5461 +               if (reg_03.bits.__reserved_1)
5462 +                       UNEXPECTED_IO_APIC();
5463 +       }
5464 +
5465 +       printk(KERN_DEBUG ".... IRQ redirection table:\n");
5466 +
5467 +       printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
5468 +                         " Stat Dest Deli Vect:   \n");
5469 +
5470 +       for (i = 0; i <= reg_01.bits.entries; i++) {
5471 +               struct IO_APIC_route_entry entry;
5472 +
5473 +               spin_lock_irqsave(&ioapic_lock, flags);
5474 +               *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
5475 +               *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
5476 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5477 +
5478 +               printk(KERN_DEBUG " %02x %03X %02X  ",
5479 +                       i,
5480 +                       entry.dest.logical.logical_dest,
5481 +                       entry.dest.physical.physical_dest
5482 +               );
5483 +
5484 +               printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
5485 +                       entry.mask,
5486 +                       entry.trigger,
5487 +                       entry.irr,
5488 +                       entry.polarity,
5489 +                       entry.delivery_status,
5490 +                       entry.dest_mode,
5491 +                       entry.delivery_mode,
5492 +                       entry.vector
5493 +               );
5494 +       }
5495 +       }
5496 +       if (use_pci_vector())
5497 +               printk(KERN_INFO "Using vector-based indexing\n");
5498 +       printk(KERN_DEBUG "IRQ to pin mappings:\n");
5499 +       for (i = 0; i < NR_IRQS; i++) {
5500 +               struct irq_pin_list *entry = irq_2_pin + i;
5501 +               if (entry->pin < 0)
5502 +                       continue;
5503 +               if (use_pci_vector() && !platform_legacy_irq(i))
5504 +                       printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
5505 +               else
5506 +                       printk(KERN_DEBUG "IRQ%d ", i);
5507 +               for (;;) {
5508 +                       printk("-> %d:%d", entry->apic, entry->pin);
5509 +                       if (!entry->next)
5510 +                               break;
5511 +                       entry = irq_2_pin + entry->next;
5512 +               }
5513 +               printk("\n");
5514 +       }
5515 +
5516 +       printk(KERN_INFO ".................................... done.\n");
5517 +
5518 +       return;
5519 +}
5520 +
5521 +#if 0
5522 +
5523 +static void print_APIC_bitfield (int base)
5524 +{
5525 +       unsigned int v;
5526 +       int i, j;
5527 +
5528 +       if (apic_verbosity == APIC_QUIET)
5529 +               return;
5530 +
5531 +       printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
5532 +       for (i = 0; i < 8; i++) {
5533 +               v = apic_read(base + i*0x10);
5534 +               for (j = 0; j < 32; j++) {
5535 +                       if (v & (1<<j))
5536 +                               printk("1");
5537 +                       else
5538 +                               printk("0");
5539 +               }
5540 +               printk("\n");
5541 +       }
5542 +}
5543 +
5544 +void /*__init*/ print_local_APIC(void * dummy)
5545 +{
5546 +       unsigned int v, ver, maxlvt;
5547 +
5548 +       if (apic_verbosity == APIC_QUIET)
5549 +               return;
5550 +
5551 +       printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
5552 +               smp_processor_id(), hard_smp_processor_id());
5553 +       v = apic_read(APIC_ID);
5554 +       printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(v));
5555 +       v = apic_read(APIC_LVR);
5556 +       printk(KERN_INFO "... APIC VERSION: %08x\n", v);
5557 +       ver = GET_APIC_VERSION(v);
5558 +       maxlvt = get_maxlvt();
5559 +
5560 +       v = apic_read(APIC_TASKPRI);
5561 +       printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
5562 +
5563 +       if (APIC_INTEGRATED(ver)) {                     /* !82489DX */
5564 +               v = apic_read(APIC_ARBPRI);
5565 +               printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
5566 +                       v & APIC_ARBPRI_MASK);
5567 +               v = apic_read(APIC_PROCPRI);
5568 +               printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
5569 +       }
5570 +
5571 +       v = apic_read(APIC_EOI);
5572 +       printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
5573 +       v = apic_read(APIC_RRR);
5574 +       printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
5575 +       v = apic_read(APIC_LDR);
5576 +       printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
5577 +       v = apic_read(APIC_DFR);
5578 +       printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
5579 +       v = apic_read(APIC_SPIV);
5580 +       printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
5581 +
5582 +       printk(KERN_DEBUG "... APIC ISR field:\n");
5583 +       print_APIC_bitfield(APIC_ISR);
5584 +       printk(KERN_DEBUG "... APIC TMR field:\n");
5585 +       print_APIC_bitfield(APIC_TMR);
5586 +       printk(KERN_DEBUG "... APIC IRR field:\n");
5587 +       print_APIC_bitfield(APIC_IRR);
5588 +
5589 +       if (APIC_INTEGRATED(ver)) {             /* !82489DX */
5590 +               if (maxlvt > 3)         /* Due to the Pentium erratum 3AP. */
5591 +                       apic_write(APIC_ESR, 0);
5592 +               v = apic_read(APIC_ESR);
5593 +               printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
5594 +       }
5595 +
5596 +       v = apic_read(APIC_ICR);
5597 +       printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
5598 +       v = apic_read(APIC_ICR2);
5599 +       printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
5600 +
5601 +       v = apic_read(APIC_LVTT);
5602 +       printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
5603 +
5604 +       if (maxlvt > 3) {                       /* PC is LVT#4. */
5605 +               v = apic_read(APIC_LVTPC);
5606 +               printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
5607 +       }
5608 +       v = apic_read(APIC_LVT0);
5609 +       printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
5610 +       v = apic_read(APIC_LVT1);
5611 +       printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
5612 +
5613 +       if (maxlvt > 2) {                       /* ERR is LVT#3. */
5614 +               v = apic_read(APIC_LVTERR);
5615 +               printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
5616 +       }
5617 +
5618 +       v = apic_read(APIC_TMICT);
5619 +       printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
5620 +       v = apic_read(APIC_TMCCT);
5621 +       printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
5622 +       v = apic_read(APIC_TDCR);
5623 +       printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
5624 +       printk("\n");
5625 +}
5626 +
5627 +void print_all_local_APICs (void)
5628 +{
5629 +       on_each_cpu(print_local_APIC, NULL, 1, 1);
5630 +}
5631 +
5632 +void /*__init*/ print_PIC(void)
5633 +{
5634 +       unsigned int v;
5635 +       unsigned long flags;
5636 +
5637 +       if (apic_verbosity == APIC_QUIET)
5638 +               return;
5639 +
5640 +       printk(KERN_DEBUG "\nprinting PIC contents\n");
5641 +
5642 +       spin_lock_irqsave(&i8259A_lock, flags);
5643 +
5644 +       v = inb(0xa1) << 8 | inb(0x21);
5645 +       printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
5646 +
5647 +       v = inb(0xa0) << 8 | inb(0x20);
5648 +       printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
5649 +
5650 +       outb(0x0b,0xa0);
5651 +       outb(0x0b,0x20);
5652 +       v = inb(0xa0) << 8 | inb(0x20);
5653 +       outb(0x0a,0xa0);
5654 +       outb(0x0a,0x20);
5655 +
5656 +       spin_unlock_irqrestore(&i8259A_lock, flags);
5657 +
5658 +       printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
5659 +
5660 +       v = inb(0x4d1) << 8 | inb(0x4d0);
5661 +       printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
5662 +}
5663 +
5664 +#endif  /*  0  */
5665 +
5666 +#else
5667 +void __init print_IO_APIC(void) { }
5668 +#endif /* !CONFIG_XEN */
5669 +
5670 +static void __init enable_IO_APIC(void)
5671 +{
5672 +       union IO_APIC_reg_01 reg_01;
5673 +       int i8259_apic, i8259_pin;
5674 +       int i, apic;
5675 +       unsigned long flags;
5676 +
5677 +       for (i = 0; i < PIN_MAP_SIZE; i++) {
5678 +               irq_2_pin[i].pin = -1;
5679 +               irq_2_pin[i].next = 0;
5680 +       }
5681 +       if (!pirqs_enabled)
5682 +               for (i = 0; i < MAX_PIRQS; i++)
5683 +                       pirq_entries[i] = -1;
5684 +
5685 +       /*
5686 +        * The number of IO-APIC IRQ registers (== #pins):
5687 +        */
5688 +       for (apic = 0; apic < nr_ioapics; apic++) {
5689 +               spin_lock_irqsave(&ioapic_lock, flags);
5690 +               reg_01.raw = io_apic_read(apic, 1);
5691 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5692 +               nr_ioapic_registers[apic] = reg_01.bits.entries+1;
5693 +       }
5694 +       for(apic = 0; apic < nr_ioapics; apic++) {
5695 +               int pin;
5696 +               /* See if any of the pins is in ExtINT mode */
5697 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
5698 +                       struct IO_APIC_route_entry entry;
5699 +                       spin_lock_irqsave(&ioapic_lock, flags);
5700 +                       *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
5701 +                       *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
5702 +                       spin_unlock_irqrestore(&ioapic_lock, flags);
5703 +
5704 +
5705 +                       /* If the interrupt line is enabled and in ExtInt mode
5706 +                        * I have found the pin where the i8259 is connected.
5707 +                        */
5708 +                       if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
5709 +                               ioapic_i8259.apic = apic;
5710 +                               ioapic_i8259.pin  = pin;
5711 +                               goto found_i8259;
5712 +                       }
5713 +               }
5714 +       }
5715 + found_i8259:
5716 +       /* Look to see what if the MP table has reported the ExtINT */
5717 +       /* If we could not find the appropriate pin by looking at the ioapic
5718 +        * the i8259 probably is not connected the ioapic but give the
5719 +        * mptable a chance anyway.
5720 +        */
5721 +       i8259_pin  = find_isa_irq_pin(0, mp_ExtINT);
5722 +       i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
5723 +       /* Trust the MP table if nothing is setup in the hardware */
5724 +       if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
5725 +               printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
5726 +               ioapic_i8259.pin  = i8259_pin;
5727 +               ioapic_i8259.apic = i8259_apic;
5728 +       }
5729 +       /* Complain if the MP table and the hardware disagree */
5730 +       if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
5731 +               (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
5732 +       {
5733 +               printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
5734 +       }
5735 +
5736 +       /*
5737 +        * Do not trust the IO-APIC being empty at bootup
5738 +        */
5739 +       clear_IO_APIC();
5740 +}
5741 +
5742 +/*
5743 + * Not an __init, needed by the reboot code
5744 + */
5745 +void disable_IO_APIC(void)
5746 +{
5747 +       /*
5748 +        * Clear the IO-APIC before rebooting:
5749 +        */
5750 +       clear_IO_APIC();
5751 +
5752 +#ifndef CONFIG_XEN
5753 +       /*
5754 +        * If the i8259 is routed through an IOAPIC
5755 +        * Put that IOAPIC in virtual wire mode
5756 +        * so legacy interrupts can be delivered.
5757 +        */
5758 +       if (ioapic_i8259.pin != -1) {
5759 +               struct IO_APIC_route_entry entry;
5760 +               unsigned long flags;
5761 +
5762 +               memset(&entry, 0, sizeof(entry));
5763 +               entry.mask            = 0; /* Enabled */
5764 +               entry.trigger         = 0; /* Edge */
5765 +               entry.irr             = 0;
5766 +               entry.polarity        = 0; /* High */
5767 +               entry.delivery_status = 0;
5768 +               entry.dest_mode       = 0; /* Physical */
5769 +               entry.delivery_mode   = dest_ExtINT; /* ExtInt */
5770 +               entry.vector          = 0;
5771 +               entry.dest.physical.physical_dest =
5772 +                                       GET_APIC_ID(apic_read(APIC_ID));
5773 +
5774 +               /*
5775 +                * Add it to the IO-APIC irq-routing table:
5776 +                */
5777 +               spin_lock_irqsave(&ioapic_lock, flags);
5778 +               io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
5779 +                       *(((int *)&entry)+1));
5780 +               io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
5781 +                       *(((int *)&entry)+0));
5782 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5783 +       }
5784 +       disconnect_bsp_APIC(ioapic_i8259.pin != -1);
5785 +#endif
5786 +}
5787 +
5788 +/*
5789 + * function to set the IO-APIC physical IDs based on the
5790 + * values stored in the MPC table.
5791 + *
5792 + * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
5793 + */
5794 +
5795 +#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
5796 +static void __init setup_ioapic_ids_from_mpc(void)
5797 +{
5798 +       union IO_APIC_reg_00 reg_00;
5799 +       physid_mask_t phys_id_present_map;
5800 +       int apic;
5801 +       int i;
5802 +       unsigned char old_id;
5803 +       unsigned long flags;
5804 +
5805 +       /*
5806 +        * Don't check I/O APIC IDs for xAPIC systems.  They have
5807 +        * no meaning without the serial APIC bus.
5808 +        */
5809 +       if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86 < 15))
5810 +               return;
5811 +       /*
5812 +        * This is broken; anything with a real cpu count has to
5813 +        * circumvent this idiocy regardless.
5814 +        */
5815 +       phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
5816 +
5817 +       /*
5818 +        * Set the IOAPIC ID to the value stored in the MPC table.
5819 +        */
5820 +       for (apic = 0; apic < nr_ioapics; apic++) {
5821 +
5822 +               /* Read the register 0 value */
5823 +               spin_lock_irqsave(&ioapic_lock, flags);
5824 +               reg_00.raw = io_apic_read(apic, 0);
5825 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5826 +               
5827 +               old_id = mp_ioapics[apic].mpc_apicid;
5828 +
5829 +               if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
5830 +                       printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
5831 +                               apic, mp_ioapics[apic].mpc_apicid);
5832 +                       printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
5833 +                               reg_00.bits.ID);
5834 +                       mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
5835 +               }
5836 +
5837 +               /*
5838 +                * Sanity check, is the ID really free? Every APIC in a
5839 +                * system must have a unique ID or we get lots of nice
5840 +                * 'stuck on smp_invalidate_needed IPI wait' messages.
5841 +                */
5842 +               if (check_apicid_used(phys_id_present_map,
5843 +                                       mp_ioapics[apic].mpc_apicid)) {
5844 +                       printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
5845 +                               apic, mp_ioapics[apic].mpc_apicid);
5846 +                       for (i = 0; i < get_physical_broadcast(); i++)
5847 +                               if (!physid_isset(i, phys_id_present_map))
5848 +                                       break;
5849 +                       if (i >= get_physical_broadcast())
5850 +                               panic("Max APIC ID exceeded!\n");
5851 +                       printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
5852 +                               i);
5853 +                       physid_set(i, phys_id_present_map);
5854 +                       mp_ioapics[apic].mpc_apicid = i;
5855 +               } else {
5856 +                       physid_mask_t tmp;
5857 +                       tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
5858 +                       apic_printk(APIC_VERBOSE, "Setting %d in the "
5859 +                                       "phys_id_present_map\n",
5860 +                                       mp_ioapics[apic].mpc_apicid);
5861 +                       physids_or(phys_id_present_map, phys_id_present_map, tmp);
5862 +               }
5863 +
5864 +
5865 +               /*
5866 +                * We need to adjust the IRQ routing table
5867 +                * if the ID changed.
5868 +                */
5869 +               if (old_id != mp_ioapics[apic].mpc_apicid)
5870 +                       for (i = 0; i < mp_irq_entries; i++)
5871 +                               if (mp_irqs[i].mpc_dstapic == old_id)
5872 +                                       mp_irqs[i].mpc_dstapic
5873 +                                               = mp_ioapics[apic].mpc_apicid;
5874 +
5875 +               /*
5876 +                * Read the right value from the MPC table and
5877 +                * write it into the ID register.
5878 +                */
5879 +               apic_printk(APIC_VERBOSE, KERN_INFO
5880 +                       "...changing IO-APIC physical APIC ID to %d ...",
5881 +                       mp_ioapics[apic].mpc_apicid);
5882 +
5883 +               reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
5884 +               spin_lock_irqsave(&ioapic_lock, flags);
5885 +               io_apic_write(apic, 0, reg_00.raw);
5886 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5887 +
5888 +               /*
5889 +                * Sanity check
5890 +                */
5891 +               spin_lock_irqsave(&ioapic_lock, flags);
5892 +               reg_00.raw = io_apic_read(apic, 0);
5893 +               spin_unlock_irqrestore(&ioapic_lock, flags);
5894 +               if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
5895 +                       printk("could not set ID!\n");
5896 +               else
5897 +                       apic_printk(APIC_VERBOSE, " ok.\n");
5898 +       }
5899 +}
5900 +#else
5901 +static void __init setup_ioapic_ids_from_mpc(void) { }
5902 +#endif
5903 +
5904 +#ifndef CONFIG_XEN
5905 +/*
5906 + * There is a nasty bug in some older SMP boards, their mptable lies
5907 + * about the timer IRQ. We do the following to work around the situation:
5908 + *
5909 + *     - timer IRQ defaults to IO-APIC IRQ
5910 + *     - if this function detects that timer IRQs are defunct, then we fall
5911 + *       back to ISA timer IRQs
5912 + */
5913 +static int __init timer_irq_works(void)
5914 +{
5915 +       unsigned long t1 = jiffies;
5916 +
5917 +       local_irq_enable();
5918 +       /* Let ten ticks pass... */
5919 +       mdelay((10 * 1000) / HZ);
5920 +
5921 +       /*
5922 +        * Expect a few ticks at least, to be sure some possible
5923 +        * glue logic does not lock up after one or two first
5924 +        * ticks in a non-ExtINT mode.  Also the local APIC
5925 +        * might have cached one ExtINT interrupt.  Finally, at
5926 +        * least one tick may be lost due to delays.
5927 +        */
5928 +       if (jiffies - t1 > 4)
5929 +               return 1;
5930 +
5931 +       return 0;
5932 +}
5933 +
5934 +/*
5935 + * In the SMP+IOAPIC case it might happen that there are an unspecified
5936 + * number of pending IRQ events unhandled. These cases are very rare,
5937 + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
5938 + * better to do it this way as thus we do not have to be aware of
5939 + * 'pending' interrupts in the IRQ path, except at this point.
5940 + */
5941 +/*
5942 + * Edge triggered needs to resend any interrupt
5943 + * that was delayed but this is now handled in the device
5944 + * independent code.
5945 + */
5946 +
5947 +/*
5948 + * Starting up a edge-triggered IO-APIC interrupt is
5949 + * nasty - we need to make sure that we get the edge.
5950 + * If it is already asserted for some reason, we need
5951 + * return 1 to indicate that is was pending.
5952 + *
5953 + * This is not complete - we should be able to fake
5954 + * an edge even if it isn't on the 8259A...
5955 + */
5956 +static unsigned int startup_edge_ioapic_irq(unsigned int irq)
5957 +{
5958 +       int was_pending = 0;
5959 +       unsigned long flags;
5960 +
5961 +       spin_lock_irqsave(&ioapic_lock, flags);
5962 +       if (irq < 16) {
5963 +               disable_8259A_irq(irq);
5964 +               if (i8259A_irq_pending(irq))
5965 +                       was_pending = 1;
5966 +       }
5967 +       __unmask_IO_APIC_irq(irq);
5968 +       spin_unlock_irqrestore(&ioapic_lock, flags);
5969 +
5970 +       return was_pending;
5971 +}
5972 +
5973 +/*
5974 + * Once we have recorded IRQ_PENDING already, we can mask the
5975 + * interrupt for real. This prevents IRQ storms from unhandled
5976 + * devices.
5977 + */
5978 +static void ack_edge_ioapic_irq(unsigned int irq)
5979 +{
5980 +       move_irq(irq);
5981 +       if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
5982 +                                       == (IRQ_PENDING | IRQ_DISABLED))
5983 +               mask_IO_APIC_irq(irq);
5984 +       ack_APIC_irq();
5985 +}
5986 +
5987 +/*
5988 + * Level triggered interrupts can just be masked,
5989 + * and shutting down and starting up the interrupt
5990 + * is the same as enabling and disabling them -- except
5991 + * with a startup need to return a "was pending" value.
5992 + *
5993 + * Level triggered interrupts are special because we
5994 + * do not touch any IO-APIC register while handling
5995 + * them. We ack the APIC in the end-IRQ handler, not
5996 + * in the start-IRQ-handler. Protection against reentrance
5997 + * from the same interrupt is still provided, both by the
5998 + * generic IRQ layer and by the fact that an unacked local
5999 + * APIC does not accept IRQs.
6000 + */
6001 +static unsigned int startup_level_ioapic_irq (unsigned int irq)
6002 +{
6003 +       unmask_IO_APIC_irq(irq);
6004 +
6005 +       return 0; /* don't check for pending */
6006 +}
6007 +
6008 +static void end_level_ioapic_irq (unsigned int irq)
6009 +{
6010 +       unsigned long v;
6011 +       int i;
6012 +
6013 +       move_irq(irq);
6014 +/*
6015 + * It appears there is an erratum which affects at least version 0x11
6016 + * of I/O APIC (that's the 82093AA and cores integrated into various
6017 + * chipsets).  Under certain conditions a level-triggered interrupt is
6018 + * erroneously delivered as edge-triggered one but the respective IRR
6019 + * bit gets set nevertheless.  As a result the I/O unit expects an EOI
6020 + * message but it will never arrive and further interrupts are blocked
6021 + * from the source.  The exact reason is so far unknown, but the
6022 + * phenomenon was observed when two consecutive interrupt requests
6023 + * from a given source get delivered to the same CPU and the source is
6024 + * temporarily disabled in between.
6025 + *
6026 + * A workaround is to simulate an EOI message manually.  We achieve it
6027 + * by setting the trigger mode to edge and then to level when the edge
6028 + * trigger mode gets detected in the TMR of a local APIC for a
6029 + * level-triggered interrupt.  We mask the source for the time of the
6030 + * operation to prevent an edge-triggered interrupt escaping meanwhile.
6031 + * The idea is from Manfred Spraul.  --macro
6032 + */
6033 +       i = IO_APIC_VECTOR(irq);
6034 +
6035 +       v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
6036 +
6037 +       ack_APIC_irq();
6038 +
6039 +       if (!(v & (1 << (i & 0x1f)))) {
6040 +               atomic_inc(&irq_mis_count);
6041 +               spin_lock(&ioapic_lock);
6042 +               __mask_and_edge_IO_APIC_irq(irq);
6043 +               __unmask_and_level_IO_APIC_irq(irq);
6044 +               spin_unlock(&ioapic_lock);
6045 +       }
6046 +}
6047 +
6048 +#ifdef CONFIG_PCI_MSI
6049 +static unsigned int startup_edge_ioapic_vector(unsigned int vector)
6050 +{
6051 +       int irq = vector_to_irq(vector);
6052 +
6053 +       return startup_edge_ioapic_irq(irq);
6054 +}
6055 +
6056 +static void ack_edge_ioapic_vector(unsigned int vector)
6057 +{
6058 +       int irq = vector_to_irq(vector);
6059 +
6060 +       move_native_irq(vector);
6061 +       ack_edge_ioapic_irq(irq);
6062 +}
6063 +
6064 +static unsigned int startup_level_ioapic_vector (unsigned int vector)
6065 +{
6066 +       int irq = vector_to_irq(vector);
6067 +
6068 +       return startup_level_ioapic_irq (irq);
6069 +}
6070 +
6071 +static void end_level_ioapic_vector (unsigned int vector)
6072 +{
6073 +       int irq = vector_to_irq(vector);
6074 +
6075 +       move_native_irq(vector);
6076 +       end_level_ioapic_irq(irq);
6077 +}
6078 +
6079 +static void mask_IO_APIC_vector (unsigned int vector)
6080 +{
6081 +       int irq = vector_to_irq(vector);
6082 +
6083 +       mask_IO_APIC_irq(irq);
6084 +}
6085 +
6086 +static void unmask_IO_APIC_vector (unsigned int vector)
6087 +{
6088 +       int irq = vector_to_irq(vector);
6089 +
6090 +       unmask_IO_APIC_irq(irq);
6091 +}
6092 +
6093 +#ifdef CONFIG_SMP
6094 +static void set_ioapic_affinity_vector (unsigned int vector,
6095 +                                       cpumask_t cpu_mask)
6096 +{
6097 +       int irq = vector_to_irq(vector);
6098 +
6099 +       set_native_irq_info(vector, cpu_mask);
6100 +       set_ioapic_affinity_irq(irq, cpu_mask);
6101 +}
6102 +#endif
6103 +#endif
6104 +
6105 +/*
6106 + * Level and edge triggered IO-APIC interrupts need different handling,
6107 + * so we use two separate IRQ descriptors. Edge triggered IRQs can be
6108 + * handled with the level-triggered descriptor, but that one has slightly
6109 + * more overhead. Level-triggered interrupts cannot be handled with the
6110 + * edge-triggered handler, without risking IRQ storms and other ugly
6111 + * races.
6112 + */
6113 +static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
6114 +       .typename       = "IO-APIC-edge",
6115 +       .startup        = startup_edge_ioapic,
6116 +       .shutdown       = shutdown_edge_ioapic,
6117 +       .enable         = enable_edge_ioapic,
6118 +       .disable        = disable_edge_ioapic,
6119 +       .ack            = ack_edge_ioapic,
6120 +       .end            = end_edge_ioapic,
6121 +#ifdef CONFIG_SMP
6122 +       .set_affinity   = set_ioapic_affinity,
6123 +#endif
6124 +};
6125 +
6126 +static struct hw_interrupt_type ioapic_level_type __read_mostly = {
6127 +       .typename       = "IO-APIC-level",
6128 +       .startup        = startup_level_ioapic,
6129 +       .shutdown       = shutdown_level_ioapic,
6130 +       .enable         = enable_level_ioapic,
6131 +       .disable        = disable_level_ioapic,
6132 +       .ack            = mask_and_ack_level_ioapic,
6133 +       .end            = end_level_ioapic,
6134 +#ifdef CONFIG_SMP
6135 +       .set_affinity   = set_ioapic_affinity,
6136 +#endif
6137 +};
6138 +#endif /* !CONFIG_XEN */
6139 +
6140 +static inline void init_IO_APIC_traps(void)
6141 +{
6142 +       int irq;
6143 +
6144 +       /*
6145 +        * NOTE! The local APIC isn't very good at handling
6146 +        * multiple interrupts at the same interrupt level.
6147 +        * As the interrupt level is determined by taking the
6148 +        * vector number and shifting that right by 4, we
6149 +        * want to spread these out a bit so that they don't
6150 +        * all fall in the same interrupt level.
6151 +        *
6152 +        * Also, we've got to be careful not to trash gate
6153 +        * 0x80, because int 0x80 is hm, kind of importantish. ;)
6154 +        */
6155 +       for (irq = 0; irq < NR_IRQS ; irq++) {
6156 +               int tmp = irq;
6157 +               if (use_pci_vector()) {
6158 +                       if (!platform_legacy_irq(tmp))
6159 +                               if ((tmp = vector_to_irq(tmp)) == -1)
6160 +                                       continue;
6161 +               }
6162 +               if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
6163 +                       /*
6164 +                        * Hmm.. We don't have an entry for this,
6165 +                        * so default to an old-fashioned 8259
6166 +                        * interrupt if we can..
6167 +                        */
6168 +                       if (irq < 16)
6169 +                               make_8259A_irq(irq);
6170 +#ifndef CONFIG_XEN
6171 +                       else
6172 +                               /* Strange. Oh, well.. */
6173 +                               irq_desc[irq].handler = &no_irq_type;
6174 +#endif
6175 +               }
6176 +       }
6177 +}
6178 +
6179 +#ifndef CONFIG_XEN
6180 +static void enable_lapic_irq (unsigned int irq)
6181 +{
6182 +       unsigned long v;
6183 +
6184 +       v = apic_read(APIC_LVT0);
6185 +       apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
6186 +}
6187 +
6188 +static void disable_lapic_irq (unsigned int irq)
6189 +{
6190 +       unsigned long v;
6191 +
6192 +       v = apic_read(APIC_LVT0);
6193 +       apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
6194 +}
6195 +
6196 +static void ack_lapic_irq (unsigned int irq)
6197 +{
6198 +       ack_APIC_irq();
6199 +}
6200 +
6201 +static void end_lapic_irq (unsigned int i) { /* nothing */ }
6202 +
6203 +static struct hw_interrupt_type lapic_irq_type __read_mostly = {
6204 +       .typename       = "local-APIC-edge",
6205 +       .startup        = NULL, /* startup_irq() not used for IRQ0 */
6206 +       .shutdown       = NULL, /* shutdown_irq() not used for IRQ0 */
6207 +       .enable         = enable_lapic_irq,
6208 +       .disable        = disable_lapic_irq,
6209 +       .ack            = ack_lapic_irq,
6210 +       .end            = end_lapic_irq
6211 +};
6212 +
6213 +static void setup_nmi (void)
6214 +{
6215 +       /*
6216 +        * Dirty trick to enable the NMI watchdog ...
6217 +        * We put the 8259A master into AEOI mode and
6218 +        * unmask on all local APICs LVT0 as NMI.
6219 +        *
6220 +        * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
6221 +        * is from Maciej W. Rozycki - so we do not have to EOI from
6222 +        * the NMI handler or the timer interrupt.
6223 +        */ 
6224 +       apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
6225 +
6226 +       on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
6227 +
6228 +       apic_printk(APIC_VERBOSE, " done.\n");
6229 +}
6230 +
6231 +/*
6232 + * This looks a bit hackish but it's about the only one way of sending
6233 + * a few INTA cycles to 8259As and any associated glue logic.  ICR does
6234 + * not support the ExtINT mode, unfortunately.  We need to send these
6235 + * cycles as some i82489DX-based boards have glue logic that keeps the
6236 + * 8259A interrupt line asserted until INTA.  --macro
6237 + */
6238 +static inline void unlock_ExtINT_logic(void)
6239 +{
6240 +       int apic, pin, i;
6241 +       struct IO_APIC_route_entry entry0, entry1;
6242 +       unsigned char save_control, save_freq_select;
6243 +       unsigned long flags;
6244 +
6245 +       pin  = find_isa_irq_pin(8, mp_INT);
6246 +       apic = find_isa_irq_apic(8, mp_INT);
6247 +       if (pin == -1)
6248 +               return;
6249 +
6250 +       spin_lock_irqsave(&ioapic_lock, flags);
6251 +       *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
6252 +       *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
6253 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6254 +       clear_IO_APIC_pin(apic, pin);
6255 +
6256 +       memset(&entry1, 0, sizeof(entry1));
6257 +
6258 +       entry1.dest_mode = 0;                   /* physical delivery */
6259 +       entry1.mask = 0;                        /* unmask IRQ now */
6260 +       entry1.dest.physical.physical_dest = hard_smp_processor_id();
6261 +       entry1.delivery_mode = dest_ExtINT;
6262 +       entry1.polarity = entry0.polarity;
6263 +       entry1.trigger = 0;
6264 +       entry1.vector = 0;
6265 +
6266 +       spin_lock_irqsave(&ioapic_lock, flags);
6267 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
6268 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
6269 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6270 +
6271 +       save_control = CMOS_READ(RTC_CONTROL);
6272 +       save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
6273 +       CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
6274 +                  RTC_FREQ_SELECT);
6275 +       CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
6276 +
6277 +       i = 100;
6278 +       while (i-- > 0) {
6279 +               mdelay(10);
6280 +               if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
6281 +                       i -= 10;
6282 +       }
6283 +
6284 +       CMOS_WRITE(save_control, RTC_CONTROL);
6285 +       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
6286 +       clear_IO_APIC_pin(apic, pin);
6287 +
6288 +       spin_lock_irqsave(&ioapic_lock, flags);
6289 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
6290 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
6291 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6292 +}
6293 +
6294 +/*
6295 + * This code may look a bit paranoid, but it's supposed to cooperate with
6296 + * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
6297 + * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
6298 + * fanatically on his truly buggy board.
6299 + */
6300 +static inline void check_timer(void)
6301 +{
6302 +       int apic1, pin1, apic2, pin2;
6303 +       int vector;
6304 +
6305 +       /*
6306 +        * get/set the timer IRQ vector:
6307 +        */
6308 +       disable_8259A_irq(0);
6309 +       vector = assign_irq_vector(0);
6310 +       set_intr_gate(vector, interrupt[0]);
6311 +
6312 +       /*
6313 +        * Subtle, code in do_timer_interrupt() expects an AEOI
6314 +        * mode for the 8259A whenever interrupts are routed
6315 +        * through I/O APICs.  Also IRQ0 has to be enabled in
6316 +        * the 8259A which implies the virtual wire has to be
6317 +        * disabled in the local APIC.
6318 +        */
6319 +       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
6320 +       init_8259A(1);
6321 +       timer_ack = 1;
6322 +       if (timer_over_8254 > 0)
6323 +               enable_8259A_irq(0);
6324 +
6325 +       pin1  = find_isa_irq_pin(0, mp_INT);
6326 +       apic1 = find_isa_irq_apic(0, mp_INT);
6327 +       pin2  = ioapic_i8259.pin;
6328 +       apic2 = ioapic_i8259.apic;
6329 +
6330 +       printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
6331 +               vector, apic1, pin1, apic2, pin2);
6332 +
6333 +       if (pin1 != -1) {
6334 +               /*
6335 +                * Ok, does IRQ0 through the IOAPIC work?
6336 +                */
6337 +               unmask_IO_APIC_irq(0);
6338 +               if (timer_irq_works()) {
6339 +                       if (nmi_watchdog == NMI_IO_APIC) {
6340 +                               disable_8259A_irq(0);
6341 +                               setup_nmi();
6342 +                               enable_8259A_irq(0);
6343 +                       }
6344 +                       if (disable_timer_pin_1 > 0)
6345 +                               clear_IO_APIC_pin(0, pin1);
6346 +                       return;
6347 +               }
6348 +               clear_IO_APIC_pin(apic1, pin1);
6349 +               printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
6350 +                               "IO-APIC\n");
6351 +       }
6352 +
6353 +       printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
6354 +       if (pin2 != -1) {
6355 +               printk("\n..... (found pin %d) ...", pin2);
6356 +               /*
6357 +                * legacy devices should be connected to IO APIC #0
6358 +                */
6359 +               setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
6360 +               if (timer_irq_works()) {
6361 +                       printk("works.\n");
6362 +                       if (pin1 != -1)
6363 +                               replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
6364 +                       else
6365 +                               add_pin_to_irq(0, apic2, pin2);
6366 +                       if (nmi_watchdog == NMI_IO_APIC) {
6367 +                               setup_nmi();
6368 +                       }
6369 +                       return;
6370 +               }
6371 +               /*
6372 +                * Cleanup, just in case ...
6373 +                */
6374 +               clear_IO_APIC_pin(apic2, pin2);
6375 +       }
6376 +       printk(" failed.\n");
6377 +
6378 +       if (nmi_watchdog == NMI_IO_APIC) {
6379 +               printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
6380 +               nmi_watchdog = 0;
6381 +       }
6382 +
6383 +       printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
6384 +
6385 +       disable_8259A_irq(0);
6386 +       irq_desc[0].handler = &lapic_irq_type;
6387 +       apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);   /* Fixed mode */
6388 +       enable_8259A_irq(0);
6389 +
6390 +       if (timer_irq_works()) {
6391 +               printk(" works.\n");
6392 +               return;
6393 +       }
6394 +       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
6395 +       printk(" failed.\n");
6396 +
6397 +       printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
6398 +
6399 +       timer_ack = 0;
6400 +       init_8259A(0);
6401 +       make_8259A_irq(0);
6402 +       apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
6403 +
6404 +       unlock_ExtINT_logic();
6405 +
6406 +       if (timer_irq_works()) {
6407 +               printk(" works.\n");
6408 +               return;
6409 +       }
6410 +       printk(" failed :(.\n");
6411 +       panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
6412 +               "report.  Then try booting with the 'noapic' option");
6413 +}
6414 +#else
6415 +#define check_timer() ((void)0)
6416 +#endif
6417 +
6418 +/*
6419 + *
6420 + * IRQ's that are handled by the PIC in the MPS IOAPIC case.
6421 + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
6422 + *   Linux doesn't really care, as it's not actually used
6423 + *   for any interrupt handling anyway.
6424 + */
6425 +#define PIC_IRQS       (1 << PIC_CASCADE_IR)
6426 +
6427 +void __init setup_IO_APIC(void)
6428 +{
6429 +       enable_IO_APIC();
6430 +
6431 +       if (acpi_ioapic)
6432 +               io_apic_irqs = ~0;      /* all IRQs go through IOAPIC */
6433 +       else
6434 +               io_apic_irqs = ~PIC_IRQS;
6435 +
6436 +       printk("ENABLING IO-APIC IRQs\n");
6437 +
6438 +       /*
6439 +        * Set up IO-APIC IRQ routing.
6440 +        */
6441 +       if (!acpi_ioapic)
6442 +               setup_ioapic_ids_from_mpc();
6443 +#ifndef CONFIG_XEN
6444 +       sync_Arb_IDs();
6445 +#endif
6446 +       setup_IO_APIC_irqs();
6447 +       init_IO_APIC_traps();
6448 +       check_timer();
6449 +       if (!acpi_ioapic)
6450 +               print_IO_APIC();
6451 +}
6452 +
6453 +static int __init setup_disable_8254_timer(char *s)
6454 +{
6455 +       timer_over_8254 = -1;
6456 +       return 1;
6457 +}
6458 +static int __init setup_enable_8254_timer(char *s)
6459 +{
6460 +       timer_over_8254 = 2;
6461 +       return 1;
6462 +}
6463 +
6464 +__setup("disable_8254_timer", setup_disable_8254_timer);
6465 +__setup("enable_8254_timer", setup_enable_8254_timer);
6466 +
6467 +/*
6468 + *     Called after all the initialization is done. If we didnt find any
6469 + *     APIC bugs then we can allow the modify fast path
6470 + */
6471
6472 +static int __init io_apic_bug_finalize(void)
6473 +{
6474 +       if(sis_apic_bug == -1)
6475 +               sis_apic_bug = 0;
6476 +       return 0;
6477 +}
6478 +
6479 +late_initcall(io_apic_bug_finalize);
6480 +
6481 +struct sysfs_ioapic_data {
6482 +       struct sys_device dev;
6483 +       struct IO_APIC_route_entry entry[0];
6484 +};
6485 +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
6486 +
6487 +static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
6488 +{
6489 +       struct IO_APIC_route_entry *entry;
6490 +       struct sysfs_ioapic_data *data;
6491 +       unsigned long flags;
6492 +       int i;
6493 +       
6494 +       data = container_of(dev, struct sysfs_ioapic_data, dev);
6495 +       entry = data->entry;
6496 +       spin_lock_irqsave(&ioapic_lock, flags);
6497 +       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
6498 +               *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
6499 +               *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
6500 +       }
6501 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6502 +
6503 +       return 0;
6504 +}
6505 +
6506 +static int ioapic_resume(struct sys_device *dev)
6507 +{
6508 +       struct IO_APIC_route_entry *entry;
6509 +       struct sysfs_ioapic_data *data;
6510 +       unsigned long flags;
6511 +       union IO_APIC_reg_00 reg_00;
6512 +       int i;
6513 +       
6514 +       data = container_of(dev, struct sysfs_ioapic_data, dev);
6515 +       entry = data->entry;
6516 +
6517 +       spin_lock_irqsave(&ioapic_lock, flags);
6518 +       reg_00.raw = io_apic_read(dev->id, 0);
6519 +       if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
6520 +               reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
6521 +               io_apic_write(dev->id, 0, reg_00.raw);
6522 +       }
6523 +       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
6524 +               io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
6525 +               io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
6526 +       }
6527 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6528 +
6529 +       return 0;
6530 +}
6531 +
6532 +static struct sysdev_class ioapic_sysdev_class = {
6533 +       set_kset_name("ioapic"),
6534 +       .suspend = ioapic_suspend,
6535 +       .resume = ioapic_resume,
6536 +};
6537 +
6538 +static int __init ioapic_init_sysfs(void)
6539 +{
6540 +       struct sys_device * dev;
6541 +       int i, size, error = 0;
6542 +
6543 +       error = sysdev_class_register(&ioapic_sysdev_class);
6544 +       if (error)
6545 +               return error;
6546 +
6547 +       for (i = 0; i < nr_ioapics; i++ ) {
6548 +               size = sizeof(struct sys_device) + nr_ioapic_registers[i] 
6549 +                       * sizeof(struct IO_APIC_route_entry);
6550 +               mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
6551 +               if (!mp_ioapic_data[i]) {
6552 +                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
6553 +                       continue;
6554 +               }
6555 +               memset(mp_ioapic_data[i], 0, size);
6556 +               dev = &mp_ioapic_data[i]->dev;
6557 +               dev->id = i; 
6558 +               dev->cls = &ioapic_sysdev_class;
6559 +               error = sysdev_register(dev);
6560 +               if (error) {
6561 +                       kfree(mp_ioapic_data[i]);
6562 +                       mp_ioapic_data[i] = NULL;
6563 +                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
6564 +                       continue;
6565 +               }
6566 +       }
6567 +
6568 +       return 0;
6569 +}
6570 +
6571 +device_initcall(ioapic_init_sysfs);
6572 +
6573 +/* --------------------------------------------------------------------------
6574 +                          ACPI-based IOAPIC Configuration
6575 +   -------------------------------------------------------------------------- */
6576 +
6577 +#ifdef CONFIG_ACPI
6578 +
6579 +int __init io_apic_get_unique_id (int ioapic, int apic_id)
6580 +{
6581 +#ifndef CONFIG_XEN
6582 +       union IO_APIC_reg_00 reg_00;
6583 +       static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
6584 +       physid_mask_t tmp;
6585 +       unsigned long flags;
6586 +       int i = 0;
6587 +
6588 +       /*
6589 +        * The P4 platform supports up to 256 APIC IDs on two separate APIC 
6590 +        * buses (one for LAPICs, one for IOAPICs), where predecessors only 
6591 +        * supports up to 16 on one shared APIC bus.
6592 +        * 
6593 +        * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
6594 +        *      advantage of new APIC bus architecture.
6595 +        */
6596 +
6597 +       if (physids_empty(apic_id_map))
6598 +               apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
6599 +
6600 +       spin_lock_irqsave(&ioapic_lock, flags);
6601 +       reg_00.raw = io_apic_read(ioapic, 0);
6602 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6603 +
6604 +       if (apic_id >= get_physical_broadcast()) {
6605 +               printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
6606 +                       "%d\n", ioapic, apic_id, reg_00.bits.ID);
6607 +               apic_id = reg_00.bits.ID;
6608 +       }
6609 +
6610 +       /*
6611 +        * Every APIC in a system must have a unique ID or we get lots of nice 
6612 +        * 'stuck on smp_invalidate_needed IPI wait' messages.
6613 +        */
6614 +       if (check_apicid_used(apic_id_map, apic_id)) {
6615 +
6616 +               for (i = 0; i < get_physical_broadcast(); i++) {
6617 +                       if (!check_apicid_used(apic_id_map, i))
6618 +                               break;
6619 +               }
6620 +
6621 +               if (i == get_physical_broadcast())
6622 +                       panic("Max apic_id exceeded!\n");
6623 +
6624 +               printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
6625 +                       "trying %d\n", ioapic, apic_id, i);
6626 +
6627 +               apic_id = i;
6628 +       } 
6629 +
6630 +       tmp = apicid_to_cpu_present(apic_id);
6631 +       physids_or(apic_id_map, apic_id_map, tmp);
6632 +
6633 +       if (reg_00.bits.ID != apic_id) {
6634 +               reg_00.bits.ID = apic_id;
6635 +
6636 +               spin_lock_irqsave(&ioapic_lock, flags);
6637 +               io_apic_write(ioapic, 0, reg_00.raw);
6638 +               reg_00.raw = io_apic_read(ioapic, 0);
6639 +               spin_unlock_irqrestore(&ioapic_lock, flags);
6640 +
6641 +               /* Sanity check */
6642 +               if (reg_00.bits.ID != apic_id) {
6643 +                       printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
6644 +                       return -1;
6645 +               }
6646 +       }
6647 +
6648 +       apic_printk(APIC_VERBOSE, KERN_INFO
6649 +                       "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
6650 +#endif /* !CONFIG_XEN */
6651 +
6652 +       return apic_id;
6653 +}
6654 +
6655 +
6656 +int __init io_apic_get_version (int ioapic)
6657 +{
6658 +       union IO_APIC_reg_01    reg_01;
6659 +       unsigned long flags;
6660 +
6661 +       spin_lock_irqsave(&ioapic_lock, flags);
6662 +       reg_01.raw = io_apic_read(ioapic, 1);
6663 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6664 +
6665 +       return reg_01.bits.version;
6666 +}
6667 +
6668 +
6669 +int __init io_apic_get_redir_entries (int ioapic)
6670 +{
6671 +       union IO_APIC_reg_01    reg_01;
6672 +       unsigned long flags;
6673 +
6674 +       spin_lock_irqsave(&ioapic_lock, flags);
6675 +       reg_01.raw = io_apic_read(ioapic, 1);
6676 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6677 +
6678 +       return reg_01.bits.entries;
6679 +}
6680 +
6681 +
6682 +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
6683 +{
6684 +       struct IO_APIC_route_entry entry;
6685 +       unsigned long flags;
6686 +
6687 +       if (!IO_APIC_IRQ(irq)) {
6688 +               printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
6689 +                       ioapic);
6690 +               return -EINVAL;
6691 +       }
6692 +
6693 +       /*
6694 +        * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
6695 +        * Note that we mask (disable) IRQs now -- these get enabled when the
6696 +        * corresponding device driver registers for this IRQ.
6697 +        */
6698 +
6699 +       memset(&entry,0,sizeof(entry));
6700 +
6701 +       entry.delivery_mode = INT_DELIVERY_MODE;
6702 +       entry.dest_mode = INT_DEST_MODE;
6703 +       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
6704 +       entry.trigger = edge_level;
6705 +       entry.polarity = active_high_low;
6706 +       entry.mask  = 1;
6707 +
6708 +       /*
6709 +        * IRQs < 16 are already in the irq_2_pin[] map
6710 +        */
6711 +       if (irq >= 16)
6712 +               add_pin_to_irq(irq, ioapic, pin);
6713 +
6714 +       entry.vector = assign_irq_vector(irq);
6715 +
6716 +       apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
6717 +               "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
6718 +               mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
6719 +               edge_level, active_high_low);
6720 +
6721 +       ioapic_register_intr(irq, entry.vector, edge_level);
6722 +
6723 +       if (!ioapic && (irq < 16))
6724 +               disable_8259A_irq(irq);
6725 +
6726 +       spin_lock_irqsave(&ioapic_lock, flags);
6727 +       io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
6728 +       io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
6729 +       set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
6730 +       spin_unlock_irqrestore(&ioapic_lock, flags);
6731 +
6732 +       return 0;
6733 +}
6734 +
6735 +#endif /* CONFIG_ACPI */
6736 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/ioport-xen.c linux-2.6.16/arch/i386/kernel/ioport-xen.c
6737 --- linux-2.6.16.orig/arch/i386/kernel/ioport-xen.c     1970-01-01 01:00:00.000000000 +0100
6738 +++ linux-2.6.16/arch/i386/kernel/ioport-xen.c  2006-06-26 09:51:32.000000000 +0200
6739 @@ -0,0 +1,122 @@
6740 +/*
6741 + *     linux/arch/i386/kernel/ioport.c
6742 + *
6743 + * This contains the io-permission bitmap code - written by obz, with changes
6744 + * by Linus.
6745 + */
6746 +
6747 +#include <linux/sched.h>
6748 +#include <linux/kernel.h>
6749 +#include <linux/capability.h>
6750 +#include <linux/errno.h>
6751 +#include <linux/types.h>
6752 +#include <linux/ioport.h>
6753 +#include <linux/smp.h>
6754 +#include <linux/smp_lock.h>
6755 +#include <linux/stddef.h>
6756 +#include <linux/slab.h>
6757 +#include <linux/thread_info.h>
6758 +#include <xen/interface/physdev.h>
6759 +
6760 +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
6761 +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
6762 +{
6763 +       unsigned long mask;
6764 +       unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
6765 +       unsigned int low_index = base & (BITS_PER_LONG-1);
6766 +       int length = low_index + extent;
6767 +
6768 +       if (low_index != 0) {
6769 +               mask = (~0UL << low_index);
6770 +               if (length < BITS_PER_LONG)
6771 +                       mask &= ~(~0UL << length);
6772 +               if (new_value)
6773 +                       *bitmap_base++ |= mask;
6774 +               else
6775 +                       *bitmap_base++ &= ~mask;
6776 +               length -= BITS_PER_LONG;
6777 +       }
6778 +
6779 +       mask = (new_value ? ~0UL : 0UL);
6780 +       while (length >= BITS_PER_LONG) {
6781 +               *bitmap_base++ = mask;
6782 +               length -= BITS_PER_LONG;
6783 +       }
6784 +
6785 +       if (length > 0) {
6786 +               mask = ~(~0UL << length);
6787 +               if (new_value)
6788 +                       *bitmap_base++ |= mask;
6789 +               else
6790 +                       *bitmap_base++ &= ~mask;
6791 +       }
6792 +}
6793 +
6794 +
6795 +/*
6796 + * this changes the io permissions bitmap in the current task.
6797 + */
6798 +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
6799 +{
6800 +       struct thread_struct * t = &current->thread;
6801 +       unsigned long *bitmap;
6802 +       physdev_op_t op;
6803 +
6804 +       if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
6805 +               return -EINVAL;
6806 +       if (turn_on && !capable(CAP_SYS_RAWIO))
6807 +               return -EPERM;
6808 +
6809 +       /*
6810 +        * If it's the first ioperm() call in this thread's lifetime, set the
6811 +        * IO bitmap up. ioperm() is much less timing critical than clone(),
6812 +        * this is why we delay this operation until now:
6813 +        */
6814 +       if (!t->io_bitmap_ptr) {
6815 +               bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
6816 +               if (!bitmap)
6817 +                       return -ENOMEM;
6818 +
6819 +               memset(bitmap, 0xff, IO_BITMAP_BYTES);
6820 +               t->io_bitmap_ptr = bitmap;
6821 +
6822 +               op.cmd = PHYSDEVOP_SET_IOBITMAP;
6823 +               op.u.set_iobitmap.bitmap   = (char *)bitmap;
6824 +               op.u.set_iobitmap.nr_ports = IO_BITMAP_BITS;
6825 +               HYPERVISOR_physdev_op(&op);
6826 +       }
6827 +
6828 +       set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
6829 +
6830 +       return 0;
6831 +}
6832 +
6833 +/*
6834 + * sys_iopl has to be used when you want to access the IO ports
6835 + * beyond the 0x3ff range: to get the full 65536 ports bitmapped
6836 + * you'd need 8kB of bitmaps/process, which is a bit excessive.
6837 + *
6838 + * Here we just change the eflags value on the stack: we allow
6839 + * only the super-user to do it. This depends on the stack-layout
6840 + * on system-call entry - see also fork() and the signal handling
6841 + * code.
6842 + */
6843 +
6844 +asmlinkage long sys_iopl(unsigned long unused)
6845 +{
6846 +       volatile struct pt_regs * regs = (struct pt_regs *) &unused;
6847 +       unsigned int level = regs->ebx;
6848 +       struct thread_struct *t = &current->thread;
6849 +       unsigned int old = (t->iopl >> 12) & 3;
6850 +
6851 +       if (level > 3)
6852 +               return -EINVAL;
6853 +       /* Trying to gain more privileges? */
6854 +       if (level > old) {
6855 +               if (!capable(CAP_SYS_RAWIO))
6856 +                       return -EPERM;
6857 +       }
6858 +       t->iopl = level << 12;
6859 +       set_iopl_mask(t->iopl);
6860 +       return 0;
6861 +}
6862 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/irq-xen.c linux-2.6.16/arch/i386/kernel/irq-xen.c
6863 --- linux-2.6.16.orig/arch/i386/kernel/irq-xen.c        1970-01-01 01:00:00.000000000 +0100
6864 +++ linux-2.6.16/arch/i386/kernel/irq-xen.c     2006-06-26 09:51:32.000000000 +0200
6865 @@ -0,0 +1,306 @@
6866 +/*
6867 + *     linux/arch/i386/kernel/irq.c
6868 + *
6869 + *     Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
6870 + *
6871 + * This file contains the lowest level x86-specific interrupt
6872 + * entry, irq-stacks and irq statistics code. All the remaining
6873 + * irq logic is done by the generic kernel/irq/ code and
6874 + * by the x86-specific irq controller code. (e.g. i8259.c and
6875 + * io_apic.c.)
6876 + */
6877 +
6878 +#include <asm/uaccess.h>
6879 +#include <linux/module.h>
6880 +#include <linux/seq_file.h>
6881 +#include <linux/interrupt.h>
6882 +#include <linux/kernel_stat.h>
6883 +#include <linux/notifier.h>
6884 +#include <linux/cpu.h>
6885 +#include <linux/delay.h>
6886 +
6887 +DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
6888 +EXPORT_PER_CPU_SYMBOL(irq_stat);
6889 +
6890 +#ifndef CONFIG_X86_LOCAL_APIC
6891 +/*
6892 + * 'what should we do if we get a hw irq event on an illegal vector'.
6893 + * each architecture has to answer this themselves.
6894 + */
6895 +void ack_bad_irq(unsigned int irq)
6896 +{
6897 +       printk("unexpected IRQ trap at vector %02x\n", irq);
6898 +}
6899 +#endif
6900 +
6901 +#ifdef CONFIG_4KSTACKS
6902 +/*
6903 + * per-CPU IRQ handling contexts (thread information and stack)
6904 + */
6905 +union irq_ctx {
6906 +       struct thread_info      tinfo;
6907 +       u32                     stack[THREAD_SIZE/sizeof(u32)];
6908 +};
6909 +
6910 +static union irq_ctx *hardirq_ctx[NR_CPUS];
6911 +static union irq_ctx *softirq_ctx[NR_CPUS];
6912 +#endif
6913 +
6914 +/*
6915 + * do_IRQ handles all normal device IRQ's (the special
6916 + * SMP cross-CPU interrupts have their own specific
6917 + * handlers).
6918 + */
6919 +fastcall unsigned int do_IRQ(struct pt_regs *regs)
6920 +{      
6921 +       /* high bit used in ret_from_ code */
6922 +       int irq = ~regs->orig_eax;
6923 +#ifdef CONFIG_4KSTACKS
6924 +       union irq_ctx *curctx, *irqctx;
6925 +       u32 *isp;
6926 +#endif
6927 +
6928 +       irq_enter();
6929 +#ifdef CONFIG_DEBUG_STACKOVERFLOW
6930 +       /* Debugging check for stack overflow: is there less than 1KB free? */
6931 +       {
6932 +               long esp;
6933 +
6934 +               __asm__ __volatile__("andl %%esp,%0" :
6935 +                                       "=r" (esp) : "0" (THREAD_SIZE - 1));
6936 +               if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
6937 +                       printk("do_IRQ: stack overflow: %ld\n",
6938 +                               esp - sizeof(struct thread_info));
6939 +                       dump_stack();
6940 +               }
6941 +       }
6942 +#endif
6943 +
6944 +#ifdef CONFIG_4KSTACKS
6945 +
6946 +       curctx = (union irq_ctx *) current_thread_info();
6947 +       irqctx = hardirq_ctx[smp_processor_id()];
6948 +
6949 +       /*
6950 +        * this is where we switch to the IRQ stack. However, if we are
6951 +        * already using the IRQ stack (because we interrupted a hardirq
6952 +        * handler) we can't do that and just have to keep using the
6953 +        * current stack (which is the irq stack already after all)
6954 +        */
6955 +       if (curctx != irqctx) {
6956 +               int arg1, arg2, ebx;
6957 +
6958 +               /* build the stack frame on the IRQ stack */
6959 +               isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
6960 +               irqctx->tinfo.task = curctx->tinfo.task;
6961 +               irqctx->tinfo.previous_esp = current_stack_pointer;
6962 +
6963 +               asm volatile(
6964 +                       "       xchgl   %%ebx,%%esp      \n"
6965 +                       "       call    __do_IRQ         \n"
6966 +                       "       movl   %%ebx,%%esp      \n"
6967 +                       : "=a" (arg1), "=d" (arg2), "=b" (ebx)
6968 +                       :  "0" (irq),   "1" (regs),  "2" (isp)
6969 +                       : "memory", "cc", "ecx"
6970 +               );
6971 +       } else
6972 +#endif
6973 +               __do_IRQ(irq, regs);
6974 +
6975 +       irq_exit();
6976 +
6977 +       return 1;
6978 +}
6979 +
6980 +#ifdef CONFIG_4KSTACKS
6981 +
6982 +/*
6983 + * These should really be __section__(".bss.page_aligned") as well, but
6984 + * gcc's 3.0 and earlier don't handle that correctly.
6985 + */
6986 +static char softirq_stack[NR_CPUS * THREAD_SIZE]
6987 +               __attribute__((__aligned__(THREAD_SIZE)));
6988 +
6989 +static char hardirq_stack[NR_CPUS * THREAD_SIZE]
6990 +               __attribute__((__aligned__(THREAD_SIZE)));
6991 +
6992 +/*
6993 + * allocate per-cpu stacks for hardirq and for softirq processing
6994 + */
6995 +void irq_ctx_init(int cpu)
6996 +{
6997 +       union irq_ctx *irqctx;
6998 +
6999 +       if (hardirq_ctx[cpu])
7000 +               return;
7001 +
7002 +       irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
7003 +       irqctx->tinfo.task              = NULL;
7004 +       irqctx->tinfo.exec_domain       = NULL;
7005 +       irqctx->tinfo.cpu               = cpu;
7006 +       irqctx->tinfo.preempt_count     = HARDIRQ_OFFSET;
7007 +       irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
7008 +
7009 +       hardirq_ctx[cpu] = irqctx;
7010 +
7011 +       irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
7012 +       irqctx->tinfo.task              = NULL;
7013 +       irqctx->tinfo.exec_domain       = NULL;
7014 +       irqctx->tinfo.cpu               = cpu;
7015 +       irqctx->tinfo.preempt_count     = SOFTIRQ_OFFSET;
7016 +       irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
7017 +
7018 +       softirq_ctx[cpu] = irqctx;
7019 +
7020 +       printk("CPU %u irqstacks, hard=%p soft=%p\n",
7021 +               cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
7022 +}
7023 +
7024 +void irq_ctx_exit(int cpu)
7025 +{
7026 +       hardirq_ctx[cpu] = NULL;
7027 +}
7028 +
7029 +extern asmlinkage void __do_softirq(void);
7030 +
7031 +asmlinkage void do_softirq(void)
7032 +{
7033 +       unsigned long flags;
7034 +       struct thread_info *curctx;
7035 +       union irq_ctx *irqctx;
7036 +       u32 *isp;
7037 +
7038 +       if (in_interrupt())
7039 +               return;
7040 +
7041 +       local_irq_save(flags);
7042 +
7043 +       if (local_softirq_pending()) {
7044 +               curctx = current_thread_info();
7045 +               irqctx = softirq_ctx[smp_processor_id()];
7046 +               irqctx->tinfo.task = curctx->task;
7047 +               irqctx->tinfo.previous_esp = current_stack_pointer;
7048 +
7049 +               /* build the stack frame on the softirq stack */
7050 +               isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
7051 +
7052 +               asm volatile(
7053 +                       "       xchgl   %%ebx,%%esp     \n"
7054 +                       "       call    __do_softirq    \n"
7055 +                       "       movl    %%ebx,%%esp     \n"
7056 +                       : "=b"(isp)
7057 +                       : "0"(isp)
7058 +                       : "memory", "cc", "edx", "ecx", "eax"
7059 +               );
7060 +       }
7061 +
7062 +       local_irq_restore(flags);
7063 +}
7064 +
7065 +EXPORT_SYMBOL(do_softirq);
7066 +#endif
7067 +
7068 +/*
7069 + * Interrupt statistics:
7070 + */
7071 +
7072 +atomic_t irq_err_count;
7073 +
7074 +/*
7075 + * /proc/interrupts printing:
7076 + */
7077 +
7078 +int show_interrupts(struct seq_file *p, void *v)
7079 +{
7080 +       int i = *(loff_t *) v, j;
7081 +       struct irqaction * action;
7082 +       unsigned long flags;
7083 +
7084 +       if (i == 0) {
7085 +               seq_printf(p, "           ");
7086 +               for_each_online_cpu(j)
7087 +                       seq_printf(p, "CPU%d       ",j);
7088 +               seq_putc(p, '\n');
7089 +       }
7090 +
7091 +       if (i < NR_IRQS) {
7092 +               spin_lock_irqsave(&irq_desc[i].lock, flags);
7093 +               action = irq_desc[i].action;
7094 +               if (!action)
7095 +                       goto skip;
7096 +               seq_printf(p, "%3d: ",i);
7097 +#ifndef CONFIG_SMP
7098 +               seq_printf(p, "%10u ", kstat_irqs(i));
7099 +#else
7100 +               for_each_online_cpu(j)
7101 +                       seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
7102 +#endif
7103 +               seq_printf(p, " %14s", irq_desc[i].handler->typename);
7104 +               seq_printf(p, "  %s", action->name);
7105 +
7106 +               for (action=action->next; action; action = action->next)
7107 +                       seq_printf(p, ", %s", action->name);
7108 +
7109 +               seq_putc(p, '\n');
7110 +skip:
7111 +               spin_unlock_irqrestore(&irq_desc[i].lock, flags);
7112 +       } else if (i == NR_IRQS) {
7113 +               seq_printf(p, "NMI: ");
7114 +               for_each_online_cpu(j)
7115 +                       seq_printf(p, "%10u ", nmi_count(j));
7116 +               seq_putc(p, '\n');
7117 +#ifdef CONFIG_X86_LOCAL_APIC
7118 +               seq_printf(p, "LOC: ");
7119 +               for_each_online_cpu(j)
7120 +                       seq_printf(p, "%10u ",
7121 +                               per_cpu(irq_stat,j).apic_timer_irqs);
7122 +               seq_putc(p, '\n');
7123 +#endif
7124 +               seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
7125 +#if defined(CONFIG_X86_IO_APIC)
7126 +               seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
7127 +#endif
7128 +       }
7129 +       return 0;
7130 +}
7131 +
7132 +#ifdef CONFIG_HOTPLUG_CPU
7133 +
7134 +void fixup_irqs(cpumask_t map)
7135 +{
7136 +       unsigned int irq;
7137 +       static int warned;
7138 +
7139 +       for (irq = 0; irq < NR_IRQS; irq++) {
7140 +               cpumask_t mask;
7141 +               if (irq == 2)
7142 +                       continue;
7143 +
7144 +               cpus_and(mask, irq_affinity[irq], map);
7145 +               if (any_online_cpu(mask) == NR_CPUS) {
7146 +                       /*printk("Breaking affinity for irq %i\n", irq);*/
7147 +                       mask = map;
7148 +               }
7149 +               if (irq_desc[irq].handler->set_affinity)
7150 +                       irq_desc[irq].handler->set_affinity(irq, mask);
7151 +               else if (irq_desc[irq].action && !(warned++))
7152 +                       printk("Cannot set affinity for irq %i\n", irq);
7153 +       }
7154 +
7155 +#if 0
7156 +       barrier();
7157 +       /* Ingo Molnar says: "after the IO-APIC masks have been redirected
7158 +          [note the nop - the interrupt-enable boundary on x86 is two
7159 +          instructions from sti] - to flush out pending hardirqs and
7160 +          IPIs. After this point nothing is supposed to reach this CPU." */
7161 +       __asm__ __volatile__("sti; nop; cli");
7162 +       barrier();
7163 +#else
7164 +       /* That doesn't seem sufficient.  Give it 1ms. */
7165 +       local_irq_enable();
7166 +       mdelay(1);
7167 +       local_irq_disable();
7168 +#endif
7169 +}
7170 +#endif
7171 +
7172 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/ldt-xen.c linux-2.6.16/arch/i386/kernel/ldt-xen.c
7173 --- linux-2.6.16.orig/arch/i386/kernel/ldt-xen.c        1970-01-01 01:00:00.000000000 +0100
7174 +++ linux-2.6.16/arch/i386/kernel/ldt-xen.c     2006-06-26 09:51:32.000000000 +0200
7175 @@ -0,0 +1,269 @@
7176 +/*
7177 + * linux/kernel/ldt.c
7178 + *
7179 + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
7180 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
7181 + */
7182 +
7183 +#include <linux/errno.h>
7184 +#include <linux/sched.h>
7185 +#include <linux/string.h>
7186 +#include <linux/mm.h>
7187 +#include <linux/smp.h>
7188 +#include <linux/smp_lock.h>
7189 +#include <linux/vmalloc.h>
7190 +#include <linux/slab.h>
7191 +
7192 +#include <asm/uaccess.h>
7193 +#include <asm/system.h>
7194 +#include <asm/ldt.h>
7195 +#include <asm/desc.h>
7196 +#include <asm/mmu_context.h>
7197 +
7198 +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
7199 +static void flush_ldt(void *null)
7200 +{
7201 +       if (current->active_mm)
7202 +               load_LDT(&current->active_mm->context);
7203 +}
7204 +#endif
7205 +
7206 +static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
7207 +{
7208 +       void *oldldt;
7209 +       void *newldt;
7210 +       int oldsize;
7211 +
7212 +       if (mincount <= pc->size)
7213 +               return 0;
7214 +       oldsize = pc->size;
7215 +       mincount = (mincount+511)&(~511);
7216 +       if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
7217 +               newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
7218 +       else
7219 +               newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
7220 +
7221 +       if (!newldt)
7222 +               return -ENOMEM;
7223 +
7224 +       if (oldsize)
7225 +               memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
7226 +       oldldt = pc->ldt;
7227 +       memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
7228 +       pc->ldt = newldt;
7229 +       wmb();
7230 +       pc->size = mincount;
7231 +       wmb();
7232 +
7233 +       if (reload) {
7234 +#ifdef CONFIG_SMP
7235 +               cpumask_t mask;
7236 +               preempt_disable();
7237 +#endif
7238 +               make_pages_readonly(
7239 +                       pc->ldt,
7240 +                       (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
7241 +                       XENFEAT_writable_descriptor_tables);
7242 +               load_LDT(pc);
7243 +#ifdef CONFIG_SMP
7244 +               mask = cpumask_of_cpu(smp_processor_id());
7245 +               if (!cpus_equal(current->mm->cpu_vm_mask, mask))
7246 +                       smp_call_function(flush_ldt, NULL, 1, 1);
7247 +               preempt_enable();
7248 +#endif
7249 +       }
7250 +       if (oldsize) {
7251 +               make_pages_writable(
7252 +                       oldldt,
7253 +                       (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
7254 +                       XENFEAT_writable_descriptor_tables);
7255 +               if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
7256 +                       vfree(oldldt);
7257 +               else
7258 +                       kfree(oldldt);
7259 +       }
7260 +       return 0;
7261 +}
7262 +
7263 +static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
7264 +{
7265 +       int err = alloc_ldt(new, old->size, 0);
7266 +       if (err < 0)
7267 +               return err;
7268 +       memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
7269 +       make_pages_readonly(
7270 +               new->ldt,
7271 +               (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
7272 +               XENFEAT_writable_descriptor_tables);
7273 +       return 0;
7274 +}
7275 +
7276 +/*
7277 + * we do not have to muck with descriptors here, that is
7278 + * done in switch_mm() as needed.
7279 + */
7280 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
7281 +{
7282 +       struct mm_struct * old_mm;
7283 +       int retval = 0;
7284 +
7285 +       init_MUTEX(&mm->context.sem);
7286 +       mm->context.size = 0;
7287 +       old_mm = current->mm;
7288 +       if (old_mm && old_mm->context.size > 0) {
7289 +               down(&old_mm->context.sem);
7290 +               retval = copy_ldt(&mm->context, &old_mm->context);
7291 +               up(&old_mm->context.sem);
7292 +       }
7293 +       return retval;
7294 +}
7295 +
7296 +/*
7297 + * No need to lock the MM as we are the last user
7298 + */
7299 +void destroy_context(struct mm_struct *mm)
7300 +{
7301 +       if (mm->context.size) {
7302 +               if (mm == current->active_mm)
7303 +                       clear_LDT();
7304 +               make_pages_writable(
7305 +                       mm->context.ldt,
7306 +                       (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
7307 +                       XENFEAT_writable_descriptor_tables);
7308 +               if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
7309 +                       vfree(mm->context.ldt);
7310 +               else
7311 +                       kfree(mm->context.ldt);
7312 +               mm->context.size = 0;
7313 +       }
7314 +}
7315 +
7316 +static int read_ldt(void __user * ptr, unsigned long bytecount)
7317 +{
7318 +       int err;
7319 +       unsigned long size;
7320 +       struct mm_struct * mm = current->mm;
7321 +
7322 +       if (!mm->context.size)
7323 +               return 0;
7324 +       if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
7325 +               bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
7326 +
7327 +       down(&mm->context.sem);
7328 +       size = mm->context.size*LDT_ENTRY_SIZE;
7329 +       if (size > bytecount)
7330 +               size = bytecount;
7331 +
7332 +       err = 0;
7333 +       if (copy_to_user(ptr, mm->context.ldt, size))
7334 +               err = -EFAULT;
7335 +       up(&mm->context.sem);
7336 +       if (err < 0)
7337 +               goto error_return;
7338 +       if (size != bytecount) {
7339 +               /* zero-fill the rest */
7340 +               if (clear_user(ptr+size, bytecount-size) != 0) {
7341 +                       err = -EFAULT;
7342 +                       goto error_return;
7343 +               }
7344 +       }
7345 +       return bytecount;
7346 +error_return:
7347 +       return err;
7348 +}
7349 +
7350 +static int read_default_ldt(void __user * ptr, unsigned long bytecount)
7351 +{
7352 +       int err;
7353 +       unsigned long size;
7354 +       void *address;
7355 +
7356 +       err = 0;
7357 +       address = &default_ldt[0];
7358 +       size = 5*sizeof(struct desc_struct);
7359 +       if (size > bytecount)
7360 +               size = bytecount;
7361 +
7362 +       err = size;
7363 +       if (copy_to_user(ptr, address, size))
7364 +               err = -EFAULT;
7365 +
7366 +       return err;
7367 +}
7368 +
7369 +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
7370 +{
7371 +       struct mm_struct * mm = current->mm;
7372 +       __u32 entry_1, entry_2;
7373 +       int error;
7374 +       struct user_desc ldt_info;
7375 +
7376 +       error = -EINVAL;
7377 +       if (bytecount != sizeof(ldt_info))
7378 +               goto out;
7379 +       error = -EFAULT;        
7380 +       if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
7381 +               goto out;
7382 +
7383 +       error = -EINVAL;
7384 +       if (ldt_info.entry_number >= LDT_ENTRIES)
7385 +               goto out;
7386 +       if (ldt_info.contents == 3) {
7387 +               if (oldmode)
7388 +                       goto out;
7389 +               if (ldt_info.seg_not_present == 0)
7390 +                       goto out;
7391 +       }
7392 +
7393 +       down(&mm->context.sem);
7394 +       if (ldt_info.entry_number >= mm->context.size) {
7395 +               error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
7396 +               if (error < 0)
7397 +                       goto out_unlock;
7398 +       }
7399 +
7400 +       /* Allow LDTs to be cleared by the user. */
7401 +       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
7402 +               if (oldmode || LDT_empty(&ldt_info)) {
7403 +                       entry_1 = 0;
7404 +                       entry_2 = 0;
7405 +                       goto install;
7406 +               }
7407 +       }
7408 +
7409 +       entry_1 = LDT_entry_a(&ldt_info);
7410 +       entry_2 = LDT_entry_b(&ldt_info);
7411 +       if (oldmode)
7412 +               entry_2 &= ~(1 << 20);
7413 +
7414 +       /* Install the new entry ...  */
7415 +install:
7416 +       error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number,
7417 +                               entry_1, entry_2);
7418 +
7419 +out_unlock:
7420 +       up(&mm->context.sem);
7421 +out:
7422 +       return error;
7423 +}
7424 +
7425 +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
7426 +{
7427 +       int ret = -ENOSYS;
7428 +
7429 +       switch (func) {
7430 +       case 0:
7431 +               ret = read_ldt(ptr, bytecount);
7432 +               break;
7433 +       case 1:
7434 +               ret = write_ldt(ptr, bytecount, 1);
7435 +               break;
7436 +       case 2:
7437 +               ret = read_default_ldt(ptr, bytecount);
7438 +               break;
7439 +       case 0x11:
7440 +               ret = write_ldt(ptr, bytecount, 0);
7441 +               break;
7442 +       }
7443 +       return ret;
7444 +}
7445 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/microcode-xen.c linux-2.6.16/arch/i386/kernel/microcode-xen.c
7446 --- linux-2.6.16.orig/arch/i386/kernel/microcode-xen.c  1970-01-01 01:00:00.000000000 +0100
7447 +++ linux-2.6.16/arch/i386/kernel/microcode-xen.c       2006-06-26 09:51:32.000000000 +0200
7448 @@ -0,0 +1,159 @@
7449 +/*
7450 + *     Intel CPU Microcode Update Driver for Linux
7451 + *
7452 + *     Copyright (C) 2000-2004 Tigran Aivazian
7453 + *
7454 + *     This driver allows to upgrade microcode on Intel processors
7455 + *     belonging to IA-32 family - PentiumPro, Pentium II, 
7456 + *     Pentium III, Xeon, Pentium 4, etc.
7457 + *
7458 + *     Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, 
7459 + *     Order Number 245472 or free download from:
7460 + *             
7461 + *     http://developer.intel.com/design/pentium4/manuals/245472.htm
7462 + *
7463 + *     For more information, go to http://www.urbanmyth.org/microcode
7464 + *
7465 + *     This program is free software; you can redistribute it and/or
7466 + *     modify it under the terms of the GNU General Public License
7467 + *     as published by the Free Software Foundation; either version
7468 + *     2 of the License, or (at your option) any later version.
7469 + */
7470 +
7471 +//#define DEBUG /* pr_debug */
7472 +#include <linux/capability.h>
7473 +#include <linux/kernel.h>
7474 +#include <linux/init.h>
7475 +#include <linux/sched.h>
7476 +#include <linux/cpumask.h>
7477 +#include <linux/module.h>
7478 +#include <linux/slab.h>
7479 +#include <linux/vmalloc.h>
7480 +#include <linux/miscdevice.h>
7481 +#include <linux/spinlock.h>
7482 +#include <linux/mm.h>
7483 +#include <linux/syscalls.h>
7484 +
7485 +#include <asm/msr.h>
7486 +#include <asm/uaccess.h>
7487 +#include <asm/processor.h>
7488 +
7489 +MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
7490 +MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>");
7491 +MODULE_LICENSE("GPL");
7492 +
7493 +#define MICROCODE_VERSION      "1.14-xen"
7494 +
7495 +#define DEFAULT_UCODE_DATASIZE         (2000)    /* 2000 bytes */
7496 +#define MC_HEADER_SIZE         (sizeof (microcode_header_t))     /* 48 bytes */
7497 +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
7498 +
7499 +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
7500 +static DECLARE_MUTEX(microcode_sem);
7501 +
7502 +static int microcode_open (struct inode *unused1, struct file *unused2)
7503 +{
7504 +       return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
7505 +}
7506 +
7507 +
7508 +static int do_microcode_update (const void __user *ubuf, size_t len)
7509 +{
7510 +       int err;
7511 +       void *kbuf;
7512 +
7513 +       kbuf = vmalloc(len);
7514 +       if (!kbuf)
7515 +               return -ENOMEM;
7516 +
7517 +       if (copy_from_user(kbuf, ubuf, len) == 0) {
7518 +               dom0_op_t op;
7519 +
7520 +               op.cmd = DOM0_MICROCODE;
7521 +               op.u.microcode.data = kbuf;
7522 +               op.u.microcode.length = len;
7523 +               err = HYPERVISOR_dom0_op(&op);
7524 +       } else
7525 +               err = -EFAULT;
7526 +
7527 +       vfree(kbuf);
7528 +
7529 +       return err;
7530 +}
7531 +
7532 +static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
7533 +{
7534 +       ssize_t ret;
7535 +
7536 +       if (len < DEFAULT_UCODE_TOTALSIZE) {
7537 +               printk(KERN_ERR "microcode: not enough data\n"); 
7538 +               return -EINVAL;
7539 +       }
7540 +
7541 +       down(&microcode_sem);
7542 +
7543 +       ret = do_microcode_update(buf, len);
7544 +       if (!ret)
7545 +               ret = (ssize_t)len;
7546 +
7547 +       up(&microcode_sem);
7548 +
7549 +       return ret;
7550 +}
7551 +
7552 +static int microcode_ioctl (struct inode *inode, struct file *file, 
7553 +               unsigned int cmd, unsigned long arg)
7554 +{
7555 +       switch (cmd) {
7556 +               /* 
7557 +                *  XXX: will be removed after microcode_ctl 
7558 +                *  is updated to ignore failure of this ioctl()
7559 +                */
7560 +               case MICROCODE_IOCFREE:
7561 +                       return 0;
7562 +               default:
7563 +                       return -EINVAL;
7564 +       }
7565 +       return -EINVAL;
7566 +}
7567 +
7568 +static struct file_operations microcode_fops = {
7569 +       .owner          = THIS_MODULE,
7570 +       .write          = microcode_write,
7571 +       .ioctl          = microcode_ioctl,
7572 +       .open           = microcode_open,
7573 +};
7574 +
7575 +static struct miscdevice microcode_dev = {
7576 +       .minor          = MICROCODE_MINOR,
7577 +       .name           = "microcode",
7578 +       .devfs_name     = "cpu/microcode",
7579 +       .fops           = &microcode_fops,
7580 +};
7581 +
7582 +static int __init microcode_init (void)
7583 +{
7584 +       int error;
7585 +
7586 +       error = misc_register(&microcode_dev);
7587 +       if (error) {
7588 +               printk(KERN_ERR
7589 +                       "microcode: can't misc_register on minor=%d\n",
7590 +                       MICROCODE_MINOR);
7591 +               return error;
7592 +       }
7593 +
7594 +       printk(KERN_INFO 
7595 +               "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n");
7596 +       return 0;
7597 +}
7598 +
7599 +static void __exit microcode_exit (void)
7600 +{
7601 +       misc_deregister(&microcode_dev);
7602 +       printk(KERN_INFO "IA-32 Microcode Update Driver v" MICROCODE_VERSION " unregistered\n");
7603 +}
7604 +
7605 +module_init(microcode_init)
7606 +module_exit(microcode_exit)
7607 +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
7608 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/mpparse-xen.c linux-2.6.16/arch/i386/kernel/mpparse-xen.c
7609 --- linux-2.6.16.orig/arch/i386/kernel/mpparse-xen.c    1970-01-01 01:00:00.000000000 +0100
7610 +++ linux-2.6.16/arch/i386/kernel/mpparse-xen.c 2006-06-26 09:51:32.000000000 +0200
7611 @@ -0,0 +1,1188 @@
7612 +/*
7613 + *     Intel Multiprocessor Specification 1.1 and 1.4
7614 + *     compliant MP-table parsing routines.
7615 + *
7616 + *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
7617 + *     (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
7618 + *
7619 + *     Fixes
7620 + *             Erich Boleyn    :       MP v1.4 and additional changes.
7621 + *             Alan Cox        :       Added EBDA scanning
7622 + *             Ingo Molnar     :       various cleanups and rewrites
7623 + *             Maciej W. Rozycki:      Bits for default MP configurations
7624 + *             Paul Diefenbaugh:       Added full ACPI support
7625 + */
7626 +
7627 +#include <linux/mm.h>
7628 +#include <linux/init.h>
7629 +#include <linux/acpi.h>
7630 +#include <linux/delay.h>
7631 +#include <linux/config.h>
7632 +#include <linux/bootmem.h>
7633 +#include <linux/smp_lock.h>
7634 +#include <linux/kernel_stat.h>
7635 +#include <linux/mc146818rtc.h>
7636 +#include <linux/bitops.h>
7637 +
7638 +#include <asm/smp.h>
7639 +#include <asm/acpi.h>
7640 +#include <asm/mtrr.h>
7641 +#include <asm/mpspec.h>
7642 +#include <asm/io_apic.h>
7643 +
7644 +#include <mach_apic.h>
7645 +#include <mach_mpparse.h>
7646 +#include <bios_ebda.h>
7647 +
7648 +/* Have we found an MP table */
7649 +int smp_found_config;
7650 +unsigned int __initdata maxcpus = NR_CPUS;
7651 +
7652 +#ifdef CONFIG_HOTPLUG_CPU
7653 +#define CPU_HOTPLUG_ENABLED    (1)
7654 +#else
7655 +#define CPU_HOTPLUG_ENABLED    (0)
7656 +#endif
7657 +
7658 +/*
7659 + * Various Linux-internal data structures created from the
7660 + * MP-table.
7661 + */
7662 +int apic_version [MAX_APICS];
7663 +int mp_bus_id_to_type [MAX_MP_BUSSES];
7664 +int mp_bus_id_to_node [MAX_MP_BUSSES];
7665 +int mp_bus_id_to_local [MAX_MP_BUSSES];
7666 +int quad_local_to_mp_bus_id [NR_CPUS/4][4];
7667 +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
7668 +static int mp_current_pci_id;
7669 +
7670 +/* I/O APIC entries */
7671 +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
7672 +
7673 +/* # of MP IRQ source entries */
7674 +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
7675 +
7676 +/* MP IRQ source entries */
7677 +int mp_irq_entries;
7678 +
7679 +int nr_ioapics;
7680 +
7681 +int pic_mode;
7682 +unsigned long mp_lapic_addr;
7683 +
7684 +unsigned int def_to_bigsmp = 0;
7685 +
7686 +/* Processor that is doing the boot up */
7687 +unsigned int boot_cpu_physical_apicid = -1U;
7688 +/* Internal processor count */
7689 +static unsigned int __devinitdata num_processors;
7690 +
7691 +/* Bitmask of physically existing CPUs */
7692 +physid_mask_t phys_cpu_present_map;
7693 +
7694 +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
7695 +
7696 +/*
7697 + * Intel MP BIOS table parsing routines:
7698 + */
7699 +
7700 +
7701 +/*
7702 + * Checksum an MP configuration block.
7703 + */
7704 +
7705 +static int __init mpf_checksum(unsigned char *mp, int len)
7706 +{
7707 +       int sum = 0;
7708 +
7709 +       while (len--)
7710 +               sum += *mp++;
7711 +
7712 +       return sum & 0xFF;
7713 +}
7714 +
7715 +/*
7716 + * Have to match translation table entries to main table entries by counter
7717 + * hence the mpc_record variable .... can't see a less disgusting way of
7718 + * doing this ....
7719 + */
7720 +
7721 +static int mpc_record; 
7722 +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata;
7723 +
7724 +#ifdef CONFIG_X86_NUMAQ
7725 +static int MP_valid_apicid(int apicid, int version)
7726 +{
7727 +       return hweight_long(apicid & 0xf) == 1 && (apicid >> 4) != 0xf;
7728 +}
7729 +#elif !defined(CONFIG_XEN)
7730 +static int MP_valid_apicid(int apicid, int version)
7731 +{
7732 +       if (version >= 0x14)
7733 +               return apicid < 0xff;
7734 +       else
7735 +               return apicid < 0xf;
7736 +}
7737 +#endif
7738 +
7739 +#ifndef CONFIG_XEN
7740 +static void __devinit MP_processor_info (struct mpc_config_processor *m)
7741 +{
7742 +       int ver, apicid;
7743 +       physid_mask_t phys_cpu;
7744 +       
7745 +       if (!(m->mpc_cpuflag & CPU_ENABLED))
7746 +               return;
7747 +
7748 +       apicid = mpc_apic_id(m, translation_table[mpc_record]);
7749 +
7750 +       if (m->mpc_featureflag&(1<<0))
7751 +               Dprintk("    Floating point unit present.\n");
7752 +       if (m->mpc_featureflag&(1<<7))
7753 +               Dprintk("    Machine Exception supported.\n");
7754 +       if (m->mpc_featureflag&(1<<8))
7755 +               Dprintk("    64 bit compare & exchange supported.\n");
7756 +       if (m->mpc_featureflag&(1<<9))
7757 +               Dprintk("    Internal APIC present.\n");
7758 +       if (m->mpc_featureflag&(1<<11))
7759 +               Dprintk("    SEP present.\n");
7760 +       if (m->mpc_featureflag&(1<<12))
7761 +               Dprintk("    MTRR  present.\n");
7762 +       if (m->mpc_featureflag&(1<<13))
7763 +               Dprintk("    PGE  present.\n");
7764 +       if (m->mpc_featureflag&(1<<14))
7765 +               Dprintk("    MCA  present.\n");
7766 +       if (m->mpc_featureflag&(1<<15))
7767 +               Dprintk("    CMOV  present.\n");
7768 +       if (m->mpc_featureflag&(1<<16))
7769 +               Dprintk("    PAT  present.\n");
7770 +       if (m->mpc_featureflag&(1<<17))
7771 +               Dprintk("    PSE  present.\n");
7772 +       if (m->mpc_featureflag&(1<<18))
7773 +               Dprintk("    PSN  present.\n");
7774 +       if (m->mpc_featureflag&(1<<19))
7775 +               Dprintk("    Cache Line Flush Instruction present.\n");
7776 +       /* 20 Reserved */
7777 +       if (m->mpc_featureflag&(1<<21))
7778 +               Dprintk("    Debug Trace and EMON Store present.\n");
7779 +       if (m->mpc_featureflag&(1<<22))
7780 +               Dprintk("    ACPI Thermal Throttle Registers  present.\n");
7781 +       if (m->mpc_featureflag&(1<<23))
7782 +               Dprintk("    MMX  present.\n");
7783 +       if (m->mpc_featureflag&(1<<24))
7784 +               Dprintk("    FXSR  present.\n");
7785 +       if (m->mpc_featureflag&(1<<25))
7786 +               Dprintk("    XMM  present.\n");
7787 +       if (m->mpc_featureflag&(1<<26))
7788 +               Dprintk("    Willamette New Instructions  present.\n");
7789 +       if (m->mpc_featureflag&(1<<27))
7790 +               Dprintk("    Self Snoop  present.\n");
7791 +       if (m->mpc_featureflag&(1<<28))
7792 +               Dprintk("    HT  present.\n");
7793 +       if (m->mpc_featureflag&(1<<29))
7794 +               Dprintk("    Thermal Monitor present.\n");
7795 +       /* 30, 31 Reserved */
7796 +
7797 +
7798 +       if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
7799 +               Dprintk("    Bootup CPU\n");
7800 +               boot_cpu_physical_apicid = m->mpc_apicid;
7801 +       }
7802 +
7803 +       ver = m->mpc_apicver;
7804 +
7805 +       if (!MP_valid_apicid(apicid, ver)) {
7806 +               printk(KERN_WARNING "Processor #%d INVALID. (Max ID: %d).\n",
7807 +                       m->mpc_apicid, MAX_APICS);
7808 +               return;
7809 +       }
7810 +
7811 +       /*
7812 +        * Validate version
7813 +        */
7814 +       if (ver == 0x0) {
7815 +               printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
7816 +                               "fixing up to 0x10. (tell your hw vendor)\n",
7817 +                               m->mpc_apicid);
7818 +               ver = 0x10;
7819 +       }
7820 +       apic_version[m->mpc_apicid] = ver;
7821 +
7822 +       phys_cpu = apicid_to_cpu_present(apicid);
7823 +       physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
7824 +
7825 +       if (num_processors >= NR_CPUS) {
7826 +               printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
7827 +                       "  Processor ignored.\n", NR_CPUS);
7828 +               return;
7829 +       }
7830 +
7831 +       if (num_processors >= maxcpus) {
7832 +               printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
7833 +                       " Processor ignored.\n", maxcpus);
7834 +               return;
7835 +       }
7836 +
7837 +       cpu_set(num_processors, cpu_possible_map);
7838 +       num_processors++;
7839 +
7840 +       if (CPU_HOTPLUG_ENABLED || (num_processors > 8)) {
7841 +               switch (boot_cpu_data.x86_vendor) {
7842 +               case X86_VENDOR_INTEL:
7843 +                       if (!APIC_XAPIC(ver)) {
7844 +                               def_to_bigsmp = 0;
7845 +                               break;
7846 +                       }
7847 +                       /* If P4 and above fall through */
7848 +               case X86_VENDOR_AMD:
7849 +                       def_to_bigsmp = 1;
7850 +               }
7851 +       }
7852 +       bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
7853 +}
7854 +#else
7855 +void __init MP_processor_info (struct mpc_config_processor *m)
7856 +{
7857 +       num_processors++;
7858 +}
7859 +#endif /* CONFIG_XEN */
7860 +
7861 +static void __init MP_bus_info (struct mpc_config_bus *m)
7862 +{
7863 +       char str[7];
7864 +
7865 +       memcpy(str, m->mpc_bustype, 6);
7866 +       str[6] = 0;
7867 +
7868 +       mpc_oem_bus_info(m, str, translation_table[mpc_record]);
7869 +
7870 +       if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
7871 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
7872 +       } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
7873 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
7874 +       } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
7875 +               mpc_oem_pci_bus(m, translation_table[mpc_record]);
7876 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
7877 +               mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
7878 +               mp_current_pci_id++;
7879 +       } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
7880 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
7881 +       } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
7882 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
7883 +       } else {
7884 +               printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
7885 +       }
7886 +}
7887 +
7888 +static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
7889 +{
7890 +       if (!(m->mpc_flags & MPC_APIC_USABLE))
7891 +               return;
7892 +
7893 +       printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
7894 +               m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
7895 +       if (nr_ioapics >= MAX_IO_APICS) {
7896 +               printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
7897 +                       MAX_IO_APICS, nr_ioapics);
7898 +               panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
7899 +       }
7900 +       if (!m->mpc_apicaddr) {
7901 +               printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
7902 +                       " found in MP table, skipping!\n");
7903 +               return;
7904 +       }
7905 +       mp_ioapics[nr_ioapics] = *m;
7906 +       nr_ioapics++;
7907 +}
7908 +
7909 +static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
7910 +{
7911 +       mp_irqs [mp_irq_entries] = *m;
7912 +       Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
7913 +               " IRQ %02x, APIC ID %x, APIC INT %02x\n",
7914 +                       m->mpc_irqtype, m->mpc_irqflag & 3,
7915 +                       (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
7916 +                       m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
7917 +       if (++mp_irq_entries == MAX_IRQ_SOURCES)
7918 +               panic("Max # of irq sources exceeded!!\n");
7919 +}
7920 +
7921 +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
7922 +{
7923 +       Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
7924 +               " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
7925 +                       m->mpc_irqtype, m->mpc_irqflag & 3,
7926 +                       (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
7927 +                       m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
7928 +       /*
7929 +        * Well it seems all SMP boards in existence
7930 +        * use ExtINT/LVT1 == LINT0 and
7931 +        * NMI/LVT2 == LINT1 - the following check
7932 +        * will show us if this assumptions is false.
7933 +        * Until then we do not have to add baggage.
7934 +        */
7935 +       if ((m->mpc_irqtype == mp_ExtINT) &&
7936 +               (m->mpc_destapiclint != 0))
7937 +                       BUG();
7938 +       if ((m->mpc_irqtype == mp_NMI) &&
7939 +               (m->mpc_destapiclint != 1))
7940 +                       BUG();
7941 +}
7942 +
7943 +#ifdef CONFIG_X86_NUMAQ
7944 +static void __init MP_translation_info (struct mpc_config_translation *m)
7945 +{
7946 +       printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
7947 +
7948 +       if (mpc_record >= MAX_MPC_ENTRY) 
7949 +               printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
7950 +       else
7951 +               translation_table[mpc_record] = m; /* stash this for later */
7952 +       if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
7953 +               node_set_online(m->trans_quad);
7954 +}
7955 +
7956 +/*
7957 + * Read/parse the MPC oem tables
7958 + */
7959 +
7960 +static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
7961 +       unsigned short oemsize)
7962 +{
7963 +       int count = sizeof (*oemtable); /* the header size */
7964 +       unsigned char *oemptr = ((unsigned char *)oemtable)+count;
7965 +       
7966 +       mpc_record = 0;
7967 +       printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
7968 +       if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
7969 +       {
7970 +               printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
7971 +                       oemtable->oem_signature[0],
7972 +                       oemtable->oem_signature[1],
7973 +                       oemtable->oem_signature[2],
7974 +                       oemtable->oem_signature[3]);
7975 +               return;
7976 +       }
7977 +       if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
7978 +       {
7979 +               printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
7980 +               return;
7981 +       }
7982 +       while (count < oemtable->oem_length) {
7983 +               switch (*oemptr) {
7984 +                       case MP_TRANSLATION:
7985 +                       {
7986 +                               struct mpc_config_translation *m=
7987 +                                       (struct mpc_config_translation *)oemptr;
7988 +                               MP_translation_info(m);
7989 +                               oemptr += sizeof(*m);
7990 +                               count += sizeof(*m);
7991 +                               ++mpc_record;
7992 +                               break;
7993 +                       }
7994 +                       default:
7995 +                       {
7996 +                               printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
7997 +                               return;
7998 +                       }
7999 +               }
8000 +       }
8001 +}
8002 +
8003 +static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
8004 +               char *productid)
8005 +{
8006 +       if (strncmp(oem, "IBM NUMA", 8))
8007 +               printk("Warning!  May not be a NUMA-Q system!\n");
8008 +       if (mpc->mpc_oemptr)
8009 +               smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
8010 +                               mpc->mpc_oemsize);
8011 +}
8012 +#endif /* CONFIG_X86_NUMAQ */
8013 +
8014 +/*
8015 + * Read/parse the MPC
8016 + */
8017 +
8018 +static int __init smp_read_mpc(struct mp_config_table *mpc)
8019 +{
8020 +       char str[16];
8021 +       char oem[10];
8022 +       int count=sizeof(*mpc);
8023 +       unsigned char *mpt=((unsigned char *)mpc)+count;
8024 +
8025 +       if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
8026 +               printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
8027 +                       *(u32 *)mpc->mpc_signature);
8028 +               return 0;
8029 +       }
8030 +       if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
8031 +               printk(KERN_ERR "SMP mptable: checksum error!\n");
8032 +               return 0;
8033 +       }
8034 +       if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
8035 +               printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
8036 +                       mpc->mpc_spec);
8037 +               return 0;
8038 +       }
8039 +       if (!mpc->mpc_lapic) {
8040 +               printk(KERN_ERR "SMP mptable: null local APIC address!\n");
8041 +               return 0;
8042 +       }
8043 +       memcpy(oem,mpc->mpc_oem,8);
8044 +       oem[8]=0;
8045 +       printk(KERN_INFO "OEM ID: %s ",oem);
8046 +
8047 +       memcpy(str,mpc->mpc_productid,12);
8048 +       str[12]=0;
8049 +       printk("Product ID: %s ",str);
8050 +
8051 +       mps_oem_check(mpc, oem, str);
8052 +
8053 +       printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
8054 +
8055 +       /* 
8056 +        * Save the local APIC address (it might be non-default) -- but only
8057 +        * if we're not using ACPI.
8058 +        */
8059 +       if (!acpi_lapic)
8060 +               mp_lapic_addr = mpc->mpc_lapic;
8061 +
8062 +       /*
8063 +        *      Now process the configuration blocks.
8064 +        */
8065 +       mpc_record = 0;
8066 +       while (count < mpc->mpc_length) {
8067 +               switch(*mpt) {
8068 +                       case MP_PROCESSOR:
8069 +                       {
8070 +                               struct mpc_config_processor *m=
8071 +                                       (struct mpc_config_processor *)mpt;
8072 +                               /* ACPI may have already provided this data */
8073 +                               if (!acpi_lapic)
8074 +                                       MP_processor_info(m);
8075 +                               mpt += sizeof(*m);
8076 +                               count += sizeof(*m);
8077 +                               break;
8078 +                       }
8079 +                       case MP_BUS:
8080 +                       {
8081 +                               struct mpc_config_bus *m=
8082 +                                       (struct mpc_config_bus *)mpt;
8083 +                               MP_bus_info(m);
8084 +                               mpt += sizeof(*m);
8085 +                               count += sizeof(*m);
8086 +                               break;
8087 +                       }
8088 +                       case MP_IOAPIC:
8089 +                       {
8090 +                               struct mpc_config_ioapic *m=
8091 +                                       (struct mpc_config_ioapic *)mpt;
8092 +                               MP_ioapic_info(m);
8093 +                               mpt+=sizeof(*m);
8094 +                               count+=sizeof(*m);
8095 +                               break;
8096 +                       }
8097 +                       case MP_INTSRC:
8098 +                       {
8099 +                               struct mpc_config_intsrc *m=
8100 +                                       (struct mpc_config_intsrc *)mpt;
8101 +
8102 +                               MP_intsrc_info(m);
8103 +                               mpt+=sizeof(*m);
8104 +                               count+=sizeof(*m);
8105 +                               break;
8106 +                       }
8107 +                       case MP_LINTSRC:
8108 +                       {
8109 +                               struct mpc_config_lintsrc *m=
8110 +                                       (struct mpc_config_lintsrc *)mpt;
8111 +                               MP_lintsrc_info(m);
8112 +                               mpt+=sizeof(*m);
8113 +                               count+=sizeof(*m);
8114 +                               break;
8115 +                       }
8116 +                       default:
8117 +                       {
8118 +                               count = mpc->mpc_length;
8119 +                               break;
8120 +                       }
8121 +               }
8122 +               ++mpc_record;
8123 +       }
8124 +       clustered_apic_check();
8125 +       if (!num_processors)
8126 +               printk(KERN_ERR "SMP mptable: no processors registered!\n");
8127 +       return num_processors;
8128 +}
8129 +
8130 +static int __init ELCR_trigger(unsigned int irq)
8131 +{
8132 +       unsigned int port;
8133 +
8134 +       port = 0x4d0 + (irq >> 3);
8135 +       return (inb(port) >> (irq & 7)) & 1;
8136 +}
8137 +
8138 +static void __init construct_default_ioirq_mptable(int mpc_default_type)
8139 +{
8140 +       struct mpc_config_intsrc intsrc;
8141 +       int i;
8142 +       int ELCR_fallback = 0;
8143 +
8144 +       intsrc.mpc_type = MP_INTSRC;
8145 +       intsrc.mpc_irqflag = 0;                 /* conforming */
8146 +       intsrc.mpc_srcbus = 0;
8147 +       intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
8148 +
8149 +       intsrc.mpc_irqtype = mp_INT;
8150 +
8151 +       /*
8152 +        *  If true, we have an ISA/PCI system with no IRQ entries
8153 +        *  in the MP table. To prevent the PCI interrupts from being set up
8154 +        *  incorrectly, we try to use the ELCR. The sanity check to see if
8155 +        *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
8156 +        *  never be level sensitive, so we simply see if the ELCR agrees.
8157 +        *  If it does, we assume it's valid.
8158 +        */
8159 +       if (mpc_default_type == 5) {
8160 +               printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
8161 +
8162 +               if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
8163 +                       printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
8164 +               else {
8165 +                       printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
8166 +                       ELCR_fallback = 1;
8167 +               }
8168 +       }
8169 +
8170 +       for (i = 0; i < 16; i++) {
8171 +               switch (mpc_default_type) {
8172 +               case 2:
8173 +                       if (i == 0 || i == 13)
8174 +                               continue;       /* IRQ0 & IRQ13 not connected */
8175 +                       /* fall through */
8176 +               default:
8177 +                       if (i == 2)
8178 +                               continue;       /* IRQ2 is never connected */
8179 +               }
8180 +
8181 +               if (ELCR_fallback) {
8182 +                       /*
8183 +                        *  If the ELCR indicates a level-sensitive interrupt, we
8184 +                        *  copy that information over to the MP table in the
8185 +                        *  irqflag field (level sensitive, active high polarity).
8186 +                        */
8187 +                       if (ELCR_trigger(i))
8188 +                               intsrc.mpc_irqflag = 13;
8189 +                       else
8190 +                               intsrc.mpc_irqflag = 0;
8191 +               }
8192 +
8193 +               intsrc.mpc_srcbusirq = i;
8194 +               intsrc.mpc_dstirq = i ? i : 2;          /* IRQ0 to INTIN2 */
8195 +               MP_intsrc_info(&intsrc);
8196 +       }
8197 +
8198 +       intsrc.mpc_irqtype = mp_ExtINT;
8199 +       intsrc.mpc_srcbusirq = 0;
8200 +       intsrc.mpc_dstirq = 0;                          /* 8259A to INTIN0 */
8201 +       MP_intsrc_info(&intsrc);
8202 +}
8203 +
8204 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
8205 +{
8206 +       struct mpc_config_processor processor;
8207 +       struct mpc_config_bus bus;
8208 +       struct mpc_config_ioapic ioapic;
8209 +       struct mpc_config_lintsrc lintsrc;
8210 +       int linttypes[2] = { mp_ExtINT, mp_NMI };
8211 +       int i;
8212 +
8213 +       /*
8214 +        * local APIC has default address
8215 +        */
8216 +       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
8217 +
8218 +       /*
8219 +        * 2 CPUs, numbered 0 & 1.
8220 +        */
8221 +       processor.mpc_type = MP_PROCESSOR;
8222 +       /* Either an integrated APIC or a discrete 82489DX. */
8223 +       processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
8224 +       processor.mpc_cpuflag = CPU_ENABLED;
8225 +       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
8226 +                                  (boot_cpu_data.x86_model << 4) |
8227 +                                  boot_cpu_data.x86_mask;
8228 +       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
8229 +       processor.mpc_reserved[0] = 0;
8230 +       processor.mpc_reserved[1] = 0;
8231 +       for (i = 0; i < 2; i++) {
8232 +               processor.mpc_apicid = i;
8233 +               MP_processor_info(&processor);
8234 +       }
8235 +
8236 +       bus.mpc_type = MP_BUS;
8237 +       bus.mpc_busid = 0;
8238 +       switch (mpc_default_type) {
8239 +               default:
8240 +                       printk("???\n");
8241 +                       printk(KERN_ERR "Unknown standard configuration %d\n",
8242 +                               mpc_default_type);
8243 +                       /* fall through */
8244 +               case 1:
8245 +               case 5:
8246 +                       memcpy(bus.mpc_bustype, "ISA   ", 6);
8247 +                       break;
8248 +               case 2:
8249 +               case 6:
8250 +               case 3:
8251 +                       memcpy(bus.mpc_bustype, "EISA  ", 6);
8252 +                       break;
8253 +               case 4:
8254 +               case 7:
8255 +                       memcpy(bus.mpc_bustype, "MCA   ", 6);
8256 +       }
8257 +       MP_bus_info(&bus);
8258 +       if (mpc_default_type > 4) {
8259 +               bus.mpc_busid = 1;
8260 +               memcpy(bus.mpc_bustype, "PCI   ", 6);
8261 +               MP_bus_info(&bus);
8262 +       }
8263 +
8264 +       ioapic.mpc_type = MP_IOAPIC;
8265 +       ioapic.mpc_apicid = 2;
8266 +       ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
8267 +       ioapic.mpc_flags = MPC_APIC_USABLE;
8268 +       ioapic.mpc_apicaddr = 0xFEC00000;
8269 +       MP_ioapic_info(&ioapic);
8270 +
8271 +       /*
8272 +        * We set up most of the low 16 IO-APIC pins according to MPS rules.
8273 +        */
8274 +       construct_default_ioirq_mptable(mpc_default_type);
8275 +
8276 +       lintsrc.mpc_type = MP_LINTSRC;
8277 +       lintsrc.mpc_irqflag = 0;                /* conforming */
8278 +       lintsrc.mpc_srcbusid = 0;
8279 +       lintsrc.mpc_srcbusirq = 0;
8280 +       lintsrc.mpc_destapic = MP_APIC_ALL;
8281 +       for (i = 0; i < 2; i++) {
8282 +               lintsrc.mpc_irqtype = linttypes[i];
8283 +               lintsrc.mpc_destapiclint = i;
8284 +               MP_lintsrc_info(&lintsrc);
8285 +       }
8286 +}
8287 +
8288 +static struct intel_mp_floating *mpf_found;
8289 +
8290 +/*
8291 + * Scan the memory blocks for an SMP configuration block.
8292 + */
8293 +void __init get_smp_config (void)
8294 +{
8295 +       struct intel_mp_floating *mpf = mpf_found;
8296 +
8297 +       /*
8298 +        * ACPI supports both logical (e.g. Hyper-Threading) and physical 
8299 +        * processors, where MPS only supports physical.
8300 +        */
8301 +       if (acpi_lapic && acpi_ioapic) {
8302 +               printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
8303 +               return;
8304 +       }
8305 +       else if (acpi_lapic)
8306 +               printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
8307 +
8308 +       printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
8309 +       if (mpf->mpf_feature2 & (1<<7)) {
8310 +               printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
8311 +               pic_mode = 1;
8312 +       } else {
8313 +               printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
8314 +               pic_mode = 0;
8315 +       }
8316 +
8317 +       /*
8318 +        * Now see if we need to read further.
8319 +        */
8320 +       if (mpf->mpf_feature1 != 0) {
8321 +
8322 +               printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
8323 +               construct_default_ISA_mptable(mpf->mpf_feature1);
8324 +
8325 +       } else if (mpf->mpf_physptr) {
8326 +
8327 +               /*
8328 +                * Read the physical hardware table.  Anything here will
8329 +                * override the defaults.
8330 +                */
8331 +               if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
8332 +                       smp_found_config = 0;
8333 +                       printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
8334 +                       printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
8335 +                       return;
8336 +               }
8337 +               /*
8338 +                * If there are no explicit MP IRQ entries, then we are
8339 +                * broken.  We set up most of the low 16 IO-APIC pins to
8340 +                * ISA defaults and hope it will work.
8341 +                */
8342 +               if (!mp_irq_entries) {
8343 +                       struct mpc_config_bus bus;
8344 +
8345 +                       printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
8346 +
8347 +                       bus.mpc_type = MP_BUS;
8348 +                       bus.mpc_busid = 0;
8349 +                       memcpy(bus.mpc_bustype, "ISA   ", 6);
8350 +                       MP_bus_info(&bus);
8351 +
8352 +                       construct_default_ioirq_mptable(0);
8353 +               }
8354 +
8355 +       } else
8356 +               BUG();
8357 +
8358 +       printk(KERN_INFO "Processors: %d\n", num_processors);
8359 +       /*
8360 +        * Only use the first configuration found.
8361 +        */
8362 +}
8363 +
8364 +static int __init smp_scan_config (unsigned long base, unsigned long length)
8365 +{
8366 +       unsigned long *bp = isa_bus_to_virt(base);
8367 +       struct intel_mp_floating *mpf;
8368 +
8369 +       Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
8370 +       if (sizeof(*mpf) != 16)
8371 +               printk("Error: MPF size\n");
8372 +
8373 +       while (length > 0) {
8374 +               mpf = (struct intel_mp_floating *)bp;
8375 +               if ((*bp == SMP_MAGIC_IDENT) &&
8376 +                       (mpf->mpf_length == 1) &&
8377 +                       !mpf_checksum((unsigned char *)bp, 16) &&
8378 +                       ((mpf->mpf_specification == 1)
8379 +                               || (mpf->mpf_specification == 4)) ) {
8380 +
8381 +                       smp_found_config = 1;
8382 +#ifndef CONFIG_XEN
8383 +                       printk(KERN_INFO "found SMP MP-table at %08lx\n",
8384 +                                               virt_to_phys(mpf));
8385 +                       reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
8386 +                       if (mpf->mpf_physptr) {
8387 +                               /*
8388 +                                * We cannot access to MPC table to compute
8389 +                                * table size yet, as only few megabytes from
8390 +                                * the bottom is mapped now.
8391 +                                * PC-9800's MPC table places on the very last
8392 +                                * of physical memory; so that simply reserving
8393 +                                * PAGE_SIZE from mpg->mpf_physptr yields BUG()
8394 +                                * in reserve_bootmem.
8395 +                                */
8396 +                               unsigned long size = PAGE_SIZE;
8397 +                               unsigned long end = max_low_pfn * PAGE_SIZE;
8398 +                               if (mpf->mpf_physptr + size > end)
8399 +                                       size = end - mpf->mpf_physptr;
8400 +                               reserve_bootmem(mpf->mpf_physptr, size);
8401 +                       }
8402 +#else
8403 +                       printk(KERN_INFO "found SMP MP-table at %08lx\n",
8404 +                               ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base);
8405 +#endif
8406 +
8407 +                       mpf_found = mpf;
8408 +                       return 1;
8409 +               }
8410 +               bp += 4;
8411 +               length -= 16;
8412 +       }
8413 +       return 0;
8414 +}
8415 +
8416 +void __init find_smp_config (void)
8417 +{
8418 +#ifndef CONFIG_XEN
8419 +       unsigned int address;
8420 +#endif
8421 +
8422 +       /*
8423 +        * FIXME: Linux assumes you have 640K of base ram..
8424 +        * this continues the error...
8425 +        *
8426 +        * 1) Scan the bottom 1K for a signature
8427 +        * 2) Scan the top 1K of base RAM
8428 +        * 3) Scan the 64K of bios
8429 +        */
8430 +       if (smp_scan_config(0x0,0x400) ||
8431 +               smp_scan_config(639*0x400,0x400) ||
8432 +                       smp_scan_config(0xF0000,0x10000))
8433 +               return;
8434 +       /*
8435 +        * If it is an SMP machine we should know now, unless the
8436 +        * configuration is in an EISA/MCA bus machine with an
8437 +        * extended bios data area.
8438 +        *
8439 +        * there is a real-mode segmented pointer pointing to the
8440 +        * 4K EBDA area at 0x40E, calculate and scan it here.
8441 +        *
8442 +        * NOTE! There are Linux loaders that will corrupt the EBDA
8443 +        * area, and as such this kind of SMP config may be less
8444 +        * trustworthy, simply because the SMP table may have been
8445 +        * stomped on during early boot. These loaders are buggy and
8446 +        * should be fixed.
8447 +        *
8448 +        * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
8449 +        */
8450 +
8451 +#ifndef CONFIG_XEN
8452 +       address = get_bios_ebda();
8453 +       if (address)
8454 +               smp_scan_config(address, 0x400);
8455 +#endif
8456 +}
8457 +
8458 +/* --------------------------------------------------------------------------
8459 +                            ACPI-based MP Configuration
8460 +   -------------------------------------------------------------------------- */
8461 +
8462 +#ifdef CONFIG_ACPI
8463 +
8464 +void __init mp_register_lapic_address (
8465 +       u64                     address)
8466 +{
8467 +#ifndef CONFIG_XEN
8468 +       mp_lapic_addr = (unsigned long) address;
8469 +
8470 +       set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
8471 +
8472 +       if (boot_cpu_physical_apicid == -1U)
8473 +               boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
8474 +
8475 +       Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
8476 +#endif
8477 +}
8478 +
8479 +
8480 +void __devinit mp_register_lapic (
8481 +       u8                      id, 
8482 +       u8                      enabled)
8483 +{
8484 +       struct mpc_config_processor processor;
8485 +       int                     boot_cpu = 0;
8486 +       
8487 +       if (MAX_APICS - id <= 0) {
8488 +               printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
8489 +                       id, MAX_APICS);
8490 +               return;
8491 +       }
8492 +
8493 +       if (id == boot_cpu_physical_apicid)
8494 +               boot_cpu = 1;
8495 +
8496 +#ifndef CONFIG_XEN
8497 +       processor.mpc_type = MP_PROCESSOR;
8498 +       processor.mpc_apicid = id;
8499 +       processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
8500 +       processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
8501 +       processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
8502 +       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | 
8503 +               (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
8504 +       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
8505 +       processor.mpc_reserved[0] = 0;
8506 +       processor.mpc_reserved[1] = 0;
8507 +#endif
8508 +
8509 +       MP_processor_info(&processor);
8510 +}
8511 +
8512 +#ifdef CONFIG_X86_IO_APIC
8513 +
8514 +#define MP_ISA_BUS             0
8515 +#define MP_MAX_IOAPIC_PIN      127
8516 +
8517 +static struct mp_ioapic_routing {
8518 +       int                     apic_id;
8519 +       int                     gsi_base;
8520 +       int                     gsi_end;
8521 +       u32                     pin_programmed[4];
8522 +} mp_ioapic_routing[MAX_IO_APICS];
8523 +
8524 +
8525 +static int mp_find_ioapic (
8526 +       int                     gsi)
8527 +{
8528 +       int                     i = 0;
8529 +
8530 +       /* Find the IOAPIC that manages this GSI. */
8531 +       for (i = 0; i < nr_ioapics; i++) {
8532 +               if ((gsi >= mp_ioapic_routing[i].gsi_base)
8533 +                       && (gsi <= mp_ioapic_routing[i].gsi_end))
8534 +                       return i;
8535 +       }
8536 +
8537 +       printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
8538 +
8539 +       return -1;
8540 +}
8541 +       
8542 +
8543 +void __init mp_register_ioapic (
8544 +       u8                      id, 
8545 +       u32                     address,
8546 +       u32                     gsi_base)
8547 +{
8548 +       int                     idx = 0;
8549 +       int                     tmpid;
8550 +
8551 +       if (nr_ioapics >= MAX_IO_APICS) {
8552 +               printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
8553 +                       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
8554 +               panic("Recompile kernel with bigger MAX_IO_APICS!\n");
8555 +       }
8556 +       if (!address) {
8557 +               printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
8558 +                       " found in MADT table, skipping!\n");
8559 +               return;
8560 +       }
8561 +
8562 +       idx = nr_ioapics++;
8563 +
8564 +       mp_ioapics[idx].mpc_type = MP_IOAPIC;
8565 +       mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
8566 +       mp_ioapics[idx].mpc_apicaddr = address;
8567 +
8568 +#ifndef CONFIG_XEN
8569 +       set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
8570 +#endif
8571 +       if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 < 15))
8572 +               tmpid = io_apic_get_unique_id(idx, id);
8573 +       else
8574 +               tmpid = id;
8575 +       if (tmpid == -1) {
8576 +               nr_ioapics--;
8577 +               return;
8578 +       }
8579 +       mp_ioapics[idx].mpc_apicid = tmpid;
8580 +       mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
8581 +       
8582 +       /* 
8583 +        * Build basic GSI lookup table to facilitate gsi->io_apic lookups
8584 +        * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
8585 +        */
8586 +       mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
8587 +       mp_ioapic_routing[idx].gsi_base = gsi_base;
8588 +       mp_ioapic_routing[idx].gsi_end = gsi_base + 
8589 +               io_apic_get_redir_entries(idx);
8590 +
8591 +       printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
8592 +               "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 
8593 +               mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
8594 +               mp_ioapic_routing[idx].gsi_base,
8595 +               mp_ioapic_routing[idx].gsi_end);
8596 +
8597 +       return;
8598 +}
8599 +
8600 +
8601 +void __init mp_override_legacy_irq (
8602 +       u8                      bus_irq,
8603 +       u8                      polarity, 
8604 +       u8                      trigger, 
8605 +       u32                     gsi)
8606 +{
8607 +       struct mpc_config_intsrc intsrc;
8608 +       int                     ioapic = -1;
8609 +       int                     pin = -1;
8610 +
8611 +       /* 
8612 +        * Convert 'gsi' to 'ioapic.pin'.
8613 +        */
8614 +       ioapic = mp_find_ioapic(gsi);
8615 +       if (ioapic < 0)
8616 +               return;
8617 +       pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
8618 +
8619 +       /*
8620 +        * TBD: This check is for faulty timer entries, where the override
8621 +        *      erroneously sets the trigger to level, resulting in a HUGE 
8622 +        *      increase of timer interrupts!
8623 +        */
8624 +       if ((bus_irq == 0) && (trigger == 3))
8625 +               trigger = 1;
8626 +
8627 +       intsrc.mpc_type = MP_INTSRC;
8628 +       intsrc.mpc_irqtype = mp_INT;
8629 +       intsrc.mpc_irqflag = (trigger << 2) | polarity;
8630 +       intsrc.mpc_srcbus = MP_ISA_BUS;
8631 +       intsrc.mpc_srcbusirq = bus_irq;                                /* IRQ */
8632 +       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;        /* APIC ID */
8633 +       intsrc.mpc_dstirq = pin;                                    /* INTIN# */
8634 +
8635 +       Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
8636 +               intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
8637 +               (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
8638 +               intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
8639 +
8640 +       mp_irqs[mp_irq_entries] = intsrc;
8641 +       if (++mp_irq_entries == MAX_IRQ_SOURCES)
8642 +               panic("Max # of irq sources exceeded!\n");
8643 +
8644 +       return;
8645 +}
8646 +
8647 +int es7000_plat;
8648 +
8649 +void __init mp_config_acpi_legacy_irqs (void)
8650 +{
8651 +       struct mpc_config_intsrc intsrc;
8652 +       int                     i = 0;
8653 +       int                     ioapic = -1;
8654 +
8655 +       /* 
8656 +        * Fabricate the legacy ISA bus (bus #31).
8657 +        */
8658 +       mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
8659 +       Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
8660 +
8661 +       /*
8662 +        * Older generations of ES7000 have no legacy identity mappings
8663 +        */
8664 +       if (es7000_plat == 1)
8665 +               return;
8666 +
8667 +       /* 
8668 +        * Locate the IOAPIC that manages the ISA IRQs (0-15). 
8669 +        */
8670 +       ioapic = mp_find_ioapic(0);
8671 +       if (ioapic < 0)
8672 +               return;
8673 +
8674 +       intsrc.mpc_type = MP_INTSRC;
8675 +       intsrc.mpc_irqflag = 0;                                 /* Conforming */
8676 +       intsrc.mpc_srcbus = MP_ISA_BUS;
8677 +       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
8678 +
8679 +       /* 
8680 +        * Use the default configuration for the IRQs 0-15.  Unless
8681 +        * overriden by (MADT) interrupt source override entries.
8682 +        */
8683 +       for (i = 0; i < 16; i++) {
8684 +               int idx;
8685 +
8686 +               for (idx = 0; idx < mp_irq_entries; idx++) {
8687 +                       struct mpc_config_intsrc *irq = mp_irqs + idx;
8688 +
8689 +                       /* Do we already have a mapping for this ISA IRQ? */
8690 +                       if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
8691 +                               break;
8692 +
8693 +                       /* Do we already have a mapping for this IOAPIC pin */
8694 +                       if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
8695 +                               (irq->mpc_dstirq == i))
8696 +                               break;
8697 +               }
8698 +
8699 +               if (idx != mp_irq_entries) {
8700 +                       printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
8701 +                       continue;                       /* IRQ already used */
8702 +               }
8703 +
8704 +               intsrc.mpc_irqtype = mp_INT;
8705 +               intsrc.mpc_srcbusirq = i;                  /* Identity mapped */
8706 +               intsrc.mpc_dstirq = i;
8707 +
8708 +               Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
8709 +                       "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
8710 +                       (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
8711 +                       intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, 
8712 +                       intsrc.mpc_dstirq);
8713 +
8714 +               mp_irqs[mp_irq_entries] = intsrc;
8715 +               if (++mp_irq_entries == MAX_IRQ_SOURCES)
8716 +                       panic("Max # of irq sources exceeded!\n");
8717 +       }
8718 +}
8719 +
8720 +#define MAX_GSI_NUM    4096
8721 +
8722 +int mp_register_gsi (u32 gsi, int triggering, int polarity)
8723 +{
8724 +       int                     ioapic = -1;
8725 +       int                     ioapic_pin = 0;
8726 +       int                     idx, bit = 0;
8727 +       static int              pci_irq = 16;
8728 +       /*
8729 +        * Mapping between Global System Interrups, which
8730 +        * represent all possible interrupts, and IRQs
8731 +        * assigned to actual devices.
8732 +        */
8733 +       static int              gsi_to_irq[MAX_GSI_NUM];
8734 +
8735 +       /* Don't set up the ACPI SCI because it's already set up */
8736 +       if (acpi_fadt.sci_int == gsi)
8737 +               return gsi;
8738 +
8739 +       ioapic = mp_find_ioapic(gsi);
8740 +       if (ioapic < 0) {
8741 +               printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
8742 +               return gsi;
8743 +       }
8744 +
8745 +       ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
8746 +
8747 +       if (ioapic_renumber_irq)
8748 +               gsi = ioapic_renumber_irq(ioapic, gsi);
8749 +
8750 +       /* 
8751 +        * Avoid pin reprogramming.  PRTs typically include entries  
8752 +        * with redundant pin->gsi mappings (but unique PCI devices);
8753 +        * we only program the IOAPIC on the first.
8754 +        */
8755 +       bit = ioapic_pin % 32;
8756 +       idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
8757 +       if (idx > 3) {
8758 +               printk(KERN_ERR "Invalid reference to IOAPIC pin "
8759 +                       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, 
8760 +                       ioapic_pin);
8761 +               return gsi;
8762 +       }
8763 +       if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
8764 +               Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
8765 +                       mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
8766 +               return gsi_to_irq[gsi];
8767 +       }
8768 +
8769 +       mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
8770 +
8771 +       if (triggering == ACPI_LEVEL_SENSITIVE) {
8772 +               /*
8773 +                * For PCI devices assign IRQs in order, avoiding gaps
8774 +                * due to unused I/O APIC pins.
8775 +                */
8776 +               int irq = gsi;
8777 +               if (gsi < MAX_GSI_NUM) {
8778 +                       if (gsi > 15)
8779 +                               gsi = pci_irq++;
8780 +                       /*
8781 +                        * Don't assign IRQ used by ACPI SCI
8782 +                        */
8783 +                       if (gsi == acpi_fadt.sci_int)
8784 +                               gsi = pci_irq++;
8785 +                       gsi_to_irq[irq] = gsi;
8786 +               } else {
8787 +                       printk(KERN_ERR "GSI %u is too high\n", gsi);
8788 +                       return gsi;
8789 +               }
8790 +       }
8791 +
8792 +       io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
8793 +                   triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
8794 +                   polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
8795 +       return gsi;
8796 +}
8797 +
8798 +#endif /* CONFIG_X86_IO_APIC */
8799 +#endif /* CONFIG_ACPI */
8800 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/pci-dma-xen.c linux-2.6.16/arch/i386/kernel/pci-dma-xen.c
8801 --- linux-2.6.16.orig/arch/i386/kernel/pci-dma-xen.c    1970-01-01 01:00:00.000000000 +0100
8802 +++ linux-2.6.16/arch/i386/kernel/pci-dma-xen.c 2006-06-26 09:51:32.000000000 +0200
8803 @@ -0,0 +1,344 @@
8804 +/*
8805 + * Dynamic DMA mapping support.
8806 + *
8807 + * On i386 there is no hardware dynamic DMA address translation,
8808 + * so consistent alloc/free are merely page allocation/freeing.
8809 + * The rest of the dynamic DMA mapping interface is implemented
8810 + * in asm/pci.h.
8811 + */
8812 +
8813 +#include <linux/types.h>
8814 +#include <linux/mm.h>
8815 +#include <linux/string.h>
8816 +#include <linux/pci.h>
8817 +#include <linux/module.h>
8818 +#include <linux/version.h>
8819 +#include <asm/io.h>
8820 +#include <xen/balloon.h>
8821 +#include <asm/tlbflush.h>
8822 +#include <asm-i386/mach-xen/asm/swiotlb.h>
8823 +#include <asm/bug.h>
8824 +
8825 +#ifdef __x86_64__
8826 +int iommu_merge __read_mostly = 0;
8827 +EXPORT_SYMBOL(iommu_merge);
8828 +
8829 +dma_addr_t bad_dma_address __read_mostly;
8830 +EXPORT_SYMBOL(bad_dma_address);
8831 +
8832 +/* This tells the BIO block layer to assume merging. Default to off
8833 +   because we cannot guarantee merging later. */
8834 +int iommu_bio_merge __read_mostly = 0;
8835 +EXPORT_SYMBOL(iommu_bio_merge);
8836 +
8837 +__init int iommu_setup(char *p)
8838 +{
8839 +    return 1;
8840 +}
8841 +#endif
8842 +
8843 +struct dma_coherent_mem {
8844 +       void            *virt_base;
8845 +       u32             device_base;
8846 +       int             size;
8847 +       int             flags;
8848 +       unsigned long   *bitmap;
8849 +};
8850 +
8851 +#define IOMMU_BUG_ON(test)                             \
8852 +do {                                                   \
8853 +       if (unlikely(test)) {                           \
8854 +               printk(KERN_ALERT "Fatal DMA error! "   \
8855 +                      "Please use 'swiotlb=force'\n"); \
8856 +               BUG();                                  \
8857 +       }                                               \
8858 +} while (0)
8859 +
8860 +int
8861 +dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents,
8862 +          enum dma_data_direction direction)
8863 +{
8864 +       int i, rc;
8865 +
8866 +       if (direction == DMA_NONE)
8867 +               BUG();
8868 +       WARN_ON(nents == 0 || sg[0].length == 0);
8869 +
8870 +       if (swiotlb) {
8871 +               rc = swiotlb_map_sg(hwdev, sg, nents, direction);
8872 +       } else {
8873 +               for (i = 0; i < nents; i++ ) {
8874 +                       sg[i].dma_address =
8875 +                               page_to_phys(sg[i].page) + sg[i].offset;
8876 +                       sg[i].dma_length  = sg[i].length;
8877 +                       BUG_ON(!sg[i].page);
8878 +                       IOMMU_BUG_ON(address_needs_mapping(
8879 +                               hwdev, sg[i].dma_address));
8880 +               }
8881 +               rc = nents;
8882 +       }
8883 +
8884 +       flush_write_buffers();
8885 +       return rc;
8886 +}
8887 +EXPORT_SYMBOL(dma_map_sg);
8888 +
8889 +void
8890 +dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
8891 +            enum dma_data_direction direction)
8892 +{
8893 +       BUG_ON(direction == DMA_NONE);
8894 +       if (swiotlb)
8895 +               swiotlb_unmap_sg(hwdev, sg, nents, direction);
8896 +}
8897 +EXPORT_SYMBOL(dma_unmap_sg);
8898 +
8899 +dma_addr_t
8900 +dma_map_page(struct device *dev, struct page *page, unsigned long offset,
8901 +            size_t size, enum dma_data_direction direction)
8902 +{
8903 +       dma_addr_t dma_addr;
8904 +
8905 +       BUG_ON(direction == DMA_NONE);
8906 +
8907 +       if (swiotlb) {
8908 +               dma_addr = swiotlb_map_page(
8909 +                       dev, page, offset, size, direction);
8910 +       } else {
8911 +               dma_addr = page_to_phys(page) + offset;
8912 +               IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr));
8913 +       }
8914 +
8915 +       return dma_addr;
8916 +}
8917 +EXPORT_SYMBOL(dma_map_page);
8918 +
8919 +void
8920 +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
8921 +              enum dma_data_direction direction)
8922 +{
8923 +       BUG_ON(direction == DMA_NONE);
8924 +       if (swiotlb)
8925 +               swiotlb_unmap_page(dev, dma_address, size, direction);
8926 +}
8927 +EXPORT_SYMBOL(dma_unmap_page);
8928 +
8929 +int
8930 +dma_mapping_error(dma_addr_t dma_addr)
8931 +{
8932 +       if (swiotlb)
8933 +               return swiotlb_dma_mapping_error(dma_addr);
8934 +       return 0;
8935 +}
8936 +EXPORT_SYMBOL(dma_mapping_error);
8937 +
8938 +int
8939 +dma_supported(struct device *dev, u64 mask)
8940 +{
8941 +       if (swiotlb)
8942 +               return swiotlb_dma_supported(dev, mask);
8943 +       /*
8944 +        * By default we'll BUG when an infeasible DMA is requested, and
8945 +        * request swiotlb=force (see IOMMU_BUG_ON).
8946 +        */
8947 +       return 1;
8948 +}
8949 +EXPORT_SYMBOL(dma_supported);
8950 +
8951 +void *dma_alloc_coherent(struct device *dev, size_t size,
8952 +                          dma_addr_t *dma_handle, gfp_t gfp)
8953 +{
8954 +       void *ret;
8955 +       struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
8956 +       unsigned int order = get_order(size);
8957 +       unsigned long vstart;
8958 +       /* ignore region specifiers */
8959 +       gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
8960 +
8961 +       if (mem) {
8962 +               int page = bitmap_find_free_region(mem->bitmap, mem->size,
8963 +                                                    order);
8964 +               if (page >= 0) {
8965 +                       *dma_handle = mem->device_base + (page << PAGE_SHIFT);
8966 +                       ret = mem->virt_base + (page << PAGE_SHIFT);
8967 +                       memset(ret, 0, size);
8968 +                       return ret;
8969 +               }
8970 +               if (mem->flags & DMA_MEMORY_EXCLUSIVE)
8971 +                       return NULL;
8972 +       }
8973 +
8974 +       if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
8975 +               gfp |= GFP_DMA;
8976 +
8977 +       vstart = __get_free_pages(gfp, order);
8978 +       ret = (void *)vstart;
8979 +
8980 +       if (ret != NULL) {
8981 +               /* NB. Hardcode 31 address bits for now: aacraid limitation. */
8982 +               if (xen_create_contiguous_region(vstart, order, 31) != 0) {
8983 +                       free_pages(vstart, order);
8984 +                       return NULL;
8985 +               }
8986 +               memset(ret, 0, size);
8987 +               *dma_handle = virt_to_bus(ret);
8988 +       }
8989 +       return ret;
8990 +}
8991 +EXPORT_SYMBOL(dma_alloc_coherent);
8992 +
8993 +void dma_free_coherent(struct device *dev, size_t size,
8994 +                        void *vaddr, dma_addr_t dma_handle)
8995 +{
8996 +       struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
8997 +       int order = get_order(size);
8998 +       
8999 +       if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
9000 +               int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
9001 +
9002 +               bitmap_release_region(mem->bitmap, page, order);
9003 +       } else {
9004 +               xen_destroy_contiguous_region((unsigned long)vaddr, order);
9005 +               free_pages((unsigned long)vaddr, order);
9006 +       }
9007 +}
9008 +EXPORT_SYMBOL(dma_free_coherent);
9009 +
9010 +int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
9011 +                               dma_addr_t device_addr, size_t size, int flags)
9012 +{
9013 +       void __iomem *mem_base;
9014 +       int pages = size >> PAGE_SHIFT;
9015 +       int bitmap_size = (pages + 31)/32;
9016 +
9017 +       if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
9018 +               goto out;
9019 +       if (!size)
9020 +               goto out;
9021 +       if (dev->dma_mem)
9022 +               goto out;
9023 +
9024 +       /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
9025 +
9026 +       mem_base = ioremap(bus_addr, size);
9027 +       if (!mem_base)
9028 +               goto out;
9029 +
9030 +       dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
9031 +       if (!dev->dma_mem)
9032 +               goto out;
9033 +       memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem));
9034 +       dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
9035 +       if (!dev->dma_mem->bitmap)
9036 +               goto free1_out;
9037 +       memset(dev->dma_mem->bitmap, 0, bitmap_size);
9038 +
9039 +       dev->dma_mem->virt_base = mem_base;
9040 +       dev->dma_mem->device_base = device_addr;
9041 +       dev->dma_mem->size = pages;
9042 +       dev->dma_mem->flags = flags;
9043 +
9044 +       if (flags & DMA_MEMORY_MAP)
9045 +               return DMA_MEMORY_MAP;
9046 +
9047 +       return DMA_MEMORY_IO;
9048 +
9049 + free1_out:
9050 +       kfree(dev->dma_mem->bitmap);
9051 + out:
9052 +       return 0;
9053 +}
9054 +EXPORT_SYMBOL(dma_declare_coherent_memory);
9055 +
9056 +void dma_release_declared_memory(struct device *dev)
9057 +{
9058 +       struct dma_coherent_mem *mem = dev->dma_mem;
9059 +       
9060 +       if(!mem)
9061 +               return;
9062 +       dev->dma_mem = NULL;
9063 +       iounmap(mem->virt_base);
9064 +       kfree(mem->bitmap);
9065 +       kfree(mem);
9066 +}
9067 +EXPORT_SYMBOL(dma_release_declared_memory);
9068 +
9069 +void *dma_mark_declared_memory_occupied(struct device *dev,
9070 +                                       dma_addr_t device_addr, size_t size)
9071 +{
9072 +       struct dma_coherent_mem *mem = dev->dma_mem;
9073 +       int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
9074 +       int pos, err;
9075 +
9076 +       if (!mem)
9077 +               return ERR_PTR(-EINVAL);
9078 +
9079 +       pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
9080 +       err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
9081 +       if (err != 0)
9082 +               return ERR_PTR(err);
9083 +       return mem->virt_base + (pos << PAGE_SHIFT);
9084 +}
9085 +EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
9086 +
9087 +dma_addr_t
9088 +dma_map_single(struct device *dev, void *ptr, size_t size,
9089 +              enum dma_data_direction direction)
9090 +{
9091 +       dma_addr_t dma;
9092 +
9093 +       if (direction == DMA_NONE)
9094 +               BUG();
9095 +       WARN_ON(size == 0);
9096 +
9097 +       if (swiotlb) {
9098 +               dma = swiotlb_map_single(dev, ptr, size, direction);
9099 +       } else {
9100 +               dma = virt_to_bus(ptr);
9101 +               IOMMU_BUG_ON(range_straddles_page_boundary(ptr, size));
9102 +               IOMMU_BUG_ON(address_needs_mapping(dev, dma));
9103 +       }
9104 +
9105 +       flush_write_buffers();
9106 +       return dma;
9107 +}
9108 +EXPORT_SYMBOL(dma_map_single);
9109 +
9110 +void
9111 +dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
9112 +                enum dma_data_direction direction)
9113 +{
9114 +       if (direction == DMA_NONE)
9115 +               BUG();
9116 +       if (swiotlb)
9117 +               swiotlb_unmap_single(dev, dma_addr, size, direction);
9118 +}
9119 +EXPORT_SYMBOL(dma_unmap_single);
9120 +
9121 +void
9122 +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
9123 +                       enum dma_data_direction direction)
9124 +{
9125 +       if (swiotlb)
9126 +               swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction);
9127 +}
9128 +EXPORT_SYMBOL(dma_sync_single_for_cpu);
9129 +
9130 +void
9131 +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
9132 +                           enum dma_data_direction direction)
9133 +{
9134 +       if (swiotlb)
9135 +               swiotlb_sync_single_for_device(dev, dma_handle, size, direction);
9136 +}
9137 +EXPORT_SYMBOL(dma_sync_single_for_device);
9138 +
9139 +/*
9140 + * Local variables:
9141 + *  c-file-style: "linux"
9142 + *  indent-tabs-mode: t
9143 + *  c-indent-level: 8
9144 + *  c-basic-offset: 8
9145 + *  tab-width: 8
9146 + * End:
9147 + */
9148 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/process-xen.c linux-2.6.16/arch/i386/kernel/process-xen.c
9149 --- linux-2.6.16.orig/arch/i386/kernel/process-xen.c    1970-01-01 01:00:00.000000000 +0100
9150 +++ linux-2.6.16/arch/i386/kernel/process-xen.c 2006-06-26 09:51:32.000000000 +0200
9151 @@ -0,0 +1,833 @@
9152 +/*
9153 + *  linux/arch/i386/kernel/process.c
9154 + *
9155 + *  Copyright (C) 1995  Linus Torvalds
9156 + *
9157 + *  Pentium III FXSR, SSE support
9158 + *     Gareth Hughes <gareth@valinux.com>, May 2000
9159 + */
9160 +
9161 +/*
9162 + * This file handles the architecture-dependent parts of process handling..
9163 + */
9164 +
9165 +#include <stdarg.h>
9166 +
9167 +#include <linux/cpu.h>
9168 +#include <linux/errno.h>
9169 +#include <linux/sched.h>
9170 +#include <linux/fs.h>
9171 +#include <linux/kernel.h>
9172 +#include <linux/mm.h>
9173 +#include <linux/elfcore.h>
9174 +#include <linux/smp.h>
9175 +#include <linux/smp_lock.h>
9176 +#include <linux/stddef.h>
9177 +#include <linux/slab.h>
9178 +#include <linux/vmalloc.h>
9179 +#include <linux/user.h>
9180 +#include <linux/a.out.h>
9181 +#include <linux/interrupt.h>
9182 +#include <linux/config.h>
9183 +#include <linux/utsname.h>
9184 +#include <linux/delay.h>
9185 +#include <linux/reboot.h>
9186 +#include <linux/init.h>
9187 +#include <linux/mc146818rtc.h>
9188 +#include <linux/module.h>
9189 +#include <linux/kallsyms.h>
9190 +#include <linux/ptrace.h>
9191 +#include <linux/random.h>
9192 +#include <linux/kprobes.h>
9193 +
9194 +#include <asm/uaccess.h>
9195 +#include <asm/pgtable.h>
9196 +#include <asm/system.h>
9197 +#include <asm/io.h>
9198 +#include <asm/ldt.h>
9199 +#include <asm/processor.h>
9200 +#include <asm/i387.h>
9201 +#include <asm/desc.h>
9202 +#include <asm/vm86.h>
9203 +#ifdef CONFIG_MATH_EMULATION
9204 +#include <asm/math_emu.h>
9205 +#endif
9206 +
9207 +#include <xen/interface/physdev.h>
9208 +#include <xen/interface/vcpu.h>
9209 +
9210 +#include <linux/err.h>
9211 +
9212 +#include <asm/tlbflush.h>
9213 +#include <asm/cpu.h>
9214 +
9215 +#include <asm/tlbflush.h>
9216 +#include <asm/cpu.h>
9217 +
9218 +asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
9219 +
9220 +static int hlt_counter;
9221 +
9222 +unsigned long boot_option_idle_override = 0;
9223 +EXPORT_SYMBOL(boot_option_idle_override);
9224 +
9225 +/*
9226 + * Return saved PC of a blocked thread.
9227 + */
9228 +unsigned long thread_saved_pc(struct task_struct *tsk)
9229 +{
9230 +       return ((unsigned long *)tsk->thread.esp)[3];
9231 +}
9232 +
9233 +/*
9234 + * Powermanagement idle function, if any..
9235 + */
9236 +void (*pm_idle)(void);
9237 +EXPORT_SYMBOL(pm_idle);
9238 +static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
9239 +
9240 +void disable_hlt(void)
9241 +{
9242 +       hlt_counter++;
9243 +}
9244 +
9245 +EXPORT_SYMBOL(disable_hlt);
9246 +
9247 +void enable_hlt(void)
9248 +{
9249 +       hlt_counter--;
9250 +}
9251 +
9252 +EXPORT_SYMBOL(enable_hlt);
9253 +
9254 +/* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */
9255 +extern void stop_hz_timer(void);
9256 +extern void start_hz_timer(void);
9257 +void xen_idle(void)
9258 +{
9259 +       local_irq_disable();
9260 +
9261 +       if (need_resched())
9262 +               local_irq_enable();
9263 +       else {
9264 +               clear_thread_flag(TIF_POLLING_NRFLAG);
9265 +               smp_mb__after_clear_bit();
9266 +               stop_hz_timer();
9267 +               /* Blocking includes an implicit local_irq_enable(). */
9268 +               HYPERVISOR_block();
9269 +               start_hz_timer();
9270 +               set_thread_flag(TIF_POLLING_NRFLAG);
9271 +       }
9272 +}
9273 +#ifdef CONFIG_APM_MODULE
9274 +EXPORT_SYMBOL(default_idle);
9275 +#endif
9276 +
9277 +#ifdef CONFIG_HOTPLUG_CPU
9278 +extern cpumask_t cpu_initialized;
9279 +static inline void play_dead(void)
9280 +{
9281 +       idle_task_exit();
9282 +       local_irq_disable();
9283 +       cpu_clear(smp_processor_id(), cpu_initialized);
9284 +       preempt_enable_no_resched();
9285 +       HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
9286 +       /* Same as drivers/xen/core/smpboot.c:cpu_bringup(). */
9287 +       cpu_init();
9288 +       touch_softlockup_watchdog();
9289 +       preempt_disable();
9290 +       local_irq_enable();
9291 +}
9292 +#else
9293 +static inline void play_dead(void)
9294 +{
9295 +       BUG();
9296 +}
9297 +#endif /* CONFIG_HOTPLUG_CPU */
9298 +
9299 +/*
9300 + * The idle thread. There's no useful work to be
9301 + * done, so just try to conserve power and have a
9302 + * low exit latency (ie sit in a loop waiting for
9303 + * somebody to say that they'd like to reschedule)
9304 + */
9305 +void cpu_idle(void)
9306 +{
9307 +       int cpu = smp_processor_id();
9308 +
9309 +       set_thread_flag(TIF_POLLING_NRFLAG);
9310 +
9311 +       /* endless idle loop with no priority at all */
9312 +       while (1) {
9313 +               while (!need_resched()) {
9314 +
9315 +                       if (__get_cpu_var(cpu_idle_state))
9316 +                               __get_cpu_var(cpu_idle_state) = 0;
9317 +
9318 +                       rmb();
9319 +
9320 +                       if (cpu_is_offline(cpu))
9321 +                               play_dead();
9322 +
9323 +                       __get_cpu_var(irq_stat).idle_timestamp = jiffies;
9324 +                       xen_idle();
9325 +               }
9326 +               preempt_enable_no_resched();
9327 +               schedule();
9328 +               preempt_disable();
9329 +       }
9330 +}
9331 +
9332 +void cpu_idle_wait(void)
9333 +{
9334 +       unsigned int cpu, this_cpu = get_cpu();
9335 +       cpumask_t map;
9336 +
9337 +       set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
9338 +       put_cpu();
9339 +
9340 +       cpus_clear(map);
9341 +       for_each_online_cpu(cpu) {
9342 +               per_cpu(cpu_idle_state, cpu) = 1;
9343 +               cpu_set(cpu, map);
9344 +       }
9345 +
9346 +       __get_cpu_var(cpu_idle_state) = 0;
9347 +
9348 +       wmb();
9349 +       do {
9350 +               ssleep(1);
9351 +               for_each_online_cpu(cpu) {
9352 +                       if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
9353 +                               cpu_clear(cpu, map);
9354 +               }
9355 +               cpus_and(map, map, cpu_online_map);
9356 +       } while (!cpus_empty(map));
9357 +}
9358 +EXPORT_SYMBOL_GPL(cpu_idle_wait);
9359 +
9360 +/* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */
9361 +/* Always use xen_idle() instead. */
9362 +void __devinit select_idle_routine(const struct cpuinfo_x86 *c) {}
9363 +
9364 +void show_regs(struct pt_regs * regs)
9365 +{
9366 +       unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
9367 +
9368 +       printk("\n");
9369 +       printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
9370 +       printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
9371 +       print_symbol("EIP is at %s\n", regs->eip);
9372 +
9373 +       if (user_mode(regs))
9374 +               printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
9375 +       printk(" EFLAGS: %08lx    %s  (%s %.*s)\n",
9376 +              regs->eflags, print_tainted(), system_utsname.release,
9377 +              (int)strcspn(system_utsname.version, " "),
9378 +              system_utsname.version);
9379 +       printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
9380 +               regs->eax,regs->ebx,regs->ecx,regs->edx);
9381 +       printk("ESI: %08lx EDI: %08lx EBP: %08lx",
9382 +               regs->esi, regs->edi, regs->ebp);
9383 +       printk(" DS: %04x ES: %04x\n",
9384 +               0xffff & regs->xds,0xffff & regs->xes);
9385 +
9386 +       cr0 = read_cr0();
9387 +       cr2 = read_cr2();
9388 +       cr3 = read_cr3();
9389 +       cr4 = read_cr4_safe();
9390 +       printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
9391 +       show_trace(NULL, &regs->esp);
9392 +}
9393 +
9394 +/*
9395 + * This gets run with %ebx containing the
9396 + * function to call, and %edx containing
9397 + * the "args".
9398 + */
9399 +extern void kernel_thread_helper(void);
9400 +__asm__(".section .text\n"
9401 +       ".align 4\n"
9402 +       "kernel_thread_helper:\n\t"
9403 +       "movl %edx,%eax\n\t"
9404 +       "pushl %edx\n\t"
9405 +       "call *%ebx\n\t"
9406 +       "pushl %eax\n\t"
9407 +       "call do_exit\n"
9408 +       ".previous");
9409 +
9410 +/*
9411 + * Create a kernel thread
9412 + */
9413 +int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
9414 +{
9415 +       struct pt_regs regs;
9416 +
9417 +       memset(&regs, 0, sizeof(regs));
9418 +
9419 +       regs.ebx = (unsigned long) fn;
9420 +       regs.edx = (unsigned long) arg;
9421 +
9422 +       regs.xds = __USER_DS;
9423 +       regs.xes = __USER_DS;
9424 +       regs.orig_eax = -1;
9425 +       regs.eip = (unsigned long) kernel_thread_helper;
9426 +       regs.xcs = GET_KERNEL_CS();
9427 +       regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
9428 +
9429 +       /* Ok, create the new process.. */
9430 +       return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
9431 +}
9432 +EXPORT_SYMBOL(kernel_thread);
9433 +
9434 +/*
9435 + * Free current thread data structures etc..
9436 + */
9437 +void exit_thread(void)
9438 +{
9439 +       struct task_struct *tsk = current;
9440 +       struct thread_struct *t = &tsk->thread;
9441 +
9442 +       /*
9443 +        * Remove function-return probe instances associated with this task
9444 +        * and put them back on the free list. Do not insert an exit probe for
9445 +        * this function, it will be disabled by kprobe_flush_task if you do.
9446 +        */
9447 +       kprobe_flush_task(tsk);
9448 +
9449 +       /* The process may have allocated an io port bitmap... nuke it. */
9450 +       if (unlikely(NULL != t->io_bitmap_ptr)) {
9451 +               physdev_op_t op = { 0 };
9452 +               op.cmd = PHYSDEVOP_SET_IOBITMAP;
9453 +               HYPERVISOR_physdev_op(&op);
9454 +               kfree(t->io_bitmap_ptr);
9455 +               t->io_bitmap_ptr = NULL;
9456 +       }
9457 +}
9458 +
9459 +void flush_thread(void)
9460 +{
9461 +       struct task_struct *tsk = current;
9462 +
9463 +       memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
9464 +       memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));        
9465 +       /*
9466 +        * Forget coprocessor state..
9467 +        */
9468 +       clear_fpu(tsk);
9469 +       clear_used_math();
9470 +}
9471 +
9472 +void release_thread(struct task_struct *dead_task)
9473 +{
9474 +       BUG_ON(dead_task->mm);
9475 +       release_vm86_irqs(dead_task);
9476 +}
9477 +
9478 +/*
9479 + * This gets called before we allocate a new thread and copy
9480 + * the current task into it.
9481 + */
9482 +void prepare_to_copy(struct task_struct *tsk)
9483 +{
9484 +       unlazy_fpu(tsk);
9485 +}
9486 +
9487 +int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
9488 +       unsigned long unused,
9489 +       struct task_struct * p, struct pt_regs * regs)
9490 +{
9491 +       struct pt_regs * childregs;
9492 +       struct task_struct *tsk;
9493 +       int err;
9494 +
9495 +       childregs = task_pt_regs(p);
9496 +       *childregs = *regs;
9497 +       childregs->eax = 0;
9498 +       childregs->esp = esp;
9499 +
9500 +       p->thread.esp = (unsigned long) childregs;
9501 +       p->thread.esp0 = (unsigned long) (childregs+1);
9502 +
9503 +       p->thread.eip = (unsigned long) ret_from_fork;
9504 +
9505 +       savesegment(fs,p->thread.fs);
9506 +       savesegment(gs,p->thread.gs);
9507 +
9508 +       tsk = current;
9509 +       if (unlikely(NULL != tsk->thread.io_bitmap_ptr)) {
9510 +               p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
9511 +               if (!p->thread.io_bitmap_ptr) {
9512 +                       p->thread.io_bitmap_max = 0;
9513 +                       return -ENOMEM;
9514 +               }
9515 +               memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
9516 +                       IO_BITMAP_BYTES);
9517 +       }
9518 +
9519 +       /*
9520 +        * Set a new TLS for the child thread?
9521 +        */
9522 +       if (clone_flags & CLONE_SETTLS) {
9523 +               struct desc_struct *desc;
9524 +               struct user_desc info;
9525 +               int idx;
9526 +
9527 +               err = -EFAULT;
9528 +               if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
9529 +                       goto out;
9530 +               err = -EINVAL;
9531 +               if (LDT_empty(&info))
9532 +                       goto out;
9533 +
9534 +               idx = info.entry_number;
9535 +               if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
9536 +                       goto out;
9537 +
9538 +               desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
9539 +               desc->a = LDT_entry_a(&info);
9540 +               desc->b = LDT_entry_b(&info);
9541 +       }
9542 +
9543 +       p->thread.iopl = current->thread.iopl;
9544 +
9545 +       err = 0;
9546 + out:
9547 +       if (err && p->thread.io_bitmap_ptr) {
9548 +               kfree(p->thread.io_bitmap_ptr);
9549 +               p->thread.io_bitmap_max = 0;
9550 +       }
9551 +       return err;
9552 +}
9553 +
9554 +/*
9555 + * fill in the user structure for a core dump..
9556 + */
9557 +void dump_thread(struct pt_regs * regs, struct user * dump)
9558 +{
9559 +       int i;
9560 +
9561 +/* changed the size calculations - should hopefully work better. lbt */
9562 +       dump->magic = CMAGIC;
9563 +       dump->start_code = 0;
9564 +       dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
9565 +       dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
9566 +       dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
9567 +       dump->u_dsize -= dump->u_tsize;
9568 +       dump->u_ssize = 0;
9569 +       for (i = 0; i < 8; i++)
9570 +               dump->u_debugreg[i] = current->thread.debugreg[i];  
9571 +
9572 +       if (dump->start_stack < TASK_SIZE)
9573 +               dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
9574 +
9575 +       dump->regs.ebx = regs->ebx;
9576 +       dump->regs.ecx = regs->ecx;
9577 +       dump->regs.edx = regs->edx;
9578 +       dump->regs.esi = regs->esi;
9579 +       dump->regs.edi = regs->edi;
9580 +       dump->regs.ebp = regs->ebp;
9581 +       dump->regs.eax = regs->eax;
9582 +       dump->regs.ds = regs->xds;
9583 +       dump->regs.es = regs->xes;
9584 +       savesegment(fs,dump->regs.fs);
9585 +       savesegment(gs,dump->regs.gs);
9586 +       dump->regs.orig_eax = regs->orig_eax;
9587 +       dump->regs.eip = regs->eip;
9588 +       dump->regs.cs = regs->xcs;
9589 +       dump->regs.eflags = regs->eflags;
9590 +       dump->regs.esp = regs->esp;
9591 +       dump->regs.ss = regs->xss;
9592 +
9593 +       dump->u_fpvalid = dump_fpu (regs, &dump->i387);
9594 +}
9595 +EXPORT_SYMBOL(dump_thread);
9596 +
9597 +/* 
9598 + * Capture the user space registers if the task is not running (in user space)
9599 + */
9600 +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
9601 +{
9602 +       struct pt_regs ptregs = *task_pt_regs(tsk);
9603 +       ptregs.xcs &= 0xffff;
9604 +       ptregs.xds &= 0xffff;
9605 +       ptregs.xes &= 0xffff;
9606 +       ptregs.xss &= 0xffff;
9607 +
9608 +       elf_core_copy_regs(regs, &ptregs);
9609 +
9610 +       return 1;
9611 +}
9612 +
9613 +/*
9614 + * This function selects if the context switch from prev to next
9615 + * has to tweak the TSC disable bit in the cr4.
9616 + */
9617 +static inline void disable_tsc(struct task_struct *prev_p,
9618 +                              struct task_struct *next_p)
9619 +{
9620 +       struct thread_info *prev, *next;
9621 +
9622 +       /*
9623 +        * gcc should eliminate the ->thread_info dereference if
9624 +        * has_secure_computing returns 0 at compile time (SECCOMP=n).
9625 +        */
9626 +       prev = task_thread_info(prev_p);
9627 +       next = task_thread_info(next_p);
9628 +
9629 +       if (has_secure_computing(prev) || has_secure_computing(next)) {
9630 +               /* slow path here */
9631 +               if (has_secure_computing(prev) &&
9632 +                   !has_secure_computing(next)) {
9633 +                       write_cr4(read_cr4() & ~X86_CR4_TSD);
9634 +               } else if (!has_secure_computing(prev) &&
9635 +                          has_secure_computing(next))
9636 +                       write_cr4(read_cr4() | X86_CR4_TSD);
9637 +       }
9638 +}
9639 +
9640 +/*
9641 + *     switch_to(x,yn) should switch tasks from x to y.
9642 + *
9643 + * We fsave/fwait so that an exception goes off at the right time
9644 + * (as a call from the fsave or fwait in effect) rather than to
9645 + * the wrong process. Lazy FP saving no longer makes any sense
9646 + * with modern CPU's, and this simplifies a lot of things (SMP
9647 + * and UP become the same).
9648 + *
9649 + * NOTE! We used to use the x86 hardware context switching. The
9650 + * reason for not using it any more becomes apparent when you
9651 + * try to recover gracefully from saved state that is no longer
9652 + * valid (stale segment register values in particular). With the
9653 + * hardware task-switch, there is no way to fix up bad state in
9654 + * a reasonable manner.
9655 + *
9656 + * The fact that Intel documents the hardware task-switching to
9657 + * be slow is a fairly red herring - this code is not noticeably
9658 + * faster. However, there _is_ some room for improvement here,
9659 + * so the performance issues may eventually be a valid point.
9660 + * More important, however, is the fact that this allows us much
9661 + * more flexibility.
9662 + *
9663 + * The return value (in %eax) will be the "prev" task after
9664 + * the task-switch, and shows up in ret_from_fork in entry.S,
9665 + * for example.
9666 + */
9667 +struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
9668 +{
9669 +       struct thread_struct *prev = &prev_p->thread,
9670 +                                *next = &next_p->thread;
9671 +       int cpu = smp_processor_id();
9672 +#ifndef CONFIG_X86_NO_TSS
9673 +       struct tss_struct *tss = &per_cpu(init_tss, cpu);
9674 +#endif
9675 +       physdev_op_t iopl_op, iobmp_op;
9676 +       multicall_entry_t _mcl[8], *mcl = _mcl;
9677 +
9678 +       /* XEN NOTE: FS/GS saved in switch_mm(), not here. */
9679 +
9680 +       /*
9681 +        * This is basically '__unlazy_fpu', except that we queue a
9682 +        * multicall to indicate FPU task switch, rather than
9683 +        * synchronously trapping to Xen.
9684 +        */
9685 +       if (prev_p->thread_info->status & TS_USEDFPU) {
9686 +               __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
9687 +               mcl->op      = __HYPERVISOR_fpu_taskswitch;
9688 +               mcl->args[0] = 1;
9689 +               mcl++;
9690 +       }
9691 +#if 0 /* lazy fpu sanity check */
9692 +       else BUG_ON(!(read_cr0() & 8));
9693 +#endif
9694 +
9695 +       /*
9696 +        * Reload esp0.
9697 +        * This is load_esp0(tss, next) with a multicall.
9698 +        */
9699 +       mcl->op      = __HYPERVISOR_stack_switch;
9700 +       mcl->args[0] = __KERNEL_DS;
9701 +       mcl->args[1] = next->esp0;
9702 +       mcl++;
9703 +
9704 +       /*
9705 +        * Load the per-thread Thread-Local Storage descriptor.
9706 +        * This is load_TLS(next, cpu) with multicalls.
9707 +        */
9708 +#define C(i) do {                                                      \
9709 +       if (unlikely(next->tls_array[i].a != prev->tls_array[i].a ||    \
9710 +                    next->tls_array[i].b != prev->tls_array[i].b)) {   \
9711 +               mcl->op = __HYPERVISOR_update_descriptor;               \
9712 +               *(u64 *)&mcl->args[0] = virt_to_machine(                \
9713 +                       &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
9714 +               *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i];    \
9715 +               mcl++;                                                  \
9716 +       }                                                               \
9717 +} while (0)
9718 +       C(0); C(1); C(2);
9719 +#undef C
9720 +
9721 +       if (unlikely(prev->iopl != next->iopl)) {
9722 +               iopl_op.cmd             = PHYSDEVOP_SET_IOPL;
9723 +               iopl_op.u.set_iopl.iopl = (next->iopl == 0) ? 1 :
9724 +                       (next->iopl >> 12) & 3;
9725 +               mcl->op      = __HYPERVISOR_physdev_op;
9726 +               mcl->args[0] = (unsigned long)&iopl_op;
9727 +               mcl++;
9728 +       }
9729 +
9730 +       if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
9731 +               iobmp_op.cmd                     =
9732 +                       PHYSDEVOP_SET_IOBITMAP;
9733 +               iobmp_op.u.set_iobitmap.bitmap   =
9734 +                       (char *)next->io_bitmap_ptr;
9735 +               iobmp_op.u.set_iobitmap.nr_ports =
9736 +                       next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
9737 +               mcl->op      = __HYPERVISOR_physdev_op;
9738 +               mcl->args[0] = (unsigned long)&iobmp_op;
9739 +               mcl++;
9740 +       }
9741 +
9742 +       (void)HYPERVISOR_multicall(_mcl, mcl - _mcl);
9743 +
9744 +       /*
9745 +        * Restore %fs and %gs if needed.
9746 +        *
9747 +        * Glibc normally makes %fs be zero, and %gs is one of
9748 +        * the TLS segments.
9749 +        */
9750 +       if (unlikely(next->fs))
9751 +               loadsegment(fs, next->fs);
9752 +
9753 +       if (next->gs)
9754 +               loadsegment(gs, next->gs);
9755 +
9756 +       /*
9757 +        * Now maybe reload the debug registers
9758 +        */
9759 +       if (unlikely(next->debugreg[7])) {
9760 +               set_debugreg(next->debugreg[0], 0);
9761 +               set_debugreg(next->debugreg[1], 1);
9762 +               set_debugreg(next->debugreg[2], 2);
9763 +               set_debugreg(next->debugreg[3], 3);
9764 +               /* no 4 and 5 */
9765 +               set_debugreg(next->debugreg[6], 6);
9766 +               set_debugreg(next->debugreg[7], 7);
9767 +       }
9768 +
9769 +       disable_tsc(prev_p, next_p);
9770 +
9771 +       return prev_p;
9772 +}
9773 +
9774 +asmlinkage int sys_fork(struct pt_regs regs)
9775 +{
9776 +       return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
9777 +}
9778 +
9779 +asmlinkage int sys_clone(struct pt_regs regs)
9780 +{
9781 +       unsigned long clone_flags;
9782 +       unsigned long newsp;
9783 +       int __user *parent_tidptr, *child_tidptr;
9784 +
9785 +       clone_flags = regs.ebx;
9786 +       newsp = regs.ecx;
9787 +       parent_tidptr = (int __user *)regs.edx;
9788 +       child_tidptr = (int __user *)regs.edi;
9789 +       if (!newsp)
9790 +               newsp = regs.esp;
9791 +       return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
9792 +}
9793 +
9794 +/*
9795 + * This is trivial, and on the face of it looks like it
9796 + * could equally well be done in user mode.
9797 + *
9798 + * Not so, for quite unobvious reasons - register pressure.
9799 + * In user mode vfork() cannot have a stack frame, and if
9800 + * done by calling the "clone()" system call directly, you
9801 + * do not have enough call-clobbered registers to hold all
9802 + * the information you need.
9803 + */
9804 +asmlinkage int sys_vfork(struct pt_regs regs)
9805 +{
9806 +       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
9807 +}
9808 +
9809 +/*
9810 + * sys_execve() executes a new program.
9811 + */
9812 +asmlinkage int sys_execve(struct pt_regs regs)
9813 +{
9814 +       int error;
9815 +       char * filename;
9816 +
9817 +       filename = getname((char __user *) regs.ebx);
9818 +       error = PTR_ERR(filename);
9819 +       if (IS_ERR(filename))
9820 +               goto out;
9821 +       error = do_execve(filename,
9822 +                       (char __user * __user *) regs.ecx,
9823 +                       (char __user * __user *) regs.edx,
9824 +                       &regs);
9825 +       if (error == 0) {
9826 +               task_lock(current);
9827 +               current->ptrace &= ~PT_DTRACE;
9828 +               task_unlock(current);
9829 +               /* Make sure we don't return using sysenter.. */
9830 +               set_thread_flag(TIF_IRET);
9831 +       }
9832 +       putname(filename);
9833 +out:
9834 +       return error;
9835 +}
9836 +
9837 +#define top_esp                (THREAD_SIZE - sizeof(unsigned long))
9838 +#define top_ebp                (THREAD_SIZE - 2*sizeof(unsigned long))
9839 +
9840 +unsigned long get_wchan(struct task_struct *p)
9841 +{
9842 +       unsigned long ebp, esp, eip;
9843 +       unsigned long stack_page;
9844 +       int count = 0;
9845 +       if (!p || p == current || p->state == TASK_RUNNING)
9846 +               return 0;
9847 +       stack_page = (unsigned long)task_stack_page(p);
9848 +       esp = p->thread.esp;
9849 +       if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
9850 +               return 0;
9851 +       /* include/asm-i386/system.h:switch_to() pushes ebp last. */
9852 +       ebp = *(unsigned long *) esp;
9853 +       do {
9854 +               if (ebp < stack_page || ebp > top_ebp+stack_page)
9855 +                       return 0;
9856 +               eip = *(unsigned long *) (ebp+4);
9857 +               if (!in_sched_functions(eip))
9858 +                       return eip;
9859 +               ebp = *(unsigned long *) ebp;
9860 +       } while (count++ < 16);
9861 +       return 0;
9862 +}
9863 +EXPORT_SYMBOL(get_wchan);
9864 +
9865 +/*
9866 + * sys_alloc_thread_area: get a yet unused TLS descriptor index.
9867 + */
9868 +static int get_free_idx(void)
9869 +{
9870 +       struct thread_struct *t = &current->thread;
9871 +       int idx;
9872 +
9873 +       for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
9874 +               if (desc_empty(t->tls_array + idx))
9875 +                       return idx + GDT_ENTRY_TLS_MIN;
9876 +       return -ESRCH;
9877 +}
9878 +
9879 +/*
9880 + * Set a given TLS descriptor:
9881 + */
9882 +asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
9883 +{
9884 +       struct thread_struct *t = &current->thread;
9885 +       struct user_desc info;
9886 +       struct desc_struct *desc;
9887 +       int cpu, idx;
9888 +
9889 +       if (copy_from_user(&info, u_info, sizeof(info)))
9890 +               return -EFAULT;
9891 +       idx = info.entry_number;
9892 +
9893 +       /*
9894 +        * index -1 means the kernel should try to find and
9895 +        * allocate an empty descriptor:
9896 +        */
9897 +       if (idx == -1) {
9898 +               idx = get_free_idx();
9899 +               if (idx < 0)
9900 +                       return idx;
9901 +               if (put_user(idx, &u_info->entry_number))
9902 +                       return -EFAULT;
9903 +       }
9904 +
9905 +       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
9906 +               return -EINVAL;
9907 +
9908 +       desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
9909 +
9910 +       /*
9911 +        * We must not get preempted while modifying the TLS.
9912 +        */
9913 +       cpu = get_cpu();
9914 +
9915 +       if (LDT_empty(&info)) {
9916 +               desc->a = 0;
9917 +               desc->b = 0;
9918 +       } else {
9919 +               desc->a = LDT_entry_a(&info);
9920 +               desc->b = LDT_entry_b(&info);
9921 +       }
9922 +       load_TLS(t, cpu);
9923 +
9924 +       put_cpu();
9925 +
9926 +       return 0;
9927 +}
9928 +
9929 +/*
9930 + * Get the current Thread-Local Storage area:
9931 + */
9932 +
9933 +#define GET_BASE(desc) ( \
9934 +       (((desc)->a >> 16) & 0x0000ffff) | \
9935 +       (((desc)->b << 16) & 0x00ff0000) | \
9936 +       ( (desc)->b        & 0xff000000)   )
9937 +
9938 +#define GET_LIMIT(desc) ( \
9939 +       ((desc)->a & 0x0ffff) | \
9940 +        ((desc)->b & 0xf0000) )
9941 +       
9942 +#define GET_32BIT(desc)                (((desc)->b >> 22) & 1)
9943 +#define GET_CONTENTS(desc)     (((desc)->b >> 10) & 3)
9944 +#define GET_WRITABLE(desc)     (((desc)->b >>  9) & 1)
9945 +#define GET_LIMIT_PAGES(desc)  (((desc)->b >> 23) & 1)
9946 +#define GET_PRESENT(desc)      (((desc)->b >> 15) & 1)
9947 +#define GET_USEABLE(desc)      (((desc)->b >> 20) & 1)
9948 +
9949 +asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
9950 +{
9951 +       struct user_desc info;
9952 +       struct desc_struct *desc;
9953 +       int idx;
9954 +
9955 +       if (get_user(idx, &u_info->entry_number))
9956 +               return -EFAULT;
9957 +       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
9958 +               return -EINVAL;
9959 +
9960 +       memset(&info, 0, sizeof(info));
9961 +
9962 +       desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
9963 +
9964 +       info.entry_number = idx;
9965 +       info.base_addr = GET_BASE(desc);
9966 +       info.limit = GET_LIMIT(desc);
9967 +       info.seg_32bit = GET_32BIT(desc);
9968 +       info.contents = GET_CONTENTS(desc);
9969 +       info.read_exec_only = !GET_WRITABLE(desc);
9970 +       info.limit_in_pages = GET_LIMIT_PAGES(desc);
9971 +       info.seg_not_present = !GET_PRESENT(desc);
9972 +       info.useable = GET_USEABLE(desc);
9973 +
9974 +       if (copy_to_user(u_info, &info, sizeof(info)))
9975 +               return -EFAULT;
9976 +       return 0;
9977 +}
9978 +
9979 +unsigned long arch_align_stack(unsigned long sp)
9980 +{
9981 +       if (randomize_va_space)
9982 +               sp -= get_random_int() % 8192;
9983 +       return sp & ~0xf;
9984 +}
9985 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/quirks-xen.c linux-2.6.16/arch/i386/kernel/quirks-xen.c
9986 --- linux-2.6.16.orig/arch/i386/kernel/quirks-xen.c     1970-01-01 01:00:00.000000000 +0100
9987 +++ linux-2.6.16/arch/i386/kernel/quirks-xen.c  2006-06-26 09:51:32.000000000 +0200
9988 @@ -0,0 +1,48 @@
9989 +/*
9990 + * This file contains work-arounds for x86 and x86_64 platform bugs.
9991 + */
9992 +#include <linux/config.h>
9993 +#include <linux/pci.h>
9994 +#include <linux/irq.h>
9995 +
9996 +#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
9997 +
9998 +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
9999 +{
10000 +       u8 config, rev;
10001 +       u32 word;
10002 +
10003 +       /* BIOS may enable hardware IRQ balancing for
10004 +        * E7520/E7320/E7525(revision ID 0x9 and below)
10005 +        * based platforms.
10006 +        * Disable SW irqbalance/affinity on those platforms.
10007 +        */
10008 +       pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
10009 +       if (rev > 0x9)
10010 +               return;
10011 +
10012 +       printk(KERN_INFO "Intel E7520/7320/7525 detected.");
10013 +
10014 +       /* enable access to config space*/
10015 +       pci_read_config_byte(dev, 0xf4, &config);
10016 +       pci_write_config_byte(dev, 0xf4, config|0x2);
10017 +
10018 +       /* read xTPR register */
10019 +       raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
10020 +
10021 +       if (!(word & (1 << 13))) {
10022 +               dom0_op_t op;
10023 +               printk(KERN_INFO "Disabling irq balancing and affinity\n");
10024 +               op.cmd = DOM0_PLATFORM_QUIRK;
10025 +               op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
10026 +               (void)HYPERVISOR_dom0_op(&op);
10027 +       }
10028 +
10029 +       /* put back the original value for config space*/
10030 +       if (!(config & 0x2))
10031 +               pci_write_config_byte(dev, 0xf4, config);
10032 +}
10033 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7320_MCH,  quirk_intel_irqbalance);
10034 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7525_MCH,  quirk_intel_irqbalance);
10035 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7520_MCH,  quirk_intel_irqbalance);
10036 +#endif
10037 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/setup-xen.c linux-2.6.16/arch/i386/kernel/setup-xen.c
10038 --- linux-2.6.16.orig/arch/i386/kernel/setup-xen.c      1970-01-01 01:00:00.000000000 +0100
10039 +++ linux-2.6.16/arch/i386/kernel/setup-xen.c   2006-06-26 09:51:32.000000000 +0200
10040 @@ -0,0 +1,1892 @@
10041 +/*
10042 + *  linux/arch/i386/kernel/setup.c
10043 + *
10044 + *  Copyright (C) 1995  Linus Torvalds
10045 + *
10046 + *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10047 + *
10048 + *  Memory region support
10049 + *     David Parsons <orc@pell.chi.il.us>, July-August 1999
10050 + *
10051 + *  Added E820 sanitization routine (removes overlapping memory regions);
10052 + *  Brian Moyle <bmoyle@mvista.com>, February 2001
10053 + *
10054 + * Moved CPU detection code to cpu/${cpu}.c
10055 + *    Patrick Mochel <mochel@osdl.org>, March 2002
10056 + *
10057 + *  Provisions for empty E820 memory regions (reported by certain BIOSes).
10058 + *  Alex Achenbach <xela@slit.de>, December 2002.
10059 + *
10060 + */
10061 +
10062 +/*
10063 + * This file handles the architecture-dependent parts of initialization
10064 + */
10065 +
10066 +#include <linux/config.h>
10067 +#include <linux/sched.h>
10068 +#include <linux/mm.h>
10069 +#include <linux/mmzone.h>
10070 +#include <linux/tty.h>
10071 +#include <linux/ioport.h>
10072 +#include <linux/acpi.h>
10073 +#include <linux/apm_bios.h>
10074 +#include <linux/initrd.h>
10075 +#include <linux/bootmem.h>
10076 +#include <linux/seq_file.h>
10077 +#include <linux/console.h>
10078 +#include <linux/mca.h>
10079 +#include <linux/root_dev.h>
10080 +#include <linux/highmem.h>
10081 +#include <linux/module.h>
10082 +#include <linux/efi.h>
10083 +#include <linux/init.h>
10084 +#include <linux/edd.h>
10085 +#include <linux/nodemask.h>
10086 +#include <linux/kernel.h>
10087 +#include <linux/percpu.h>
10088 +#include <linux/notifier.h>
10089 +#include <linux/kexec.h>
10090 +#include <linux/crash_dump.h>
10091 +#include <linux/dmi.h>
10092 +
10093 +#include <video/edid.h>
10094 +
10095 +#include <asm/apic.h>
10096 +#include <asm/e820.h>
10097 +#include <asm/mpspec.h>
10098 +#include <asm/setup.h>
10099 +#include <asm/arch_hooks.h>
10100 +#include <asm/sections.h>
10101 +#include <asm/io_apic.h>
10102 +#include <asm/ist.h>
10103 +#include <asm/io.h>
10104 +#include <asm/hypervisor.h>
10105 +#include <xen/interface/physdev.h>
10106 +#include <xen/interface/memory.h>
10107 +#include <xen/features.h>
10108 +#include "setup_arch_pre.h"
10109 +#include <bios_ebda.h>
10110 +
10111 +/* Forward Declaration. */
10112 +void __init find_max_pfn(void);
10113 +
10114 +/* Allows setting of maximum possible memory size  */
10115 +static unsigned long xen_override_max_pfn;
10116 +
10117 +static int xen_panic_event(struct notifier_block *, unsigned long, void *);
10118 +static struct notifier_block xen_panic_block = {
10119 +       xen_panic_event, NULL, 0 /* try to go last */
10120 +};
10121 +
10122 +extern char hypercall_page[PAGE_SIZE];
10123 +EXPORT_SYMBOL(hypercall_page);
10124 +
10125 +int disable_pse __devinitdata = 0;
10126 +
10127 +/*
10128 + * Machine setup..
10129 + */
10130 +
10131 +#ifdef CONFIG_EFI
10132 +int efi_enabled = 0;
10133 +EXPORT_SYMBOL(efi_enabled);
10134 +#endif
10135 +
10136 +/* cpu data as detected by the assembly code in head.S */
10137 +struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
10138 +/* common cpu data for all cpus */
10139 +struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
10140 +EXPORT_SYMBOL(boot_cpu_data);
10141 +
10142 +unsigned long mmu_cr4_features;
10143 +
10144 +#ifdef CONFIG_ACPI
10145 +       int acpi_disabled = 0;
10146 +#else
10147 +       int acpi_disabled = 1;
10148 +#endif
10149 +EXPORT_SYMBOL(acpi_disabled);
10150 +
10151 +#ifdef CONFIG_ACPI
10152 +int __initdata acpi_force = 0;
10153 +extern acpi_interrupt_flags    acpi_sci_flags;
10154 +#endif
10155 +
10156 +/* for MCA, but anyone else can use it if they want */
10157 +unsigned int machine_id;
10158 +#ifdef CONFIG_MCA
10159 +EXPORT_SYMBOL(machine_id);
10160 +#endif
10161 +unsigned int machine_submodel_id;
10162 +unsigned int BIOS_revision;
10163 +unsigned int mca_pentium_flag;
10164 +
10165 +/* For PCI or other memory-mapped resources */
10166 +unsigned long pci_mem_start = 0x10000000;
10167 +#ifdef CONFIG_PCI
10168 +EXPORT_SYMBOL(pci_mem_start);
10169 +#endif
10170 +
10171 +/* Boot loader ID as an integer, for the benefit of proc_dointvec */
10172 +int bootloader_type;
10173 +
10174 +/* user-defined highmem size */
10175 +static unsigned int highmem_pages = -1;
10176 +
10177 +/*
10178 + * Setup options
10179 + */
10180 +struct drive_info_struct { char dummy[32]; } drive_info;
10181 +#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || \
10182 +    defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
10183 +EXPORT_SYMBOL(drive_info);
10184 +#endif
10185 +struct screen_info screen_info;
10186 +EXPORT_SYMBOL(screen_info);
10187 +struct apm_info apm_info;
10188 +EXPORT_SYMBOL(apm_info);
10189 +struct sys_desc_table_struct {
10190 +       unsigned short length;
10191 +       unsigned char table[0];
10192 +};
10193 +struct edid_info edid_info;
10194 +EXPORT_SYMBOL_GPL(edid_info);
10195 +struct ist_info ist_info;
10196 +#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
10197 +       defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
10198 +EXPORT_SYMBOL(ist_info);
10199 +#endif
10200 +struct e820map e820;
10201 +
10202 +extern void early_cpu_init(void);
10203 +extern void generic_apic_probe(char *);
10204 +extern int root_mountflags;
10205 +
10206 +unsigned long saved_videomode;
10207 +
10208 +#define RAMDISK_IMAGE_START_MASK       0x07FF
10209 +#define RAMDISK_PROMPT_FLAG            0x8000
10210 +#define RAMDISK_LOAD_FLAG              0x4000  
10211 +
10212 +static char command_line[COMMAND_LINE_SIZE];
10213 +
10214 +unsigned char __initdata boot_params[PARAM_SIZE];
10215 +
10216 +static struct resource data_resource = {
10217 +       .name   = "Kernel data",
10218 +       .start  = 0,
10219 +       .end    = 0,
10220 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
10221 +};
10222 +
10223 +static struct resource code_resource = {
10224 +       .name   = "Kernel code",
10225 +       .start  = 0,
10226 +       .end    = 0,
10227 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
10228 +};
10229 +
10230 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
10231 +static struct resource system_rom_resource = {
10232 +       .name   = "System ROM",
10233 +       .start  = 0xf0000,
10234 +       .end    = 0xfffff,
10235 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10236 +};
10237 +
10238 +static struct resource extension_rom_resource = {
10239 +       .name   = "Extension ROM",
10240 +       .start  = 0xe0000,
10241 +       .end    = 0xeffff,
10242 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10243 +};
10244 +
10245 +static struct resource adapter_rom_resources[] = { {
10246 +       .name   = "Adapter ROM",
10247 +       .start  = 0xc8000,
10248 +       .end    = 0,
10249 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10250 +}, {
10251 +       .name   = "Adapter ROM",
10252 +       .start  = 0,
10253 +       .end    = 0,
10254 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10255 +}, {
10256 +       .name   = "Adapter ROM",
10257 +       .start  = 0,
10258 +       .end    = 0,
10259 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10260 +}, {
10261 +       .name   = "Adapter ROM",
10262 +       .start  = 0,
10263 +       .end    = 0,
10264 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10265 +}, {
10266 +       .name   = "Adapter ROM",
10267 +       .start  = 0,
10268 +       .end    = 0,
10269 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10270 +}, {
10271 +       .name   = "Adapter ROM",
10272 +       .start  = 0,
10273 +       .end    = 0,
10274 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10275 +} };
10276 +
10277 +#define ADAPTER_ROM_RESOURCES \
10278 +       (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
10279 +
10280 +static struct resource video_rom_resource = {
10281 +       .name   = "Video ROM",
10282 +       .start  = 0xc0000,
10283 +       .end    = 0xc7fff,
10284 +       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
10285 +};
10286 +#endif
10287 +
10288 +static struct resource video_ram_resource = {
10289 +       .name   = "Video RAM area",
10290 +       .start  = 0xa0000,
10291 +       .end    = 0xbffff,
10292 +       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
10293 +};
10294 +
10295 +static struct resource standard_io_resources[] = { {
10296 +       .name   = "dma1",
10297 +       .start  = 0x0000,
10298 +       .end    = 0x001f,
10299 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10300 +}, {
10301 +       .name   = "pic1",
10302 +       .start  = 0x0020,
10303 +       .end    = 0x0021,
10304 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10305 +}, {
10306 +       .name   = "timer0",
10307 +       .start  = 0x0040,
10308 +       .end    = 0x0043,
10309 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10310 +}, {
10311 +       .name   = "timer1",
10312 +       .start  = 0x0050,
10313 +       .end    = 0x0053,
10314 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10315 +}, {
10316 +       .name   = "keyboard",
10317 +       .start  = 0x0060,
10318 +       .end    = 0x006f,
10319 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10320 +}, {
10321 +       .name   = "dma page reg",
10322 +       .start  = 0x0080,
10323 +       .end    = 0x008f,
10324 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10325 +}, {
10326 +       .name   = "pic2",
10327 +       .start  = 0x00a0,
10328 +       .end    = 0x00a1,
10329 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10330 +}, {
10331 +       .name   = "dma2",
10332 +       .start  = 0x00c0,
10333 +       .end    = 0x00df,
10334 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10335 +}, {
10336 +       .name   = "fpu",
10337 +       .start  = 0x00f0,
10338 +       .end    = 0x00ff,
10339 +       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
10340 +} };
10341 +
10342 +#define STANDARD_IO_RESOURCES \
10343 +       (sizeof standard_io_resources / sizeof standard_io_resources[0])
10344 +
10345 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
10346 +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
10347 +
10348 +static int __init romchecksum(unsigned char *rom, unsigned long length)
10349 +{
10350 +       unsigned char *p, sum = 0;
10351 +
10352 +       for (p = rom; p < rom + length; p++)
10353 +               sum += *p;
10354 +       return sum == 0;
10355 +}
10356 +
10357 +static void __init probe_roms(void)
10358 +{
10359 +       unsigned long start, length, upper;
10360 +       unsigned char *rom;
10361 +       int           i;
10362 +
10363 +       /* Nothing to do if not running in dom0. */
10364 +       if (!(xen_start_info->flags & SIF_INITDOMAIN))
10365 +               return;
10366 +
10367 +       /* video rom */
10368 +       upper = adapter_rom_resources[0].start;
10369 +       for (start = video_rom_resource.start; start < upper; start += 2048) {
10370 +               rom = isa_bus_to_virt(start);
10371 +               if (!romsignature(rom))
10372 +                       continue;
10373 +
10374 +               video_rom_resource.start = start;
10375 +
10376 +               /* 0 < length <= 0x7f * 512, historically */
10377 +               length = rom[2] * 512;
10378 +
10379 +               /* if checksum okay, trust length byte */
10380 +               if (length && romchecksum(rom, length))
10381 +                       video_rom_resource.end = start + length - 1;
10382 +
10383 +               request_resource(&iomem_resource, &video_rom_resource);
10384 +               break;
10385 +       }
10386 +
10387 +       start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
10388 +       if (start < upper)
10389 +               start = upper;
10390 +
10391 +       /* system rom */
10392 +       request_resource(&iomem_resource, &system_rom_resource);
10393 +       upper = system_rom_resource.start;
10394 +
10395 +       /* check for extension rom (ignore length byte!) */
10396 +       rom = isa_bus_to_virt(extension_rom_resource.start);
10397 +       if (romsignature(rom)) {
10398 +               length = extension_rom_resource.end - extension_rom_resource.start + 1;
10399 +               if (romchecksum(rom, length)) {
10400 +                       request_resource(&iomem_resource, &extension_rom_resource);
10401 +                       upper = extension_rom_resource.start;
10402 +               }
10403 +       }
10404 +
10405 +       /* check for adapter roms on 2k boundaries */
10406 +       for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
10407 +               rom = isa_bus_to_virt(start);
10408 +               if (!romsignature(rom))
10409 +                       continue;
10410 +
10411 +               /* 0 < length <= 0x7f * 512, historically */
10412 +               length = rom[2] * 512;
10413 +
10414 +               /* but accept any length that fits if checksum okay */
10415 +               if (!length || start + length > upper || !romchecksum(rom, length))
10416 +                       continue;
10417 +
10418 +               adapter_rom_resources[i].start = start;
10419 +               adapter_rom_resources[i].end = start + length - 1;
10420 +               request_resource(&iomem_resource, &adapter_rom_resources[i]);
10421 +
10422 +               start = adapter_rom_resources[i++].end & ~2047UL;
10423 +       }
10424 +}
10425 +#endif
10426 +
10427 +/*
10428 + * Point at the empty zero page to start with. We map the real shared_info
10429 + * page as soon as fixmap is up and running.
10430 + */
10431 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
10432 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
10433 +
10434 +unsigned long *phys_to_machine_mapping;
10435 +unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
10436 +EXPORT_SYMBOL(phys_to_machine_mapping);
10437 +
10438 +/* Raw start-of-day parameters from the hypervisor. */
10439 +start_info_t *xen_start_info;
10440 +EXPORT_SYMBOL(xen_start_info);
10441 +
10442 +static void __init limit_regions(unsigned long long size)
10443 +{
10444 +       unsigned long long current_addr = 0;
10445 +       int i;
10446 +
10447 +       if (efi_enabled) {
10448 +               efi_memory_desc_t *md;
10449 +               void *p;
10450 +
10451 +               for (p = memmap.map, i = 0; p < memmap.map_end;
10452 +                       p += memmap.desc_size, i++) {
10453 +                       md = p;
10454 +                       current_addr = md->phys_addr + (md->num_pages << 12);
10455 +                       if (md->type == EFI_CONVENTIONAL_MEMORY) {
10456 +                               if (current_addr >= size) {
10457 +                                       md->num_pages -=
10458 +                                               (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
10459 +                                       memmap.nr_map = i + 1;
10460 +                                       return;
10461 +                               }
10462 +                       }
10463 +               }
10464 +       }
10465 +       for (i = 0; i < e820.nr_map; i++) {
10466 +               current_addr = e820.map[i].addr + e820.map[i].size;
10467 +               if (current_addr < size)
10468 +                       continue;
10469 +
10470 +               if (e820.map[i].type != E820_RAM)
10471 +                       continue;
10472 +
10473 +               if (e820.map[i].addr >= size) {
10474 +                       /*
10475 +                        * This region starts past the end of the
10476 +                        * requested size, skip it completely.
10477 +                        */
10478 +                       e820.nr_map = i;
10479 +               } else {
10480 +                       e820.nr_map = i + 1;
10481 +                       e820.map[i].size -= current_addr - size;
10482 +               }
10483 +               return;
10484 +       }
10485 +}
10486 +
10487 +static void __init add_memory_region(unsigned long long start,
10488 +                                  unsigned long long size, int type)
10489 +{
10490 +       int x;
10491 +
10492 +       if (!efi_enabled) {
10493 +                       x = e820.nr_map;
10494 +
10495 +               if (x == E820MAX) {
10496 +                   printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
10497 +                   return;
10498 +               }
10499 +
10500 +               e820.map[x].addr = start;
10501 +               e820.map[x].size = size;
10502 +               e820.map[x].type = type;
10503 +               e820.nr_map++;
10504 +       }
10505 +} /* add_memory_region */
10506 +
10507 +#define E820_DEBUG     1
10508 +
10509 +static void __init print_memory_map(char *who)
10510 +{
10511 +       int i;
10512 +
10513 +       for (i = 0; i < e820.nr_map; i++) {
10514 +               printk(" %s: %016Lx - %016Lx ", who,
10515 +                       e820.map[i].addr,
10516 +                       e820.map[i].addr + e820.map[i].size);
10517 +               switch (e820.map[i].type) {
10518 +               case E820_RAM:  printk("(usable)\n");
10519 +                               break;
10520 +               case E820_RESERVED:
10521 +                               printk("(reserved)\n");
10522 +                               break;
10523 +               case E820_ACPI:
10524 +                               printk("(ACPI data)\n");
10525 +                               break;
10526 +               case E820_NVS:
10527 +                               printk("(ACPI NVS)\n");
10528 +                               break;
10529 +               default:        printk("type %lu\n", e820.map[i].type);
10530 +                               break;
10531 +               }
10532 +       }
10533 +}
10534 +
10535 +#if 0
10536 +/*
10537 + * Sanitize the BIOS e820 map.
10538 + *
10539 + * Some e820 responses include overlapping entries.  The following 
10540 + * replaces the original e820 map with a new one, removing overlaps.
10541 + *
10542 + */
10543 +struct change_member {
10544 +       struct e820entry *pbios; /* pointer to original bios entry */
10545 +       unsigned long long addr; /* address for this change point */
10546 +};
10547 +static struct change_member change_point_list[2*E820MAX] __initdata;
10548 +static struct change_member *change_point[2*E820MAX] __initdata;
10549 +static struct e820entry *overlap_list[E820MAX] __initdata;
10550 +static struct e820entry new_bios[E820MAX] __initdata;
10551 +
10552 +static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
10553 +{
10554 +       struct change_member *change_tmp;
10555 +       unsigned long current_type, last_type;
10556 +       unsigned long long last_addr;
10557 +       int chgidx, still_changing;
10558 +       int overlap_entries;
10559 +       int new_bios_entry;
10560 +       int old_nr, new_nr, chg_nr;
10561 +       int i;
10562 +
10563 +       /*
10564 +               Visually we're performing the following (1,2,3,4 = memory types)...
10565 +
10566 +               Sample memory map (w/overlaps):
10567 +                  ____22__________________
10568 +                  ______________________4_
10569 +                  ____1111________________
10570 +                  _44_____________________
10571 +                  11111111________________
10572 +                  ____________________33__
10573 +                  ___________44___________
10574 +                  __________33333_________
10575 +                  ______________22________
10576 +                  ___________________2222_
10577 +                  _________111111111______
10578 +                  _____________________11_
10579 +                  _________________4______
10580 +
10581 +               Sanitized equivalent (no overlap):
10582 +                  1_______________________
10583 +                  _44_____________________
10584 +                  ___1____________________
10585 +                  ____22__________________
10586 +                  ______11________________
10587 +                  _________1______________
10588 +                  __________3_____________
10589 +                  ___________44___________
10590 +                  _____________33_________
10591 +                  _______________2________
10592 +                  ________________1_______
10593 +                  _________________4______
10594 +                  ___________________2____
10595 +                  ____________________33__
10596 +                  ______________________4_
10597 +       */
10598 +
10599 +       /* if there's only one memory region, don't bother */
10600 +       if (*pnr_map < 2)
10601 +               return -1;
10602 +
10603 +       old_nr = *pnr_map;
10604 +
10605 +       /* bail out if we find any unreasonable addresses in bios map */
10606 +       for (i=0; i<old_nr; i++)
10607 +               if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
10608 +                       return -1;
10609 +
10610 +       /* create pointers for initial change-point information (for sorting) */
10611 +       for (i=0; i < 2*old_nr; i++)
10612 +               change_point[i] = &change_point_list[i];
10613 +
10614 +       /* record all known change-points (starting and ending addresses),
10615 +          omitting those that are for empty memory regions */
10616 +       chgidx = 0;
10617 +       for (i=0; i < old_nr; i++)      {
10618 +               if (biosmap[i].size != 0) {
10619 +                       change_point[chgidx]->addr = biosmap[i].addr;
10620 +                       change_point[chgidx++]->pbios = &biosmap[i];
10621 +                       change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
10622 +                       change_point[chgidx++]->pbios = &biosmap[i];
10623 +               }
10624 +       }
10625 +       chg_nr = chgidx;        /* true number of change-points */
10626 +
10627 +       /* sort change-point list by memory addresses (low -> high) */
10628 +       still_changing = 1;
10629 +       while (still_changing)  {
10630 +               still_changing = 0;
10631 +               for (i=1; i < chg_nr; i++)  {
10632 +                       /* if <current_addr> > <last_addr>, swap */
10633 +                       /* or, if current=<start_addr> & last=<end_addr>, swap */
10634 +                       if ((change_point[i]->addr < change_point[i-1]->addr) ||
10635 +                               ((change_point[i]->addr == change_point[i-1]->addr) &&
10636 +                                (change_point[i]->addr == change_point[i]->pbios->addr) &&
10637 +                                (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
10638 +                          )
10639 +                       {
10640 +                               change_tmp = change_point[i];
10641 +                               change_point[i] = change_point[i-1];
10642 +                               change_point[i-1] = change_tmp;
10643 +                               still_changing=1;
10644 +                       }
10645 +               }
10646 +       }
10647 +
10648 +       /* create a new bios memory map, removing overlaps */
10649 +       overlap_entries=0;       /* number of entries in the overlap table */
10650 +       new_bios_entry=0;        /* index for creating new bios map entries */
10651 +       last_type = 0;           /* start with undefined memory type */
10652 +       last_addr = 0;           /* start with 0 as last starting address */
10653 +       /* loop through change-points, determining affect on the new bios map */
10654 +       for (chgidx=0; chgidx < chg_nr; chgidx++)
10655 +       {
10656 +               /* keep track of all overlapping bios entries */
10657 +               if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
10658 +               {
10659 +                       /* add map entry to overlap list (> 1 entry implies an overlap) */
10660 +                       overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
10661 +               }
10662 +               else
10663 +               {
10664 +                       /* remove entry from list (order independent, so swap with last) */
10665 +                       for (i=0; i<overlap_entries; i++)
10666 +                       {
10667 +                               if (overlap_list[i] == change_point[chgidx]->pbios)
10668 +                                       overlap_list[i] = overlap_list[overlap_entries-1];
10669 +                       }
10670 +                       overlap_entries--;
10671 +               }
10672 +               /* if there are overlapping entries, decide which "type" to use */
10673 +               /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
10674 +               current_type = 0;
10675 +               for (i=0; i<overlap_entries; i++)
10676 +                       if (overlap_list[i]->type > current_type)
10677 +                               current_type = overlap_list[i]->type;
10678 +               /* continue building up new bios map based on this information */
10679 +               if (current_type != last_type)  {
10680 +                       if (last_type != 0)      {
10681 +                               new_bios[new_bios_entry].size =
10682 +                                       change_point[chgidx]->addr - last_addr;
10683 +                               /* move forward only if the new size was non-zero */
10684 +                               if (new_bios[new_bios_entry].size != 0)
10685 +                                       if (++new_bios_entry >= E820MAX)
10686 +                                               break;  /* no more space left for new bios entries */
10687 +                       }
10688 +                       if (current_type != 0)  {
10689 +                               new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
10690 +                               new_bios[new_bios_entry].type = current_type;
10691 +                               last_addr=change_point[chgidx]->addr;
10692 +                       }
10693 +                       last_type = current_type;
10694 +               }
10695 +       }
10696 +       new_nr = new_bios_entry;   /* retain count for new bios entries */
10697 +
10698 +       /* copy new bios mapping into original location */
10699 +       memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
10700 +       *pnr_map = new_nr;
10701 +
10702 +       return 0;
10703 +}
10704 +
10705 +/*
10706 + * Copy the BIOS e820 map into a safe place.
10707 + *
10708 + * Sanity-check it while we're at it..
10709 + *
10710 + * If we're lucky and live on a modern system, the setup code
10711 + * will have given us a memory map that we can use to properly
10712 + * set up memory.  If we aren't, we'll fake a memory map.
10713 + *
10714 + * We check to see that the memory map contains at least 2 elements
10715 + * before we'll use it, because the detection code in setup.S may
10716 + * not be perfect and most every PC known to man has two memory
10717 + * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
10718 + * thinkpad 560x, for example, does not cooperate with the memory
10719 + * detection code.)
10720 + */
10721 +static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
10722 +{
10723 +       /* Only one memory region (or negative)? Ignore it */
10724 +       if (nr_map < 2)
10725 +               return -1;
10726 +
10727 +       do {
10728 +               unsigned long long start = biosmap->addr;
10729 +               unsigned long long size = biosmap->size;
10730 +               unsigned long long end = start + size;
10731 +               unsigned long type = biosmap->type;
10732 +
10733 +               /* Overflow in 64 bits? Ignore the memory map. */
10734 +               if (start > end)
10735 +                       return -1;
10736 +
10737 +               /*
10738 +                * Some BIOSes claim RAM in the 640k - 1M region.
10739 +                * Not right. Fix it up.
10740 +                */
10741 +               if (type == E820_RAM) {
10742 +                       if (start < 0x100000ULL && end > 0xA0000ULL) {
10743 +                               if (start < 0xA0000ULL)
10744 +                                       add_memory_region(start, 0xA0000ULL-start, type);
10745 +                               if (end <= 0x100000ULL)
10746 +                                       continue;
10747 +                               start = 0x100000ULL;
10748 +                               size = end - start;
10749 +                       }
10750 +               }
10751 +               add_memory_region(start, size, type);
10752 +       } while (biosmap++,--nr_map);
10753 +       return 0;
10754 +}
10755 +#endif
10756 +
10757 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
10758 +struct edd edd;
10759 +#ifdef CONFIG_EDD_MODULE
10760 +EXPORT_SYMBOL(edd);
10761 +#endif
10762 +/**
10763 + * copy_edd() - Copy the BIOS EDD information
10764 + *              from boot_params into a safe place.
10765 + *
10766 + */
10767 +static inline void copy_edd(void)
10768 +{
10769 +     memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
10770 +     memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
10771 +     edd.mbr_signature_nr = EDD_MBR_SIG_NR;
10772 +     edd.edd_info_nr = EDD_NR;
10773 +}
10774 +#else
10775 +static inline void copy_edd(void)
10776 +{
10777 +}
10778 +#endif
10779 +
10780 +/*
10781 + * Do NOT EVER look at the BIOS memory size location.
10782 + * It does not work on many machines.
10783 + */
10784 +#define LOWMEMSIZE()   (0x9f000)
10785 +
10786 +static void __init parse_cmdline_early (char ** cmdline_p)
10787 +{
10788 +       char c = ' ', *to = command_line, *from = saved_command_line;
10789 +       int len = 0, max_cmdline;
10790 +       int userdef = 0;
10791 +
10792 +       if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
10793 +               max_cmdline = COMMAND_LINE_SIZE;
10794 +       memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
10795 +       /* Save unparsed command line copy for /proc/cmdline */
10796 +       saved_command_line[max_cmdline-1] = '\0';
10797 +
10798 +       for (;;) {
10799 +               if (c != ' ')
10800 +                       goto next_char;
10801 +               /*
10802 +                * "mem=nopentium" disables the 4MB page tables.
10803 +                * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
10804 +                * to <mem>, overriding the bios size.
10805 +                * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
10806 +                * <start> to <start>+<mem>, overriding the bios size.
10807 +                *
10808 +                * HPA tells me bootloaders need to parse mem=, so no new
10809 +                * option should be mem=  [also see Documentation/i386/boot.txt]
10810 +                */
10811 +               if (!memcmp(from, "mem=", 4)) {
10812 +                       if (to != command_line)
10813 +                               to--;
10814 +                       if (!memcmp(from+4, "nopentium", 9)) {
10815 +                               from += 9+4;
10816 +                               clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
10817 +                               disable_pse = 1;
10818 +                       } else {
10819 +                               /* If the user specifies memory size, we
10820 +                                * limit the BIOS-provided memory map to
10821 +                                * that size. exactmap can be used to specify
10822 +                                * the exact map. mem=number can be used to
10823 +                                * trim the existing memory map.
10824 +                                */
10825 +                               unsigned long long mem_size;
10826
10827 +                               mem_size = memparse(from+4, &from);
10828 +#if 0
10829 +                               limit_regions(mem_size);
10830 +                               userdef=1;
10831 +#else
10832 +                               xen_override_max_pfn =
10833 +                                       (unsigned long)(mem_size>>PAGE_SHIFT);
10834 +#endif
10835 +                       }
10836 +               }
10837 +
10838 +               else if (!memcmp(from, "memmap=", 7)) {
10839 +                       if (to != command_line)
10840 +                               to--;
10841 +                       if (!memcmp(from+7, "exactmap", 8)) {
10842 +#ifdef CONFIG_CRASH_DUMP
10843 +                               /* If we are doing a crash dump, we
10844 +                                * still need to know the real mem
10845 +                                * size before original memory map is
10846 +                                * reset.
10847 +                                */
10848 +                               find_max_pfn();
10849 +                               saved_max_pfn = max_pfn;
10850 +#endif
10851 +                               from += 8+7;
10852 +                               e820.nr_map = 0;
10853 +                               userdef = 1;
10854 +                       } else {
10855 +                               /* If the user specifies memory size, we
10856 +                                * limit the BIOS-provided memory map to
10857 +                                * that size. exactmap can be used to specify
10858 +                                * the exact map. mem=number can be used to
10859 +                                * trim the existing memory map.
10860 +                                */
10861 +                               unsigned long long start_at, mem_size;
10862
10863 +                               mem_size = memparse(from+7, &from);
10864 +                               if (*from == '@') {
10865 +                                       start_at = memparse(from+1, &from);
10866 +                                       add_memory_region(start_at, mem_size, E820_RAM);
10867 +                               } else if (*from == '#') {
10868 +                                       start_at = memparse(from+1, &from);
10869 +                                       add_memory_region(start_at, mem_size, E820_ACPI);
10870 +                               } else if (*from == '$') {
10871 +                                       start_at = memparse(from+1, &from);
10872 +                                       add_memory_region(start_at, mem_size, E820_RESERVED);
10873 +                               } else {
10874 +                                       limit_regions(mem_size);
10875 +                                       userdef=1;
10876 +                               }
10877 +                       }
10878 +               }
10879 +
10880 +               else if (!memcmp(from, "noexec=", 7))
10881 +                       noexec_setup(from + 7);
10882 +
10883 +
10884 +#ifdef  CONFIG_X86_MPPARSE
10885 +               /*
10886 +                * If the BIOS enumerates physical processors before logical,
10887 +                * maxcpus=N at enumeration-time can be used to disable HT.
10888 +                */
10889 +               else if (!memcmp(from, "maxcpus=", 8)) {
10890 +                       extern unsigned int maxcpus;
10891 +
10892 +                       maxcpus = simple_strtoul(from + 8, NULL, 0);
10893 +               }
10894 +#endif
10895 +
10896 +#ifdef CONFIG_ACPI
10897 +               /* "acpi=off" disables both ACPI table parsing and interpreter */
10898 +               else if (!memcmp(from, "acpi=off", 8)) {
10899 +                       disable_acpi();
10900 +               }
10901 +
10902 +               /* acpi=force to over-ride black-list */
10903 +               else if (!memcmp(from, "acpi=force", 10)) {
10904 +                       acpi_force = 1;
10905 +                       acpi_ht = 1;
10906 +                       acpi_disabled = 0;
10907 +               }
10908 +
10909 +               /* acpi=strict disables out-of-spec workarounds */
10910 +               else if (!memcmp(from, "acpi=strict", 11)) {
10911 +                       acpi_strict = 1;
10912 +               }
10913 +
10914 +               /* Limit ACPI just to boot-time to enable HT */
10915 +               else if (!memcmp(from, "acpi=ht", 7)) {
10916 +                       if (!acpi_force)
10917 +                               disable_acpi();
10918 +                       acpi_ht = 1;
10919 +               }
10920 +               
10921 +               /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
10922 +               else if (!memcmp(from, "pci=noacpi", 10)) {
10923 +                       acpi_disable_pci();
10924 +               }
10925 +               /* "acpi=noirq" disables ACPI interrupt routing */
10926 +               else if (!memcmp(from, "acpi=noirq", 10)) {
10927 +                       acpi_noirq_set();
10928 +               }
10929 +
10930 +               else if (!memcmp(from, "acpi_sci=edge", 13))
10931 +                       acpi_sci_flags.trigger =  1;
10932 +
10933 +               else if (!memcmp(from, "acpi_sci=level", 14))
10934 +                       acpi_sci_flags.trigger = 3;
10935 +
10936 +               else if (!memcmp(from, "acpi_sci=high", 13))
10937 +                       acpi_sci_flags.polarity = 1;
10938 +
10939 +               else if (!memcmp(from, "acpi_sci=low", 12))
10940 +                       acpi_sci_flags.polarity = 3;
10941 +
10942 +#ifdef CONFIG_X86_IO_APIC
10943 +               else if (!memcmp(from, "acpi_skip_timer_override", 24))
10944 +                       acpi_skip_timer_override = 1;
10945 +
10946 +               if (!memcmp(from, "disable_timer_pin_1", 19))
10947 +                       disable_timer_pin_1 = 1;
10948 +               if (!memcmp(from, "enable_timer_pin_1", 18))
10949 +                       disable_timer_pin_1 = -1;
10950 +
10951 +               /* disable IO-APIC */
10952 +               else if (!memcmp(from, "noapic", 6))
10953 +                       disable_ioapic_setup();
10954 +#endif /* CONFIG_X86_IO_APIC */
10955 +#endif /* CONFIG_ACPI */
10956 +
10957 +#ifdef CONFIG_X86_LOCAL_APIC
10958 +               /* enable local APIC */
10959 +               else if (!memcmp(from, "lapic", 5))
10960 +                       lapic_enable();
10961 +
10962 +               /* disable local APIC */
10963 +               else if (!memcmp(from, "nolapic", 6))
10964 +                       lapic_disable();
10965 +#endif /* CONFIG_X86_LOCAL_APIC */
10966 +
10967 +#ifdef CONFIG_KEXEC
10968 +               /* crashkernel=size@addr specifies the location to reserve for
10969 +                * a crash kernel.  By reserving this memory we guarantee
10970 +                * that linux never set's it up as a DMA target.
10971 +                * Useful for holding code to do something appropriate
10972 +                * after a kernel panic.
10973 +                */
10974 +               else if (!memcmp(from, "crashkernel=", 12)) {
10975 +                       unsigned long size, base;
10976 +                       size = memparse(from+12, &from);
10977 +                       if (*from == '@') {
10978 +                               base = memparse(from+1, &from);
10979 +                               /* FIXME: Do I want a sanity check
10980 +                                * to validate the memory range?
10981 +                                */
10982 +                               crashk_res.start = base;
10983 +                               crashk_res.end   = base + size - 1;
10984 +                       }
10985 +               }
10986 +#endif
10987 +#ifdef CONFIG_PROC_VMCORE
10988 +               /* elfcorehdr= specifies the location of elf core header
10989 +                * stored by the crashed kernel.
10990 +                */
10991 +               else if (!memcmp(from, "elfcorehdr=", 11))
10992 +                       elfcorehdr_addr = memparse(from+11, &from);
10993 +#endif
10994 +
10995 +               /*
10996 +                * highmem=size forces highmem to be exactly 'size' bytes.
10997 +                * This works even on boxes that have no highmem otherwise.
10998 +                * This also works to reduce highmem size on bigger boxes.
10999 +                */
11000 +               else if (!memcmp(from, "highmem=", 8))
11001 +                       highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
11002 +       
11003 +               /*
11004 +                * vmalloc=size forces the vmalloc area to be exactly 'size'
11005 +                * bytes. This can be used to increase (or decrease) the
11006 +                * vmalloc area - the default is 128m.
11007 +                */
11008 +               else if (!memcmp(from, "vmalloc=", 8))
11009 +                       __VMALLOC_RESERVE = memparse(from+8, &from);
11010 +
11011 +       next_char:
11012 +               c = *(from++);
11013 +               if (!c)
11014 +                       break;
11015 +               if (COMMAND_LINE_SIZE <= ++len)
11016 +                       break;
11017 +               *(to++) = c;
11018 +       }
11019 +       *to = '\0';
11020 +       *cmdline_p = command_line;
11021 +       if (userdef) {
11022 +               printk(KERN_INFO "user-defined physical RAM map:\n");
11023 +               print_memory_map("user");
11024 +       }
11025 +}
11026 +
11027 +#if 0 /* !XEN */
11028 +/*
11029 + * Callback for efi_memory_walk.
11030 + */
11031 +static int __init
11032 +efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
11033 +{
11034 +       unsigned long *max_pfn = arg, pfn;
11035 +
11036 +       if (start < end) {
11037 +               pfn = PFN_UP(end -1);
11038 +               if (pfn > *max_pfn)
11039 +                       *max_pfn = pfn;
11040 +       }
11041 +       return 0;
11042 +}
11043 +
11044 +static int __init
11045 +efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
11046 +{
11047 +       memory_present(0, start, end);
11048 +       return 0;
11049 +}
11050 +
11051 +/*
11052 + * Find the highest page frame number we have available
11053 + */
11054 +void __init find_max_pfn(void)
11055 +{
11056 +       int i;
11057 +
11058 +       max_pfn = 0;
11059 +       if (efi_enabled) {
11060 +               efi_memmap_walk(efi_find_max_pfn, &max_pfn);
11061 +               efi_memmap_walk(efi_memory_present_wrapper, NULL);
11062 +               return;
11063 +       }
11064 +
11065 +       for (i = 0; i < e820.nr_map; i++) {
11066 +               unsigned long start, end;
11067 +               /* RAM? */
11068 +               if (e820.map[i].type != E820_RAM)
11069 +                       continue;
11070 +               start = PFN_UP(e820.map[i].addr);
11071 +               end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
11072 +               if (start >= end)
11073 +                       continue;
11074 +               if (end > max_pfn)
11075 +                       max_pfn = end;
11076 +               memory_present(0, start, end);
11077 +       }
11078 +}
11079 +#else
11080 +/* We don't use the fake e820 because we need to respond to user override. */
11081 +void __init find_max_pfn(void)
11082 +{
11083 +       if (xen_override_max_pfn == 0) {
11084 +               max_pfn = xen_start_info->nr_pages;
11085 +               /* Default 8MB slack (to balance backend allocations). */
11086 +               max_pfn += 8 << (20 - PAGE_SHIFT);
11087 +       } else if (xen_override_max_pfn > xen_start_info->nr_pages) {
11088 +               max_pfn = xen_override_max_pfn;
11089 +       } else {
11090 +               max_pfn = xen_start_info->nr_pages;
11091 +       }
11092 +}
11093 +#endif /* XEN */
11094 +
11095 +/*
11096 + * Determine low and high memory ranges:
11097 + */
11098 +unsigned long __init find_max_low_pfn(void)
11099 +{
11100 +       unsigned long max_low_pfn;
11101 +
11102 +       max_low_pfn = max_pfn;
11103 +       if (max_low_pfn > MAXMEM_PFN) {
11104 +               if (highmem_pages == -1)
11105 +                       highmem_pages = max_pfn - MAXMEM_PFN;
11106 +               if (highmem_pages + MAXMEM_PFN < max_pfn)
11107 +                       max_pfn = MAXMEM_PFN + highmem_pages;
11108 +               if (highmem_pages + MAXMEM_PFN > max_pfn) {
11109 +                       printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages));
11110 +                       highmem_pages = 0;
11111 +               }
11112 +               max_low_pfn = MAXMEM_PFN;
11113 +#ifndef CONFIG_HIGHMEM
11114 +               /* Maximum memory usable is what is directly addressable */
11115 +               printk(KERN_WARNING "Warning only %ldMB will be used.\n",
11116 +                                       MAXMEM>>20);
11117 +               if (max_pfn > MAX_NONPAE_PFN)
11118 +                       printk(KERN_WARNING "Use a PAE enabled kernel.\n");
11119 +               else
11120 +                       printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
11121 +               max_pfn = MAXMEM_PFN;
11122 +#else /* !CONFIG_HIGHMEM */
11123 +#ifndef CONFIG_X86_PAE
11124 +               if (max_pfn > MAX_NONPAE_PFN) {
11125 +                       max_pfn = MAX_NONPAE_PFN;
11126 +                       printk(KERN_WARNING "Warning only 4GB will be used.\n");
11127 +                       printk(KERN_WARNING "Use a PAE enabled kernel.\n");
11128 +               }
11129 +#endif /* !CONFIG_X86_PAE */
11130 +#endif /* !CONFIG_HIGHMEM */
11131 +       } else {
11132 +               if (highmem_pages == -1)
11133 +                       highmem_pages = 0;
11134 +#ifdef CONFIG_HIGHMEM
11135 +               if (highmem_pages >= max_pfn) {
11136 +                       printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
11137 +                       highmem_pages = 0;
11138 +               }
11139 +               if (highmem_pages) {
11140 +                       if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
11141 +                               printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
11142 +                               highmem_pages = 0;
11143 +                       }
11144 +                       max_low_pfn -= highmem_pages;
11145 +               }
11146 +#else
11147 +               if (highmem_pages)
11148 +                       printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
11149 +#endif
11150 +       }
11151 +       return max_low_pfn;
11152 +}
11153 +
11154 +/*
11155 + * Free all available memory for boot time allocation.  Used
11156 + * as a callback function by efi_memory_walk()
11157 + */
11158 +
11159 +static int __init
11160 +free_available_memory(unsigned long start, unsigned long end, void *arg)
11161 +{
11162 +       /* check max_low_pfn */
11163 +       if (start >= ((max_low_pfn + 1) << PAGE_SHIFT))
11164 +               return 0;
11165 +       if (end >= ((max_low_pfn + 1) << PAGE_SHIFT))
11166 +               end = (max_low_pfn + 1) << PAGE_SHIFT;
11167 +       if (start < end)
11168 +               free_bootmem(start, end - start);
11169 +
11170 +       return 0;
11171 +}
11172 +/*
11173 + * Register fully available low RAM pages with the bootmem allocator.
11174 + */
11175 +static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
11176 +{
11177 +       int i;
11178 +
11179 +       if (efi_enabled) {
11180 +               efi_memmap_walk(free_available_memory, NULL);
11181 +               return;
11182 +       }
11183 +       for (i = 0; i < e820.nr_map; i++) {
11184 +               unsigned long curr_pfn, last_pfn, size;
11185 +               /*
11186 +                * Reserve usable low memory
11187 +                */
11188 +               if (e820.map[i].type != E820_RAM)
11189 +                       continue;
11190 +               /*
11191 +                * We are rounding up the start address of usable memory:
11192 +                */
11193 +               curr_pfn = PFN_UP(e820.map[i].addr);
11194 +               if (curr_pfn >= max_low_pfn)
11195 +                       continue;
11196 +               /*
11197 +                * ... and at the end of the usable range downwards:
11198 +                */
11199 +               last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
11200 +
11201 +               if (last_pfn > max_low_pfn)
11202 +                       last_pfn = max_low_pfn;
11203 +
11204 +               /*
11205 +                * .. finally, did all the rounding and playing
11206 +                * around just make the area go away?
11207 +                */
11208 +               if (last_pfn <= curr_pfn)
11209 +                       continue;
11210 +
11211 +               size = last_pfn - curr_pfn;
11212 +               free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
11213 +       }
11214 +}
11215 +
11216 +#ifndef CONFIG_XEN
11217 +/*
11218 + * workaround for Dell systems that neglect to reserve EBDA
11219 + */
11220 +static void __init reserve_ebda_region(void)
11221 +{
11222 +       unsigned int addr;
11223 +       addr = get_bios_ebda();
11224 +       if (addr)
11225 +               reserve_bootmem(addr, PAGE_SIZE);       
11226 +}
11227 +#endif
11228 +
11229 +#ifndef CONFIG_NEED_MULTIPLE_NODES
11230 +void __init setup_bootmem_allocator(void);
11231 +static unsigned long __init setup_memory(void)
11232 +{
11233 +       /*
11234 +        * partially used pages are not usable - thus
11235 +        * we are rounding upwards:
11236 +        */
11237 +       min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) +
11238 +               xen_start_info->nr_pt_frames;
11239 +
11240 +       find_max_pfn();
11241 +
11242 +       max_low_pfn = find_max_low_pfn();
11243 +
11244 +#ifdef CONFIG_HIGHMEM
11245 +       highstart_pfn = highend_pfn = max_pfn;
11246 +       if (max_pfn > max_low_pfn) {
11247 +               highstart_pfn = max_low_pfn;
11248 +       }
11249 +       printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
11250 +               pages_to_mb(highend_pfn - highstart_pfn));
11251 +#endif
11252 +       printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
11253 +                       pages_to_mb(max_low_pfn));
11254 +
11255 +       setup_bootmem_allocator();
11256 +
11257 +       return max_low_pfn;
11258 +}
11259 +
11260 +void __init zone_sizes_init(void)
11261 +{
11262 +       unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
11263 +       unsigned int max_dma, low;
11264 +
11265 +       /*
11266 +        * XEN: Our notion of "DMA memory" is fake when running over Xen.
11267 +        * We simply put all RAM in the DMA zone so that those drivers which
11268 +        * needlessly specify GFP_DMA do not get starved of RAM unnecessarily.
11269 +        * Those drivers that *do* require lowmem are screwed anyway when
11270 +        * running over Xen!
11271 +        */
11272 +       max_dma = max_low_pfn;
11273 +       low = max_low_pfn;
11274 +
11275 +       if (low < max_dma)
11276 +               zones_size[ZONE_DMA] = low;
11277 +       else {
11278 +               zones_size[ZONE_DMA] = max_dma;
11279 +               zones_size[ZONE_NORMAL] = low - max_dma;
11280 +#ifdef CONFIG_HIGHMEM
11281 +               zones_size[ZONE_HIGHMEM] = highend_pfn - low;
11282 +#endif
11283 +       }
11284 +       free_area_init(zones_size);
11285 +}
11286 +#else
11287 +extern unsigned long __init setup_memory(void);
11288 +extern void zone_sizes_init(void);
11289 +#endif /* !CONFIG_NEED_MULTIPLE_NODES */
11290 +
11291 +void __init setup_bootmem_allocator(void)
11292 +{
11293 +       unsigned long bootmap_size;
11294 +       /*
11295 +        * Initialize the boot-time allocator (with low memory only):
11296 +        */
11297 +       bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
11298 +
11299 +       register_bootmem_low_pages(max_low_pfn);
11300 +
11301 +       /*
11302 +        * Reserve the bootmem bitmap itself as well. We do this in two
11303 +        * steps (first step was init_bootmem()) because this catches
11304 +        * the (very unlikely) case of us accidentally initializing the
11305 +        * bootmem allocator with an invalid RAM area.
11306 +        */
11307 +       reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) +
11308 +                        bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START));
11309 +
11310 +#ifndef CONFIG_XEN
11311 +       /*
11312 +        * reserve physical page 0 - it's a special BIOS page on many boxes,
11313 +        * enabling clean reboots, SMP operation, laptop functions.
11314 +        */
11315 +       reserve_bootmem(0, PAGE_SIZE);
11316 +
11317 +       /* reserve EBDA region, it's a 4K region */
11318 +       reserve_ebda_region();
11319 +
11320 +    /* could be an AMD 768MPX chipset. Reserve a page  before VGA to prevent
11321 +       PCI prefetch into it (errata #56). Usually the page is reserved anyways,
11322 +       unless you have no PS/2 mouse plugged in. */
11323 +       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
11324 +           boot_cpu_data.x86 == 6)
11325 +            reserve_bootmem(0xa0000 - 4096, 4096);
11326 +
11327 +#ifdef CONFIG_SMP
11328 +       /*
11329 +        * But first pinch a few for the stack/trampoline stuff
11330 +        * FIXME: Don't need the extra page at 4K, but need to fix
11331 +        * trampoline before removing it. (see the GDT stuff)
11332 +        */
11333 +       reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
11334 +#endif
11335 +#ifdef CONFIG_ACPI_SLEEP
11336 +       /*
11337 +        * Reserve low memory region for sleep support.
11338 +        */
11339 +       acpi_reserve_bootmem();
11340 +#endif
11341 +#endif /* !CONFIG_XEN */
11342 +
11343 +#ifdef CONFIG_BLK_DEV_INITRD
11344 +       if (xen_start_info->mod_start) {
11345 +               if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
11346 +                       /*reserve_bootmem(INITRD_START, INITRD_SIZE);*/
11347 +                       initrd_start = INITRD_START + PAGE_OFFSET;
11348 +                       initrd_end = initrd_start+INITRD_SIZE;
11349 +                       initrd_below_start_ok = 1;
11350 +               }
11351 +               else {
11352 +                       printk(KERN_ERR "initrd extends beyond end of memory "
11353 +                           "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
11354 +                           INITRD_START + INITRD_SIZE,
11355 +                           max_low_pfn << PAGE_SHIFT);
11356 +                       initrd_start = 0;
11357 +               }
11358 +       }
11359 +#endif
11360 +
11361 +       if (!xen_feature(XENFEAT_auto_translated_physmap))
11362 +               phys_to_machine_mapping =
11363 +                       (unsigned long *)xen_start_info->mfn_list;
11364 +}
11365 +
11366 +/*
11367 + * The node 0 pgdat is initialized before all of these because
11368 + * it's needed for bootmem.  node>0 pgdats have their virtual
11369 + * space allocated before the pagetables are in place to access
11370 + * them, so they can't be cleared then.
11371 + *
11372 + * This should all compile down to nothing when NUMA is off.
11373 + */
11374 +void __init remapped_pgdat_init(void)
11375 +{
11376 +       int nid;
11377 +
11378 +       for_each_online_node(nid) {
11379 +               if (nid != 0)
11380 +                       memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
11381 +       }
11382 +}
11383 +
11384 +/*
11385 + * Request address space for all standard RAM and ROM resources
11386 + * and also for regions reported as reserved by the e820.
11387 + */
11388 +static void __init
11389 +legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
11390 +{
11391 +       int i;
11392 +#ifdef CONFIG_XEN
11393 +       dom0_op_t op;
11394 +       struct dom0_memory_map_entry *map;
11395 +       unsigned long gapstart, gapsize;
11396 +       unsigned long long last;
11397 +#endif
11398 +
11399 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
11400 +       probe_roms();
11401 +#endif
11402 +
11403 +#ifdef CONFIG_XEN
11404 +       map = alloc_bootmem_low_pages(PAGE_SIZE);
11405 +       op.cmd = DOM0_PHYSICAL_MEMORY_MAP;
11406 +       op.u.physical_memory_map.memory_map = map;
11407 +       op.u.physical_memory_map.max_map_entries =
11408 +               PAGE_SIZE / sizeof(struct dom0_memory_map_entry);
11409 +       BUG_ON(HYPERVISOR_dom0_op(&op));
11410 +
11411 +       last = 0x100000000ULL;
11412 +       gapstart = 0x10000000;
11413 +       gapsize = 0x400000;
11414 +
11415 +       for (i = op.u.physical_memory_map.nr_map_entries - 1; i >= 0; i--) {
11416 +               struct resource *res;
11417 +
11418 +               if ((last > map[i].end) && ((last - map[i].end) > gapsize)) {
11419 +                       gapsize = last - map[i].end;
11420 +                       gapstart = map[i].end;
11421 +               }
11422 +               if (map[i].start < last)
11423 +                       last = map[i].start;
11424 +
11425 +               if (map[i].end > 0x100000000ULL)
11426 +                       continue;
11427 +               res = alloc_bootmem_low(sizeof(struct resource));
11428 +               res->name = map[i].is_ram ? "System RAM" : "reserved";
11429 +               res->start = map[i].start;
11430 +               res->end = map[i].end - 1;
11431 +               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
11432 +               request_resource(&iomem_resource, res);
11433 +       }
11434 +
11435 +       free_bootmem(__pa(map), PAGE_SIZE);
11436 +
11437 +       /*
11438 +        * Start allocating dynamic PCI memory a bit into the gap,
11439 +        * aligned up to the nearest megabyte.
11440 +        *
11441 +        * Question: should we try to pad it up a bit (do something
11442 +        * like " + (gapsize >> 3)" in there too?). We now have the
11443 +        * technology.
11444 +        */
11445 +       pci_mem_start = (gapstart + 0xfffff) & ~0xfffff;
11446 +
11447 +       printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
11448 +               pci_mem_start, gapstart, gapsize);
11449 +#else
11450 +       for (i = 0; i < e820.nr_map; i++) {
11451 +               struct resource *res;
11452 +               if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
11453 +                       continue;
11454 +               res = alloc_bootmem_low(sizeof(struct resource));
11455 +               switch (e820.map[i].type) {
11456 +               case E820_RAM:  res->name = "System RAM"; break;
11457 +               case E820_ACPI: res->name = "ACPI Tables"; break;
11458 +               case E820_NVS:  res->name = "ACPI Non-volatile Storage"; break;
11459 +               default:        res->name = "reserved";
11460 +               }
11461 +               res->start = e820.map[i].addr;
11462 +               res->end = res->start + e820.map[i].size - 1;
11463 +               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
11464 +               request_resource(&iomem_resource, res);
11465 +               if (e820.map[i].type == E820_RAM) {
11466 +                       /*
11467 +                        *  We don't know which RAM region contains kernel data,
11468 +                        *  so we try it repeatedly and let the resource manager
11469 +                        *  test it.
11470 +                        */
11471 +                       request_resource(res, code_resource);
11472 +                       request_resource(res, data_resource);
11473 +#ifdef CONFIG_KEXEC
11474 +                       request_resource(res, &crashk_res);
11475 +#endif
11476 +               }
11477 +       }
11478 +#endif
11479 +#ifdef CONFIG_KEXEC
11480 +       if (crashk_res.start != crashk_res.end)
11481 +               reserve_bootmem(crashk_res.start,
11482 +                       crashk_res.end - crashk_res.start + 1);
11483 +#endif
11484 +}
11485 +
11486 +/*
11487 + * Request address space for all standard resources
11488 + */
11489 +static void __init register_memory(void)
11490 +{
11491 +#ifndef CONFIG_XEN
11492 +       unsigned long gapstart, gapsize, round;
11493 +       unsigned long long last;
11494 +#endif
11495 +       int           i;
11496 +
11497 +       /* Nothing to do if not running in dom0. */
11498 +       if (!(xen_start_info->flags & SIF_INITDOMAIN))
11499 +               return;
11500 +
11501 +       if (efi_enabled)
11502 +               efi_initialize_iomem_resources(&code_resource, &data_resource);
11503 +       else
11504 +               legacy_init_iomem_resources(&code_resource, &data_resource);
11505 +
11506 +       /* EFI systems may still have VGA */
11507 +       request_resource(&iomem_resource, &video_ram_resource);
11508 +
11509 +       /* request I/O space for devices used on all i[345]86 PCs */
11510 +       for (i = 0; i < STANDARD_IO_RESOURCES; i++)
11511 +               request_resource(&ioport_resource, &standard_io_resources[i]);
11512 +
11513 +#ifndef CONFIG_XEN
11514 +       /*
11515 +        * Search for the bigest gap in the low 32 bits of the e820
11516 +        * memory space.
11517 +        */
11518 +       last = 0x100000000ull;
11519 +       gapstart = 0x10000000;
11520 +       gapsize = 0x400000;
11521 +       i = e820.nr_map;
11522 +       while (--i >= 0) {
11523 +               unsigned long long start = e820.map[i].addr;
11524 +               unsigned long long end = start + e820.map[i].size;
11525 +
11526 +               /*
11527 +                * Since "last" is at most 4GB, we know we'll
11528 +                * fit in 32 bits if this condition is true
11529 +                */
11530 +               if (last > end) {
11531 +                       unsigned long gap = last - end;
11532 +
11533 +                       if (gap > gapsize) {
11534 +                               gapsize = gap;
11535 +                               gapstart = end;
11536 +                       }
11537 +               }
11538 +               if (start < last)
11539 +                       last = start;
11540 +       }
11541 +
11542 +       /*
11543 +        * See how much we want to round up: start off with
11544 +        * rounding to the next 1MB area.
11545 +        */
11546 +       round = 0x100000;
11547 +       while ((gapsize >> 4) > round)
11548 +               round += round;
11549 +       /* Fun with two's complement */
11550 +       pci_mem_start = (gapstart + round) & -round;
11551 +
11552 +       printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
11553 +               pci_mem_start, gapstart, gapsize);
11554 +#endif
11555 +}
11556 +
11557 +/* Use inline assembly to define this because the nops are defined 
11558 +   as inline assembly strings in the include files and we cannot 
11559 +   get them easily into strings. */
11560 +asm("\t.data\nintelnops: " 
11561 +    GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
11562 +    GENERIC_NOP7 GENERIC_NOP8); 
11563 +asm("\t.data\nk8nops: " 
11564 +    K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
11565 +    K8_NOP7 K8_NOP8); 
11566 +asm("\t.data\nk7nops: " 
11567 +    K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
11568 +    K7_NOP7 K7_NOP8); 
11569 +    
11570 +extern unsigned char intelnops[], k8nops[], k7nops[];
11571 +static unsigned char *intel_nops[ASM_NOP_MAX+1] = { 
11572 +     NULL,
11573 +     intelnops,
11574 +     intelnops + 1,
11575 +     intelnops + 1 + 2,
11576 +     intelnops + 1 + 2 + 3,
11577 +     intelnops + 1 + 2 + 3 + 4,
11578 +     intelnops + 1 + 2 + 3 + 4 + 5,
11579 +     intelnops + 1 + 2 + 3 + 4 + 5 + 6,
11580 +     intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
11581 +}; 
11582 +static unsigned char *k8_nops[ASM_NOP_MAX+1] = { 
11583 +     NULL,
11584 +     k8nops,
11585 +     k8nops + 1,
11586 +     k8nops + 1 + 2,
11587 +     k8nops + 1 + 2 + 3,
11588 +     k8nops + 1 + 2 + 3 + 4,
11589 +     k8nops + 1 + 2 + 3 + 4 + 5,
11590 +     k8nops + 1 + 2 + 3 + 4 + 5 + 6,
11591 +     k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
11592 +}; 
11593 +static unsigned char *k7_nops[ASM_NOP_MAX+1] = { 
11594 +     NULL,
11595 +     k7nops,
11596 +     k7nops + 1,
11597 +     k7nops + 1 + 2,
11598 +     k7nops + 1 + 2 + 3,
11599 +     k7nops + 1 + 2 + 3 + 4,
11600 +     k7nops + 1 + 2 + 3 + 4 + 5,
11601 +     k7nops + 1 + 2 + 3 + 4 + 5 + 6,
11602 +     k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
11603 +}; 
11604 +static struct nop { 
11605 +     int cpuid; 
11606 +     unsigned char **noptable; 
11607 +} noptypes[] = { 
11608 +     { X86_FEATURE_K8, k8_nops }, 
11609 +     { X86_FEATURE_K7, k7_nops }, 
11610 +     { -1, NULL }
11611 +}; 
11612 +
11613 +/* Replace instructions with better alternatives for this CPU type.
11614 +
11615 +   This runs before SMP is initialized to avoid SMP problems with
11616 +   self modifying code. This implies that assymetric systems where
11617 +   APs have less capabilities than the boot processor are not handled. 
11618 +   Tough. Make sure you disable such features by hand. */ 
11619 +void apply_alternatives(void *start, void *end) 
11620 +{ 
11621 +       struct alt_instr *a; 
11622 +       int diff, i, k;
11623 +        unsigned char **noptable = intel_nops; 
11624 +       for (i = 0; noptypes[i].cpuid >= 0; i++) { 
11625 +               if (boot_cpu_has(noptypes[i].cpuid)) { 
11626 +                       noptable = noptypes[i].noptable;
11627 +                       break;
11628 +               }
11629 +       } 
11630 +       for (a = start; (void *)a < end; a++) { 
11631 +               if (!boot_cpu_has(a->cpuid))
11632 +                       continue;
11633 +               BUG_ON(a->replacementlen > a->instrlen); 
11634 +               memcpy(a->instr, a->replacement, a->replacementlen); 
11635 +               diff = a->instrlen - a->replacementlen; 
11636 +               /* Pad the rest with nops */
11637 +               for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
11638 +                       k = diff;
11639 +                       if (k > ASM_NOP_MAX)
11640 +                               k = ASM_NOP_MAX;
11641 +                       memcpy(a->instr + i, noptable[k], k); 
11642 +               } 
11643 +       }
11644 +} 
11645 +
11646 +void __init alternative_instructions(void)
11647 +{
11648 +       extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
11649 +       apply_alternatives(__alt_instructions, __alt_instructions_end);
11650 +}
11651 +
11652 +static char * __init machine_specific_memory_setup(void);
11653 +
11654 +#ifdef CONFIG_MCA
11655 +static void set_mca_bus(int x)
11656 +{
11657 +       MCA_bus = x;
11658 +}
11659 +#else
11660 +static void set_mca_bus(int x) { }
11661 +#endif
11662 +
11663 +/*
11664 + * Determine if we were loaded by an EFI loader.  If so, then we have also been
11665 + * passed the efi memmap, systab, etc., so we should use these data structures
11666 + * for initialization.  Note, the efi init code path is determined by the
11667 + * global efi_enabled. This allows the same kernel image to be used on existing
11668 + * systems (with a traditional BIOS) as well as on EFI systems.
11669 + */
11670 +void __init setup_arch(char **cmdline_p)
11671 +{
11672 +       int i, j, k, fpp;
11673 +       physdev_op_t op;
11674 +       unsigned long max_low_pfn;
11675 +
11676 +       /* Force a quick death if the kernel panics (not domain 0). */
11677 +       extern int panic_timeout;
11678 +       if (!panic_timeout && !(xen_start_info->flags & SIF_INITDOMAIN))
11679 +               panic_timeout = 1;
11680 +
11681 +       /* Register a call for panic conditions. */
11682 +       notifier_chain_register(&panic_notifier_list, &xen_panic_block);
11683 +
11684 +       HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
11685 +       HYPERVISOR_vm_assist(VMASST_CMD_enable,
11686 +                            VMASST_TYPE_writable_pagetables);
11687 +
11688 +       memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
11689 +       early_cpu_init();
11690 +
11691 +       /*
11692 +        * FIXME: This isn't an official loader_type right
11693 +        * now but does currently work with elilo.
11694 +        * If we were configured as an EFI kernel, check to make
11695 +        * sure that we were loaded correctly from elilo and that
11696 +        * the system table is valid.  If not, then initialize normally.
11697 +        */
11698 +#ifdef CONFIG_EFI
11699 +       if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
11700 +               efi_enabled = 1;
11701 +#endif
11702 +
11703 +       /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
11704 +          properly.  Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
11705 +       */
11706 +       ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
11707 +       drive_info = DRIVE_INFO;
11708 +       screen_info = SCREEN_INFO;
11709 +       edid_info = EDID_INFO;
11710 +       apm_info.bios = APM_BIOS_INFO;
11711 +       ist_info = IST_INFO;
11712 +       saved_videomode = VIDEO_MODE;
11713 +       if( SYS_DESC_TABLE.length != 0 ) {
11714 +               set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2);
11715 +               machine_id = SYS_DESC_TABLE.table[0];
11716 +               machine_submodel_id = SYS_DESC_TABLE.table[1];
11717 +               BIOS_revision = SYS_DESC_TABLE.table[2];
11718 +       }
11719 +       bootloader_type = LOADER_TYPE;
11720 +
11721 +       if (xen_start_info->flags & SIF_INITDOMAIN) {
11722 +               /* This is drawn from a dump from vgacon:startup in
11723 +                * standard Linux. */
11724 +               screen_info.orig_video_mode = 3; 
11725 +               screen_info.orig_video_isVGA = 1;
11726 +               screen_info.orig_video_lines = 25;
11727 +               screen_info.orig_video_cols = 80;
11728 +               screen_info.orig_video_ega_bx = 3;
11729 +               screen_info.orig_video_points = 16;
11730 +       } else
11731 +               screen_info.orig_video_isVGA = 0;
11732 +
11733 +#ifdef CONFIG_BLK_DEV_RAM
11734 +       rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
11735 +       rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
11736 +       rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
11737 +#endif
11738 +
11739 +       setup_xen_features();
11740 +
11741 +       ARCH_SETUP
11742 +       if (efi_enabled)
11743 +               efi_init();
11744 +       else {
11745 +               printk(KERN_INFO "BIOS-provided physical RAM map:\n");
11746 +               print_memory_map(machine_specific_memory_setup());
11747 +       }
11748 +
11749 +       copy_edd();
11750 +
11751 +       if (!MOUNT_ROOT_RDONLY)
11752 +               root_mountflags &= ~MS_RDONLY;
11753 +       init_mm.start_code = (unsigned long) _text;
11754 +       init_mm.end_code = (unsigned long) _etext;
11755 +       init_mm.end_data = (unsigned long) _edata;
11756 +       init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) +
11757 +                      xen_start_info->nr_pt_frames) << PAGE_SHIFT;
11758 +
11759 +       /* XEN: This is nonsense: kernel may not even be contiguous in RAM. */
11760 +       /*code_resource.start = virt_to_phys(_text);*/
11761 +       /*code_resource.end = virt_to_phys(_etext)-1;*/
11762 +       /*data_resource.start = virt_to_phys(_etext);*/
11763 +       /*data_resource.end = virt_to_phys(_edata)-1;*/
11764 +
11765 +       parse_cmdline_early(cmdline_p);
11766 +
11767 +       max_low_pfn = setup_memory();
11768 +
11769 +       /*
11770 +        * NOTE: before this point _nobody_ is allowed to allocate
11771 +        * any memory using the bootmem allocator.  Although the
11772 +        * alloctor is now initialised only the first 8Mb of the kernel
11773 +        * virtual address space has been mapped.  All allocations before
11774 +        * paging_init() has completed must use the alloc_bootmem_low_pages()
11775 +        * variant (which allocates DMA'able memory) and care must be taken
11776 +        * not to exceed the 8Mb limit.
11777 +        */
11778 +
11779 +#ifdef CONFIG_SMP
11780 +       smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
11781 +#endif
11782 +       paging_init();
11783 +       remapped_pgdat_init();
11784 +       sparse_init();
11785 +       zone_sizes_init();
11786 +
11787 +#ifdef CONFIG_X86_FIND_SMP_CONFIG
11788 +       /*
11789 +        * Find and reserve possible boot-time SMP configuration:
11790 +        */
11791 +       find_smp_config();
11792 +#endif
11793 +
11794 +       /* Make sure we have a correctly sized P->M table. */
11795 +       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
11796 +               phys_to_machine_mapping = alloc_bootmem_low_pages(
11797 +                    max_pfn * sizeof(unsigned long));
11798 +               memset(phys_to_machine_mapping, ~0,
11799 +                      max_pfn * sizeof(unsigned long));
11800 +               memcpy(phys_to_machine_mapping,
11801 +                      (unsigned long *)xen_start_info->mfn_list,
11802 +                      xen_start_info->nr_pages * sizeof(unsigned long));
11803 +               free_bootmem(
11804 +                    __pa(xen_start_info->mfn_list),
11805 +                    PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
11806 +                                    sizeof(unsigned long))));
11807 +
11808 +               /*
11809 +                * Initialise the list of the frames that specify the list of
11810 +                * frames that make up the p2m table. Used by save/restore
11811 +                */
11812 +               pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE);
11813 +               HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
11814 +                    virt_to_mfn(pfn_to_mfn_frame_list_list);
11815 +
11816 +               fpp = PAGE_SIZE/sizeof(unsigned long);
11817 +               for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) {
11818 +                       if ((j % fpp) == 0) {
11819 +                               k++;
11820 +                               BUG_ON(k>=16);
11821 +                               pfn_to_mfn_frame_list[k] =
11822 +                                       alloc_bootmem_low_pages(PAGE_SIZE);
11823 +                               pfn_to_mfn_frame_list_list[k] =
11824 +                                       virt_to_mfn(pfn_to_mfn_frame_list[k]);
11825 +                               j=0;
11826 +                       }
11827 +                       pfn_to_mfn_frame_list[k][j] =
11828 +                               virt_to_mfn(&phys_to_machine_mapping[i]);
11829 +               }
11830 +               HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
11831 +       }
11832 +
11833 +       /*
11834 +        * NOTE: at this point the bootmem allocator is fully available.
11835 +        */
11836 +
11837 +#ifdef CONFIG_EARLY_PRINTK
11838 +       {
11839 +               char *s = strstr(*cmdline_p, "earlyprintk=");
11840 +               if (s) {
11841 +                       extern void setup_early_printk(char *);
11842 +
11843 +                       setup_early_printk(strchr(s, '=') + 1);
11844 +                       printk("early console enabled\n");
11845 +               }
11846 +       }
11847 +#endif
11848 +
11849 +       if (xen_start_info->flags & SIF_INITDOMAIN)
11850 +               dmi_scan_machine();
11851 +
11852 +#ifdef CONFIG_X86_GENERICARCH
11853 +       generic_apic_probe(*cmdline_p);
11854 +#endif 
11855 +       if (efi_enabled)
11856 +               efi_map_memmap();
11857 +
11858 +       op.cmd             = PHYSDEVOP_SET_IOPL;
11859 +       op.u.set_iopl.iopl = 1;
11860 +       HYPERVISOR_physdev_op(&op);
11861 +
11862 +#ifdef CONFIG_X86_IO_APIC
11863 +       check_acpi_pci();       /* Checks more than just ACPI actually */
11864 +#endif
11865 +
11866 +#ifdef CONFIG_ACPI
11867 +       if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
11868 +               printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
11869 +               acpi_disabled = 1;
11870 +               acpi_ht = 0;
11871 +       }
11872 +
11873 +       /*
11874 +        * Parse the ACPI tables for possible boot-time SMP configuration.
11875 +        */
11876 +       acpi_boot_table_init();
11877 +       acpi_boot_init();
11878 +
11879 +#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
11880 +       if (def_to_bigsmp)
11881 +               printk(KERN_WARNING "More than 8 CPUs detected and "
11882 +                       "CONFIG_X86_PC cannot handle it.\nUse "
11883 +                       "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
11884 +#endif
11885 +#endif
11886 +#ifdef CONFIG_X86_LOCAL_APIC
11887 +       if (smp_found_config)
11888 +               get_smp_config();
11889 +#endif
11890 +
11891 +       /* XXX Disable irqdebug until we have a way to avoid interrupt
11892 +        * conflicts. */
11893 +       noirqdebug_setup("");
11894 +
11895 +       register_memory();
11896 +
11897 +       if (xen_start_info->flags & SIF_INITDOMAIN) {
11898 +               if (!(xen_start_info->flags & SIF_PRIVILEGED))
11899 +                       panic("Xen granted us console access "
11900 +                             "but not privileged status");
11901 +
11902 +#ifdef CONFIG_VT
11903 +#if defined(CONFIG_VGA_CONSOLE)
11904 +               if (!efi_enabled ||
11905 +                   (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
11906 +                       conswitchp = &vga_con;
11907 +#elif defined(CONFIG_DUMMY_CONSOLE)
11908 +               conswitchp = &dummy_con;
11909 +#endif
11910 +#endif
11911 +       } else {
11912 +               extern int console_use_vt;
11913 +               console_use_vt = 0;
11914 +       }
11915 +}
11916 +
11917 +static int
11918 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
11919 +{
11920 +       HYPERVISOR_shutdown(SHUTDOWN_crash);
11921 +       /* we're never actually going to get here... */
11922 +       return NOTIFY_DONE;
11923 +}
11924 +
11925 +#include "setup_arch_post.h"
11926 +/*
11927 + * Local Variables:
11928 + * mode:c
11929 + * c-file-style:"k&r"
11930 + * c-basic-offset:8
11931 + * End:
11932 + */
11933 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/smp-xen.c linux-2.6.16/arch/i386/kernel/smp-xen.c
11934 --- linux-2.6.16.orig/arch/i386/kernel/smp-xen.c        1970-01-01 01:00:00.000000000 +0100
11935 +++ linux-2.6.16/arch/i386/kernel/smp-xen.c     2006-06-26 09:51:32.000000000 +0200
11936 @@ -0,0 +1,617 @@
11937 +/*
11938 + *     Intel SMP support routines.
11939 + *
11940 + *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
11941 + *     (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
11942 + *
11943 + *     This code is released under the GNU General Public License version 2 or
11944 + *     later.
11945 + */
11946 +
11947 +#include <linux/init.h>
11948 +
11949 +#include <linux/mm.h>
11950 +#include <linux/delay.h>
11951 +#include <linux/spinlock.h>
11952 +#include <linux/smp_lock.h>
11953 +#include <linux/kernel_stat.h>
11954 +#include <linux/mc146818rtc.h>
11955 +#include <linux/cache.h>
11956 +#include <linux/interrupt.h>
11957 +#include <linux/cpu.h>
11958 +#include <linux/module.h>
11959 +
11960 +#include <asm/mtrr.h>
11961 +#include <asm/tlbflush.h>
11962 +#if 0
11963 +#include <mach_apic.h>
11964 +#endif
11965 +#include <xen/evtchn.h>
11966 +
11967 +/*
11968 + *     Some notes on x86 processor bugs affecting SMP operation:
11969 + *
11970 + *     Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
11971 + *     The Linux implications for SMP are handled as follows:
11972 + *
11973 + *     Pentium III / [Xeon]
11974 + *             None of the E1AP-E3AP errata are visible to the user.
11975 + *
11976 + *     E1AP.   see PII A1AP
11977 + *     E2AP.   see PII A2AP
11978 + *     E3AP.   see PII A3AP
11979 + *
11980 + *     Pentium II / [Xeon]
11981 + *             None of the A1AP-A3AP errata are visible to the user.
11982 + *
11983 + *     A1AP.   see PPro 1AP
11984 + *     A2AP.   see PPro 2AP
11985 + *     A3AP.   see PPro 7AP
11986 + *
11987 + *     Pentium Pro
11988 + *             None of 1AP-9AP errata are visible to the normal user,
11989 + *     except occasional delivery of 'spurious interrupt' as trap #15.
11990 + *     This is very rare and a non-problem.
11991 + *
11992 + *     1AP.    Linux maps APIC as non-cacheable
11993 + *     2AP.    worked around in hardware
11994 + *     3AP.    fixed in C0 and above steppings microcode update.
11995 + *             Linux does not use excessive STARTUP_IPIs.
11996 + *     4AP.    worked around in hardware
11997 + *     5AP.    symmetric IO mode (normal Linux operation) not affected.
11998 + *             'noapic' mode has vector 0xf filled out properly.
11999 + *     6AP.    'noapic' mode might be affected - fixed in later steppings
12000 + *     7AP.    We do not assume writes to the LVT deassering IRQs
12001 + *     8AP.    We do not enable low power mode (deep sleep) during MP bootup
12002 + *     9AP.    We do not use mixed mode
12003 + *
12004 + *     Pentium
12005 + *             There is a marginal case where REP MOVS on 100MHz SMP
12006 + *     machines with B stepping processors can fail. XXX should provide
12007 + *     an L1cache=Writethrough or L1cache=off option.
12008 + *
12009 + *             B stepping CPUs may hang. There are hardware work arounds
12010 + *     for this. We warn about it in case your board doesn't have the work
12011 + *     arounds. Basically thats so I can tell anyone with a B stepping
12012 + *     CPU and SMP problems "tough".
12013 + *
12014 + *     Specific items [From Pentium Processor Specification Update]
12015 + *
12016 + *     1AP.    Linux doesn't use remote read
12017 + *     2AP.    Linux doesn't trust APIC errors
12018 + *     3AP.    We work around this
12019 + *     4AP.    Linux never generated 3 interrupts of the same priority
12020 + *             to cause a lost local interrupt.
12021 + *     5AP.    Remote read is never used
12022 + *     6AP.    not affected - worked around in hardware
12023 + *     7AP.    not affected - worked around in hardware
12024 + *     8AP.    worked around in hardware - we get explicit CS errors if not
12025 + *     9AP.    only 'noapic' mode affected. Might generate spurious
12026 + *             interrupts, we log only the first one and count the
12027 + *             rest silently.
12028 + *     10AP.   not affected - worked around in hardware
12029 + *     11AP.   Linux reads the APIC between writes to avoid this, as per
12030 + *             the documentation. Make sure you preserve this as it affects
12031 + *             the C stepping chips too.
12032 + *     12AP.   not affected - worked around in hardware
12033 + *     13AP.   not affected - worked around in hardware
12034 + *     14AP.   we always deassert INIT during bootup
12035 + *     15AP.   not affected - worked around in hardware
12036 + *     16AP.   not affected - worked around in hardware
12037 + *     17AP.   not affected - worked around in hardware
12038 + *     18AP.   not affected - worked around in hardware
12039 + *     19AP.   not affected - worked around in BIOS
12040 + *
12041 + *     If this sounds worrying believe me these bugs are either ___RARE___,
12042 + *     or are signal timing bugs worked around in hardware and there's
12043 + *     about nothing of note with C stepping upwards.
12044 + */
12045 +
12046 +DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
12047 +
12048 +/*
12049 + * the following functions deal with sending IPIs between CPUs.
12050 + *
12051 + * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
12052 + */
12053 +
12054 +static inline int __prepare_ICR (unsigned int shortcut, int vector)
12055 +{
12056 +       return APIC_DM_FIXED | shortcut | vector | APIC_DEST_LOGICAL;
12057 +}
12058 +
12059 +static inline int __prepare_ICR2 (unsigned int mask)
12060 +{
12061 +       return SET_APIC_DEST_FIELD(mask);
12062 +}
12063 +
12064 +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
12065 +
12066 +static inline void __send_IPI_one(unsigned int cpu, int vector)
12067 +{
12068 +       int irq = per_cpu(ipi_to_irq, cpu)[vector];
12069 +       BUG_ON(irq < 0);
12070 +       notify_remote_via_irq(irq);
12071 +}
12072 +
12073 +void __send_IPI_shortcut(unsigned int shortcut, int vector)
12074 +{
12075 +       int cpu;
12076 +
12077 +       switch (shortcut) {
12078 +       case APIC_DEST_SELF:
12079 +               __send_IPI_one(smp_processor_id(), vector);
12080 +               break;
12081 +       case APIC_DEST_ALLBUT:
12082 +               for (cpu = 0; cpu < NR_CPUS; ++cpu) {
12083 +                       if (cpu == smp_processor_id())
12084 +                               continue;
12085 +                       if (cpu_isset(cpu, cpu_online_map)) {
12086 +                               __send_IPI_one(cpu, vector);
12087 +                       }
12088 +               }
12089 +               break;
12090 +       default:
12091 +               printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
12092 +                      vector);
12093 +               break;
12094 +       }
12095 +}
12096 +
12097 +void fastcall send_IPI_self(int vector)
12098 +{
12099 +       __send_IPI_shortcut(APIC_DEST_SELF, vector);
12100 +}
12101 +
12102 +/*
12103 + * This is only used on smaller machines.
12104 + */
12105 +void send_IPI_mask_bitmask(cpumask_t mask, int vector)
12106 +{
12107 +       unsigned long flags;
12108 +       unsigned int cpu;
12109 +
12110 +       local_irq_save(flags);
12111 +       WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]);
12112 +
12113 +       for (cpu = 0; cpu < NR_CPUS; ++cpu) {
12114 +               if (cpu_isset(cpu, mask)) {
12115 +                       __send_IPI_one(cpu, vector);
12116 +               }
12117 +       }
12118 +
12119 +       local_irq_restore(flags);
12120 +}
12121 +
12122 +void send_IPI_mask_sequence(cpumask_t mask, int vector)
12123 +{
12124 +
12125 +       send_IPI_mask_bitmask(mask, vector);
12126 +}
12127 +
12128 +#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
12129 +
12130 +#if 0 /* XEN */
12131 +/*
12132 + *     Smarter SMP flushing macros. 
12133 + *             c/o Linus Torvalds.
12134 + *
12135 + *     These mean you can really definitely utterly forget about
12136 + *     writing to user space from interrupts. (Its not allowed anyway).
12137 + *
12138 + *     Optimizations Manfred Spraul <manfred@colorfullife.com>
12139 + */
12140 +
12141 +static cpumask_t flush_cpumask;
12142 +static struct mm_struct * flush_mm;
12143 +static unsigned long flush_va;
12144 +static DEFINE_SPINLOCK(tlbstate_lock);
12145 +#define FLUSH_ALL      0xffffffff
12146 +
12147 +/*
12148 + * We cannot call mmdrop() because we are in interrupt context, 
12149 + * instead update mm->cpu_vm_mask.
12150 + *
12151 + * We need to reload %cr3 since the page tables may be going
12152 + * away from under us..
12153 + */
12154 +static inline void leave_mm (unsigned long cpu)
12155 +{
12156 +       if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
12157 +               BUG();
12158 +       cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
12159 +       load_cr3(swapper_pg_dir);
12160 +}
12161 +
12162 +/*
12163 + *
12164 + * The flush IPI assumes that a thread switch happens in this order:
12165 + * [cpu0: the cpu that switches]
12166 + * 1) switch_mm() either 1a) or 1b)
12167 + * 1a) thread switch to a different mm
12168 + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
12169 + *     Stop ipi delivery for the old mm. This is not synchronized with
12170 + *     the other cpus, but smp_invalidate_interrupt ignore flush ipis
12171 + *     for the wrong mm, and in the worst case we perform a superflous
12172 + *     tlb flush.
12173 + * 1a2) set cpu_tlbstate to TLBSTATE_OK
12174 + *     Now the smp_invalidate_interrupt won't call leave_mm if cpu0
12175 + *     was in lazy tlb mode.
12176 + * 1a3) update cpu_tlbstate[].active_mm
12177 + *     Now cpu0 accepts tlb flushes for the new mm.
12178 + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
12179 + *     Now the other cpus will send tlb flush ipis.
12180 + * 1a4) change cr3.
12181 + * 1b) thread switch without mm change
12182 + *     cpu_tlbstate[].active_mm is correct, cpu0 already handles
12183 + *     flush ipis.
12184 + * 1b1) set cpu_tlbstate to TLBSTATE_OK
12185 + * 1b2) test_and_set the cpu bit in cpu_vm_mask.
12186 + *     Atomically set the bit [other cpus will start sending flush ipis],
12187 + *     and test the bit.
12188 + * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
12189 + * 2) switch %%esp, ie current
12190 + *
12191 + * The interrupt must handle 2 special cases:
12192 + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
12193 + * - the cpu performs speculative tlb reads, i.e. even if the cpu only
12194 + *   runs in kernel space, the cpu could load tlb entries for user space
12195 + *   pages.
12196 + *
12197 + * The good news is that cpu_tlbstate is local to each cpu, no
12198 + * write/read ordering problems.
12199 + */
12200 +
12201 +/*
12202 + * TLB flush IPI:
12203 + *
12204 + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
12205 + * 2) Leave the mm if we are in the lazy tlb mode.
12206 + */
12207 +
12208 +irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
12209 +                                    struct pt_regs *regs)
12210 +{
12211 +       unsigned long cpu;
12212 +
12213 +       cpu = get_cpu();
12214 +
12215 +       if (!cpu_isset(cpu, flush_cpumask))
12216 +               goto out;
12217 +               /* 
12218 +                * This was a BUG() but until someone can quote me the
12219 +                * line from the intel manual that guarantees an IPI to
12220 +                * multiple CPUs is retried _only_ on the erroring CPUs
12221 +                * its staying as a return
12222 +                *
12223 +                * BUG();
12224 +                */
12225 +                
12226 +       if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
12227 +               if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
12228 +                       if (flush_va == FLUSH_ALL)
12229 +                               local_flush_tlb();
12230 +                       else
12231 +                               __flush_tlb_one(flush_va);
12232 +               } else
12233 +                       leave_mm(cpu);
12234 +       }
12235 +       smp_mb__before_clear_bit();
12236 +       cpu_clear(cpu, flush_cpumask);
12237 +       smp_mb__after_clear_bit();
12238 +out:
12239 +       put_cpu_no_resched();
12240 +
12241 +       return IRQ_HANDLED;
12242 +}
12243 +
12244 +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
12245 +                                               unsigned long va)
12246 +{
12247 +       /*
12248 +        * A couple of (to be removed) sanity checks:
12249 +        *
12250 +        * - current CPU must not be in mask
12251 +        * - mask must exist :)
12252 +        */
12253 +       BUG_ON(cpus_empty(cpumask));
12254 +       BUG_ON(cpu_isset(smp_processor_id(), cpumask));
12255 +       BUG_ON(!mm);
12256 +
12257 +       /* If a CPU which we ran on has gone down, OK. */
12258 +       cpus_and(cpumask, cpumask, cpu_online_map);
12259 +       if (cpus_empty(cpumask))
12260 +               return;
12261 +
12262 +       /*
12263 +        * i'm not happy about this global shared spinlock in the
12264 +        * MM hot path, but we'll see how contended it is.
12265 +        * Temporarily this turns IRQs off, so that lockups are
12266 +        * detected by the NMI watchdog.
12267 +        */
12268 +       spin_lock(&tlbstate_lock);
12269 +       
12270 +       flush_mm = mm;
12271 +       flush_va = va;
12272 +#if NR_CPUS <= BITS_PER_LONG
12273 +       atomic_set_mask(cpumask, &flush_cpumask);
12274 +#else
12275 +       {
12276 +               int k;
12277 +               unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
12278 +               unsigned long *cpu_mask = (unsigned long *)&cpumask;
12279 +               for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
12280 +                       atomic_set_mask(cpu_mask[k], &flush_mask[k]);
12281 +       }
12282 +#endif
12283 +       /*
12284 +        * We have to send the IPI only to
12285 +        * CPUs affected.
12286 +        */
12287 +       send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
12288 +
12289 +       while (!cpus_empty(flush_cpumask))
12290 +               /* nothing. lockup detection does not belong here */
12291 +               mb();
12292 +
12293 +       flush_mm = NULL;
12294 +       flush_va = 0;
12295 +       spin_unlock(&tlbstate_lock);
12296 +}
12297 +       
12298 +void flush_tlb_current_task(void)
12299 +{
12300 +       struct mm_struct *mm = current->mm;
12301 +       cpumask_t cpu_mask;
12302 +
12303 +       preempt_disable();
12304 +       cpu_mask = mm->cpu_vm_mask;
12305 +       cpu_clear(smp_processor_id(), cpu_mask);
12306 +
12307 +       local_flush_tlb();
12308 +       if (!cpus_empty(cpu_mask))
12309 +               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
12310 +       preempt_enable();
12311 +}
12312 +
12313 +void flush_tlb_mm (struct mm_struct * mm)
12314 +{
12315 +       cpumask_t cpu_mask;
12316 +
12317 +       preempt_disable();
12318 +       cpu_mask = mm->cpu_vm_mask;
12319 +       cpu_clear(smp_processor_id(), cpu_mask);
12320 +
12321 +       if (current->active_mm == mm) {
12322 +               if (current->mm)
12323 +                       local_flush_tlb();
12324 +               else
12325 +                       leave_mm(smp_processor_id());
12326 +       }
12327 +       if (!cpus_empty(cpu_mask))
12328 +               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
12329 +
12330 +       preempt_enable();
12331 +}
12332 +
12333 +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
12334 +{
12335 +       struct mm_struct *mm = vma->vm_mm;
12336 +       cpumask_t cpu_mask;
12337 +
12338 +       preempt_disable();
12339 +       cpu_mask = mm->cpu_vm_mask;
12340 +       cpu_clear(smp_processor_id(), cpu_mask);
12341 +
12342 +       if (current->active_mm == mm) {
12343 +               if(current->mm)
12344 +                       __flush_tlb_one(va);
12345 +               else
12346 +                       leave_mm(smp_processor_id());
12347 +       }
12348 +
12349 +       if (!cpus_empty(cpu_mask))
12350 +               flush_tlb_others(cpu_mask, mm, va);
12351 +
12352 +       preempt_enable();
12353 +}
12354 +EXPORT_SYMBOL(flush_tlb_page);
12355 +
12356 +static void do_flush_tlb_all(void* info)
12357 +{
12358 +       unsigned long cpu = smp_processor_id();
12359 +
12360 +       __flush_tlb_all();
12361 +       if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
12362 +               leave_mm(cpu);
12363 +}
12364 +
12365 +void flush_tlb_all(void)
12366 +{
12367 +       on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
12368 +}
12369 +
12370 +#else
12371 +
12372 +irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
12373 +                                    struct pt_regs *regs)
12374 +{ return 0; }
12375 +void flush_tlb_current_task(void)
12376 +{ xen_tlb_flush_mask(&current->mm->cpu_vm_mask); }
12377 +void flush_tlb_mm(struct mm_struct * mm)
12378 +{ xen_tlb_flush_mask(&mm->cpu_vm_mask); }
12379 +void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
12380 +{ xen_invlpg_mask(&vma->vm_mm->cpu_vm_mask, va); }
12381 +void flush_tlb_all(void)
12382 +{ xen_tlb_flush_all(); }
12383 +
12384 +#endif /* XEN */
12385 +
12386 +/*
12387 + * this function sends a 'reschedule' IPI to another CPU.
12388 + * it goes straight through and wastes no time serializing
12389 + * anything. Worst case is that we lose a reschedule ...
12390 + */
12391 +void smp_send_reschedule(int cpu)
12392 +{
12393 +       WARN_ON(cpu_is_offline(cpu));
12394 +       send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
12395 +}
12396 +
12397 +/*
12398 + * Structure and data for smp_call_function(). This is designed to minimise
12399 + * static memory requirements. It also looks cleaner.
12400 + */
12401 +static DEFINE_SPINLOCK(call_lock);
12402 +
12403 +struct call_data_struct {
12404 +       void (*func) (void *info);
12405 +       void *info;
12406 +       atomic_t started;
12407 +       atomic_t finished;
12408 +       int wait;
12409 +};
12410 +
12411 +void lock_ipi_call_lock(void)
12412 +{
12413 +       spin_lock_irq(&call_lock);
12414 +}
12415 +
12416 +void unlock_ipi_call_lock(void)
12417 +{
12418 +       spin_unlock_irq(&call_lock);
12419 +}
12420 +
12421 +static struct call_data_struct * call_data;
12422 +
12423 +/*
12424 + * this function sends a 'generic call function' IPI to all other CPUs
12425 + * in the system.
12426 + */
12427 +
12428 +int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
12429 +                       int wait)
12430 +/*
12431 + * [SUMMARY] Run a function on all other CPUs.
12432 + * <func> The function to run. This must be fast and non-blocking.
12433 + * <info> An arbitrary pointer to pass to the function.
12434 + * <nonatomic> currently unused.
12435 + * <wait> If true, wait (atomically) until function has completed on other CPUs.
12436 + * [RETURNS] 0 on success, else a negative status code. Does not return until
12437 + * remote CPUs are nearly ready to execute <<func>> or are or have executed.
12438 + *
12439 + * You must not call this function with disabled interrupts or from a
12440 + * hardware interrupt handler or from a bottom half handler.
12441 + */
12442 +{
12443 +       struct call_data_struct data;
12444 +       int cpus;
12445 +
12446 +       /* Holding any lock stops cpus from going down. */
12447 +       spin_lock(&call_lock);
12448 +       cpus = num_online_cpus() - 1;
12449 +       if (!cpus) {
12450 +               spin_unlock(&call_lock);
12451 +               return 0;
12452 +       }
12453 +
12454 +       /* Can deadlock when called with interrupts disabled */
12455 +       WARN_ON(irqs_disabled());
12456 +
12457 +       data.func = func;
12458 +       data.info = info;
12459 +       atomic_set(&data.started, 0);
12460 +       data.wait = wait;
12461 +       if (wait)
12462 +               atomic_set(&data.finished, 0);
12463 +
12464 +       call_data = &data;
12465 +       mb();
12466 +       
12467 +       /* Send a message to all other CPUs and wait for them to respond */
12468 +       send_IPI_allbutself(CALL_FUNCTION_VECTOR);
12469 +
12470 +       /* Wait for response */
12471 +       while (atomic_read(&data.started) != cpus)
12472 +               barrier();
12473 +
12474 +       if (wait)
12475 +               while (atomic_read(&data.finished) != cpus)
12476 +                       barrier();
12477 +       spin_unlock(&call_lock);
12478 +
12479 +       return 0;
12480 +}
12481 +EXPORT_SYMBOL(smp_call_function);
12482 +
12483 +static void stop_this_cpu (void * dummy)
12484 +{
12485 +       /*
12486 +        * Remove this CPU:
12487 +        */
12488 +       cpu_clear(smp_processor_id(), cpu_online_map);
12489 +       local_irq_disable();
12490 +#if 0
12491 +       disable_local_APIC();
12492 +#endif
12493 +       if (cpu_data[smp_processor_id()].hlt_works_ok)
12494 +               for(;;) halt();
12495 +       for (;;);
12496 +}
12497 +
12498 +/*
12499 + * this function calls the 'stop' function on all other CPUs in the system.
12500 + */
12501 +
12502 +void smp_send_stop(void)
12503 +{
12504 +       smp_call_function(stop_this_cpu, NULL, 1, 0);
12505 +
12506 +       local_irq_disable();
12507 +#if 0
12508 +       disable_local_APIC();
12509 +#endif
12510 +       local_irq_enable();
12511 +}
12512 +
12513 +/*
12514 + * Reschedule call back. Nothing to do,
12515 + * all the work is done automatically when
12516 + * we return from the interrupt.
12517 + */
12518 +irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id,
12519 +                                    struct pt_regs *regs)
12520 +{
12521 +
12522 +       return IRQ_HANDLED;
12523 +}
12524 +
12525 +#include <linux/kallsyms.h>
12526 +irqreturn_t smp_call_function_interrupt(int irq, void *dev_id,
12527 +                                       struct pt_regs *regs)
12528 +{
12529 +       void (*func) (void *info) = call_data->func;
12530 +       void *info = call_data->info;
12531 +       int wait = call_data->wait;
12532 +
12533 +       /*
12534 +        * Notify initiating CPU that I've grabbed the data and am
12535 +        * about to execute the function
12536 +        */
12537 +       mb();
12538 +       atomic_inc(&call_data->started);
12539 +       /*
12540 +        * At this point the info structure may be out of scope unless wait==1
12541 +        */
12542 +       irq_enter();
12543 +       (*func)(info);
12544 +       irq_exit();
12545 +
12546 +       if (wait) {
12547 +               mb();
12548 +               atomic_inc(&call_data->finished);
12549 +       }
12550 +
12551 +       return IRQ_HANDLED;
12552 +}
12553 +
12554 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/smpalts.c linux-2.6.16/arch/i386/kernel/smpalts.c
12555 --- linux-2.6.16.orig/arch/i386/kernel/smpalts.c        1970-01-01 01:00:00.000000000 +0100
12556 +++ linux-2.6.16/arch/i386/kernel/smpalts.c     2006-06-26 09:51:32.000000000 +0200
12557 @@ -0,0 +1,85 @@
12558 +#include <linux/kernel.h>
12559 +#include <asm/system.h>
12560 +#include <asm/smp_alt.h>
12561 +#include <asm/processor.h>
12562 +#include <asm/string.h>
12563 +
12564 +struct smp_replacement_record {
12565 +       unsigned char targ_size;
12566 +       unsigned char smp1_size;
12567 +       unsigned char smp2_size;
12568 +       unsigned char up_size;
12569 +       unsigned char feature;
12570 +       unsigned char data[0];
12571 +};
12572 +
12573 +struct smp_alternative_record {
12574 +       void *targ_start;
12575 +       struct smp_replacement_record *repl;
12576 +};
12577 +
12578 +extern struct smp_alternative_record __start_smp_alternatives_table,
12579 +  __stop_smp_alternatives_table;
12580 +extern unsigned long __init_begin, __init_end;
12581 +
12582 +void prepare_for_smp(void)
12583 +{
12584 +       struct smp_alternative_record *r;
12585 +       printk(KERN_INFO "Enabling SMP...\n");
12586 +       for (r = &__start_smp_alternatives_table;
12587 +            r != &__stop_smp_alternatives_table;
12588 +            r++) {
12589 +               BUG_ON(r->repl->targ_size < r->repl->smp1_size);
12590 +               BUG_ON(r->repl->targ_size < r->repl->smp2_size);
12591 +               BUG_ON(r->repl->targ_size < r->repl->up_size);
12592 +               if (system_state == SYSTEM_RUNNING &&
12593 +                   r->targ_start >= (void *)&__init_begin &&
12594 +                   r->targ_start < (void *)&__init_end)
12595 +                       continue;
12596 +               if (r->repl->feature != (unsigned char)-1 &&
12597 +                   boot_cpu_has(r->repl->feature)) {
12598 +                       memcpy(r->targ_start,
12599 +                              r->repl->data + r->repl->smp1_size,
12600 +                              r->repl->smp2_size);
12601 +                       memset(r->targ_start + r->repl->smp2_size,
12602 +                              0x90,
12603 +                              r->repl->targ_size - r->repl->smp2_size);
12604 +               } else {
12605 +                       memcpy(r->targ_start,
12606 +                              r->repl->data,
12607 +                              r->repl->smp1_size);
12608 +                       memset(r->targ_start + r->repl->smp1_size,
12609 +                              0x90,
12610 +                              r->repl->targ_size - r->repl->smp1_size);
12611 +               }
12612 +       }
12613 +       /* Paranoia */
12614 +       asm volatile ("jmp 1f\n1:");
12615 +       mb();
12616 +}
12617 +
12618 +void unprepare_for_smp(void)
12619 +{
12620 +       struct smp_alternative_record *r;
12621 +       printk(KERN_INFO "Disabling SMP...\n");
12622 +       for (r = &__start_smp_alternatives_table;
12623 +            r != &__stop_smp_alternatives_table;
12624 +            r++) {
12625 +               BUG_ON(r->repl->targ_size < r->repl->smp1_size);
12626 +               BUG_ON(r->repl->targ_size < r->repl->smp2_size);
12627 +               BUG_ON(r->repl->targ_size < r->repl->up_size);
12628 +               if (system_state == SYSTEM_RUNNING &&
12629 +                   r->targ_start >= (void *)&__init_begin &&
12630 +                   r->targ_start < (void *)&__init_end)
12631 +                       continue;
12632 +               memcpy(r->targ_start,
12633 +                      r->repl->data + r->repl->smp1_size + r->repl->smp2_size,
12634 +                      r->repl->up_size);
12635 +               memset(r->targ_start + r->repl->up_size,
12636 +                      0x90,
12637 +                      r->repl->targ_size - r->repl->up_size);
12638 +       }
12639 +       /* Paranoia */
12640 +       asm volatile ("jmp 1f\n1:");
12641 +       mb();
12642 +}
12643 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/smpboot.c linux-2.6.16/arch/i386/kernel/smpboot.c
12644 --- linux-2.6.16.orig/arch/i386/kernel/smpboot.c        2006-03-20 06:53:29.000000000 +0100
12645 +++ linux-2.6.16/arch/i386/kernel/smpboot.c     2006-06-26 09:51:32.000000000 +0200
12646 @@ -1218,6 +1218,11 @@
12647                 if (max_cpus <= cpucount+1)
12648                         continue;
12649  
12650 +#ifdef CONFIG_SMP_ALTERNATIVES
12651 +               if (kicked == 1)
12652 +                       prepare_for_smp();
12653 +#endif
12654 +
12655                 if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
12656                         printk("CPU #%d not responding - cannot use it.\n",
12657                                                                 apicid);
12658 @@ -1396,6 +1401,11 @@
12659                 return -EIO;
12660         }
12661  
12662 +#ifdef CONFIG_SMP_ALTERNATIVES
12663 +       if (num_online_cpus() == 1)
12664 +               prepare_for_smp();
12665 +#endif
12666 +
12667         local_irq_enable();
12668         per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
12669         /* Unleash the CPU! */
12670 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/swiotlb.c linux-2.6.16/arch/i386/kernel/swiotlb.c
12671 --- linux-2.6.16.orig/arch/i386/kernel/swiotlb.c        1970-01-01 01:00:00.000000000 +0100
12672 +++ linux-2.6.16/arch/i386/kernel/swiotlb.c     2006-06-26 09:51:32.000000000 +0200
12673 @@ -0,0 +1,674 @@
12674 +/*
12675 + * Dynamic DMA mapping support.
12676 + *
12677 + * This implementation is a fallback for platforms that do not support
12678 + * I/O TLBs (aka DMA address translation hardware).
12679 + * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
12680 + * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
12681 + * Copyright (C) 2000, 2003 Hewlett-Packard Co
12682 + *     David Mosberger-Tang <davidm@hpl.hp.com>
12683 + * Copyright (C) 2005 Keir Fraser <keir@xensource.com>
12684 + */
12685 +
12686 +#include <linux/cache.h>
12687 +#include <linux/mm.h>
12688 +#include <linux/module.h>
12689 +#include <linux/pci.h>
12690 +#include <linux/spinlock.h>
12691 +#include <linux/string.h>
12692 +#include <linux/types.h>
12693 +#include <linux/ctype.h>
12694 +#include <linux/init.h>
12695 +#include <linux/bootmem.h>
12696 +#include <linux/highmem.h>
12697 +#include <asm/io.h>
12698 +#include <asm/pci.h>
12699 +#include <asm/dma.h>
12700 +#include <asm/uaccess.h>
12701 +#include <xen/interface/memory.h>
12702 +
12703 +int swiotlb;
12704 +EXPORT_SYMBOL(swiotlb);
12705 +
12706 +#define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
12707 +
12708 +#define SG_ENT_PHYS_ADDRESS(sg)        (page_to_phys((sg)->page) + (sg)->offset)
12709 +
12710 +/*
12711 + * Maximum allowable number of contiguous slabs to map,
12712 + * must be a power of 2.  What is the appropriate value ?
12713 + * The complexity of {map,unmap}_single is linearly dependent on this value.
12714 + */
12715 +#define IO_TLB_SEGSIZE 128
12716 +
12717 +/*
12718 + * log of the size of each IO TLB slab.  The number of slabs is command line
12719 + * controllable.
12720 + */
12721 +#define IO_TLB_SHIFT 11
12722 +
12723 +static int swiotlb_force;
12724 +static char *iotlb_virt_start;
12725 +static unsigned long iotlb_nslabs;
12726 +
12727 +/*
12728 + * Used to do a quick range check in swiotlb_unmap_single and
12729 + * swiotlb_sync_single_*, to see if the memory was in fact allocated by this
12730 + * API.
12731 + */
12732 +static dma_addr_t iotlb_bus_start, iotlb_bus_end, iotlb_bus_mask;
12733 +
12734 +/* Does the given dma address reside within the swiotlb aperture? */
12735 +#define in_swiotlb_aperture(a) (!(((a) ^ iotlb_bus_start) & iotlb_bus_mask))
12736 +
12737 +/*
12738 + * When the IOMMU overflows we return a fallback buffer. This sets the size.
12739 + */
12740 +static unsigned long io_tlb_overflow = 32*1024;
12741 +
12742 +void *io_tlb_overflow_buffer;
12743 +
12744 +/*
12745 + * This is a free list describing the number of free entries available from
12746 + * each index
12747 + */
12748 +static unsigned int *io_tlb_list;
12749 +static unsigned int io_tlb_index;
12750 +
12751 +/*
12752 + * We need to save away the original address corresponding to a mapped entry
12753 + * for the sync operations.
12754 + */
12755 +static struct phys_addr {
12756 +       struct page *page;
12757 +       unsigned int offset;
12758 +} *io_tlb_orig_addr;
12759 +
12760 +/*
12761 + * Protect the above data structures in the map and unmap calls
12762 + */
12763 +static DEFINE_SPINLOCK(io_tlb_lock);
12764 +
12765 +static int __init
12766 +setup_io_tlb_npages(char *str)
12767 +{
12768 +       /* Unlike ia64, the size is aperture in megabytes, not 'slabs'! */
12769 +       if (isdigit(*str)) {
12770 +               iotlb_nslabs = simple_strtoul(str, &str, 0) <<
12771 +                       (20 - IO_TLB_SHIFT);
12772 +               iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
12773 +               /* Round up to power of two (xen_create_contiguous_region). */
12774 +               while (iotlb_nslabs & (iotlb_nslabs-1))
12775 +                       iotlb_nslabs += iotlb_nslabs & ~(iotlb_nslabs-1);
12776 +       }
12777 +       if (*str == ',')
12778 +               ++str;
12779 +       /*
12780 +         * NB. 'force' enables the swiotlb, but doesn't force its use for
12781 +         * every DMA like it does on native Linux. 'off' forcibly disables
12782 +         * use of the swiotlb.
12783 +         */
12784 +       if (!strcmp(str, "force"))
12785 +               swiotlb_force = 1;
12786 +       else if (!strcmp(str, "off"))
12787 +               swiotlb_force = -1;
12788 +       return 1;
12789 +}
12790 +__setup("swiotlb=", setup_io_tlb_npages);
12791 +/* make io_tlb_overflow tunable too? */
12792 +
12793 +/*
12794 + * Statically reserve bounce buffer space and initialize bounce buffer data
12795 + * structures for the software IO TLB used to implement the PCI DMA API.
12796 + */
12797 +void
12798 +swiotlb_init_with_default_size (size_t default_size)
12799 +{
12800 +       unsigned long i, bytes;
12801 +       int rc;
12802 +
12803 +       if (!iotlb_nslabs) {
12804 +               iotlb_nslabs = (default_size >> IO_TLB_SHIFT);
12805 +               iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE);
12806 +               /* Round up to power of two (xen_create_contiguous_region). */
12807 +               while (iotlb_nslabs & (iotlb_nslabs-1))
12808 +                       iotlb_nslabs += iotlb_nslabs & ~(iotlb_nslabs-1);
12809 +       }
12810 +
12811 +       bytes = iotlb_nslabs * (1UL << IO_TLB_SHIFT);
12812 +
12813 +       /*
12814 +        * Get IO TLB memory from the low pages
12815 +        */
12816 +       iotlb_virt_start = alloc_bootmem_low_pages(bytes);
12817 +       if (!iotlb_virt_start)
12818 +               panic("Cannot allocate SWIOTLB buffer!\n"
12819 +                     "Use dom0_mem Xen boot parameter to reserve\n"
12820 +                     "some DMA memory (e.g., dom0_mem=-128M).\n");
12821 +
12822 +       /* Hardcode 31 address bits for now: aacraid limitation. */
12823 +       rc = xen_create_contiguous_region(
12824 +               (unsigned long)iotlb_virt_start, get_order(bytes), 31);
12825 +       BUG_ON(rc);
12826 +
12827 +       /*
12828 +        * Allocate and initialize the free list array.  This array is used
12829 +        * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE.
12830 +        */
12831 +       io_tlb_list = alloc_bootmem(iotlb_nslabs * sizeof(int));
12832 +       for (i = 0; i < iotlb_nslabs; i++)
12833 +               io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
12834 +       io_tlb_index = 0;
12835 +       io_tlb_orig_addr = alloc_bootmem(
12836 +               iotlb_nslabs * sizeof(*io_tlb_orig_addr));
12837 +
12838 +       /*
12839 +        * Get the overflow emergency buffer
12840 +        */
12841 +       io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
12842 +
12843 +       iotlb_bus_start = virt_to_bus(iotlb_virt_start);
12844 +       iotlb_bus_end   = iotlb_bus_start + bytes;
12845 +       iotlb_bus_mask  = ~(dma_addr_t)(bytes - 1);
12846 +
12847 +       printk(KERN_INFO "Software IO TLB enabled: \n"
12848 +              " Aperture:     %lu megabytes\n"
12849 +              " Bus range:    0x%016lx - 0x%016lx\n"
12850 +              " Kernel range: 0x%016lx - 0x%016lx\n",
12851 +              bytes >> 20,
12852 +              (unsigned long)iotlb_bus_start,
12853 +              (unsigned long)iotlb_bus_end,
12854 +              (unsigned long)iotlb_virt_start,
12855 +              (unsigned long)iotlb_virt_start + bytes);
12856 +}
12857 +
12858 +void
12859 +swiotlb_init(void)
12860 +{
12861 +       long ram_end;
12862 +       size_t defsz = 64 * (1 << 20); /* 64MB default size */
12863 +
12864 +       if (swiotlb_force == 1) {
12865 +               swiotlb = 1;
12866 +       } else if ((swiotlb_force != -1) &&
12867 +                  (xen_start_info->flags & SIF_INITDOMAIN)) {
12868 +               /* Domain 0 always has a swiotlb. */
12869 +               ram_end = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
12870 +               if (ram_end <= 0x7ffff)
12871 +                       defsz = 2 * (1 << 20); /* 2MB on <2GB on systems. */
12872 +               swiotlb = 1;
12873 +       }
12874 +
12875 +       if (swiotlb)
12876 +               swiotlb_init_with_default_size(defsz);
12877 +       else
12878 +               printk(KERN_INFO "Software IO TLB disabled\n");
12879 +}
12880 +
12881 +/*
12882 + * We use __copy_to_user to transfer to the host buffer because the buffer
12883 + * may be mapped read-only (e.g, in blkback driver) but lower-level
12884 + * drivers map the buffer for DMA_BIDIRECTIONAL access. This causes an
12885 + * unnecessary copy from the aperture to the host buffer, and a page fault.
12886 + */
12887 +static void
12888 +__sync_single(struct phys_addr buffer, char *dma_addr, size_t size, int dir)
12889 +{
12890 +       if (PageHighMem(buffer.page)) {
12891 +               size_t len, bytes;
12892 +               char *dev, *host, *kmp;
12893 +               len = size;
12894 +               while (len != 0) {
12895 +                       if (((bytes = len) + buffer.offset) > PAGE_SIZE)
12896 +                               bytes = PAGE_SIZE - buffer.offset;
12897 +                       kmp  = kmap_atomic(buffer.page, KM_SWIOTLB);
12898 +                       dev  = dma_addr + size - len;
12899 +                       host = kmp + buffer.offset;
12900 +                       if (dir == DMA_FROM_DEVICE) {
12901 +                               if (__copy_to_user(host, dev, bytes))
12902 +                                       /* inaccessible */;
12903 +                       } else
12904 +                               memcpy(dev, host, bytes);
12905 +                       kunmap_atomic(kmp, KM_SWIOTLB);
12906 +                       len -= bytes;
12907 +                       buffer.page++;
12908 +                       buffer.offset = 0;
12909 +               }
12910 +       } else {
12911 +               char *host = (char *)phys_to_virt(
12912 +                       page_to_pseudophys(buffer.page)) + buffer.offset;
12913 +               if (dir == DMA_FROM_DEVICE) {
12914 +                       if (__copy_to_user(host, dma_addr, size))
12915 +                               /* inaccessible */;
12916 +               } else if (dir == DMA_TO_DEVICE)
12917 +                       memcpy(dma_addr, host, size);
12918 +       }
12919 +}
12920 +
12921 +/*
12922 + * Allocates bounce buffer and returns its kernel virtual address.
12923 + */
12924 +static void *
12925 +map_single(struct device *hwdev, struct phys_addr buffer, size_t size, int dir)
12926 +{
12927 +       unsigned long flags;
12928 +       char *dma_addr;
12929 +       unsigned int nslots, stride, index, wrap;
12930 +       int i;
12931 +
12932 +       /*
12933 +        * For mappings greater than a page, we limit the stride (and
12934 +        * hence alignment) to a page size.
12935 +        */
12936 +       nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
12937 +       if (size > PAGE_SIZE)
12938 +               stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
12939 +       else
12940 +               stride = 1;
12941 +
12942 +       BUG_ON(!nslots);
12943 +
12944 +       /*
12945 +        * Find suitable number of IO TLB entries size that will fit this
12946 +        * request and allocate a buffer from that IO TLB pool.
12947 +        */
12948 +       spin_lock_irqsave(&io_tlb_lock, flags);
12949 +       {
12950 +               wrap = index = ALIGN(io_tlb_index, stride);
12951 +
12952 +               if (index >= iotlb_nslabs)
12953 +                       wrap = index = 0;
12954 +
12955 +               do {
12956 +                       /*
12957 +                        * If we find a slot that indicates we have 'nslots'
12958 +                        * number of contiguous buffers, we allocate the
12959 +                        * buffers from that slot and mark the entries as '0'
12960 +                        * indicating unavailable.
12961 +                        */
12962 +                       if (io_tlb_list[index] >= nslots) {
12963 +                               int count = 0;
12964 +
12965 +                               for (i = index; i < (int)(index + nslots); i++)
12966 +                                       io_tlb_list[i] = 0;
12967 +                               for (i = index - 1;
12968 +                                    (OFFSET(i, IO_TLB_SEGSIZE) !=
12969 +                                     IO_TLB_SEGSIZE -1) && io_tlb_list[i];
12970 +                                    i--)
12971 +                                       io_tlb_list[i] = ++count;
12972 +                               dma_addr = iotlb_virt_start +
12973 +                                       (index << IO_TLB_SHIFT);
12974 +
12975 +                               /*
12976 +                                * Update the indices to avoid searching in
12977 +                                * the next round.
12978 +                                */
12979 +                               io_tlb_index = 
12980 +                                       ((index + nslots) < iotlb_nslabs
12981 +                                        ? (index + nslots) : 0);
12982 +
12983 +                               goto found;
12984 +                       }
12985 +                       index += stride;
12986 +                       if (index >= iotlb_nslabs)
12987 +                               index = 0;
12988 +               } while (index != wrap);
12989 +
12990 +               spin_unlock_irqrestore(&io_tlb_lock, flags);
12991 +               return NULL;
12992 +       }
12993 +  found:
12994 +       spin_unlock_irqrestore(&io_tlb_lock, flags);
12995 +
12996 +       /*
12997 +        * Save away the mapping from the original address to the DMA address.
12998 +        * This is needed when we sync the memory.  Then we sync the buffer if
12999 +        * needed.
13000 +        */
13001 +       io_tlb_orig_addr[index] = buffer;
13002 +       if ((dir == DMA_TO_DEVICE) || (dir == DMA_BIDIRECTIONAL))
13003 +               __sync_single(buffer, dma_addr, size, DMA_TO_DEVICE);
13004 +
13005 +       return dma_addr;
13006 +}
13007 +
13008 +/*
13009 + * dma_addr is the kernel virtual address of the bounce buffer to unmap.
13010 + */
13011 +static void
13012 +unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
13013 +{
13014 +       unsigned long flags;
13015 +       int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
13016 +       int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
13017 +       struct phys_addr buffer = io_tlb_orig_addr[index];
13018 +
13019 +       /*
13020 +        * First, sync the memory before unmapping the entry
13021 +        */
13022 +       if ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))
13023 +               __sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE);
13024 +
13025 +       /*
13026 +        * Return the buffer to the free list by setting the corresponding
13027 +        * entries to indicate the number of contigous entries available.
13028 +        * While returning the entries to the free list, we merge the entries
13029 +        * with slots below and above the pool being returned.
13030 +        */
13031 +       spin_lock_irqsave(&io_tlb_lock, flags);
13032 +       {
13033 +               count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
13034 +                        io_tlb_list[index + nslots] : 0);
13035 +               /*
13036 +                * Step 1: return the slots to the free list, merging the
13037 +                * slots with superceeding slots
13038 +                */
13039 +               for (i = index + nslots - 1; i >= index; i--)
13040 +                       io_tlb_list[i] = ++count;
13041 +               /*
13042 +                * Step 2: merge the returned slots with the preceding slots,
13043 +                * if available (non zero)
13044 +                */
13045 +               for (i = index - 1;
13046 +                    (OFFSET(i, IO_TLB_SEGSIZE) !=
13047 +                     IO_TLB_SEGSIZE -1) && io_tlb_list[i];
13048 +                    i--)
13049 +                       io_tlb_list[i] = ++count;
13050 +       }
13051 +       spin_unlock_irqrestore(&io_tlb_lock, flags);
13052 +}
13053 +
13054 +static void
13055 +sync_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
13056 +{
13057 +       int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT;
13058 +       struct phys_addr buffer = io_tlb_orig_addr[index];
13059 +       BUG_ON((dir != DMA_FROM_DEVICE) && (dir != DMA_TO_DEVICE));
13060 +       __sync_single(buffer, dma_addr, size, dir);
13061 +}
13062 +
13063 +static void
13064 +swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
13065 +{
13066 +       /*
13067 +        * Ran out of IOMMU space for this operation. This is very bad.
13068 +        * Unfortunately the drivers cannot handle this operation properly.
13069 +        * unless they check for pci_dma_mapping_error (most don't)
13070 +        * When the mapping is small enough return a static buffer to limit
13071 +        * the damage, or panic when the transfer is too big.
13072 +        */
13073 +       printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %lu bytes at "
13074 +              "device %s\n", (unsigned long)size, dev ? dev->bus_id : "?");
13075 +
13076 +       if (size > io_tlb_overflow && do_panic) {
13077 +               if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
13078 +                       panic("PCI-DMA: Memory would be corrupted\n");
13079 +               if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
13080 +                       panic("PCI-DMA: Random memory would be DMAed\n");
13081 +       }
13082 +}
13083 +
13084 +/*
13085 + * Map a single buffer of the indicated size for DMA in streaming mode.  The
13086 + * PCI address to use is returned.
13087 + *
13088 + * Once the device is given the dma address, the device owns this memory until
13089 + * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
13090 + */
13091 +dma_addr_t
13092 +swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
13093 +{
13094 +       dma_addr_t dev_addr = virt_to_bus(ptr);
13095 +       void *map;
13096 +       struct phys_addr buffer;
13097 +
13098 +       BUG_ON(dir == DMA_NONE);
13099 +
13100 +       /*
13101 +        * If the pointer passed in happens to be in the device's DMA window,
13102 +        * we can safely return the device addr and not worry about bounce
13103 +        * buffering it.
13104 +        */
13105 +       if (!range_straddles_page_boundary(ptr, size) &&
13106 +           !address_needs_mapping(hwdev, dev_addr))
13107 +               return dev_addr;
13108 +
13109 +       /*
13110 +        * Oh well, have to allocate and map a bounce buffer.
13111 +        */
13112 +       buffer.page   = virt_to_page(ptr);
13113 +       buffer.offset = (unsigned long)ptr & ~PAGE_MASK;
13114 +       map = map_single(hwdev, buffer, size, dir);
13115 +       if (!map) {
13116 +               swiotlb_full(hwdev, size, dir, 1);
13117 +               map = io_tlb_overflow_buffer;
13118 +       }
13119 +
13120 +       dev_addr = virt_to_bus(map);
13121 +       return dev_addr;
13122 +}
13123 +
13124 +/*
13125 + * Unmap a single streaming mode DMA translation.  The dma_addr and size must
13126 + * match what was provided for in a previous swiotlb_map_single call.  All
13127 + * other usages are undefined.
13128 + *
13129 + * After this call, reads by the cpu to the buffer are guaranteed to see
13130 + * whatever the device wrote there.
13131 + */
13132 +void
13133 +swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
13134 +                    int dir)
13135 +{
13136 +       BUG_ON(dir == DMA_NONE);
13137 +       if (in_swiotlb_aperture(dev_addr))
13138 +               unmap_single(hwdev, bus_to_virt(dev_addr), size, dir);
13139 +}
13140 +
13141 +/*
13142 + * Make physical memory consistent for a single streaming mode DMA translation
13143 + * after a transfer.
13144 + *
13145 + * If you perform a swiotlb_map_single() but wish to interrogate the buffer
13146 + * using the cpu, yet do not wish to teardown the PCI dma mapping, you must
13147 + * call this function before doing so.  At the next point you give the PCI dma
13148 + * address back to the card, you must first perform a
13149 + * swiotlb_dma_sync_for_device, and then the device again owns the buffer
13150 + */
13151 +void
13152 +swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
13153 +                           size_t size, int dir)
13154 +{
13155 +       BUG_ON(dir == DMA_NONE);
13156 +       if (in_swiotlb_aperture(dev_addr))
13157 +               sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
13158 +}
13159 +
13160 +void
13161 +swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
13162 +                              size_t size, int dir)
13163 +{
13164 +       BUG_ON(dir == DMA_NONE);
13165 +       if (in_swiotlb_aperture(dev_addr))
13166 +               sync_single(hwdev, bus_to_virt(dev_addr), size, dir);
13167 +}
13168 +
13169 +/*
13170 + * Map a set of buffers described by scatterlist in streaming mode for DMA.
13171 + * This is the scatter-gather version of the above swiotlb_map_single
13172 + * interface.  Here the scatter gather list elements are each tagged with the
13173 + * appropriate dma address and length.  They are obtained via
13174 + * sg_dma_{address,length}(SG).
13175 + *
13176 + * NOTE: An implementation may be able to use a smaller number of
13177 + *       DMA address/length pairs than there are SG table elements.
13178 + *       (for example via virtual mapping capabilities)
13179 + *       The routine returns the number of addr/length pairs actually
13180 + *       used, at most nents.
13181 + *
13182 + * Device ownership issues as mentioned above for swiotlb_map_single are the
13183 + * same here.
13184 + */
13185 +int
13186 +swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
13187 +              int dir)
13188 +{
13189 +       struct phys_addr buffer;
13190 +       dma_addr_t dev_addr;
13191 +       char *map;
13192 +       int i;
13193 +
13194 +       BUG_ON(dir == DMA_NONE);
13195 +
13196 +       for (i = 0; i < nelems; i++, sg++) {
13197 +               dev_addr = SG_ENT_PHYS_ADDRESS(sg);
13198 +               if (address_needs_mapping(hwdev, dev_addr)) {
13199 +                       buffer.page   = sg->page;
13200 +                       buffer.offset = sg->offset;
13201 +                       map = map_single(hwdev, buffer, sg->length, dir);
13202 +                       if (!map) {
13203 +                               /* Don't panic here, we expect map_sg users
13204 +                                  to do proper error handling. */
13205 +                               swiotlb_full(hwdev, sg->length, dir, 0);
13206 +                               swiotlb_unmap_sg(hwdev, sg - i, i, dir);
13207 +                               sg[0].dma_length = 0;
13208 +                               return 0;
13209 +                       }
13210 +                       sg->dma_address = (dma_addr_t)virt_to_bus(map);
13211 +               } else
13212 +                       sg->dma_address = dev_addr;
13213 +               sg->dma_length = sg->length;
13214 +       }
13215 +       return nelems;
13216 +}
13217 +
13218 +/*
13219 + * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
13220 + * concerning calls here are the same as for swiotlb_unmap_single() above.
13221 + */
13222 +void
13223 +swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
13224 +                int dir)
13225 +{
13226 +       int i;
13227 +
13228 +       BUG_ON(dir == DMA_NONE);
13229 +
13230 +       for (i = 0; i < nelems; i++, sg++)
13231 +               if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
13232 +                       unmap_single(hwdev, 
13233 +                                    (void *)bus_to_virt(sg->dma_address),
13234 +                                    sg->dma_length, dir);
13235 +}
13236 +
13237 +/*
13238 + * Make physical memory consistent for a set of streaming mode DMA translations
13239 + * after a transfer.
13240 + *
13241 + * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
13242 + * and usage.
13243 + */
13244 +void
13245 +swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
13246 +                       int nelems, int dir)
13247 +{
13248 +       int i;
13249 +
13250 +       BUG_ON(dir == DMA_NONE);
13251 +
13252 +       for (i = 0; i < nelems; i++, sg++)
13253 +               if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
13254 +                       sync_single(hwdev,
13255 +                                   (void *)bus_to_virt(sg->dma_address),
13256 +                                   sg->dma_length, dir);
13257 +}
13258 +
13259 +void
13260 +swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
13261 +                          int nelems, int dir)
13262 +{
13263 +       int i;
13264 +
13265 +       BUG_ON(dir == DMA_NONE);
13266 +
13267 +       for (i = 0; i < nelems; i++, sg++)
13268 +               if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
13269 +                       sync_single(hwdev,
13270 +                                   (void *)bus_to_virt(sg->dma_address),
13271 +                                   sg->dma_length, dir);
13272 +}
13273 +
13274 +dma_addr_t
13275 +swiotlb_map_page(struct device *hwdev, struct page *page,
13276 +                unsigned long offset, size_t size,
13277 +                enum dma_data_direction direction)
13278 +{
13279 +       struct phys_addr buffer;
13280 +       dma_addr_t dev_addr;
13281 +       char *map;
13282 +
13283 +       dev_addr = page_to_phys(page) + offset;
13284 +       if (address_needs_mapping(hwdev, dev_addr)) {
13285 +               buffer.page   = page;
13286 +               buffer.offset = offset;
13287 +               map = map_single(hwdev, buffer, size, direction);
13288 +               if (!map) {
13289 +                       swiotlb_full(hwdev, size, direction, 1);
13290 +                       map = io_tlb_overflow_buffer;
13291 +               }
13292 +               dev_addr = (dma_addr_t)virt_to_bus(map);
13293 +       }
13294 +
13295 +       return dev_addr;
13296 +}
13297 +
13298 +void
13299 +swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
13300 +                  size_t size, enum dma_data_direction direction)
13301 +{
13302 +       BUG_ON(direction == DMA_NONE);
13303 +       if (in_swiotlb_aperture(dma_address))
13304 +               unmap_single(hwdev, bus_to_virt(dma_address), size, direction);
13305 +}
13306 +
13307 +int
13308 +swiotlb_dma_mapping_error(dma_addr_t dma_addr)
13309 +{
13310 +       return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
13311 +}
13312 +
13313 +/*
13314 + * Return whether the given PCI device DMA address mask can be supported
13315 + * properly.  For example, if your device can only drive the low 24-bits
13316 + * during PCI bus mastering, then you would pass 0x00ffffff as the mask to
13317 + * this function.
13318 + */
13319 +int
13320 +swiotlb_dma_supported (struct device *hwdev, u64 mask)
13321 +{
13322 +       return (mask >= (iotlb_bus_end - 1));
13323 +}
13324 +
13325 +EXPORT_SYMBOL(swiotlb_init);
13326 +EXPORT_SYMBOL(swiotlb_map_single);
13327 +EXPORT_SYMBOL(swiotlb_unmap_single);
13328 +EXPORT_SYMBOL(swiotlb_map_sg);
13329 +EXPORT_SYMBOL(swiotlb_unmap_sg);
13330 +EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
13331 +EXPORT_SYMBOL(swiotlb_sync_single_for_device);
13332 +EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
13333 +EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
13334 +EXPORT_SYMBOL(swiotlb_map_page);
13335 +EXPORT_SYMBOL(swiotlb_unmap_page);
13336 +EXPORT_SYMBOL(swiotlb_dma_mapping_error);
13337 +EXPORT_SYMBOL(swiotlb_dma_supported);
13338 +
13339 +/*
13340 + * Local variables:
13341 + *  c-file-style: "linux"
13342 + *  indent-tabs-mode: t
13343 + *  c-indent-level: 8
13344 + *  c-basic-offset: 8
13345 + *  tab-width: 8
13346 + * End:
13347 + */
13348 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/sysenter.c linux-2.6.16/arch/i386/kernel/sysenter.c
13349 --- linux-2.6.16.orig/arch/i386/kernel/sysenter.c       2006-03-20 06:53:29.000000000 +0100
13350 +++ linux-2.6.16/arch/i386/kernel/sysenter.c    2006-06-26 09:51:32.000000000 +0200
13351 @@ -13,6 +13,7 @@
13352  #include <linux/gfp.h>
13353  #include <linux/string.h>
13354  #include <linux/elf.h>
13355 +#include <linux/mm.h>
13356  
13357  #include <asm/cpufeature.h>
13358  #include <asm/msr.h>
13359 @@ -23,6 +24,7 @@
13360  
13361  void enable_sep_cpu(void)
13362  {
13363 +#ifdef CONFIG_X86_SYSENTER
13364         int cpu = get_cpu();
13365         struct tss_struct *tss = &per_cpu(init_tss, cpu);
13366  
13367 @@ -37,6 +39,7 @@
13368         wrmsr(MSR_IA32_SYSENTER_ESP, tss->esp1, 0);
13369         wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
13370         put_cpu();      
13371 +#endif
13372  }
13373  
13374  /*
13375 @@ -45,23 +48,90 @@
13376   */
13377  extern const char vsyscall_int80_start, vsyscall_int80_end;
13378  extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
13379 +static void *syscall_page;
13380  
13381  int __init sysenter_setup(void)
13382  {
13383 -       void *page = (void *)get_zeroed_page(GFP_ATOMIC);
13384 -
13385 -       __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC);
13386 +       syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
13387  
13388 -       if (!boot_cpu_has(X86_FEATURE_SEP)) {
13389 -               memcpy(page,
13390 -                      &vsyscall_int80_start,
13391 -                      &vsyscall_int80_end - &vsyscall_int80_start);
13392 +#ifdef CONFIG_X86_SYSENTER
13393 +       if (boot_cpu_has(X86_FEATURE_SEP)) {
13394 +               memcpy(syscall_page,
13395 +                      &vsyscall_sysenter_start,
13396 +                      &vsyscall_sysenter_end - &vsyscall_sysenter_start);
13397                 return 0;
13398         }
13399 +#endif
13400  
13401 -       memcpy(page,
13402 -              &vsyscall_sysenter_start,
13403 -              &vsyscall_sysenter_end - &vsyscall_sysenter_start);
13404 +       memcpy(syscall_page,
13405 +              &vsyscall_int80_start,
13406 +              &vsyscall_int80_end - &vsyscall_int80_start);
13407 +
13408 +       return 0;
13409 +}
13410 +
13411 +static struct page*
13412 +syscall_nopage(struct vm_area_struct *vma, unsigned long adr, int *type)
13413 +{
13414 +       struct page *p = virt_to_page(adr - vma->vm_start + syscall_page);
13415 +       get_page(p);
13416 +       return p;
13417 +}
13418 +
13419 +/* Prevent VMA merging */
13420 +static void syscall_vma_close(struct vm_area_struct *vma)
13421 +{
13422 +}
13423 +
13424 +static struct vm_operations_struct syscall_vm_ops = {
13425 +       .close = syscall_vma_close,
13426 +       .nopage = syscall_nopage,
13427 +};
13428  
13429 +/* Setup a VMA at program startup for the vsyscall page */
13430 +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
13431 +{
13432 +       struct vm_area_struct *vma;
13433 +       struct mm_struct *mm = current->mm;
13434 +       int ret;
13435 +
13436 +       vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
13437 +       if (!vma)
13438 +               return -ENOMEM;
13439 +
13440 +       memset(vma, 0, sizeof(struct vm_area_struct));
13441 +       /* Could randomize here */
13442 +       vma->vm_start = VSYSCALL_BASE;
13443 +       vma->vm_end = VSYSCALL_BASE + PAGE_SIZE;
13444 +       /* MAYWRITE to allow gdb to COW and set breakpoints */
13445 +       vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE;
13446 +       vma->vm_flags |= mm->def_flags;
13447 +       vma->vm_page_prot = protection_map[vma->vm_flags & 7];
13448 +       vma->vm_ops = &syscall_vm_ops;
13449 +       vma->vm_mm = mm;
13450 +
13451 +       down_write(&mm->mmap_sem);
13452 +       if ((ret = insert_vm_struct(mm, vma))) {
13453 +               up_write(&mm->mmap_sem);
13454 +               kmem_cache_free(vm_area_cachep, vma);
13455 +               return ret;
13456 +       }
13457 +       mm->total_vm++;
13458 +       up_write(&mm->mmap_sem);
13459 +       return 0;
13460 +}
13461 +
13462 +struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
13463 +{
13464 +       return NULL;
13465 +}
13466 +
13467 +int in_gate_area(struct task_struct *task, unsigned long addr)
13468 +{
13469 +       return 0;
13470 +}
13471 +
13472 +int in_gate_area_no_task(unsigned long addr)
13473 +{
13474         return 0;
13475  }
13476 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/time-xen.c linux-2.6.16/arch/i386/kernel/time-xen.c
13477 --- linux-2.6.16.orig/arch/i386/kernel/time-xen.c       1970-01-01 01:00:00.000000000 +0100
13478 +++ linux-2.6.16/arch/i386/kernel/time-xen.c    2006-06-26 09:51:32.000000000 +0200
13479 @@ -0,0 +1,1097 @@
13480 +/*
13481 + *  linux/arch/i386/kernel/time.c
13482 + *
13483 + *  Copyright (C) 1991, 1992, 1995  Linus Torvalds
13484 + *
13485 + * This file contains the PC-specific time handling details:
13486 + * reading the RTC at bootup, etc..
13487 + * 1994-07-02    Alan Modra
13488 + *     fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
13489 + * 1995-03-26    Markus Kuhn
13490 + *      fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
13491 + *      precision CMOS clock update
13492 + * 1996-05-03    Ingo Molnar
13493 + *      fixed time warps in do_[slow|fast]_gettimeoffset()
13494 + * 1997-09-10  Updated NTP code according to technical memorandum Jan '96
13495 + *             "A Kernel Model for Precision Timekeeping" by Dave Mills
13496 + * 1998-09-05    (Various)
13497 + *     More robust do_fast_gettimeoffset() algorithm implemented
13498 + *     (works with APM, Cyrix 6x86MX and Centaur C6),
13499 + *     monotonic gettimeofday() with fast_get_timeoffset(),
13500 + *     drift-proof precision TSC calibration on boot
13501 + *     (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
13502 + *     Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
13503 + *     ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
13504 + * 1998-12-16    Andrea Arcangeli
13505 + *     Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
13506 + *     because was not accounting lost_ticks.
13507 + * 1998-12-24 Copyright (C) 1998  Andrea Arcangeli
13508 + *     Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
13509 + *     serialize accesses to xtime/lost_ticks).
13510 + */
13511 +
13512 +#include <linux/errno.h>
13513 +#include <linux/sched.h>
13514 +#include <linux/kernel.h>
13515 +#include <linux/param.h>
13516 +#include <linux/string.h>
13517 +#include <linux/mm.h>
13518 +#include <linux/interrupt.h>
13519 +#include <linux/time.h>
13520 +#include <linux/delay.h>
13521 +#include <linux/init.h>
13522 +#include <linux/smp.h>
13523 +#include <linux/module.h>
13524 +#include <linux/sysdev.h>
13525 +#include <linux/bcd.h>
13526 +#include <linux/efi.h>
13527 +#include <linux/mca.h>
13528 +#include <linux/sysctl.h>
13529 +#include <linux/percpu.h>
13530 +#include <linux/kernel_stat.h>
13531 +#include <linux/posix-timers.h>
13532 +
13533 +#include <asm/io.h>
13534 +#include <asm/smp.h>
13535 +#include <asm/irq.h>
13536 +#include <asm/msr.h>
13537 +#include <asm/delay.h>
13538 +#include <asm/mpspec.h>
13539 +#include <asm/uaccess.h>
13540 +#include <asm/processor.h>
13541 +#include <asm/timer.h>
13542 +#include <asm/sections.h>
13543 +
13544 +#include "mach_time.h"
13545 +
13546 +#include <linux/timex.h>
13547 +#include <linux/config.h>
13548 +
13549 +#include <asm/hpet.h>
13550 +
13551 +#include <asm/arch_hooks.h>
13552 +
13553 +#include <xen/evtchn.h>
13554 +#include <xen/interface/vcpu.h>
13555 +
13556 +#if defined (__i386__)
13557 +#include <asm/i8259.h>
13558 +#endif
13559 +
13560 +int pit_latch_buggy;              /* extern */
13561 +
13562 +#if defined(__x86_64__)
13563 +unsigned long vxtime_hz = PIT_TICK_RATE;
13564 +struct vxtime_data __vxtime __section_vxtime;   /* for vsyscalls */
13565 +volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
13566 +unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
13567 +struct timespec __xtime __section_xtime;
13568 +struct timezone __sys_tz __section_sys_tz;
13569 +#endif
13570 +
13571 +unsigned int cpu_khz;  /* Detected as we calibrate the TSC */
13572 +EXPORT_SYMBOL(cpu_khz);
13573 +
13574 +extern unsigned long wall_jiffies;
13575 +
13576 +DEFINE_SPINLOCK(rtc_lock);
13577 +EXPORT_SYMBOL(rtc_lock);
13578 +
13579 +#if defined (__i386__)
13580 +#include <asm/i8253.h>
13581 +#endif
13582 +
13583 +DEFINE_SPINLOCK(i8253_lock);
13584 +EXPORT_SYMBOL(i8253_lock);
13585 +
13586 +extern struct init_timer_opts timer_tsc_init;
13587 +extern struct timer_opts timer_tsc;
13588 +#define timer_none timer_tsc
13589 +struct timer_opts *cur_timer __read_mostly = &timer_tsc;
13590 +
13591 +/* These are peridically updated in shared_info, and then copied here. */
13592 +struct shadow_time_info {
13593 +       u64 tsc_timestamp;     /* TSC at last update of time vals.  */
13594 +       u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
13595 +       u32 tsc_to_nsec_mul;
13596 +       u32 tsc_to_usec_mul;
13597 +       int tsc_shift;
13598 +       u32 version;
13599 +};
13600 +static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
13601 +static struct timespec shadow_tv;
13602 +static u32 shadow_tv_version;
13603 +
13604 +/* Keep track of last time we did processing/updating of jiffies and xtime. */
13605 +static u64 processed_system_time;   /* System time (ns) at last processing. */
13606 +static DEFINE_PER_CPU(u64, processed_system_time);
13607 +
13608 +/* How much CPU time was spent blocked and how much was 'stolen'? */
13609 +static DEFINE_PER_CPU(u64, processed_stolen_time);
13610 +static DEFINE_PER_CPU(u64, processed_blocked_time);
13611 +
13612 +/* Current runstate of each CPU (updated automatically by the hypervisor). */
13613 +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
13614 +
13615 +/* Must be signed, as it's compared with s64 quantities which can be -ve. */
13616 +#define NS_PER_TICK (1000000000LL/HZ)
13617 +
13618 +static inline void __normalize_time(time_t *sec, s64 *nsec)
13619 +{
13620 +       while (*nsec >= NSEC_PER_SEC) {
13621 +               (*nsec) -= NSEC_PER_SEC;
13622 +               (*sec)++;
13623 +       }
13624 +       while (*nsec < 0) {
13625 +               (*nsec) += NSEC_PER_SEC;
13626 +               (*sec)--;
13627 +       }
13628 +}
13629 +
13630 +/* Does this guest OS track Xen time, or set its wall clock independently? */
13631 +static int independent_wallclock = 0;
13632 +static int __init __independent_wallclock(char *str)
13633 +{
13634 +       independent_wallclock = 1;
13635 +       return 1;
13636 +}
13637 +__setup("independent_wallclock", __independent_wallclock);
13638 +
13639 +/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
13640 +static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
13641 +static int __init __permitted_clock_jitter(char *str)
13642 +{
13643 +       permitted_clock_jitter = simple_strtoul(str, NULL, 0);
13644 +       return 1;
13645 +}
13646 +__setup("permitted_clock_jitter=", __permitted_clock_jitter);
13647 +
13648 +int tsc_disable __devinitdata = 0;
13649 +
13650 +static void delay_tsc(unsigned long loops)
13651 +{
13652 +       unsigned long bclock, now;
13653 +
13654 +       rdtscl(bclock);
13655 +       do {
13656 +               rep_nop();
13657 +               rdtscl(now);
13658 +       } while ((now - bclock) < loops);
13659 +}
13660 +
13661 +struct timer_opts timer_tsc = {
13662 +       .name = "tsc",
13663 +       .delay = delay_tsc,
13664 +};
13665 +
13666 +/*
13667 + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
13668 + * yielding a 64-bit result.
13669 + */
13670 +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
13671 +{
13672 +       u64 product;
13673 +#ifdef __i386__
13674 +       u32 tmp1, tmp2;
13675 +#endif
13676 +
13677 +       if (shift < 0)
13678 +               delta >>= -shift;
13679 +       else
13680 +               delta <<= shift;
13681 +
13682 +#ifdef __i386__
13683 +       __asm__ (
13684 +               "mul  %5       ; "
13685 +               "mov  %4,%%eax ; "
13686 +               "mov  %%edx,%4 ; "
13687 +               "mul  %5       ; "
13688 +               "xor  %5,%5    ; "
13689 +               "add  %4,%%eax ; "
13690 +               "adc  %5,%%edx ; "
13691 +               : "=A" (product), "=r" (tmp1), "=r" (tmp2)
13692 +               : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
13693 +#else
13694 +       __asm__ (
13695 +               "mul %%rdx ; shrd $32,%%rdx,%%rax"
13696 +               : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
13697 +#endif
13698 +
13699 +       return product;
13700 +}
13701 +
13702 +#if defined (__i386__)
13703 +int read_current_timer(unsigned long *timer_val)
13704 +{
13705 +       rdtscl(*timer_val);
13706 +       return 0;
13707 +}
13708 +#endif
13709 +
13710 +void init_cpu_khz(void)
13711 +{
13712 +       u64 __cpu_khz = 1000000ULL << 32;
13713 +       struct vcpu_time_info *info;
13714 +       info = &HYPERVISOR_shared_info->vcpu_info[0].time;
13715 +       do_div(__cpu_khz, info->tsc_to_system_mul);
13716 +       if (info->tsc_shift < 0)
13717 +               cpu_khz = __cpu_khz << -info->tsc_shift;
13718 +       else
13719 +               cpu_khz = __cpu_khz >> info->tsc_shift;
13720 +}
13721 +
13722 +static u64 get_nsec_offset(struct shadow_time_info *shadow)
13723 +{
13724 +       u64 now, delta;
13725 +       rdtscll(now);
13726 +       delta = now - shadow->tsc_timestamp;
13727 +       return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
13728 +}
13729 +
13730 +static unsigned long get_usec_offset(struct shadow_time_info *shadow)
13731 +{
13732 +       u64 now, delta;
13733 +       rdtscll(now);
13734 +       delta = now - shadow->tsc_timestamp;
13735 +       return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift);
13736 +}
13737 +
13738 +static void __update_wallclock(time_t sec, long nsec)
13739 +{
13740 +       long wtm_nsec, xtime_nsec;
13741 +       time_t wtm_sec, xtime_sec;
13742 +       u64 tmp, wc_nsec;
13743 +
13744 +       /* Adjust wall-clock time base based on wall_jiffies ticks. */
13745 +       wc_nsec = processed_system_time;
13746 +       wc_nsec += sec * (u64)NSEC_PER_SEC;
13747 +       wc_nsec += nsec;
13748 +       wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
13749 +
13750 +       /* Split wallclock base into seconds and nanoseconds. */
13751 +       tmp = wc_nsec;
13752 +       xtime_nsec = do_div(tmp, 1000000000);
13753 +       xtime_sec  = (time_t)tmp;
13754 +
13755 +       wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
13756 +       wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
13757 +
13758 +       set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
13759 +       set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
13760 +
13761 +       ntp_clear();
13762 +}
13763 +
13764 +static void update_wallclock(void)
13765 +{
13766 +       shared_info_t *s = HYPERVISOR_shared_info;
13767 +
13768 +       do {
13769 +               shadow_tv_version = s->wc_version;
13770 +               rmb();
13771 +               shadow_tv.tv_sec  = s->wc_sec;
13772 +               shadow_tv.tv_nsec = s->wc_nsec;
13773 +               rmb();
13774 +       } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
13775 +
13776 +       if (!independent_wallclock)
13777 +               __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
13778 +}
13779 +
13780 +/*
13781 + * Reads a consistent set of time-base values from Xen, into a shadow data
13782 + * area.
13783 + */
13784 +static void get_time_values_from_xen(void)
13785 +{
13786 +       shared_info_t           *s = HYPERVISOR_shared_info;
13787 +       struct vcpu_time_info   *src;
13788 +       struct shadow_time_info *dst;
13789 +
13790 +       src = &s->vcpu_info[smp_processor_id()].time;
13791 +       dst = &per_cpu(shadow_time, smp_processor_id());
13792 +
13793 +       do {
13794 +               dst->version = src->version;
13795 +               rmb();
13796 +               dst->tsc_timestamp     = src->tsc_timestamp;
13797 +               dst->system_timestamp  = src->system_time;
13798 +               dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
13799 +               dst->tsc_shift         = src->tsc_shift;
13800 +               rmb();
13801 +       } while ((src->version & 1) | (dst->version ^ src->version));
13802 +
13803 +       dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
13804 +}
13805 +
13806 +static inline int time_values_up_to_date(int cpu)
13807 +{
13808 +       struct vcpu_time_info   *src;
13809 +       struct shadow_time_info *dst;
13810 +
13811 +       src = &HYPERVISOR_shared_info->vcpu_info[cpu].time;
13812 +       dst = &per_cpu(shadow_time, cpu);
13813 +
13814 +       rmb();
13815 +       return (dst->version == src->version);
13816 +}
13817 +
13818 +/*
13819 + * This is a special lock that is owned by the CPU and holds the index
13820 + * register we are working with.  It is required for NMI access to the
13821 + * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
13822 + */
13823 +volatile unsigned long cmos_lock = 0;
13824 +EXPORT_SYMBOL(cmos_lock);
13825 +
13826 +/* Routines for accessing the CMOS RAM/RTC. */
13827 +unsigned char rtc_cmos_read(unsigned char addr)
13828 +{
13829 +       unsigned char val;
13830 +       lock_cmos_prefix(addr);
13831 +       outb_p(addr, RTC_PORT(0));
13832 +       val = inb_p(RTC_PORT(1));
13833 +       lock_cmos_suffix(addr);
13834 +       return val;
13835 +}
13836 +EXPORT_SYMBOL(rtc_cmos_read);
13837 +
13838 +void rtc_cmos_write(unsigned char val, unsigned char addr)
13839 +{
13840 +       lock_cmos_prefix(addr);
13841 +       outb_p(addr, RTC_PORT(0));
13842 +       outb_p(val, RTC_PORT(1));
13843 +       lock_cmos_suffix(addr);
13844 +}
13845 +EXPORT_SYMBOL(rtc_cmos_write);
13846 +
13847 +/*
13848 + * This version of gettimeofday has microsecond resolution
13849 + * and better than microsecond precision on fast x86 machines with TSC.
13850 + */
13851 +void do_gettimeofday(struct timeval *tv)
13852 +{
13853 +       unsigned long seq;
13854 +       unsigned long usec, sec;
13855 +       unsigned long max_ntp_tick;
13856 +       s64 nsec;
13857 +       unsigned int cpu;
13858 +       struct shadow_time_info *shadow;
13859 +       u32 local_time_version;
13860 +
13861 +       cpu = get_cpu();
13862 +       shadow = &per_cpu(shadow_time, cpu);
13863 +
13864 +       do {
13865 +               unsigned long lost;
13866 +
13867 +               local_time_version = shadow->version;
13868 +               seq = read_seqbegin(&xtime_lock);
13869 +
13870 +               usec = get_usec_offset(shadow);
13871 +               lost = jiffies - wall_jiffies;
13872 +
13873 +               /*
13874 +                * If time_adjust is negative then NTP is slowing the clock
13875 +                * so make sure not to go into next possible interval.
13876 +                * Better to lose some accuracy than have time go backwards..
13877 +                */
13878 +               if (unlikely(time_adjust < 0)) {
13879 +                       max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
13880 +                       usec = min(usec, max_ntp_tick);
13881 +
13882 +                       if (lost)
13883 +                               usec += lost * max_ntp_tick;
13884 +               }
13885 +               else if (unlikely(lost))
13886 +                       usec += lost * (USEC_PER_SEC / HZ);
13887 +
13888 +               sec = xtime.tv_sec;
13889 +               usec += (xtime.tv_nsec / NSEC_PER_USEC);
13890 +
13891 +               nsec = shadow->system_timestamp - processed_system_time;
13892 +               __normalize_time(&sec, &nsec);
13893 +               usec += (long)nsec / NSEC_PER_USEC;
13894 +
13895 +               if (unlikely(!time_values_up_to_date(cpu))) {
13896 +                       /*
13897 +                        * We may have blocked for a long time,
13898 +                        * rendering our calculations invalid
13899 +                        * (e.g. the time delta may have
13900 +                        * overflowed). Detect that and recalculate
13901 +                        * with fresh values.
13902 +                        */
13903 +                       get_time_values_from_xen();
13904 +                       continue;
13905 +               }
13906 +       } while (read_seqretry(&xtime_lock, seq) ||
13907 +                (local_time_version != shadow->version));
13908 +
13909 +       put_cpu();
13910 +
13911 +       while (usec >= USEC_PER_SEC) {
13912 +               usec -= USEC_PER_SEC;
13913 +               sec++;
13914 +       }
13915 +
13916 +       tv->tv_sec = sec;
13917 +       tv->tv_usec = usec;
13918 +}
13919 +
13920 +EXPORT_SYMBOL(do_gettimeofday);
13921 +
13922 +int do_settimeofday(struct timespec *tv)
13923 +{
13924 +       time_t sec;
13925 +       s64 nsec;
13926 +       unsigned int cpu;
13927 +       struct shadow_time_info *shadow;
13928 +       dom0_op_t op;
13929 +
13930 +       if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
13931 +               return -EINVAL;
13932 +
13933 +       cpu = get_cpu();
13934 +       shadow = &per_cpu(shadow_time, cpu);
13935 +
13936 +       write_seqlock_irq(&xtime_lock);
13937 +
13938 +       /*
13939 +        * Ensure we don't get blocked for a long time so that our time delta
13940 +        * overflows. If that were to happen then our shadow time values would
13941 +        * be stale, so we can retry with fresh ones.
13942 +        */
13943 +       for (;;) {
13944 +               nsec = tv->tv_nsec - get_nsec_offset(shadow);
13945 +               if (time_values_up_to_date(cpu))
13946 +                       break;
13947 +               get_time_values_from_xen();
13948 +       }
13949 +       sec = tv->tv_sec;
13950 +       __normalize_time(&sec, &nsec);
13951 +
13952 +       if ((xen_start_info->flags & SIF_INITDOMAIN) &&
13953 +           !independent_wallclock) {
13954 +               op.cmd = DOM0_SETTIME;
13955 +               op.u.settime.secs        = sec;
13956 +               op.u.settime.nsecs       = nsec;
13957 +               op.u.settime.system_time = shadow->system_timestamp;
13958 +               HYPERVISOR_dom0_op(&op);
13959 +               update_wallclock();
13960 +       } else if (independent_wallclock) {
13961 +               nsec -= shadow->system_timestamp;
13962 +               __normalize_time(&sec, &nsec);
13963 +               __update_wallclock(sec, nsec);
13964 +       }
13965 +
13966 +       write_sequnlock_irq(&xtime_lock);
13967 +
13968 +       put_cpu();
13969 +
13970 +       clock_was_set();
13971 +       return 0;
13972 +}
13973 +
13974 +EXPORT_SYMBOL(do_settimeofday);
13975 +
13976 +static void sync_xen_wallclock(unsigned long dummy);
13977 +static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
13978 +static void sync_xen_wallclock(unsigned long dummy)
13979 +{
13980 +       time_t sec;
13981 +       s64 nsec;
13982 +       dom0_op_t op;
13983 +
13984 +       if (!ntp_synced() || independent_wallclock ||
13985 +           !(xen_start_info->flags & SIF_INITDOMAIN))
13986 +               return;
13987 +
13988 +       write_seqlock_irq(&xtime_lock);
13989 +
13990 +       sec  = xtime.tv_sec;
13991 +       nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
13992 +       __normalize_time(&sec, &nsec);
13993 +
13994 +       op.cmd = DOM0_SETTIME;
13995 +       op.u.settime.secs        = sec;
13996 +       op.u.settime.nsecs       = nsec;
13997 +       op.u.settime.system_time = processed_system_time;
13998 +       HYPERVISOR_dom0_op(&op);
13999 +
14000 +       update_wallclock();
14001 +
14002 +       write_sequnlock_irq(&xtime_lock);
14003 +
14004 +       /* Once per minute. */
14005 +       mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
14006 +}
14007 +
14008 +static int set_rtc_mmss(unsigned long nowtime)
14009 +{
14010 +       int retval;
14011 +
14012 +       WARN_ON(irqs_disabled());
14013 +
14014 +       if (independent_wallclock || !(xen_start_info->flags & SIF_INITDOMAIN))
14015 +               return 0;
14016 +
14017 +       /* gets recalled with irq locally disabled */
14018 +       spin_lock_irq(&rtc_lock);
14019 +       if (efi_enabled)
14020 +               retval = efi_set_rtc_mmss(nowtime);
14021 +       else
14022 +               retval = mach_set_rtc_mmss(nowtime);
14023 +       spin_unlock_irq(&rtc_lock);
14024 +
14025 +       return retval;
14026 +}
14027 +
14028 +/* monotonic_clock(): returns # of nanoseconds passed since time_init()
14029 + *             Note: This function is required to return accurate
14030 + *             time even in the absence of multiple timer ticks.
14031 + */
14032 +unsigned long long monotonic_clock(void)
14033 +{
14034 +       int cpu = get_cpu();
14035 +       struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
14036 +       u64 time;
14037 +       u32 local_time_version;
14038 +
14039 +       do {
14040 +               local_time_version = shadow->version;
14041 +               barrier();
14042 +               time = shadow->system_timestamp + get_nsec_offset(shadow);
14043 +               if (!time_values_up_to_date(cpu))
14044 +                       get_time_values_from_xen();
14045 +               barrier();
14046 +       } while (local_time_version != shadow->version);
14047 +
14048 +       put_cpu();
14049 +
14050 +       return time;
14051 +}
14052 +EXPORT_SYMBOL(monotonic_clock);
14053 +
14054 +unsigned long long sched_clock(void)
14055 +{
14056 +       return monotonic_clock();
14057 +}
14058 +
14059 +#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
14060 +unsigned long profile_pc(struct pt_regs *regs)
14061 +{
14062 +       unsigned long pc = instruction_pointer(regs);
14063 +
14064 +#ifdef __x86_64__
14065 +       /* Assume the lock function has either no stack frame or only a single word.
14066 +          This checks if the address on the stack looks like a kernel text address.
14067 +          There is a small window for false hits, but in that case the tick
14068 +          is just accounted to the spinlock function.
14069 +          Better would be to write these functions in assembler again
14070 +          and check exactly. */
14071 +       if (in_lock_functions(pc)) {
14072 +               char *v = *(char **)regs->rsp;
14073 +               if ((v >= _stext && v <= _etext) ||
14074 +                       (v >= _sinittext && v <= _einittext) ||
14075 +                       (v >= (char *)MODULES_VADDR  && v <= (char *)MODULES_END))
14076 +                       return (unsigned long)v;
14077 +               return ((unsigned long *)regs->rsp)[1];
14078 +       }
14079 +#else
14080 +       if (in_lock_functions(pc))
14081 +               return *(unsigned long *)(regs->ebp + 4);
14082 +#endif
14083 +
14084 +       return pc;
14085 +}
14086 +EXPORT_SYMBOL(profile_pc);
14087 +#endif
14088 +
14089 +irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
14090 +{
14091 +       s64 delta, delta_cpu, stolen, blocked;
14092 +       u64 sched_time;
14093 +       int i, cpu = smp_processor_id();
14094 +       struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
14095 +       struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
14096 +
14097 +       write_seqlock(&xtime_lock);
14098 +
14099 +       do {
14100 +               get_time_values_from_xen();
14101 +
14102 +               /* Obtain a consistent snapshot of elapsed wallclock cycles. */
14103 +               delta = delta_cpu =
14104 +                       shadow->system_timestamp + get_nsec_offset(shadow);
14105 +               delta     -= processed_system_time;
14106 +               delta_cpu -= per_cpu(processed_system_time, cpu);
14107 +
14108 +               /*
14109 +                * Obtain a consistent snapshot of stolen/blocked cycles. We
14110 +                * can use state_entry_time to detect if we get preempted here.
14111 +                */
14112 +               do {
14113 +                       sched_time = runstate->state_entry_time;
14114 +                       barrier();
14115 +                       stolen = runstate->time[RUNSTATE_runnable] +
14116 +                               runstate->time[RUNSTATE_offline] -
14117 +                               per_cpu(processed_stolen_time, cpu);
14118 +                       blocked = runstate->time[RUNSTATE_blocked] -
14119 +                               per_cpu(processed_blocked_time, cpu);
14120 +                       barrier();
14121 +               } while (sched_time != runstate->state_entry_time);
14122 +       } while (!time_values_up_to_date(cpu));
14123 +
14124 +       if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
14125 +            unlikely(delta_cpu < -(s64)permitted_clock_jitter))
14126 +           && printk_ratelimit()) {
14127 +               printk("Timer ISR/%d: Time went backwards: "
14128 +                      "delta=%lld delta_cpu=%lld shadow=%lld "
14129 +                      "off=%lld processed=%lld cpu_processed=%lld\n",
14130 +                      cpu, delta, delta_cpu, shadow->system_timestamp,
14131 +                      (s64)get_nsec_offset(shadow),
14132 +                      processed_system_time,
14133 +                      per_cpu(processed_system_time, cpu));
14134 +               for (i = 0; i < num_online_cpus(); i++)
14135 +                       printk(" %d: %lld\n", i,
14136 +                              per_cpu(processed_system_time, i));
14137 +       }
14138 +
14139 +       /* System-wide jiffy work. */
14140 +       while (delta >= NS_PER_TICK) {
14141 +               delta -= NS_PER_TICK;
14142 +               processed_system_time += NS_PER_TICK;
14143 +               do_timer(regs);
14144 +       }
14145 +
14146 +       if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
14147 +               update_wallclock();
14148 +               clock_was_set();
14149 +       }
14150 +
14151 +       write_sequnlock(&xtime_lock);
14152 +
14153 +       /*
14154 +        * Account stolen ticks.
14155 +        * HACK: Passing NULL to account_steal_time()
14156 +        * ensures that the ticks are accounted as stolen.
14157 +        */
14158 +       if ((stolen > 0) && (delta_cpu > 0)) {
14159 +               delta_cpu -= stolen;
14160 +               if (unlikely(delta_cpu < 0))
14161 +                       stolen += delta_cpu; /* clamp local-time progress */
14162 +               do_div(stolen, NS_PER_TICK);
14163 +               per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
14164 +               per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
14165 +               account_steal_time(NULL, (cputime_t)stolen);
14166 +       }
14167 +
14168 +       /*
14169 +        * Account blocked ticks.
14170 +        * HACK: Passing idle_task to account_steal_time()
14171 +        * ensures that the ticks are accounted as idle/wait.
14172 +        */
14173 +       if ((blocked > 0) && (delta_cpu > 0)) {
14174 +               delta_cpu -= blocked;
14175 +               if (unlikely(delta_cpu < 0))
14176 +                       blocked += delta_cpu; /* clamp local-time progress */
14177 +               do_div(blocked, NS_PER_TICK);
14178 +               per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
14179 +               per_cpu(processed_system_time, cpu)  += blocked * NS_PER_TICK;
14180 +               account_steal_time(idle_task(cpu), (cputime_t)blocked);
14181 +       }
14182 +
14183 +       /* Account user/system ticks. */
14184 +       if (delta_cpu > 0) {
14185 +               do_div(delta_cpu, NS_PER_TICK);
14186 +               per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
14187 +               if (user_mode(regs))
14188 +                       account_user_time(current, (cputime_t)delta_cpu);
14189 +               else
14190 +                       account_system_time(current, HARDIRQ_OFFSET,
14191 +                                           (cputime_t)delta_cpu);
14192 +       }
14193 +
14194 +       /* Local timer processing (see update_process_times()). */
14195 +       run_local_timers();
14196 +       if (rcu_pending(cpu))
14197 +               rcu_check_callbacks(cpu, user_mode(regs));
14198 +       scheduler_tick();
14199 +       run_posix_cpu_timers(current);
14200 +
14201 +       return IRQ_HANDLED;
14202 +}
14203 +
14204 +static void init_missing_ticks_accounting(int cpu)
14205 +{
14206 +       struct vcpu_register_runstate_memory_area area;
14207 +       struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
14208 +
14209 +       memset(runstate, 0, sizeof(*runstate));
14210 +
14211 +       area.addr.v = runstate;
14212 +       HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
14213 +
14214 +       per_cpu(processed_blocked_time, cpu) =
14215 +               runstate->time[RUNSTATE_blocked];
14216 +       per_cpu(processed_stolen_time, cpu) =
14217 +               runstate->time[RUNSTATE_runnable] +
14218 +               runstate->time[RUNSTATE_offline];
14219 +}
14220 +
14221 +/* not static: needed by APM */
14222 +unsigned long get_cmos_time(void)
14223 +{
14224 +       unsigned long retval;
14225 +
14226 +       spin_lock(&rtc_lock);
14227 +
14228 +       if (efi_enabled)
14229 +               retval = efi_get_time();
14230 +       else
14231 +               retval = mach_get_cmos_time();
14232 +
14233 +       spin_unlock(&rtc_lock);
14234 +
14235 +       return retval;
14236 +}
14237 +EXPORT_SYMBOL(get_cmos_time);
14238 +
14239 +static void sync_cmos_clock(unsigned long dummy);
14240 +
14241 +static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
14242 +
14243 +static void sync_cmos_clock(unsigned long dummy)
14244 +{
14245 +       struct timeval now, next;
14246 +       int fail = 1;
14247 +
14248 +       /*
14249 +        * If we have an externally synchronized Linux clock, then update
14250 +        * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
14251 +        * called as close as possible to 500 ms before the new second starts.
14252 +        * This code is run on a timer.  If the clock is set, that timer
14253 +        * may not expire at the correct time.  Thus, we adjust...
14254 +        */
14255 +       if (!ntp_synced())
14256 +               /*
14257 +                * Not synced, exit, do not restart a timer (if one is
14258 +                * running, let it run out).
14259 +                */
14260 +               return;
14261 +
14262 +       do_gettimeofday(&now);
14263 +       if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
14264 +           now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
14265 +               fail = set_rtc_mmss(now.tv_sec);
14266 +
14267 +       next.tv_usec = USEC_AFTER - now.tv_usec;
14268 +       if (next.tv_usec <= 0)
14269 +               next.tv_usec += USEC_PER_SEC;
14270 +
14271 +       if (!fail)
14272 +               next.tv_sec = 659;
14273 +       else
14274 +               next.tv_sec = 0;
14275 +
14276 +       if (next.tv_usec >= USEC_PER_SEC) {
14277 +               next.tv_sec++;
14278 +               next.tv_usec -= USEC_PER_SEC;
14279 +       }
14280 +       mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
14281 +}
14282 +
14283 +void notify_arch_cmos_timer(void)
14284 +{
14285 +       mod_timer(&sync_cmos_timer, jiffies + 1);
14286 +       mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
14287 +}
14288 +
14289 +static long clock_cmos_diff, sleep_start;
14290 +
14291 +static struct timer_opts *last_timer;
14292 +static int timer_suspend(struct sys_device *dev, pm_message_t state)
14293 +{
14294 +       /*
14295 +        * Estimate time zone so that set_time can update the clock
14296 +        */
14297 +       clock_cmos_diff = -get_cmos_time();
14298 +       clock_cmos_diff += get_seconds();
14299 +       sleep_start = get_cmos_time();
14300 +       last_timer = cur_timer;
14301 +       cur_timer = &timer_none;
14302 +       if (last_timer->suspend)
14303 +               last_timer->suspend(state);
14304 +       return 0;
14305 +}
14306 +
14307 +static int timer_resume(struct sys_device *dev)
14308 +{
14309 +       unsigned long flags;
14310 +       unsigned long sec;
14311 +       unsigned long sleep_length;
14312 +
14313 +#ifdef CONFIG_HPET_TIMER
14314 +       if (is_hpet_enabled())
14315 +               hpet_reenable();
14316 +#endif
14317 +       sec = get_cmos_time() + clock_cmos_diff;
14318 +       sleep_length = (get_cmos_time() - sleep_start) * HZ;
14319 +       write_seqlock_irqsave(&xtime_lock, flags);
14320 +       xtime.tv_sec = sec;
14321 +       xtime.tv_nsec = 0;
14322 +       jiffies_64 += sleep_length;
14323 +       wall_jiffies += sleep_length;
14324 +       write_sequnlock_irqrestore(&xtime_lock, flags);
14325 +       if (last_timer->resume)
14326 +               last_timer->resume();
14327 +       cur_timer = last_timer;
14328 +       last_timer = NULL;
14329 +       touch_softlockup_watchdog();
14330 +       return 0;
14331 +}
14332 +
14333 +static struct sysdev_class timer_sysclass = {
14334 +       .resume = timer_resume,
14335 +       .suspend = timer_suspend,
14336 +       set_kset_name("timer"),
14337 +};
14338 +
14339 +
14340 +/* XXX this driverfs stuff should probably go elsewhere later -john */
14341 +static struct sys_device device_timer = {
14342 +       .id     = 0,
14343 +       .cls    = &timer_sysclass,
14344 +};
14345 +
14346 +static int time_init_device(void)
14347 +{
14348 +       int error = sysdev_class_register(&timer_sysclass);
14349 +       if (!error)
14350 +               error = sysdev_register(&device_timer);
14351 +       return error;
14352 +}
14353 +
14354 +device_initcall(time_init_device);
14355 +
14356 +#ifdef CONFIG_HPET_TIMER
14357 +extern void (*late_time_init)(void);
14358 +/* Duplicate of time_init() below, with hpet_enable part added */
14359 +static void __init hpet_time_init(void)
14360 +{
14361 +       xtime.tv_sec = get_cmos_time();
14362 +       xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
14363 +       set_normalized_timespec(&wall_to_monotonic,
14364 +               -xtime.tv_sec, -xtime.tv_nsec);
14365 +
14366 +       if ((hpet_enable() >= 0) && hpet_use_timer) {
14367 +               printk("Using HPET for base-timer\n");
14368 +       }
14369 +
14370 +       cur_timer = select_timer();
14371 +       printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
14372 +
14373 +       time_init_hook();
14374 +}
14375 +#endif
14376 +
14377 +/* Dynamically-mapped IRQ. */
14378 +DEFINE_PER_CPU(int, timer_irq);
14379 +
14380 +extern void (*late_time_init)(void);
14381 +static void setup_cpu0_timer_irq(void)
14382 +{
14383 +       per_cpu(timer_irq, 0) =
14384 +               bind_virq_to_irqhandler(
14385 +                       VIRQ_TIMER,
14386 +                       0,
14387 +                       timer_interrupt,
14388 +                       SA_INTERRUPT,
14389 +                       "timer0",
14390 +                       NULL);
14391 +       BUG_ON(per_cpu(timer_irq, 0) < 0);
14392 +}
14393 +
14394 +void __init time_init(void)
14395 +{
14396 +#ifdef CONFIG_HPET_TIMER
14397 +       if (is_hpet_capable()) {
14398 +               /*
14399 +                * HPET initialization needs to do memory-mapped io. So, let
14400 +                * us do a late initialization after mem_init().
14401 +                */
14402 +               late_time_init = hpet_time_init;
14403 +               return;
14404 +       }
14405 +#endif
14406 +       get_time_values_from_xen();
14407 +
14408 +       processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
14409 +       per_cpu(processed_system_time, 0) = processed_system_time;
14410 +       init_missing_ticks_accounting(0);
14411 +
14412 +       update_wallclock();
14413 +
14414 +       init_cpu_khz();
14415 +       printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
14416 +              cpu_khz / 1000, cpu_khz % 1000);
14417 +
14418 +#if defined(__x86_64__)
14419 +       vxtime.mode = VXTIME_TSC;
14420 +       vxtime.quot = (1000000L << 32) / vxtime_hz;
14421 +       vxtime.tsc_quot = (1000L << 32) / cpu_khz;
14422 +       sync_core();
14423 +       rdtscll(vxtime.last_tsc);
14424 +#endif
14425 +
14426 +       /* Cannot request_irq() until kmem is initialised. */
14427 +       late_time_init = setup_cpu0_timer_irq;
14428 +}
14429 +
14430 +/* Convert jiffies to system time. */
14431 +u64 jiffies_to_st(unsigned long j)
14432 +{
14433 +       unsigned long seq;
14434 +       long delta;
14435 +       u64 st;
14436 +
14437 +       do {
14438 +               seq = read_seqbegin(&xtime_lock);
14439 +               delta = j - jiffies;
14440 +               /* NB. The next check can trigger in some wrap-around cases,
14441 +                * but that's ok: we'll just end up with a shorter timeout. */
14442 +               if (delta < 1)
14443 +                       delta = 1;
14444 +               st = processed_system_time + (delta * (u64)NS_PER_TICK);
14445 +       } while (read_seqretry(&xtime_lock, seq));
14446 +
14447 +       return st;
14448 +}
14449 +EXPORT_SYMBOL(jiffies_to_st);
14450 +
14451 +/*
14452 + * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
14453 + * These functions are based on implementations from arch/s390/kernel/time.c
14454 + */
14455 +void stop_hz_timer(void)
14456 +{
14457 +       unsigned int cpu = smp_processor_id();
14458 +       unsigned long j;
14459 +
14460 +       /* We must do this /before/ checking rcu_pending(). */
14461 +       cpu_set(cpu, nohz_cpu_mask);
14462 +       smp_mb();
14463 +
14464 +       /* Leave ourselves in 'tick mode' if rcu or softirq pending. */
14465 +       if (rcu_pending(cpu) || local_softirq_pending()) {
14466 +               cpu_clear(cpu, nohz_cpu_mask);
14467 +               j = jiffies + 1;
14468 +       } else {
14469 +               j = next_timer_interrupt();
14470 +       }
14471 +
14472 +       BUG_ON(HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0);
14473 +}
14474 +
14475 +void start_hz_timer(void)
14476 +{
14477 +       cpu_clear(smp_processor_id(), nohz_cpu_mask);
14478 +}
14479 +
14480 +/* No locking required. We are only CPU running, and interrupts are off. */
14481 +void time_resume(void)
14482 +{
14483 +       init_cpu_khz();
14484 +
14485 +       get_time_values_from_xen();
14486 +
14487 +       processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
14488 +       per_cpu(processed_system_time, 0) = processed_system_time;
14489 +       init_missing_ticks_accounting(0);
14490 +
14491 +       update_wallclock();
14492 +}
14493 +
14494 +#ifdef CONFIG_SMP
14495 +static char timer_name[NR_CPUS][15];
14496 +
14497 +void local_setup_timer(unsigned int cpu)
14498 +{
14499 +       int seq;
14500 +
14501 +       BUG_ON(cpu == 0);
14502 +
14503 +       do {
14504 +               seq = read_seqbegin(&xtime_lock);
14505 +               /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
14506 +               per_cpu(processed_system_time, cpu) =
14507 +                       per_cpu(shadow_time, 0).system_timestamp;
14508 +               init_missing_ticks_accounting(cpu);
14509 +       } while (read_seqretry(&xtime_lock, seq));
14510 +
14511 +       sprintf(timer_name[cpu], "timer%d", cpu);
14512 +       per_cpu(timer_irq, cpu) =
14513 +               bind_virq_to_irqhandler(
14514 +                       VIRQ_TIMER,
14515 +                       cpu,
14516 +                       timer_interrupt,
14517 +                       SA_INTERRUPT,
14518 +                       timer_name[cpu],
14519 +                       NULL);
14520 +       BUG_ON(per_cpu(timer_irq, cpu) < 0);
14521 +}
14522 +
14523 +void local_teardown_timer(unsigned int cpu)
14524 +{
14525 +       BUG_ON(cpu == 0);
14526 +       unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
14527 +}
14528 +#endif
14529 +
14530 +/*
14531 + * /proc/sys/xen: This really belongs in another file. It can stay here for
14532 + * now however.
14533 + */
14534 +static ctl_table xen_subtable[] = {
14535 +       {
14536 +               .ctl_name       = 1,
14537 +               .procname       = "independent_wallclock",
14538 +               .data           = &independent_wallclock,
14539 +               .maxlen         = sizeof(independent_wallclock),
14540 +               .mode           = 0644,
14541 +               .proc_handler   = proc_dointvec
14542 +       },
14543 +       {
14544 +               .ctl_name       = 2,
14545 +               .procname       = "permitted_clock_jitter",
14546 +               .data           = &permitted_clock_jitter,
14547 +               .maxlen         = sizeof(permitted_clock_jitter),
14548 +               .mode           = 0644,
14549 +               .proc_handler   = proc_doulongvec_minmax
14550 +       },
14551 +       { 0 }
14552 +};
14553 +static ctl_table xen_table[] = {
14554 +       {
14555 +               .ctl_name       = 123,
14556 +               .procname       = "xen",
14557 +               .mode           = 0555,
14558 +               .child          = xen_subtable},
14559 +       { 0 }
14560 +};
14561 +static int __init xen_sysctl_init(void)
14562 +{
14563 +       (void)register_sysctl_table(xen_table, 0);
14564 +       return 0;
14565 +}
14566 +__initcall(xen_sysctl_init);
14567 +
14568 +/*
14569 + * Local variables:
14570 + *  c-file-style: "linux"
14571 + *  indent-tabs-mode: t
14572 + *  c-indent-level: 8
14573 + *  c-basic-offset: 8
14574 + *  tab-width: 8
14575 + * End:
14576 + */
14577 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/traps-xen.c linux-2.6.16/arch/i386/kernel/traps-xen.c
14578 --- linux-2.6.16.orig/arch/i386/kernel/traps-xen.c      1970-01-01 01:00:00.000000000 +0100
14579 +++ linux-2.6.16/arch/i386/kernel/traps-xen.c   2006-06-26 09:51:32.000000000 +0200
14580 @@ -0,0 +1,1094 @@
14581 +/*
14582 + *  linux/arch/i386/traps.c
14583 + *
14584 + *  Copyright (C) 1991, 1992  Linus Torvalds
14585 + *
14586 + *  Pentium III FXSR, SSE support
14587 + *     Gareth Hughes <gareth@valinux.com>, May 2000
14588 + */
14589 +
14590 +/*
14591 + * 'Traps.c' handles hardware traps and faults after we have saved some
14592 + * state in 'asm.s'.
14593 + */
14594 +#include <linux/config.h>
14595 +#include <linux/sched.h>
14596 +#include <linux/kernel.h>
14597 +#include <linux/string.h>
14598 +#include <linux/errno.h>
14599 +#include <linux/timer.h>
14600 +#include <linux/mm.h>
14601 +#include <linux/init.h>
14602 +#include <linux/delay.h>
14603 +#include <linux/spinlock.h>
14604 +#include <linux/interrupt.h>
14605 +#include <linux/highmem.h>
14606 +#include <linux/kallsyms.h>
14607 +#include <linux/ptrace.h>
14608 +#include <linux/utsname.h>
14609 +#include <linux/kprobes.h>
14610 +#include <linux/kexec.h>
14611 +
14612 +#ifdef CONFIG_EISA
14613 +#include <linux/ioport.h>
14614 +#include <linux/eisa.h>
14615 +#endif
14616 +
14617 +#ifdef CONFIG_MCA
14618 +#include <linux/mca.h>
14619 +#endif
14620 +
14621 +#include <asm/processor.h>
14622 +#include <asm/system.h>
14623 +#include <asm/uaccess.h>
14624 +#include <asm/io.h>
14625 +#include <asm/atomic.h>
14626 +#include <asm/debugreg.h>
14627 +#include <asm/desc.h>
14628 +#include <asm/i387.h>
14629 +#include <asm/nmi.h>
14630 +
14631 +#include <asm/smp.h>
14632 +#include <asm/arch_hooks.h>
14633 +#include <asm/kdebug.h>
14634 +
14635 +#include <linux/module.h>
14636 +
14637 +#include "mach_traps.h"
14638 +
14639 +asmlinkage int system_call(void);
14640 +
14641 +struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
14642 +               { 0, 0 }, { 0, 0 } };
14643 +
14644 +/* Do we ignore FPU interrupts ? */
14645 +char ignore_fpu_irq = 0;
14646 +
14647 +#ifndef CONFIG_X86_NO_IDT
14648 +/*
14649 + * The IDT has to be page-aligned to simplify the Pentium
14650 + * F0 0F bug workaround.. We have a special link segment
14651 + * for this.
14652 + */
14653 +struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
14654 +#endif
14655 +
14656 +asmlinkage void divide_error(void);
14657 +asmlinkage void debug(void);
14658 +asmlinkage void nmi(void);
14659 +asmlinkage void int3(void);
14660 +asmlinkage void overflow(void);
14661 +asmlinkage void bounds(void);
14662 +asmlinkage void invalid_op(void);
14663 +asmlinkage void device_not_available(void);
14664 +asmlinkage void coprocessor_segment_overrun(void);
14665 +asmlinkage void invalid_TSS(void);
14666 +asmlinkage void segment_not_present(void);
14667 +asmlinkage void stack_segment(void);
14668 +asmlinkage void general_protection(void);
14669 +asmlinkage void page_fault(void);
14670 +asmlinkage void coprocessor_error(void);
14671 +asmlinkage void simd_coprocessor_error(void);
14672 +asmlinkage void alignment_check(void);
14673 +#ifndef CONFIG_XEN
14674 +asmlinkage void spurious_interrupt_bug(void);
14675 +#else
14676 +asmlinkage void fixup_4gb_segment(void);
14677 +#endif
14678 +asmlinkage void machine_check(void);
14679 +
14680 +static int kstack_depth_to_print = 24;
14681 +struct notifier_block *i386die_chain;
14682 +static DEFINE_SPINLOCK(die_notifier_lock);
14683 +
14684 +int register_die_notifier(struct notifier_block *nb)
14685 +{
14686 +       int err = 0;
14687 +       unsigned long flags;
14688 +       spin_lock_irqsave(&die_notifier_lock, flags);
14689 +       err = notifier_chain_register(&i386die_chain, nb);
14690 +       spin_unlock_irqrestore(&die_notifier_lock, flags);
14691 +       return err;
14692 +}
14693 +EXPORT_SYMBOL(register_die_notifier);
14694 +
14695 +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
14696 +{
14697 +       return  p > (void *)tinfo &&
14698 +               p < (void *)tinfo + THREAD_SIZE - 3;
14699 +}
14700 +
14701 +static void print_addr_and_symbol(unsigned long addr, char *log_lvl)
14702 +{
14703 +       printk(log_lvl);
14704 +       printk(" [<%08lx>] ", addr);
14705 +       print_symbol("%s", addr);
14706 +       printk("\n");
14707 +}
14708 +
14709 +static inline unsigned long print_context_stack(struct thread_info *tinfo,
14710 +                               unsigned long *stack, unsigned long ebp,
14711 +                               char *log_lvl)
14712 +{
14713 +       unsigned long addr;
14714 +
14715 +#ifdef CONFIG_FRAME_POINTER
14716 +       while (valid_stack_ptr(tinfo, (void *)ebp)) {
14717 +               addr = *(unsigned long *)(ebp + 4);
14718 +               print_addr_and_symbol(addr, log_lvl);
14719 +               ebp = *(unsigned long *)ebp;
14720 +       }
14721 +#else
14722 +       while (valid_stack_ptr(tinfo, stack)) {
14723 +               addr = *stack++;
14724 +               if (__kernel_text_address(addr))
14725 +                       print_addr_and_symbol(addr, log_lvl);
14726 +       }
14727 +#endif
14728 +       return ebp;
14729 +}
14730 +
14731 +static void show_trace_log_lvl(struct task_struct *task,
14732 +                              unsigned long *stack, char *log_lvl)
14733 +{
14734 +       unsigned long ebp;
14735 +
14736 +       if (!task)
14737 +               task = current;
14738 +
14739 +       if (task == current) {
14740 +               /* Grab ebp right from our regs */
14741 +               asm ("movl %%ebp, %0" : "=r" (ebp) : );
14742 +       } else {
14743 +               /* ebp is the last reg pushed by switch_to */
14744 +               ebp = *(unsigned long *) task->thread.esp;
14745 +       }
14746 +
14747 +       while (1) {
14748 +               struct thread_info *context;
14749 +               context = (struct thread_info *)
14750 +                       ((unsigned long)stack & (~(THREAD_SIZE - 1)));
14751 +               ebp = print_context_stack(context, stack, ebp, log_lvl);
14752 +               stack = (unsigned long*)context->previous_esp;
14753 +               if (!stack)
14754 +                       break;
14755 +               printk(log_lvl);
14756 +               printk(" =======================\n");
14757 +       }
14758 +}
14759 +
14760 +void show_trace(struct task_struct *task, unsigned long * stack)
14761 +{
14762 +       show_trace_log_lvl(task, stack, "");
14763 +}
14764 +
14765 +static void show_stack_log_lvl(struct task_struct *task, unsigned long *esp,
14766 +                              char *log_lvl)
14767 +{
14768 +       unsigned long *stack;
14769 +       int i;
14770 +
14771 +       if (esp == NULL) {
14772 +               if (task)
14773 +                       esp = (unsigned long*)task->thread.esp;
14774 +               else
14775 +                       esp = (unsigned long *)&esp;
14776 +       }
14777 +
14778 +       stack = esp;
14779 +       printk(log_lvl);
14780 +       for(i = 0; i < kstack_depth_to_print; i++) {
14781 +               if (kstack_end(stack))
14782 +                       break;
14783 +               if (i && ((i % 8) == 0)) {
14784 +                       printk("\n");
14785 +                       printk(log_lvl);
14786 +                       printk("       ");
14787 +               }
14788 +               printk("%08lx ", *stack++);
14789 +       }
14790 +       printk("\n");
14791 +       printk(log_lvl);
14792 +       printk("Call Trace:\n");
14793 +       show_trace_log_lvl(task, esp, log_lvl);
14794 +}
14795 +
14796 +void show_stack(struct task_struct *task, unsigned long *esp)
14797 +{
14798 +       show_stack_log_lvl(task, esp, "");
14799 +}
14800 +
14801 +/*
14802 + * The architecture-independent dump_stack generator
14803 + */
14804 +void dump_stack(void)
14805 +{
14806 +       unsigned long stack;
14807 +
14808 +       show_trace(current, &stack);
14809 +}
14810 +
14811 +EXPORT_SYMBOL(dump_stack);
14812 +
14813 +void show_registers(struct pt_regs *regs)
14814 +{
14815 +       int i;
14816 +       int in_kernel = 1;
14817 +       unsigned long esp;
14818 +       unsigned short ss;
14819 +
14820 +       esp = (unsigned long) (&regs->esp);
14821 +       savesegment(ss, ss);
14822 +       if (user_mode(regs)) {
14823 +               in_kernel = 0;
14824 +               esp = regs->esp;
14825 +               ss = regs->xss & 0xffff;
14826 +       }
14827 +       print_modules();
14828 +       printk(KERN_EMERG "CPU:    %d\nEIP:    %04x:[<%08lx>]    %s VLI\n"
14829 +                       "EFLAGS: %08lx   (%s %.*s) \n",
14830 +               smp_processor_id(), 0xffff & regs->xcs, regs->eip,
14831 +               print_tainted(), regs->eflags, system_utsname.release,
14832 +               (int)strcspn(system_utsname.version, " "),
14833 +               system_utsname.version);
14834 +       print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip);
14835 +       printk(KERN_EMERG "eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
14836 +               regs->eax, regs->ebx, regs->ecx, regs->edx);
14837 +       printk(KERN_EMERG "esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
14838 +               regs->esi, regs->edi, regs->ebp, esp);
14839 +       printk(KERN_EMERG "ds: %04x   es: %04x   ss: %04x\n",
14840 +               regs->xds & 0xffff, regs->xes & 0xffff, ss);
14841 +       printk(KERN_EMERG "Process %s (pid: %d, threadinfo=%p task=%p)",
14842 +               current->comm, current->pid, current_thread_info(), current);
14843 +       /*
14844 +        * When in-kernel, we also print out the stack and code at the
14845 +        * time of the fault..
14846 +        */
14847 +       if (in_kernel) {
14848 +               u8 __user *eip;
14849 +
14850 +               printk("\n" KERN_EMERG "Stack: ");
14851 +               show_stack_log_lvl(NULL, (unsigned long *)esp, KERN_EMERG);
14852 +
14853 +               printk(KERN_EMERG "Code: ");
14854 +
14855 +               eip = (u8 __user *)regs->eip - 43;
14856 +               for (i = 0; i < 64; i++, eip++) {
14857 +                       unsigned char c;
14858 +
14859 +                       if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
14860 +                               printk(" Bad EIP value.");
14861 +                               break;
14862 +                       }
14863 +                       if (eip == (u8 __user *)regs->eip)
14864 +                               printk("<%02x> ", c);
14865 +                       else
14866 +                               printk("%02x ", c);
14867 +               }
14868 +       }
14869 +       printk("\n");
14870 +}      
14871 +
14872 +static void handle_BUG(struct pt_regs *regs)
14873 +{
14874 +       unsigned short ud2;
14875 +       unsigned short line;
14876 +       char *file;
14877 +       char c;
14878 +       unsigned long eip;
14879 +
14880 +       eip = regs->eip;
14881 +
14882 +       if (eip < PAGE_OFFSET)
14883 +               goto no_bug;
14884 +       if (__get_user(ud2, (unsigned short __user *)eip))
14885 +               goto no_bug;
14886 +       if (ud2 != 0x0b0f)
14887 +               goto no_bug;
14888 +       if (__get_user(line, (unsigned short __user *)(eip + 2)))
14889 +               goto bug;
14890 +       if (__get_user(file, (char * __user *)(eip + 4)) ||
14891 +               (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
14892 +               file = "<bad filename>";
14893 +
14894 +       printk(KERN_EMERG "------------[ cut here ]------------\n");
14895 +       printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
14896 +
14897 +no_bug:
14898 +       return;
14899 +
14900 +       /* Here we know it was a BUG but file-n-line is unavailable */
14901 +bug:
14902 +       printk(KERN_EMERG "Kernel BUG\n");
14903 +}
14904 +
14905 +/* This is gone through when something in the kernel
14906 + * has done something bad and is about to be terminated.
14907 +*/
14908 +void die(const char * str, struct pt_regs * regs, long err)
14909 +{
14910 +       static struct {
14911 +               spinlock_t lock;
14912 +               u32 lock_owner;
14913 +               int lock_owner_depth;
14914 +       } die = {
14915 +               .lock =                 SPIN_LOCK_UNLOCKED,
14916 +               .lock_owner =           -1,
14917 +               .lock_owner_depth =     0
14918 +       };
14919 +       static int die_counter;
14920 +       unsigned long flags;
14921 +
14922 +       if (die.lock_owner != raw_smp_processor_id()) {
14923 +               console_verbose();
14924 +               spin_lock_irqsave(&die.lock, flags);
14925 +               die.lock_owner = smp_processor_id();
14926 +               die.lock_owner_depth = 0;
14927 +               bust_spinlocks(1);
14928 +       }
14929 +       else
14930 +               local_save_flags(flags);
14931 +
14932 +       if (++die.lock_owner_depth < 3) {
14933 +               int nl = 0;
14934 +               handle_BUG(regs);
14935 +               printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
14936 +#ifdef CONFIG_PREEMPT
14937 +               printk(KERN_EMERG "PREEMPT ");
14938 +               nl = 1;
14939 +#endif
14940 +#ifdef CONFIG_SMP
14941 +               if (!nl)
14942 +                       printk(KERN_EMERG);
14943 +               printk("SMP ");
14944 +               nl = 1;
14945 +#endif
14946 +#ifdef CONFIG_DEBUG_PAGEALLOC
14947 +               if (!nl)
14948 +                       printk(KERN_EMERG);
14949 +               printk("DEBUG_PAGEALLOC");
14950 +               nl = 1;
14951 +#endif
14952 +               if (nl)
14953 +                       printk("\n");
14954 +       notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV);
14955 +               show_registers(regs);
14956 +       } else
14957 +               printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
14958 +
14959 +       bust_spinlocks(0);
14960 +       die.lock_owner = -1;
14961 +       spin_unlock_irqrestore(&die.lock, flags);
14962 +
14963 +       if (kexec_should_crash(current))
14964 +               crash_kexec(regs);
14965 +
14966 +       if (in_interrupt())
14967 +               panic("Fatal exception in interrupt");
14968 +
14969 +       if (panic_on_oops) {
14970 +               printk(KERN_EMERG "Fatal exception: panic in 5 seconds\n");
14971 +               ssleep(5);
14972 +               panic("Fatal exception");
14973 +       }
14974 +       do_exit(SIGSEGV);
14975 +}
14976 +
14977 +static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
14978 +{
14979 +       if (!user_mode_vm(regs))
14980 +               die(str, regs, err);
14981 +}
14982 +
14983 +static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
14984 +                             struct pt_regs * regs, long error_code,
14985 +                             siginfo_t *info)
14986 +{
14987 +       struct task_struct *tsk = current;
14988 +       tsk->thread.error_code = error_code;
14989 +       tsk->thread.trap_no = trapnr;
14990 +
14991 +       if (regs->eflags & VM_MASK) {
14992 +               if (vm86)
14993 +                       goto vm86_trap;
14994 +               goto trap_signal;
14995 +       }
14996 +
14997 +       if (!user_mode(regs))
14998 +               goto kernel_trap;
14999 +
15000 +       trap_signal: {
15001 +               if (info)
15002 +                       force_sig_info(signr, info, tsk);
15003 +               else
15004 +                       force_sig(signr, tsk);
15005 +               return;
15006 +       }
15007 +
15008 +       kernel_trap: {
15009 +               if (!fixup_exception(regs))
15010 +                       die(str, regs, error_code);
15011 +               return;
15012 +       }
15013 +
15014 +       vm86_trap: {
15015 +               int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
15016 +               if (ret) goto trap_signal;
15017 +               return;
15018 +       }
15019 +}
15020 +
15021 +#define DO_ERROR(trapnr, signr, str, name) \
15022 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
15023 +{ \
15024 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
15025 +                                               == NOTIFY_STOP) \
15026 +               return; \
15027 +       do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
15028 +}
15029 +
15030 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
15031 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
15032 +{ \
15033 +       siginfo_t info; \
15034 +       info.si_signo = signr; \
15035 +       info.si_errno = 0; \
15036 +       info.si_code = sicode; \
15037 +       info.si_addr = (void __user *)siaddr; \
15038 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
15039 +                                               == NOTIFY_STOP) \
15040 +               return; \
15041 +       do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
15042 +}
15043 +
15044 +#define DO_VM86_ERROR(trapnr, signr, str, name) \
15045 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
15046 +{ \
15047 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
15048 +                                               == NOTIFY_STOP) \
15049 +               return; \
15050 +       do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
15051 +}
15052 +
15053 +#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
15054 +fastcall void do_##name(struct pt_regs * regs, long error_code) \
15055 +{ \
15056 +       siginfo_t info; \
15057 +       info.si_signo = signr; \
15058 +       info.si_errno = 0; \
15059 +       info.si_code = sicode; \
15060 +       info.si_addr = (void __user *)siaddr; \
15061 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
15062 +                                               == NOTIFY_STOP) \
15063 +               return; \
15064 +       do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
15065 +}
15066 +
15067 +DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->eip)
15068 +#ifndef CONFIG_KPROBES
15069 +DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
15070 +#endif
15071 +DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
15072 +DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
15073 +DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip)
15074 +DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
15075 +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
15076 +DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
15077 +DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
15078 +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
15079 +DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
15080 +
15081 +fastcall void __kprobes do_general_protection(struct pt_regs * regs,
15082 +                                             long error_code)
15083 +{
15084 +       current->thread.error_code = error_code;
15085 +       current->thread.trap_no = 13;
15086 +
15087 +       if (regs->eflags & VM_MASK)
15088 +               goto gp_in_vm86;
15089 +
15090 +       if (!user_mode(regs))
15091 +               goto gp_in_kernel;
15092 +
15093 +       current->thread.error_code = error_code;
15094 +       current->thread.trap_no = 13;
15095 +       force_sig(SIGSEGV, current);
15096 +       return;
15097 +
15098 +gp_in_vm86:
15099 +       local_irq_enable();
15100 +       handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
15101 +       return;
15102 +
15103 +gp_in_kernel:
15104 +       if (!fixup_exception(regs)) {
15105 +               if (notify_die(DIE_GPF, "general protection fault", regs,
15106 +                               error_code, 13, SIGSEGV) == NOTIFY_STOP)
15107 +                       return;
15108 +               die("general protection fault", regs, error_code);
15109 +       }
15110 +}
15111 +
15112 +static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
15113 +{
15114 +       printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying "
15115 +                       "to continue\n");
15116 +       printk(KERN_EMERG "You probably have a hardware problem with your RAM "
15117 +                       "chips\n");
15118 +
15119 +       /* Clear and disable the memory parity error line. */
15120 +       clear_mem_error(reason);
15121 +}
15122 +
15123 +static void io_check_error(unsigned char reason, struct pt_regs * regs)
15124 +{
15125 +       printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
15126 +       show_registers(regs);
15127 +
15128 +       /* Re-enable the IOCK line, wait for a few seconds */
15129 +       clear_io_check_error(reason);
15130 +}
15131 +
15132 +static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
15133 +{
15134 +#ifdef CONFIG_MCA
15135 +       /* Might actually be able to figure out what the guilty party
15136 +       * is. */
15137 +       if( MCA_bus ) {
15138 +               mca_handle_nmi();
15139 +               return;
15140 +       }
15141 +#endif
15142 +       printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
15143 +               reason, smp_processor_id());
15144 +       printk("Dazed and confused, but trying to continue\n");
15145 +       printk("Do you have a strange power saving mode enabled?\n");
15146 +}
15147 +
15148 +static DEFINE_SPINLOCK(nmi_print_lock);
15149 +
15150 +void die_nmi (struct pt_regs *regs, const char *msg)
15151 +{
15152 +       if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 0, SIGINT) ==
15153 +           NOTIFY_STOP)
15154 +               return;
15155 +
15156 +       spin_lock(&nmi_print_lock);
15157 +       /*
15158 +       * We are in trouble anyway, lets at least try
15159 +       * to get a message out.
15160 +       */
15161 +       bust_spinlocks(1);
15162 +       printk(KERN_EMERG "%s", msg);
15163 +       printk(" on CPU%d, eip %08lx, registers:\n",
15164 +               smp_processor_id(), regs->eip);
15165 +       show_registers(regs);
15166 +       printk(KERN_EMERG "console shuts up ...\n");
15167 +       console_silent();
15168 +       spin_unlock(&nmi_print_lock);
15169 +       bust_spinlocks(0);
15170 +
15171 +       /* If we are in kernel we are probably nested up pretty bad
15172 +        * and might aswell get out now while we still can.
15173 +       */
15174 +       if (!user_mode(regs)) {
15175 +               current->thread.trap_no = 2;
15176 +               crash_kexec(regs);
15177 +       }
15178 +
15179 +       do_exit(SIGSEGV);
15180 +}
15181 +
15182 +static void default_do_nmi(struct pt_regs * regs)
15183 +{
15184 +       unsigned char reason = 0;
15185 +
15186 +       /* Only the BSP gets external NMIs from the system.  */
15187 +       if (!smp_processor_id())
15188 +               reason = get_nmi_reason();
15189
15190 +       if (!(reason & 0xc0)) {
15191 +               if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
15192 +                                                       == NOTIFY_STOP)
15193 +                       return;
15194 +#ifdef CONFIG_X86_LOCAL_APIC
15195 +               /*
15196 +                * Ok, so this is none of the documented NMI sources,
15197 +                * so it must be the NMI watchdog.
15198 +                */
15199 +               if (nmi_watchdog) {
15200 +                       nmi_watchdog_tick(regs);
15201 +                       return;
15202 +               }
15203 +#endif
15204 +               unknown_nmi_error(reason, regs);
15205 +               return;
15206 +       }
15207 +       if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP)
15208 +               return;
15209 +       if (reason & 0x80)
15210 +               mem_parity_error(reason, regs);
15211 +       if (reason & 0x40)
15212 +               io_check_error(reason, regs);
15213 +       /*
15214 +        * Reassert NMI in case it became active meanwhile
15215 +        * as it's edge-triggered.
15216 +        */
15217 +       reassert_nmi();
15218 +}
15219 +
15220 +static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
15221 +{
15222 +       return 0;
15223 +}
15224
15225 +static nmi_callback_t nmi_callback = dummy_nmi_callback;
15226
15227 +fastcall void do_nmi(struct pt_regs * regs, long error_code)
15228 +{
15229 +       int cpu;
15230 +
15231 +       nmi_enter();
15232 +
15233 +       cpu = smp_processor_id();
15234 +
15235 +       ++nmi_count(cpu);
15236 +
15237 +       if (!rcu_dereference(nmi_callback)(regs, cpu))
15238 +               default_do_nmi(regs);
15239 +
15240 +       nmi_exit();
15241 +}
15242 +
15243 +void set_nmi_callback(nmi_callback_t callback)
15244 +{
15245 +       rcu_assign_pointer(nmi_callback, callback);
15246 +}
15247 +EXPORT_SYMBOL_GPL(set_nmi_callback);
15248 +
15249 +void unset_nmi_callback(void)
15250 +{
15251 +       nmi_callback = dummy_nmi_callback;
15252 +}
15253 +EXPORT_SYMBOL_GPL(unset_nmi_callback);
15254 +
15255 +#ifdef CONFIG_KPROBES
15256 +fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
15257 +{
15258 +       if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
15259 +                       == NOTIFY_STOP)
15260 +               return;
15261 +       /* This is an interrupt gate, because kprobes wants interrupts
15262 +       disabled.  Normal trap handlers don't. */
15263 +       restore_interrupts(regs);
15264 +       do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
15265 +}
15266 +#endif
15267 +
15268 +/*
15269 + * Our handling of the processor debug registers is non-trivial.
15270 + * We do not clear them on entry and exit from the kernel. Therefore
15271 + * it is possible to get a watchpoint trap here from inside the kernel.
15272 + * However, the code in ./ptrace.c has ensured that the user can
15273 + * only set watchpoints on userspace addresses. Therefore the in-kernel
15274 + * watchpoint trap can only occur in code which is reading/writing
15275 + * from user space. Such code must not hold kernel locks (since it
15276 + * can equally take a page fault), therefore it is safe to call
15277 + * force_sig_info even though that claims and releases locks.
15278 + * 
15279 + * Code in ./signal.c ensures that the debug control register
15280 + * is restored before we deliver any signal, and therefore that
15281 + * user code runs with the correct debug control register even though
15282 + * we clear it here.
15283 + *
15284 + * Being careful here means that we don't have to be as careful in a
15285 + * lot of more complicated places (task switching can be a bit lazy
15286 + * about restoring all the debug state, and ptrace doesn't have to
15287 + * find every occurrence of the TF bit that could be saved away even
15288 + * by user code)
15289 + */
15290 +fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
15291 +{
15292 +       unsigned int condition;
15293 +       struct task_struct *tsk = current;
15294 +
15295 +       get_debugreg(condition, 6);
15296 +
15297 +       if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
15298 +                                       SIGTRAP) == NOTIFY_STOP)
15299 +               return;
15300 +       /* It's safe to allow irq's after DR6 has been saved */
15301 +       if (regs->eflags & X86_EFLAGS_IF)
15302 +               local_irq_enable();
15303 +
15304 +       /* Mask out spurious debug traps due to lazy DR7 setting */
15305 +       if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
15306 +               if (!tsk->thread.debugreg[7])
15307 +                       goto clear_dr7;
15308 +       }
15309 +
15310 +       if (regs->eflags & VM_MASK)
15311 +               goto debug_vm86;
15312 +
15313 +       /* Save debug status register where ptrace can see it */
15314 +       tsk->thread.debugreg[6] = condition;
15315 +
15316 +       /*
15317 +        * Single-stepping through TF: make sure we ignore any events in
15318 +        * kernel space (but re-enable TF when returning to user mode).
15319 +        */
15320 +       if (condition & DR_STEP) {
15321 +               /*
15322 +                * We already checked v86 mode above, so we can
15323 +                * check for kernel mode by just checking the CPL
15324 +                * of CS.
15325 +                */
15326 +               if (!user_mode(regs))
15327 +                       goto clear_TF_reenable;
15328 +       }
15329 +
15330 +       /* Ok, finally something we can handle */
15331 +       send_sigtrap(tsk, regs, error_code);
15332 +
15333 +       /* Disable additional traps. They'll be re-enabled when
15334 +        * the signal is delivered.
15335 +        */
15336 +clear_dr7:
15337 +       set_debugreg(0, 7);
15338 +       return;
15339 +
15340 +debug_vm86:
15341 +       handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
15342 +       return;
15343 +
15344 +clear_TF_reenable:
15345 +       set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
15346 +       regs->eflags &= ~TF_MASK;
15347 +       return;
15348 +}
15349 +
15350 +/*
15351 + * Note that we play around with the 'TS' bit in an attempt to get
15352 + * the correct behaviour even in the presence of the asynchronous
15353 + * IRQ13 behaviour
15354 + */
15355 +void math_error(void __user *eip)
15356 +{
15357 +       struct task_struct * task;
15358 +       siginfo_t info;
15359 +       unsigned short cwd, swd;
15360 +
15361 +       /*
15362 +        * Save the info for the exception handler and clear the error.
15363 +        */
15364 +       task = current;
15365 +       save_init_fpu(task);
15366 +       task->thread.trap_no = 16;
15367 +       task->thread.error_code = 0;
15368 +       info.si_signo = SIGFPE;
15369 +       info.si_errno = 0;
15370 +       info.si_code = __SI_FAULT;
15371 +       info.si_addr = eip;
15372 +       /*
15373 +        * (~cwd & swd) will mask out exceptions that are not set to unmasked
15374 +        * status.  0x3f is the exception bits in these regs, 0x200 is the
15375 +        * C1 reg you need in case of a stack fault, 0x040 is the stack
15376 +        * fault bit.  We should only be taking one exception at a time,
15377 +        * so if this combination doesn't produce any single exception,
15378 +        * then we have a bad program that isn't syncronizing its FPU usage
15379 +        * and it will suffer the consequences since we won't be able to
15380 +        * fully reproduce the context of the exception
15381 +        */
15382 +       cwd = get_fpu_cwd(task);
15383 +       swd = get_fpu_swd(task);
15384 +       switch (swd & ~cwd & 0x3f) {
15385 +               case 0x000: /* No unmasked exception */
15386 +                       return;
15387 +               default:    /* Multiple exceptions */
15388 +                       break;
15389 +               case 0x001: /* Invalid Op */
15390 +                       /*
15391 +                        * swd & 0x240 == 0x040: Stack Underflow
15392 +                        * swd & 0x240 == 0x240: Stack Overflow
15393 +                        * User must clear the SF bit (0x40) if set
15394 +                        */
15395 +                       info.si_code = FPE_FLTINV;
15396 +                       break;
15397 +               case 0x002: /* Denormalize */
15398 +               case 0x010: /* Underflow */
15399 +                       info.si_code = FPE_FLTUND;
15400 +                       break;
15401 +               case 0x004: /* Zero Divide */
15402 +                       info.si_code = FPE_FLTDIV;
15403 +                       break;
15404 +               case 0x008: /* Overflow */
15405 +                       info.si_code = FPE_FLTOVF;
15406 +                       break;
15407 +               case 0x020: /* Precision */
15408 +                       info.si_code = FPE_FLTRES;
15409 +                       break;
15410 +       }
15411 +       force_sig_info(SIGFPE, &info, task);
15412 +}
15413 +
15414 +fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
15415 +{
15416 +       ignore_fpu_irq = 1;
15417 +       math_error((void __user *)regs->eip);
15418 +}
15419 +
15420 +static void simd_math_error(void __user *eip)
15421 +{
15422 +       struct task_struct * task;
15423 +       siginfo_t info;
15424 +       unsigned short mxcsr;
15425 +
15426 +       /*
15427 +        * Save the info for the exception handler and clear the error.
15428 +        */
15429 +       task = current;
15430 +       save_init_fpu(task);
15431 +       task->thread.trap_no = 19;
15432 +       task->thread.error_code = 0;
15433 +       info.si_signo = SIGFPE;
15434 +       info.si_errno = 0;
15435 +       info.si_code = __SI_FAULT;
15436 +       info.si_addr = eip;
15437 +       /*
15438 +        * The SIMD FPU exceptions are handled a little differently, as there
15439 +        * is only a single status/control register.  Thus, to determine which
15440 +        * unmasked exception was caught we must mask the exception mask bits
15441 +        * at 0x1f80, and then use these to mask the exception bits at 0x3f.
15442 +        */
15443 +       mxcsr = get_fpu_mxcsr(task);
15444 +       switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
15445 +               case 0x000:
15446 +               default:
15447 +                       break;
15448 +               case 0x001: /* Invalid Op */
15449 +                       info.si_code = FPE_FLTINV;
15450 +                       break;
15451 +               case 0x002: /* Denormalize */
15452 +               case 0x010: /* Underflow */
15453 +                       info.si_code = FPE_FLTUND;
15454 +                       break;
15455 +               case 0x004: /* Zero Divide */
15456 +                       info.si_code = FPE_FLTDIV;
15457 +                       break;
15458 +               case 0x008: /* Overflow */
15459 +                       info.si_code = FPE_FLTOVF;
15460 +                       break;
15461 +               case 0x020: /* Precision */
15462 +                       info.si_code = FPE_FLTRES;
15463 +                       break;
15464 +       }
15465 +       force_sig_info(SIGFPE, &info, task);
15466 +}
15467 +
15468 +fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
15469 +                                         long error_code)
15470 +{
15471 +       if (cpu_has_xmm) {
15472 +               /* Handle SIMD FPU exceptions on PIII+ processors. */
15473 +               ignore_fpu_irq = 1;
15474 +               simd_math_error((void __user *)regs->eip);
15475 +       } else {
15476 +               /*
15477 +                * Handle strange cache flush from user space exception
15478 +                * in all other cases.  This is undocumented behaviour.
15479 +                */
15480 +               if (regs->eflags & VM_MASK) {
15481 +                       handle_vm86_fault((struct kernel_vm86_regs *)regs,
15482 +                                         error_code);
15483 +                       return;
15484 +               }
15485 +               current->thread.trap_no = 19;
15486 +               current->thread.error_code = error_code;
15487 +               die_if_kernel("cache flush denied", regs, error_code);
15488 +               force_sig(SIGSEGV, current);
15489 +       }
15490 +}
15491 +
15492 +#ifndef CONFIG_XEN
15493 +fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
15494 +                                         long error_code)
15495 +{
15496 +#if 0
15497 +       /* No need to warn about this any longer. */
15498 +       printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
15499 +#endif
15500 +}
15501 +
15502 +fastcall void setup_x86_bogus_stack(unsigned char * stk)
15503 +{
15504 +       unsigned long *switch16_ptr, *switch32_ptr;
15505 +       struct pt_regs *regs;
15506 +       unsigned long stack_top, stack_bot;
15507 +       unsigned short iret_frame16_off;
15508 +       int cpu = smp_processor_id();
15509 +       /* reserve the space on 32bit stack for the magic switch16 pointer */
15510 +       memmove(stk, stk + 8, sizeof(struct pt_regs));
15511 +       switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
15512 +       regs = (struct pt_regs *)stk;
15513 +       /* now the switch32 on 16bit stack */
15514 +       stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
15515 +       stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
15516 +       switch32_ptr = (unsigned long *)(stack_top - 8);
15517 +       iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
15518 +       /* copy iret frame on 16bit stack */
15519 +       memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
15520 +       /* fill in the switch pointers */
15521 +       switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
15522 +       switch16_ptr[1] = __ESPFIX_SS;
15523 +       switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
15524 +               8 - CPU_16BIT_STACK_SIZE;
15525 +       switch32_ptr[1] = __KERNEL_DS;
15526 +}
15527 +
15528 +fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
15529 +{
15530 +       unsigned long *switch32_ptr;
15531 +       unsigned char *stack16, *stack32;
15532 +       unsigned long stack_top, stack_bot;
15533 +       int len;
15534 +       int cpu = smp_processor_id();
15535 +       stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
15536 +       stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
15537 +       switch32_ptr = (unsigned long *)(stack_top - 8);
15538 +       /* copy the data from 16bit stack to 32bit stack */
15539 +       len = CPU_16BIT_STACK_SIZE - 8 - sp;
15540 +       stack16 = (unsigned char *)(stack_bot + sp);
15541 +       stack32 = (unsigned char *)
15542 +               (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
15543 +       memcpy(stack32, stack16, len);
15544 +       return stack32;
15545 +}
15546 +#endif
15547 +
15548 +/*
15549 + *  'math_state_restore()' saves the current math information in the
15550 + * old math state array, and gets the new ones from the current task
15551 + *
15552 + * Careful.. There are problems with IBM-designed IRQ13 behaviour.
15553 + * Don't touch unless you *really* know how it works.
15554 + *
15555 + * Must be called with kernel preemption disabled (in this case,
15556 + * local interrupts are disabled at the call-site in entry.S).
15557 + */
15558 +asmlinkage void math_state_restore(struct pt_regs regs)
15559 +{
15560 +       struct thread_info *thread = current_thread_info();
15561 +       struct task_struct *tsk = thread->task;
15562 +
15563 +       /* NB. 'clts' is done for us by Xen during virtual trap. */
15564 +       if (!tsk_used_math(tsk))
15565 +               init_fpu(tsk);
15566 +       restore_fpu(tsk);
15567 +       thread->status |= TS_USEDFPU;   /* So we fnsave on switch_to() */
15568 +}
15569 +
15570 +#ifndef CONFIG_MATH_EMULATION
15571 +
15572 +asmlinkage void math_emulate(long arg)
15573 +{
15574 +       printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
15575 +       printk(KERN_EMERG "killing %s.\n",current->comm);
15576 +       force_sig(SIGFPE,current);
15577 +       schedule();
15578 +}
15579 +
15580 +#endif /* CONFIG_MATH_EMULATION */
15581 +
15582 +#ifdef CONFIG_X86_F00F_BUG
15583 +void __init trap_init_f00f_bug(void)
15584 +{
15585 +       __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
15586 +
15587 +       /*
15588 +        * Update the IDT descriptor and reload the IDT so that
15589 +        * it uses the read-only mapped virtual address.
15590 +        */
15591 +       idt_descr.address = fix_to_virt(FIX_F00F_IDT);
15592 +       load_idt(&idt_descr);
15593 +}
15594 +#endif
15595 +
15596 +
15597 +/*
15598 + * NB. All these are "trap gates" (i.e. events_mask isn't set) except
15599 + * for those that specify <dpl>|4 in the second field.
15600 + */
15601 +static trap_info_t trap_table[] = {
15602 +       {  0, 0, __KERNEL_CS, (unsigned long)divide_error               },
15603 +       {  1, 0|4, __KERNEL_CS, (unsigned long)debug                    },
15604 +       {  3, 3|4, __KERNEL_CS, (unsigned long)int3                     },
15605 +       {  4, 3, __KERNEL_CS, (unsigned long)overflow                   },
15606 +       {  5, 0, __KERNEL_CS, (unsigned long)bounds                     },
15607 +       {  6, 0, __KERNEL_CS, (unsigned long)invalid_op                 },
15608 +       {  7, 0|4, __KERNEL_CS, (unsigned long)device_not_available     },
15609 +       {  9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun },
15610 +       { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS                },
15611 +       { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present        },
15612 +       { 12, 0, __KERNEL_CS, (unsigned long)stack_segment              },
15613 +       { 13, 0, __KERNEL_CS, (unsigned long)general_protection         },
15614 +       { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault               },
15615 +       { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment          },
15616 +       { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error          },
15617 +       { 17, 0, __KERNEL_CS, (unsigned long)alignment_check            },
15618 +#ifdef CONFIG_X86_MCE
15619 +       { 18, 0, __KERNEL_CS, (unsigned long)machine_check              },
15620 +#endif
15621 +       { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error     },
15622 +       { SYSCALL_VECTOR,  3, __KERNEL_CS, (unsigned long)system_call   },
15623 +       {  0, 0,           0, 0                                         }
15624 +};
15625 +
15626 +void __init trap_init(void)
15627 +{
15628 +       HYPERVISOR_set_trap_table(trap_table);
15629 +
15630 +       if (cpu_has_fxsr) {
15631 +               /*
15632 +                * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
15633 +                * Generates a compile-time "error: zero width for bit-field" if
15634 +                * the alignment is wrong.
15635 +                */
15636 +               struct fxsrAlignAssert {
15637 +                       int _:!(offsetof(struct task_struct,
15638 +                                       thread.i387.fxsave) & 15);
15639 +               };
15640 +
15641 +               printk(KERN_INFO "Enabling fast FPU save and restore... ");
15642 +               set_in_cr4(X86_CR4_OSFXSR);
15643 +               printk("done.\n");
15644 +       }
15645 +       if (cpu_has_xmm) {
15646 +               printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
15647 +                               "support... ");
15648 +               set_in_cr4(X86_CR4_OSXMMEXCPT);
15649 +               printk("done.\n");
15650 +       }
15651 +
15652 +       /*
15653 +        * Should be a barrier for any external CPU state.
15654 +        */
15655 +       cpu_init();
15656 +}
15657 +
15658 +void smp_trap_init(trap_info_t *trap_ctxt)
15659 +{
15660 +       trap_info_t *t = trap_table;
15661 +
15662 +       for (t = trap_table; t->address; t++) {
15663 +               trap_ctxt[t->vector].flags = t->flags;
15664 +               trap_ctxt[t->vector].cs = t->cs;
15665 +               trap_ctxt[t->vector].address = t->address;
15666 +       }
15667 +}
15668 +
15669 +static int __init kstack_setup(char *s)
15670 +{
15671 +       kstack_depth_to_print = simple_strtoul(s, NULL, 0);
15672 +       return 0;
15673 +}
15674 +__setup("kstack=", kstack_setup);
15675 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/traps.c linux-2.6.16/arch/i386/kernel/traps.c
15676 --- linux-2.6.16.orig/arch/i386/kernel/traps.c  2006-06-26 09:49:46.000000000 +0200
15677 +++ linux-2.6.16/arch/i386/kernel/traps.c       2006-06-26 09:51:32.000000000 +0200
15678 @@ -573,18 +573,11 @@
15679  
15680  static void io_check_error(unsigned char reason, struct pt_regs * regs)
15681  {
15682 -       unsigned long i;
15683 -
15684         printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
15685         show_registers(regs);
15686  
15687         /* Re-enable the IOCK line, wait for a few seconds */
15688 -       reason = (reason & 0xf) | 8;
15689 -       outb(reason, 0x61);
15690 -       i = 2000;
15691 -       while (--i) udelay(1000);
15692 -       reason &= ~8;
15693 -       outb(reason, 0x61);
15694 +       clear_io_check_error(reason);
15695  }
15696  
15697  static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
15698 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/vm86.c linux-2.6.16/arch/i386/kernel/vm86.c
15699 --- linux-2.6.16.orig/arch/i386/kernel/vm86.c   2006-06-26 09:49:45.000000000 +0200
15700 +++ linux-2.6.16/arch/i386/kernel/vm86.c        2006-06-26 09:53:15.000000000 +0200
15701 @@ -98,7 +98,9 @@
15702  struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
15703  struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
15704  {
15705 +#ifndef CONFIG_X86_NO_TSS
15706         struct tss_struct *tss;
15707 +#endif
15708         struct pt_regs *ret;
15709         unsigned long tmp;
15710  
15711 @@ -123,7 +125,9 @@
15712                 do_exit(SIGSEGV);
15713         }
15714  
15715 +#ifndef CONFIG_X86_NO_TSS
15716         tss = &per_cpu(init_tss, get_cpu());
15717 +#endif
15718         current->thread.esp0 = current->thread.saved_esp0;
15719         current->thread.sysenter_cs = __KERNEL_CS;
15720         load_esp0(tss, &current->thread);
15721 @@ -252,7 +256,9 @@
15722  
15723  static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
15724  {
15725 +#ifndef CONFIG_X86_NO_TSS
15726         struct tss_struct *tss;
15727 +#endif
15728         long eax;
15729  /*
15730   * make sure the vm86() system call doesn't try to do anything silly
15731 @@ -297,7 +303,9 @@
15732         savesegment(fs, tsk->thread.saved_fs);
15733         savesegment(gs, tsk->thread.saved_gs);
15734  
15735 +#ifndef CONFIG_X86_NO_TSS
15736         tss = &per_cpu(init_tss, get_cpu());
15737 +#endif
15738         tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
15739         if (cpu_has_sep)
15740                 tsk->thread.sysenter_cs = 0;
15741 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/vmlinux.lds.S linux-2.6.16/arch/i386/kernel/vmlinux.lds.S
15742 --- linux-2.6.16.orig/arch/i386/kernel/vmlinux.lds.S    2006-03-20 06:53:29.000000000 +0100
15743 +++ linux-2.6.16/arch/i386/kernel/vmlinux.lds.S 2006-06-26 09:51:32.000000000 +0200
15744 @@ -34,6 +34,13 @@
15745    __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
15746    __stop___ex_table = .;
15747  
15748 +  . = ALIGN(16);
15749 +  __start_smp_alternatives_table = .;
15750 +  __smp_alternatives : AT(ADDR(__smp_alternatives) - LOAD_OFFSET) { *(__smp_alternatives) }
15751 +  __stop_smp_alternatives_table = .;
15752 +
15753 +  __smp_replacements : AT(ADDR(__smp_replacements) - LOAD_OFFSET) { *(__smp_replacements) }
15754 +
15755    RODATA
15756  
15757    /* writeable */
15758 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/vsyscall-note-xen.S linux-2.6.16/arch/i386/kernel/vsyscall-note-xen.S
15759 --- linux-2.6.16.orig/arch/i386/kernel/vsyscall-note-xen.S      1970-01-01 01:00:00.000000000 +0100
15760 +++ linux-2.6.16/arch/i386/kernel/vsyscall-note-xen.S   2006-06-26 09:51:32.000000000 +0200
15761 @@ -0,0 +1,32 @@
15762 +/*
15763 + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
15764 + * Here we can supply some information useful to userland.
15765 + * First we get the vanilla i386 note that supplies the kernel version info.
15766 + */
15767 +
15768 +#include "vsyscall-note.S"
15769 +
15770 +/*
15771 + * Now we add a special note telling glibc's dynamic linker a fake hardware
15772 + * flavor that it will use to choose the search path for libraries in the
15773 + * same way it uses real hardware capabilities like "mmx".
15774 + * We supply "nosegneg" as the fake capability, to indicate that we
15775 + * do not like negative offsets in instructions using segment overrides,
15776 + * since we implement those inefficiently.  This makes it possible to
15777 + * install libraries optimized to avoid those access patterns in someplace
15778 + * like /lib/i686/tls/nosegneg.  Note that an /etc/ld.so.conf.d/file
15779 + * corresponding to the bits here is needed to make ldconfig work right.
15780 + * It should contain:
15781 + *     hwcap 0 nosegneg
15782 + * to match the mapping of bit to name that we give here.
15783 + */
15784 +#define NOTE_KERNELCAP_BEGIN(ncaps, mask) \
15785 +       ASM_ELF_NOTE_BEGIN(".note.kernelcap", "a", "GNU", 2) \
15786 +       .long ncaps, mask
15787 +#define NOTE_KERNELCAP(bit, name) \
15788 +       .byte bit; .asciz name
15789 +#define NOTE_KERNELCAP_END ASM_ELF_NOTE_END
15790 +
15791 +NOTE_KERNELCAP_BEGIN(1, 1)
15792 +NOTE_KERNELCAP(1, "nosegneg")  /* Change 1 back to 0 when glibc is fixed! */
15793 +NOTE_KERNELCAP_END
15794 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/kernel/vsyscall.S linux-2.6.16/arch/i386/kernel/vsyscall.S
15795 --- linux-2.6.16.orig/arch/i386/kernel/vsyscall.S       2006-03-20 06:53:29.000000000 +0100
15796 +++ linux-2.6.16/arch/i386/kernel/vsyscall.S    2006-06-26 09:51:32.000000000 +0200
15797 @@ -7,9 +7,11 @@
15798         .incbin "arch/i386/kernel/vsyscall-int80.so"
15799  vsyscall_int80_end:
15800  
15801 +#ifdef CONFIG_X86_SYSENTER
15802         .globl vsyscall_sysenter_start, vsyscall_sysenter_end
15803  vsyscall_sysenter_start:
15804         .incbin "arch/i386/kernel/vsyscall-sysenter.so"
15805  vsyscall_sysenter_end:
15806 +#endif
15807  
15808  __FINIT
15809 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/mach-xen/Makefile linux-2.6.16/arch/i386/mach-xen/Makefile
15810 --- linux-2.6.16.orig/arch/i386/mach-xen/Makefile       1970-01-01 01:00:00.000000000 +0100
15811 +++ linux-2.6.16/arch/i386/mach-xen/Makefile    2006-06-26 09:51:32.000000000 +0200
15812 @@ -0,0 +1,5 @@
15813 +#
15814 +# Makefile for the linux kernel.
15815 +#
15816 +
15817 +obj-y                          := setup.o
15818 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/mach-xen/setup.c linux-2.6.16/arch/i386/mach-xen/setup.c
15819 --- linux-2.6.16.orig/arch/i386/mach-xen/setup.c        1970-01-01 01:00:00.000000000 +0100
15820 +++ linux-2.6.16/arch/i386/mach-xen/setup.c     2006-06-26 09:51:32.000000000 +0200
15821 @@ -0,0 +1,37 @@
15822 +/*
15823 + *     Machine specific setup for generic
15824 + */
15825 +
15826 +#include <linux/config.h>
15827 +#include <linux/smp.h>
15828 +#include <linux/init.h>
15829 +#include <linux/interrupt.h>
15830 +#include <asm/acpi.h>
15831 +#include <asm/arch_hooks.h>
15832 +
15833 +#ifdef CONFIG_HOTPLUG_CPU
15834 +#define DEFAULT_SEND_IPI       (1)
15835 +#else
15836 +#define DEFAULT_SEND_IPI       (0)
15837 +#endif
15838 +
15839 +int no_broadcast=DEFAULT_SEND_IPI;
15840 +
15841 +static __init int no_ipi_broadcast(char *str)
15842 +{
15843 +       get_option(&str, &no_broadcast);
15844 +       printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
15845 +                                                                                       "IPI Broadcast");
15846 +       return 1;
15847 +}
15848 +
15849 +__setup("no_ipi_broadcast", no_ipi_broadcast);
15850 +
15851 +static int __init print_ipi_mode(void)
15852 +{
15853 +       printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
15854 +                                                                                       "Shortcut");
15855 +       return 0;
15856 +}
15857 +
15858 +late_initcall(print_ipi_mode);
15859 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/mm/Makefile linux-2.6.16/arch/i386/mm/Makefile
15860 --- linux-2.6.16.orig/arch/i386/mm/Makefile     2006-03-20 06:53:29.000000000 +0100
15861 +++ linux-2.6.16/arch/i386/mm/Makefile  2006-06-26 09:51:32.000000000 +0200
15862 @@ -8,3 +8,11 @@
15863  obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
15864  obj-$(CONFIG_HIGHMEM) += highmem.o
15865  obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o
15866 +
15867 +ifdef CONFIG_XEN
15868 +include $(srctree)/scripts/Makefile.xen
15869 +
15870 +obj-y          += hypervisor.o
15871 +
15872 +obj-y := $(call cherrypickxen, $(obj-y))
15873 +endif
15874 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/mm/fault-xen.c linux-2.6.16/arch/i386/mm/fault-xen.c
15875 --- linux-2.6.16.orig/arch/i386/mm/fault-xen.c  1970-01-01 01:00:00.000000000 +0100
15876 +++ linux-2.6.16/arch/i386/mm/fault-xen.c       2006-06-26 09:51:32.000000000 +0200
15877 @@ -0,0 +1,617 @@
15878 +/*
15879 + *  linux/arch/i386/mm/fault.c
15880 + *
15881 + *  Copyright (C) 1995  Linus Torvalds
15882 + */
15883 +
15884 +#include <linux/signal.h>
15885 +#include <linux/sched.h>
15886 +#include <linux/kernel.h>
15887 +#include <linux/errno.h>
15888 +#include <linux/string.h>
15889 +#include <linux/types.h>
15890 +#include <linux/ptrace.h>
15891 +#include <linux/mman.h>
15892 +#include <linux/mm.h>
15893 +#include <linux/smp.h>
15894 +#include <linux/smp_lock.h>
15895 +#include <linux/interrupt.h>
15896 +#include <linux/init.h>
15897 +#include <linux/tty.h>
15898 +#include <linux/vt_kern.h>             /* For unblank_screen() */
15899 +#include <linux/highmem.h>
15900 +#include <linux/module.h>
15901 +#include <linux/kprobes.h>
15902 +
15903 +#include <asm/system.h>
15904 +#include <asm/uaccess.h>
15905 +#include <asm/desc.h>
15906 +#include <asm/kdebug.h>
15907 +
15908 +extern void die(const char *,struct pt_regs *,long);
15909 +
15910 +/*
15911 + * Unlock any spinlocks which will prevent us from getting the
15912 + * message out 
15913 + */
15914 +void bust_spinlocks(int yes)
15915 +{
15916 +       int loglevel_save = console_loglevel;
15917 +
15918 +       if (yes) {
15919 +               oops_in_progress = 1;
15920 +               return;
15921 +       }
15922 +#ifdef CONFIG_VT
15923 +       unblank_screen();
15924 +#endif
15925 +       oops_in_progress = 0;
15926 +       /*
15927 +        * OK, the message is on the console.  Now we call printk()
15928 +        * without oops_in_progress set so that printk will give klogd
15929 +        * a poke.  Hold onto your hats...
15930 +        */
15931 +       console_loglevel = 15;          /* NMI oopser may have shut the console up */
15932 +       printk(" ");
15933 +       console_loglevel = loglevel_save;
15934 +}
15935 +
15936 +/*
15937 + * Return EIP plus the CS segment base.  The segment limit is also
15938 + * adjusted, clamped to the kernel/user address space (whichever is
15939 + * appropriate), and returned in *eip_limit.
15940 + *
15941 + * The segment is checked, because it might have been changed by another
15942 + * task between the original faulting instruction and here.
15943 + *
15944 + * If CS is no longer a valid code segment, or if EIP is beyond the
15945 + * limit, or if it is a kernel address when CS is not a kernel segment,
15946 + * then the returned value will be greater than *eip_limit.
15947 + * 
15948 + * This is slow, but is very rarely executed.
15949 + */
15950 +static inline unsigned long get_segment_eip(struct pt_regs *regs,
15951 +                                           unsigned long *eip_limit)
15952 +{
15953 +       unsigned long eip = regs->eip;
15954 +       unsigned seg = regs->xcs & 0xffff;
15955 +       u32 seg_ar, seg_limit, base, *desc;
15956 +
15957 +       /* The standard kernel/user address space limit. */
15958 +       *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg;
15959 +
15960 +       /* Unlikely, but must come before segment checks. */
15961 +       if (unlikely((regs->eflags & VM_MASK) != 0))
15962 +               return eip + (seg << 4);
15963 +       
15964 +       /* By far the most common cases. */
15965 +       if (likely(seg == __USER_CS || seg == GET_KERNEL_CS()))
15966 +               return eip;
15967 +
15968 +       /* Check the segment exists, is within the current LDT/GDT size,
15969 +          that kernel/user (ring 0..3) has the appropriate privilege,
15970 +          that it's a code segment, and get the limit. */
15971 +       __asm__ ("larl %3,%0; lsll %3,%1"
15972 +                : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
15973 +       if ((~seg_ar & 0x9800) || eip > seg_limit) {
15974 +               *eip_limit = 0;
15975 +               return 1;        /* So that returned eip > *eip_limit. */
15976 +       }
15977 +
15978 +       /* Get the GDT/LDT descriptor base. 
15979 +          When you look for races in this code remember that
15980 +          LDT and other horrors are only used in user space. */
15981 +       if (seg & (1<<2)) {
15982 +               /* Must lock the LDT while reading it. */
15983 +               down(&current->mm->context.sem);
15984 +               desc = current->mm->context.ldt;
15985 +               desc = (void *)desc + (seg & ~7);
15986 +       } else {
15987 +               /* Must disable preemption while reading the GDT. */
15988 +               desc = (u32 *)get_cpu_gdt_table(get_cpu());
15989 +               desc = (void *)desc + (seg & ~7);
15990 +       }
15991 +
15992 +       /* Decode the code segment base from the descriptor */
15993 +       base = get_desc_base((unsigned long *)desc);
15994 +
15995 +       if (seg & (1<<2)) { 
15996 +               up(&current->mm->context.sem);
15997 +       } else
15998 +               put_cpu();
15999 +
16000 +       /* Adjust EIP and segment limit, and clamp at the kernel limit.
16001 +          It's legitimate for segments to wrap at 0xffffffff. */
16002 +       seg_limit += base;
16003 +       if (seg_limit < *eip_limit && seg_limit >= base)
16004 +               *eip_limit = seg_limit;
16005 +       return eip + base;
16006 +}
16007 +
16008 +/* 
16009 + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
16010 + * Check that here and ignore it.
16011 + */
16012 +static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
16013 +{ 
16014 +       unsigned long limit;
16015 +       unsigned long instr = get_segment_eip (regs, &limit);
16016 +       int scan_more = 1;
16017 +       int prefetch = 0; 
16018 +       int i;
16019 +
16020 +       for (i = 0; scan_more && i < 15; i++) { 
16021 +               unsigned char opcode;
16022 +               unsigned char instr_hi;
16023 +               unsigned char instr_lo;
16024 +
16025 +               if (instr > limit)
16026 +                       break;
16027 +               if (__get_user(opcode, (unsigned char __user *) instr))
16028 +                       break; 
16029 +
16030 +               instr_hi = opcode & 0xf0; 
16031 +               instr_lo = opcode & 0x0f; 
16032 +               instr++;
16033 +
16034 +               switch (instr_hi) { 
16035 +               case 0x20:
16036 +               case 0x30:
16037 +                       /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
16038 +                       scan_more = ((instr_lo & 7) == 0x6);
16039 +                       break;
16040 +                       
16041 +               case 0x60:
16042 +                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
16043 +                       scan_more = (instr_lo & 0xC) == 0x4;
16044 +                       break;          
16045 +               case 0xF0:
16046 +                       /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
16047 +                       scan_more = !instr_lo || (instr_lo>>1) == 1;
16048 +                       break;                  
16049 +               case 0x00:
16050 +                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
16051 +                       scan_more = 0;
16052 +                       if (instr > limit)
16053 +                               break;
16054 +                       if (__get_user(opcode, (unsigned char __user *) instr))
16055 +                               break;
16056 +                       prefetch = (instr_lo == 0xF) &&
16057 +                               (opcode == 0x0D || opcode == 0x18);
16058 +                       break;                  
16059 +               default:
16060 +                       scan_more = 0;
16061 +                       break;
16062 +               } 
16063 +       }
16064 +       return prefetch;
16065 +}
16066 +
16067 +static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
16068 +                             unsigned long error_code)
16069 +{
16070 +       if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
16071 +                    boot_cpu_data.x86 >= 6)) {
16072 +               /* Catch an obscure case of prefetch inside an NX page. */
16073 +               if (nx_enabled && (error_code & 16))
16074 +                       return 0;
16075 +               return __is_prefetch(regs, addr);
16076 +       }
16077 +       return 0;
16078 +} 
16079 +
16080 +static noinline void force_sig_info_fault(int si_signo, int si_code,
16081 +       unsigned long address, struct task_struct *tsk)
16082 +{
16083 +       siginfo_t info;
16084 +
16085 +       info.si_signo = si_signo;
16086 +       info.si_errno = 0;
16087 +       info.si_code = si_code;
16088 +       info.si_addr = (void __user *)address;
16089 +       force_sig_info(si_signo, &info, tsk);
16090 +}
16091 +
16092 +fastcall void do_invalid_op(struct pt_regs *, unsigned long);
16093 +
16094 +#ifdef CONFIG_X86_PAE
16095 +static void dump_fault_path(unsigned long address)
16096 +{
16097 +       unsigned long *p, page;
16098 +       unsigned long mfn; 
16099 +
16100 +       page = read_cr3();
16101 +       p  = (unsigned long *)__va(page);
16102 +       p += (address >> 30) * 2;
16103 +       printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]);
16104 +       if (p[0] & 1) {
16105 +               mfn  = (p[0] >> PAGE_SHIFT) | ((p[1] & 0x7) << 20); 
16106 +               page = mfn_to_pfn(mfn) << PAGE_SHIFT; 
16107 +               p  = (unsigned long *)__va(page);
16108 +               address &= 0x3fffffff;
16109 +               p += (address >> 21) * 2;
16110 +               printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n", 
16111 +                      page, p[1], p[0]);
16112 +#ifndef CONFIG_HIGHPTE
16113 +               if (p[0] & 1) {
16114 +                       mfn  = (p[0] >> PAGE_SHIFT) | ((p[1] & 0x7) << 20); 
16115 +                       page = mfn_to_pfn(mfn) << PAGE_SHIFT; 
16116 +                       p  = (unsigned long *) __va(page);
16117 +                       address &= 0x001fffff;
16118 +                       p += (address >> 12) * 2;
16119 +                       printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n",
16120 +                              page, p[1], p[0]);
16121 +               }
16122 +#endif
16123 +       }
16124 +}
16125 +#else
16126 +static void dump_fault_path(unsigned long address)
16127 +{
16128 +       unsigned long page;
16129 +
16130 +       page = read_cr3();
16131 +       page = ((unsigned long *) __va(page))[address >> 22];
16132 +       printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
16133 +              machine_to_phys(page));
16134 +       /*
16135 +        * We must not directly access the pte in the highpte
16136 +        * case, the page table might be allocated in highmem.
16137 +        * And lets rather not kmap-atomic the pte, just in case
16138 +        * it's allocated already.
16139 +        */
16140 +#ifndef CONFIG_HIGHPTE
16141 +       if (page & 1) {
16142 +               page &= PAGE_MASK;
16143 +               address &= 0x003ff000;
16144 +               page = machine_to_phys(page);
16145 +               page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
16146 +               printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
16147 +                      machine_to_phys(page));
16148 +       }
16149 +#endif
16150 +}
16151 +#endif
16152 +
16153 +
16154 +/*
16155 + * This routine handles page faults.  It determines the address,
16156 + * and the problem, and then passes it off to one of the appropriate
16157 + * routines.
16158 + *
16159 + * error_code:
16160 + *     bit 0 == 0 means no page found, 1 means protection fault
16161 + *     bit 1 == 0 means read, 1 means write
16162 + *     bit 2 == 0 means kernel, 1 means user-mode
16163 + */
16164 +fastcall void __kprobes do_page_fault(struct pt_regs *regs,
16165 +                                     unsigned long error_code)
16166 +{
16167 +       struct task_struct *tsk;
16168 +       struct mm_struct *mm;
16169 +       struct vm_area_struct * vma;
16170 +       unsigned long address;
16171 +       int write, si_code;
16172 +
16173 +       /* get the address */
16174 +        address = read_cr2();
16175 +
16176 +       /* Set the "privileged fault" bit to something sane. */
16177 +       error_code &= ~4;
16178 +       error_code |= (regs->xcs & 2) << 1;
16179 +       if (regs->eflags & X86_EFLAGS_VM)
16180 +               error_code |= 4;
16181 +
16182 +       if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
16183 +                                       SIGSEGV) == NOTIFY_STOP)
16184 +               return;
16185 +       /* It's safe to allow irq's after cr2 has been saved */
16186 +       if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
16187 +               local_irq_enable();
16188 +
16189 +       tsk = current;
16190 +
16191 +       si_code = SEGV_MAPERR;
16192 +
16193 +       /*
16194 +        * We fault-in kernel-space virtual memory on-demand. The
16195 +        * 'reference' page table is init_mm.pgd.
16196 +        *
16197 +        * NOTE! We MUST NOT take any locks for this case. We may
16198 +        * be in an interrupt or a critical region, and should
16199 +        * only copy the information from the master page table,
16200 +        * nothing more.
16201 +        *
16202 +        * This verifies that the fault happens in kernel space
16203 +        * (error_code & 4) == 0, and that the fault was not a
16204 +        * protection error (error_code & 1) == 0.
16205 +        */
16206 +       if (unlikely(address >= TASK_SIZE)) { 
16207 +               if (!(error_code & 5))
16208 +                       goto vmalloc_fault;
16209 +               /* 
16210 +                * Don't take the mm semaphore here. If we fixup a prefetch
16211 +                * fault we could otherwise deadlock.
16212 +                */
16213 +               goto bad_area_nosemaphore;
16214 +       } 
16215 +
16216 +       mm = tsk->mm;
16217 +
16218 +       /*
16219 +        * If we're in an interrupt, have no user context or are running in an
16220 +        * atomic region then we must not take the fault..
16221 +        */
16222 +       if (in_atomic() || !mm)
16223 +               goto bad_area_nosemaphore;
16224 +
16225 +       /* When running in the kernel we expect faults to occur only to
16226 +        * addresses in user space.  All other faults represent errors in the
16227 +        * kernel and should generate an OOPS.  Unfortunatly, in the case of an
16228 +        * erroneous fault occuring in a code path which already holds mmap_sem
16229 +        * we will deadlock attempting to validate the fault against the
16230 +        * address space.  Luckily the kernel only validly references user
16231 +        * space from well defined areas of code, which are listed in the
16232 +        * exceptions table.
16233 +        *
16234 +        * As the vast majority of faults will be valid we will only perform
16235 +        * the source reference check when there is a possibilty of a deadlock.
16236 +        * Attempt to lock the address space, if we cannot we then validate the
16237 +        * source.  If this is invalid we can skip the address space check,
16238 +        * thus avoiding the deadlock.
16239 +        */
16240 +       if (!down_read_trylock(&mm->mmap_sem)) {
16241 +               if ((error_code & 4) == 0 &&
16242 +                   !search_exception_tables(regs->eip))
16243 +                       goto bad_area_nosemaphore;
16244 +               down_read(&mm->mmap_sem);
16245 +       }
16246 +
16247 +       vma = find_vma(mm, address);
16248 +       if (!vma)
16249 +               goto bad_area;
16250 +       if (vma->vm_start <= address)
16251 +               goto good_area;
16252 +       if (!(vma->vm_flags & VM_GROWSDOWN))
16253 +               goto bad_area;
16254 +       if (error_code & 4) {
16255 +               /*
16256 +                * accessing the stack below %esp is always a bug.
16257 +                * The "+ 32" is there due to some instructions (like
16258 +                * pusha) doing post-decrement on the stack and that
16259 +                * doesn't show up until later..
16260 +                */
16261 +               if (address + 32 < regs->esp)
16262 +                       goto bad_area;
16263 +       }
16264 +       if (expand_stack(vma, address))
16265 +               goto bad_area;
16266 +/*
16267 + * Ok, we have a good vm_area for this memory access, so
16268 + * we can handle it..
16269 + */
16270 +good_area:
16271 +       si_code = SEGV_ACCERR;
16272 +       write = 0;
16273 +       switch (error_code & 3) {
16274 +               default:        /* 3: write, present */
16275 +#ifdef TEST_VERIFY_AREA
16276 +                       if (regs->cs == GET_KERNEL_CS())
16277 +                               printk("WP fault at %08lx\n", regs->eip);
16278 +#endif
16279 +                       /* fall through */
16280 +               case 2:         /* write, not present */
16281 +                       if (!(vma->vm_flags & VM_WRITE))
16282 +                               goto bad_area;
16283 +                       write++;
16284 +                       break;
16285 +               case 1:         /* read, present */
16286 +                       goto bad_area;
16287 +               case 0:         /* read, not present */
16288 +                       if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
16289 +                               goto bad_area;
16290 +       }
16291 +
16292 + survive:
16293 +       /*
16294 +        * If for any reason at all we couldn't handle the fault,
16295 +        * make sure we exit gracefully rather than endlessly redo
16296 +        * the fault.
16297 +        */
16298 +       switch (handle_mm_fault(mm, vma, address, write)) {
16299 +               case VM_FAULT_MINOR:
16300 +                       tsk->min_flt++;
16301 +                       break;
16302 +               case VM_FAULT_MAJOR:
16303 +                       tsk->maj_flt++;
16304 +                       break;
16305 +               case VM_FAULT_SIGBUS:
16306 +                       goto do_sigbus;
16307 +               case VM_FAULT_OOM:
16308 +                       goto out_of_memory;
16309 +               default:
16310 +                       BUG();
16311 +       }
16312 +
16313 +       /*
16314 +        * Did it hit the DOS screen memory VA from vm86 mode?
16315 +        */
16316 +       if (regs->eflags & VM_MASK) {
16317 +               unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
16318 +               if (bit < 32)
16319 +                       tsk->thread.screen_bitmap |= 1 << bit;
16320 +       }
16321 +       up_read(&mm->mmap_sem);
16322 +       return;
16323 +
16324 +/*
16325 + * Something tried to access memory that isn't in our memory map..
16326 + * Fix it, but check if it's kernel or user first..
16327 + */
16328 +bad_area:
16329 +       up_read(&mm->mmap_sem);
16330 +
16331 +bad_area_nosemaphore:
16332 +       /* User mode accesses just cause a SIGSEGV */
16333 +       if (error_code & 4) {
16334 +               /* 
16335 +                * Valid to do another page fault here because this one came 
16336 +                * from user space.
16337 +                */
16338 +               if (is_prefetch(regs, address, error_code))
16339 +                       return;
16340 +
16341 +               tsk->thread.cr2 = address;
16342 +               /* Kernel addresses are always protection faults */
16343 +               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
16344 +               tsk->thread.trap_no = 14;
16345 +               force_sig_info_fault(SIGSEGV, si_code, address, tsk);
16346 +               return;
16347 +       }
16348 +
16349 +#ifdef CONFIG_X86_F00F_BUG
16350 +       /*
16351 +        * Pentium F0 0F C7 C8 bug workaround.
16352 +        */
16353 +       if (boot_cpu_data.f00f_bug) {
16354 +               unsigned long nr;
16355 +               
16356 +               nr = (address - idt_descr.address) >> 3;
16357 +
16358 +               if (nr == 6) {
16359 +                       do_invalid_op(regs, 0);
16360 +                       return;
16361 +               }
16362 +       }
16363 +#endif
16364 +
16365 +no_context:
16366 +       /* Are we prepared to handle this kernel fault?  */
16367 +       if (fixup_exception(regs))
16368 +               return;
16369 +
16370 +       /* 
16371 +        * Valid to do another page fault here, because if this fault
16372 +        * had been triggered by is_prefetch fixup_exception would have 
16373 +        * handled it.
16374 +        */
16375 +       if (is_prefetch(regs, address, error_code))
16376 +               return;
16377 +
16378 +/*
16379 + * Oops. The kernel tried to access some bad page. We'll have to
16380 + * terminate things with extreme prejudice.
16381 + */
16382 +
16383 +       bust_spinlocks(1);
16384 +
16385 +#ifdef CONFIG_X86_PAE
16386 +       if (error_code & 16) {
16387 +               pte_t *pte = lookup_address(address);
16388 +
16389 +               if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
16390 +                       printk(KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", current->uid);
16391 +       }
16392 +#endif
16393 +       if (address < PAGE_SIZE)
16394 +               printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
16395 +       else
16396 +               printk(KERN_ALERT "Unable to handle kernel paging request");
16397 +       printk(" at virtual address %08lx\n",address);
16398 +       printk(KERN_ALERT " printing eip:\n");
16399 +       printk("%08lx\n", regs->eip);
16400 +       dump_fault_path(address);
16401 +       tsk->thread.cr2 = address;
16402 +       tsk->thread.trap_no = 14;
16403 +       tsk->thread.error_code = error_code;
16404 +       die("Oops", regs, error_code);
16405 +       bust_spinlocks(0);
16406 +       do_exit(SIGKILL);
16407 +
16408 +/*
16409 + * We ran out of memory, or some other thing happened to us that made
16410 + * us unable to handle the page fault gracefully.
16411 + */
16412 +out_of_memory:
16413 +       up_read(&mm->mmap_sem);
16414 +       if (tsk->pid == 1) {
16415 +               yield();
16416 +               down_read(&mm->mmap_sem);
16417 +               goto survive;
16418 +       }
16419 +       printk("VM: killing process %s\n", tsk->comm);
16420 +       if (error_code & 4)
16421 +               do_exit(SIGKILL);
16422 +       goto no_context;
16423 +
16424 +do_sigbus:
16425 +       up_read(&mm->mmap_sem);
16426 +
16427 +       /* Kernel mode? Handle exceptions or die */
16428 +       if (!(error_code & 4))
16429 +               goto no_context;
16430 +
16431 +       /* User space => ok to do another page fault */
16432 +       if (is_prefetch(regs, address, error_code))
16433 +               return;
16434 +
16435 +       tsk->thread.cr2 = address;
16436 +       tsk->thread.error_code = error_code;
16437 +       tsk->thread.trap_no = 14;
16438 +       force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
16439 +       return;
16440 +
16441 +vmalloc_fault:
16442 +       {
16443 +               /*
16444 +                * Synchronize this task's top level page-table
16445 +                * with the 'reference' page table.
16446 +                *
16447 +                * Do _not_ use "tsk" here. We might be inside
16448 +                * an interrupt in the middle of a task switch..
16449 +                */
16450 +               int index = pgd_index(address);
16451 +               unsigned long pgd_paddr;
16452 +               pgd_t *pgd, *pgd_k;
16453 +               pud_t *pud, *pud_k;
16454 +               pmd_t *pmd, *pmd_k;
16455 +               pte_t *pte_k;
16456 +
16457 +               pgd_paddr = read_cr3();
16458 +               pgd = index + (pgd_t *)__va(pgd_paddr);
16459 +               pgd_k = init_mm.pgd + index;
16460 +
16461 +               if (!pgd_present(*pgd_k))
16462 +                       goto no_context;
16463 +
16464 +               /*
16465 +                * set_pgd(pgd, *pgd_k); here would be useless on PAE
16466 +                * and redundant with the set_pmd() on non-PAE. As would
16467 +                * set_pud.
16468 +                */
16469 +
16470 +               pud = pud_offset(pgd, address);
16471 +               pud_k = pud_offset(pgd_k, address);
16472 +               if (!pud_present(*pud_k))
16473 +                       goto no_context;
16474 +               
16475 +               pmd = pmd_offset(pud, address);
16476 +               pmd_k = pmd_offset(pud_k, address);
16477 +               if (!pmd_present(*pmd_k))
16478 +                       goto no_context;
16479 +#ifndef CONFIG_XEN
16480 +               set_pmd(pmd, *pmd_k);
16481 +#else
16482 +               /*
16483 +                * When running on Xen we must launder *pmd_k through
16484 +                * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
16485 +                */
16486 +               set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
16487 +#endif
16488 +
16489 +               pte_k = pte_offset_kernel(pmd_k, address);
16490 +               if (!pte_present(*pte_k))
16491 +                       goto no_context;
16492 +               return;
16493 +       }
16494 +}
16495 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/mm/highmem-xen.c linux-2.6.16/arch/i386/mm/highmem-xen.c
16496 --- linux-2.6.16.orig/arch/i386/mm/highmem-xen.c        1970-01-01 01:00:00.000000000 +0100
16497 +++ linux-2.6.16/arch/i386/mm/highmem-xen.c     2006-06-26 09:51:32.000000000 +0200
16498 @@ -0,0 +1,123 @@
16499 +#include <linux/highmem.h>
16500 +#include <linux/module.h>
16501 +
16502 +void *kmap(struct page *page)
16503 +{
16504 +       might_sleep();
16505 +       if (!PageHighMem(page))
16506 +               return page_address(page);
16507 +       return kmap_high(page);
16508 +}
16509 +
16510 +void kunmap(struct page *page)
16511 +{
16512 +       if (in_interrupt())
16513 +               BUG();
16514 +       if (!PageHighMem(page))
16515 +               return;
16516 +       kunmap_high(page);
16517 +}
16518 +
16519 +/*
16520 + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
16521 + * no global lock is needed and because the kmap code must perform a global TLB
16522 + * invalidation when the kmap pool wraps.
16523 + *
16524 + * However when holding an atomic kmap is is not legal to sleep, so atomic
16525 + * kmaps are appropriate for short, tight code paths only.
16526 + */
16527 +static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot)
16528 +{
16529 +       enum fixed_addresses idx;
16530 +       unsigned long vaddr;
16531 +
16532 +       /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
16533 +       inc_preempt_count();
16534 +       if (!PageHighMem(page))
16535 +               return page_address(page);
16536 +
16537 +       idx = type + KM_TYPE_NR*smp_processor_id();
16538 +       vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
16539 +#ifdef CONFIG_DEBUG_HIGHMEM
16540 +       if (!pte_none(*(kmap_pte-idx)))
16541 +               BUG();
16542 +#endif
16543 +       set_pte_at_sync(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
16544 +
16545 +       return (void*) vaddr;
16546 +}
16547 +
16548 +void *kmap_atomic(struct page *page, enum km_type type)
16549 +{
16550 +       return __kmap_atomic(page, type, kmap_prot);
16551 +}
16552 +
16553 +/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */
16554 +void *kmap_atomic_pte(struct page *page, enum km_type type)
16555 +{
16556 +       return __kmap_atomic(page, type, PAGE_KERNEL_RO);
16557 +}
16558 +
16559 +void kunmap_atomic(void *kvaddr, enum km_type type)
16560 +{
16561 +#ifdef CONFIG_DEBUG_HIGHMEM
16562 +       unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
16563 +       enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
16564 +
16565 +       if (vaddr < FIXADDR_START) { // FIXME
16566 +               dec_preempt_count();
16567 +               preempt_check_resched();
16568 +               return;
16569 +       }
16570 +
16571 +       if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
16572 +               BUG();
16573 +
16574 +       /*
16575 +        * force other mappings to Oops if they'll try to access
16576 +        * this pte without first remap it
16577 +        */
16578 +       pte_clear(&init_mm, vaddr, kmap_pte-idx);
16579 +       __flush_tlb_one(vaddr);
16580 +#endif
16581 +
16582 +       dec_preempt_count();
16583 +       preempt_check_resched();
16584 +}
16585 +
16586 +/* This is the same as kmap_atomic() but can map memory that doesn't
16587 + * have a struct page associated with it.
16588 + */
16589 +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
16590 +{
16591 +       enum fixed_addresses idx;
16592 +       unsigned long vaddr;
16593 +
16594 +       inc_preempt_count();
16595 +
16596 +       idx = type + KM_TYPE_NR*smp_processor_id();
16597 +       vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
16598 +       set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
16599 +       __flush_tlb_one(vaddr);
16600 +
16601 +       return (void*) vaddr;
16602 +}
16603 +
16604 +struct page *kmap_atomic_to_page(void *ptr)
16605 +{
16606 +       unsigned long idx, vaddr = (unsigned long)ptr;
16607 +       pte_t *pte;
16608 +
16609 +       if (vaddr < FIXADDR_START)
16610 +               return virt_to_page(ptr);
16611 +
16612 +       idx = virt_to_fix(vaddr);
16613 +       pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
16614 +       return pte_page(*pte);
16615 +}
16616 +
16617 +EXPORT_SYMBOL(kmap);
16618 +EXPORT_SYMBOL(kunmap);
16619 +EXPORT_SYMBOL(kmap_atomic);
16620 +EXPORT_SYMBOL(kunmap_atomic);
16621 +EXPORT_SYMBOL(kmap_atomic_to_page);
16622 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/mm/hypervisor.c linux-2.6.16/arch/i386/mm/hypervisor.c
16623 --- linux-2.6.16.orig/arch/i386/mm/hypervisor.c 1970-01-01 01:00:00.000000000 +0100
16624 +++ linux-2.6.16/arch/i386/mm/hypervisor.c      2006-06-26 09:51:32.000000000 +0200
16625 @@ -0,0 +1,424 @@
16626 +/******************************************************************************
16627 + * mm/hypervisor.c
16628 + * 
16629 + * Update page tables via the hypervisor.
16630 + * 
16631 + * Copyright (c) 2002-2004, K A Fraser
16632 + * 
16633 + * This program is free software; you can redistribute it and/or
16634 + * modify it under the terms of the GNU General Public License version 2
16635 + * as published by the Free Software Foundation; or, when distributed
16636 + * separately from the Linux kernel or incorporated into other
16637 + * software packages, subject to the following license:
16638 + * 
16639 + * Permission is hereby granted, free of charge, to any person obtaining a copy
16640 + * of this source file (the "Software"), to deal in the Software without
16641 + * restriction, including without limitation the rights to use, copy, modify,
16642 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
16643 + * and to permit persons to whom the Software is furnished to do so, subject to
16644 + * the following conditions:
16645 + * 
16646 + * The above copyright notice and this permission notice shall be included in
16647 + * all copies or substantial portions of the Software.
16648 + * 
16649 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16650 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16651 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16652 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16653 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
16654 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
16655 + * IN THE SOFTWARE.
16656 + */
16657 +
16658 +#include <linux/config.h>
16659 +#include <linux/sched.h>
16660 +#include <linux/mm.h>
16661 +#include <linux/vmalloc.h>
16662 +#include <asm/page.h>
16663 +#include <asm/pgtable.h>
16664 +#include <asm/hypervisor.h>
16665 +#include <xen/balloon.h>
16666 +#include <xen/features.h>
16667 +#include <xen/interface/memory.h>
16668 +#include <linux/module.h>
16669 +#include <linux/percpu.h>
16670 +#include <asm/tlbflush.h>
16671 +
16672 +#ifdef CONFIG_X86_64
16673 +#define pmd_val_ma(v) (v).pmd
16674 +#else
16675 +#ifdef CONFIG_X86_PAE
16676 +# define pmd_val_ma(v) ((v).pmd)
16677 +# define pud_val_ma(v) ((v).pgd.pgd)
16678 +#else
16679 +# define pmd_val_ma(v) ((v).pud.pgd.pgd)
16680 +#endif
16681 +#endif
16682 +
16683 +void xen_l1_entry_update(pte_t *ptr, pte_t val)
16684 +{
16685 +       mmu_update_t u;
16686 +       u.ptr = virt_to_machine(ptr);
16687 +       u.val = pte_val_ma(val);
16688 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
16689 +}
16690 +
16691 +void xen_l2_entry_update(pmd_t *ptr, pmd_t val)
16692 +{
16693 +       mmu_update_t u;
16694 +       u.ptr = virt_to_machine(ptr);
16695 +       u.val = pmd_val_ma(val);
16696 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
16697 +}
16698 +
16699 +#ifdef CONFIG_X86_PAE
16700 +void xen_l3_entry_update(pud_t *ptr, pud_t val)
16701 +{
16702 +       mmu_update_t u;
16703 +       u.ptr = virt_to_machine(ptr);
16704 +       u.val = pud_val_ma(val);
16705 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
16706 +}
16707 +#endif
16708 +
16709 +#ifdef CONFIG_X86_64
16710 +void xen_l3_entry_update(pud_t *ptr, pud_t val)
16711 +{
16712 +       mmu_update_t u;
16713 +       u.ptr = virt_to_machine(ptr);
16714 +       u.val = val.pud;
16715 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
16716 +}
16717 +
16718 +void xen_l4_entry_update(pgd_t *ptr, pgd_t val)
16719 +{
16720 +       mmu_update_t u;
16721 +       u.ptr = virt_to_machine(ptr);
16722 +       u.val = val.pgd;
16723 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
16724 +}
16725 +#endif /* CONFIG_X86_64 */
16726 +
16727 +void xen_machphys_update(unsigned long mfn, unsigned long pfn)
16728 +{
16729 +       mmu_update_t u;
16730 +       if (xen_feature(XENFEAT_auto_translated_physmap)) {
16731 +               BUG_ON(pfn != mfn);
16732 +               return;
16733 +       }
16734 +       u.ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
16735 +       u.val = pfn;
16736 +       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
16737 +}
16738 +
16739 +void xen_pt_switch(unsigned long ptr)
16740 +{
16741 +       struct mmuext_op op;
16742 +       op.cmd = MMUEXT_NEW_BASEPTR;
16743 +       op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
16744 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16745 +}
16746 +
16747 +void xen_new_user_pt(unsigned long ptr)
16748 +{
16749 +       struct mmuext_op op;
16750 +       op.cmd = MMUEXT_NEW_USER_BASEPTR;
16751 +       op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
16752 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16753 +}
16754 +
16755 +void xen_tlb_flush(void)
16756 +{
16757 +       struct mmuext_op op;
16758 +       op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
16759 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16760 +}
16761 +
16762 +void xen_invlpg(unsigned long ptr)
16763 +{
16764 +       struct mmuext_op op;
16765 +       op.cmd = MMUEXT_INVLPG_LOCAL;
16766 +       op.arg1.linear_addr = ptr & PAGE_MASK;
16767 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16768 +}
16769 +
16770 +#ifdef CONFIG_SMP
16771 +
16772 +void xen_tlb_flush_all(void)
16773 +{
16774 +       struct mmuext_op op;
16775 +       op.cmd = MMUEXT_TLB_FLUSH_ALL;
16776 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16777 +}
16778 +
16779 +void xen_tlb_flush_mask(cpumask_t *mask)
16780 +{
16781 +       struct mmuext_op op;
16782 +       if ( cpus_empty(*mask) )
16783 +               return;
16784 +       op.cmd = MMUEXT_TLB_FLUSH_MULTI;
16785 +       op.arg2.vcpumask = mask->bits;
16786 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16787 +}
16788 +
16789 +void xen_invlpg_all(unsigned long ptr)
16790 +{
16791 +       struct mmuext_op op;
16792 +       op.cmd = MMUEXT_INVLPG_ALL;
16793 +       op.arg1.linear_addr = ptr & PAGE_MASK;
16794 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16795 +}
16796 +
16797 +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr)
16798 +{
16799 +       struct mmuext_op op;
16800 +       if ( cpus_empty(*mask) )
16801 +               return;
16802 +       op.cmd = MMUEXT_INVLPG_MULTI;
16803 +       op.arg1.linear_addr = ptr & PAGE_MASK;
16804 +       op.arg2.vcpumask    = mask->bits;
16805 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16806 +}
16807 +
16808 +#endif /* CONFIG_SMP */
16809 +
16810 +void xen_pgd_pin(unsigned long ptr)
16811 +{
16812 +       struct mmuext_op op;
16813 +#ifdef CONFIG_X86_64
16814 +       op.cmd = MMUEXT_PIN_L4_TABLE;
16815 +#elif defined(CONFIG_X86_PAE)
16816 +       op.cmd = MMUEXT_PIN_L3_TABLE;
16817 +#else
16818 +       op.cmd = MMUEXT_PIN_L2_TABLE;
16819 +#endif
16820 +       op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
16821 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16822 +}
16823 +
16824 +void xen_pgd_unpin(unsigned long ptr)
16825 +{
16826 +       struct mmuext_op op;
16827 +       op.cmd = MMUEXT_UNPIN_TABLE;
16828 +       op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
16829 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16830 +}
16831 +
16832 +void xen_set_ldt(unsigned long ptr, unsigned long len)
16833 +{
16834 +       struct mmuext_op op;
16835 +       op.cmd = MMUEXT_SET_LDT;
16836 +       op.arg1.linear_addr = ptr;
16837 +       op.arg2.nr_ents     = len;
16838 +       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
16839 +}
16840 +
16841 +/*
16842 + * Bitmap is indexed by page number. If bit is set, the page is part of a
16843 + * xen_create_contiguous_region() area of memory.
16844 + */
16845 +unsigned long *contiguous_bitmap;
16846 +
16847 +static void contiguous_bitmap_set(
16848 +       unsigned long first_page, unsigned long nr_pages)
16849 +{
16850 +       unsigned long start_off, end_off, curr_idx, end_idx;
16851 +
16852 +       curr_idx  = first_page / BITS_PER_LONG;
16853 +       start_off = first_page & (BITS_PER_LONG-1);
16854 +       end_idx   = (first_page + nr_pages) / BITS_PER_LONG;
16855 +       end_off   = (first_page + nr_pages) & (BITS_PER_LONG-1);
16856 +
16857 +       if (curr_idx == end_idx) {
16858 +               contiguous_bitmap[curr_idx] |=
16859 +                       ((1UL<<end_off)-1) & -(1UL<<start_off);
16860 +       } else {
16861 +               contiguous_bitmap[curr_idx] |= -(1UL<<start_off);
16862 +               while ( ++curr_idx < end_idx )
16863 +                       contiguous_bitmap[curr_idx] = ~0UL;
16864 +               contiguous_bitmap[curr_idx] |= (1UL<<end_off)-1;
16865 +       }
16866 +}
16867 +
16868 +static void contiguous_bitmap_clear(
16869 +       unsigned long first_page, unsigned long nr_pages)
16870 +{
16871 +       unsigned long start_off, end_off, curr_idx, end_idx;
16872 +
16873 +       curr_idx  = first_page / BITS_PER_LONG;
16874 +       start_off = first_page & (BITS_PER_LONG-1);
16875 +       end_idx   = (first_page + nr_pages) / BITS_PER_LONG;
16876 +       end_off   = (first_page + nr_pages) & (BITS_PER_LONG-1);
16877 +
16878 +       if (curr_idx == end_idx) {
16879 +               contiguous_bitmap[curr_idx] &=
16880 +                       -(1UL<<end_off) | ((1UL<<start_off)-1);
16881 +       } else {
16882 +               contiguous_bitmap[curr_idx] &= (1UL<<start_off)-1;
16883 +               while ( ++curr_idx != end_idx )
16884 +                       contiguous_bitmap[curr_idx] = 0;
16885 +               contiguous_bitmap[curr_idx] &= -(1UL<<end_off);
16886 +       }
16887 +}
16888 +
16889 +/* Ensure multi-page extents are contiguous in machine memory. */
16890 +int xen_create_contiguous_region(
16891 +       unsigned long vstart, unsigned int order, unsigned int address_bits)
16892 +{
16893 +       pgd_t         *pgd; 
16894 +       pud_t         *pud; 
16895 +       pmd_t         *pmd;
16896 +       pte_t         *pte;
16897 +       unsigned long  frame, i, flags;
16898 +       struct xen_memory_reservation reservation = {
16899 +               .extent_start = &frame,
16900 +               .nr_extents   = 1,
16901 +               .extent_order = 0,
16902 +               .domid        = DOMID_SELF
16903 +       };
16904 +
16905 +       /*
16906 +        * Currently an auto-translated guest will not perform I/O, nor will
16907 +        * it require PAE page directories below 4GB. Therefore any calls to
16908 +        * this function are redundant and can be ignored.
16909 +        */
16910 +       if (xen_feature(XENFEAT_auto_translated_physmap))
16911 +               return 0;
16912 +
16913 +       scrub_pages(vstart, 1 << order);
16914 +
16915 +       balloon_lock(flags);
16916 +
16917 +       /* 1. Zap current PTEs, giving away the underlying pages. */
16918 +       for (i = 0; i < (1<<order); i++) {
16919 +               pgd = pgd_offset_k(vstart + (i*PAGE_SIZE));
16920 +               pud = pud_offset(pgd, (vstart + (i*PAGE_SIZE)));
16921 +               pmd = pmd_offset(pud, (vstart + (i*PAGE_SIZE)));
16922 +               pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE)));
16923 +               frame = pte_mfn(*pte);
16924 +               BUG_ON(HYPERVISOR_update_va_mapping(
16925 +                       vstart + (i*PAGE_SIZE), __pte_ma(0), 0));
16926 +               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
16927 +                       INVALID_P2M_ENTRY);
16928 +               BUG_ON(HYPERVISOR_memory_op(
16929 +                       XENMEM_decrease_reservation, &reservation) != 1);
16930 +       }
16931 +
16932 +       /* 2. Get a new contiguous memory extent. */
16933 +       reservation.extent_order = order;
16934 +       reservation.address_bits = address_bits;
16935 +       frame = __pa(vstart) >> PAGE_SHIFT;
16936 +       if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
16937 +                                &reservation) != 1)
16938 +               goto fail;
16939 +
16940 +       /* 3. Map the new extent in place of old pages. */
16941 +       for (i = 0; i < (1<<order); i++) {
16942 +               BUG_ON(HYPERVISOR_update_va_mapping(
16943 +                       vstart + (i*PAGE_SIZE),
16944 +                       pfn_pte_ma(frame+i, PAGE_KERNEL), 0));
16945 +               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame+i);
16946 +       }
16947 +
16948 +       flush_tlb_all();
16949 +
16950 +       contiguous_bitmap_set(__pa(vstart) >> PAGE_SHIFT, 1UL << order);
16951 +
16952 +       balloon_unlock(flags);
16953 +
16954 +       return 0;
16955 +
16956 + fail:
16957 +       reservation.extent_order = 0;
16958 +       reservation.address_bits = 0;
16959 +
16960 +       for (i = 0; i < (1<<order); i++) {
16961 +               frame = (__pa(vstart) >> PAGE_SHIFT) + i;
16962 +               BUG_ON(HYPERVISOR_memory_op(
16963 +                       XENMEM_populate_physmap, &reservation) != 1);
16964 +               BUG_ON(HYPERVISOR_update_va_mapping(
16965 +                       vstart + (i*PAGE_SIZE),
16966 +                       pfn_pte_ma(frame, PAGE_KERNEL), 0));
16967 +               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
16968 +       }
16969 +
16970 +       flush_tlb_all();
16971 +
16972 +       balloon_unlock(flags);
16973 +
16974 +       return -ENOMEM;
16975 +}
16976 +
16977 +void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
16978 +{
16979 +       pgd_t         *pgd; 
16980 +       pud_t         *pud; 
16981 +       pmd_t         *pmd;
16982 +       pte_t         *pte;
16983 +       unsigned long  frame, i, flags;
16984 +       struct xen_memory_reservation reservation = {
16985 +               .extent_start = &frame,
16986 +               .nr_extents   = 1,
16987 +               .extent_order = 0,
16988 +               .domid        = DOMID_SELF
16989 +       };
16990 +
16991 +       if (xen_feature(XENFEAT_auto_translated_physmap))
16992 +               return;
16993 +
16994 +       scrub_pages(vstart, 1 << order);
16995 +
16996 +       balloon_lock(flags);
16997 +
16998 +       contiguous_bitmap_clear(__pa(vstart) >> PAGE_SHIFT, 1UL << order);
16999 +
17000 +       /* 1. Zap current PTEs, giving away the underlying pages. */
17001 +       for (i = 0; i < (1<<order); i++) {
17002 +               pgd = pgd_offset_k(vstart + (i*PAGE_SIZE));
17003 +               pud = pud_offset(pgd, (vstart + (i*PAGE_SIZE)));
17004 +               pmd = pmd_offset(pud, (vstart + (i*PAGE_SIZE)));
17005 +               pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE)));
17006 +               frame = pte_mfn(*pte);
17007 +               BUG_ON(HYPERVISOR_update_va_mapping(
17008 +                       vstart + (i*PAGE_SIZE), __pte_ma(0), 0));
17009 +               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
17010 +                       INVALID_P2M_ENTRY);
17011 +               BUG_ON(HYPERVISOR_memory_op(
17012 +                       XENMEM_decrease_reservation, &reservation) != 1);
17013 +       }
17014 +
17015 +       /* 2. Map new pages in place of old pages. */
17016 +       for (i = 0; i < (1<<order); i++) {
17017 +               frame = (__pa(vstart) >> PAGE_SHIFT) + i;
17018 +               BUG_ON(HYPERVISOR_memory_op(
17019 +                       XENMEM_populate_physmap, &reservation) != 1);
17020 +               BUG_ON(HYPERVISOR_update_va_mapping(
17021 +                       vstart + (i*PAGE_SIZE),
17022 +                       pfn_pte_ma(frame, PAGE_KERNEL), 0));
17023 +               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
17024 +       }
17025 +
17026 +       flush_tlb_all();
17027 +
17028 +       balloon_unlock(flags);
17029 +}
17030 +
17031 +#ifdef __i386__
17032 +int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
17033 +{
17034 +       __u32 *lp = (__u32 *)((char *)ldt + entry * 8);
17035 +       maddr_t mach_lp = arbitrary_virt_to_machine(lp);
17036 +       return HYPERVISOR_update_descriptor(
17037 +               mach_lp, (u64)entry_a | ((u64)entry_b<<32));
17038 +}
17039 +#endif
17040 +
17041 +/*
17042 + * Local variables:
17043 + *  c-file-style: "linux"
17044 + *  indent-tabs-mode: t
17045 + *  c-indent-level: 8
17046 + *  c-basic-offset: 8
17047 + *  tab-width: 8
17048 + * End:
17049 + */
17050 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/mm/init-xen.c linux-2.6.16/arch/i386/mm/init-xen.c
17051 --- linux-2.6.16.orig/arch/i386/mm/init-xen.c   1970-01-01 01:00:00.000000000 +0100
17052 +++ linux-2.6.16/arch/i386/mm/init-xen.c        2006-06-26 09:51:32.000000000 +0200
17053 @@ -0,0 +1,854 @@
17054 +/*
17055 + *  linux/arch/i386/mm/init.c
17056 + *
17057 + *  Copyright (C) 1995  Linus Torvalds
17058 + *
17059 + *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
17060 + */
17061 +
17062 +#include <linux/config.h>
17063 +#include <linux/module.h>
17064 +#include <linux/signal.h>
17065 +#include <linux/sched.h>
17066 +#include <linux/kernel.h>
17067 +#include <linux/errno.h>
17068 +#include <linux/string.h>
17069 +#include <linux/types.h>
17070 +#include <linux/ptrace.h>
17071 +#include <linux/mman.h>
17072 +#include <linux/mm.h>
17073 +#include <linux/hugetlb.h>
17074 +#include <linux/swap.h>
17075 +#include <linux/smp.h>
17076 +#include <linux/init.h>
17077 +#include <linux/highmem.h>
17078 +#include <linux/pagemap.h>
17079 +#include <linux/bootmem.h>
17080 +#include <linux/slab.h>
17081 +#include <linux/proc_fs.h>
17082 +#include <linux/efi.h>
17083 +#include <linux/memory_hotplug.h>
17084 +#include <linux/initrd.h>
17085 +#include <linux/dma-mapping.h>
17086 +#include <linux/scatterlist.h>
17087 +
17088 +#include <asm/processor.h>
17089 +#include <asm/system.h>
17090 +#include <asm/uaccess.h>
17091 +#include <asm/pgtable.h>
17092 +#include <asm/dma.h>
17093 +#include <asm/fixmap.h>
17094 +#include <asm/e820.h>
17095 +#include <asm/apic.h>
17096 +#include <asm/tlb.h>
17097 +#include <asm/tlbflush.h>
17098 +#include <asm/sections.h>
17099 +#include <asm/hypervisor.h>
17100 +#include <asm/swiotlb.h>
17101 +
17102 +extern unsigned long *contiguous_bitmap;
17103 +
17104 +unsigned int __VMALLOC_RESERVE = 128 << 20;
17105 +
17106 +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
17107 +unsigned long highstart_pfn, highend_pfn;
17108 +
17109 +static int noinline do_test_wp_bit(void);
17110 +
17111 +/*
17112 + * Creates a middle page table and puts a pointer to it in the
17113 + * given global directory entry. This only returns the gd entry
17114 + * in non-PAE compilation mode, since the middle layer is folded.
17115 + */
17116 +static pmd_t * __init one_md_table_init(pgd_t *pgd)
17117 +{
17118 +       pud_t *pud;
17119 +       pmd_t *pmd_table;
17120 +               
17121 +#ifdef CONFIG_X86_PAE
17122 +       pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
17123 +       make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
17124 +       set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
17125 +       pud = pud_offset(pgd, 0);
17126 +       if (pmd_table != pmd_offset(pud, 0)) 
17127 +               BUG();
17128 +#else
17129 +       pud = pud_offset(pgd, 0);
17130 +       pmd_table = pmd_offset(pud, 0);
17131 +#endif
17132 +
17133 +       return pmd_table;
17134 +}
17135 +
17136 +/*
17137 + * Create a page table and place a pointer to it in a middle page
17138 + * directory entry.
17139 + */
17140 +static pte_t * __init one_page_table_init(pmd_t *pmd)
17141 +{
17142 +       if (pmd_none(*pmd)) {
17143 +               pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
17144 +               make_lowmem_page_readonly(page_table,
17145 +                                         XENFEAT_writable_page_tables);
17146 +               set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
17147 +               if (page_table != pte_offset_kernel(pmd, 0))
17148 +                       BUG();  
17149 +
17150 +               return page_table;
17151 +       }
17152 +       
17153 +       return pte_offset_kernel(pmd, 0);
17154 +}
17155 +
17156 +/*
17157 + * This function initializes a certain range of kernel virtual memory 
17158 + * with new bootmem page tables, everywhere page tables are missing in
17159 + * the given range.
17160 + */
17161 +
17162 +/*
17163 + * NOTE: The pagetables are allocated contiguous on the physical space 
17164 + * so we can cache the place of the first one and move around without 
17165 + * checking the pgd every time.
17166 + */
17167 +static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
17168 +{
17169 +       pgd_t *pgd;
17170 +       pud_t *pud;
17171 +       pmd_t *pmd;
17172 +       int pgd_idx, pmd_idx;
17173 +       unsigned long vaddr;
17174 +
17175 +       vaddr = start;
17176 +       pgd_idx = pgd_index(vaddr);
17177 +       pmd_idx = pmd_index(vaddr);
17178 +       pgd = pgd_base + pgd_idx;
17179 +
17180 +       for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
17181 +               if (pgd_none(*pgd)) 
17182 +                       one_md_table_init(pgd);
17183 +               pud = pud_offset(pgd, vaddr);
17184 +               pmd = pmd_offset(pud, vaddr);
17185 +               for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
17186 +                       if (vaddr < HYPERVISOR_VIRT_START && pmd_none(*pmd)) 
17187 +                               one_page_table_init(pmd);
17188 +
17189 +                       vaddr += PMD_SIZE;
17190 +               }
17191 +               pmd_idx = 0;
17192 +       }
17193 +}
17194 +
17195 +static inline int is_kernel_text(unsigned long addr)
17196 +{
17197 +       if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
17198 +               return 1;
17199 +       return 0;
17200 +}
17201 +
17202 +/*
17203 + * This maps the physical memory to kernel virtual address space, a total 
17204 + * of max_low_pfn pages, by creating page tables starting from address 
17205 + * PAGE_OFFSET.
17206 + */
17207 +static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
17208 +{
17209 +       unsigned long pfn;
17210 +       pgd_t *pgd;
17211 +       pmd_t *pmd;
17212 +       pte_t *pte;
17213 +       int pgd_idx, pmd_idx, pte_ofs;
17214 +
17215 +       unsigned long max_ram_pfn = xen_start_info->nr_pages;
17216 +       if (max_ram_pfn > max_low_pfn)
17217 +               max_ram_pfn = max_low_pfn;
17218 +
17219 +       pgd_idx = pgd_index(PAGE_OFFSET);
17220 +       pgd = pgd_base + pgd_idx;
17221 +       pfn = 0;
17222 +       pmd_idx = pmd_index(PAGE_OFFSET);
17223 +       pte_ofs = pte_index(PAGE_OFFSET);
17224 +
17225 +       for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
17226 +#ifdef CONFIG_XEN
17227 +               /*
17228 +                * Native linux hasn't PAE-paging enabled yet at this
17229 +                * point.  When running as xen domain we are in PAE
17230 +                * mode already, thus we can't simply hook a empty
17231 +                * pmd.  That would kill the mappings we are currently
17232 +                * using ...
17233 +                */
17234 +               pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET);
17235 +#else
17236 +               pmd = one_md_table_init(pgd);
17237 +#endif
17238 +               if (pfn >= max_low_pfn)
17239 +                       continue;
17240 +               pmd += pmd_idx;
17241 +               for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
17242 +                       unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
17243 +                       if (address >= HYPERVISOR_VIRT_START)
17244 +                               continue;
17245 +
17246 +                       /* Map with big pages if possible, otherwise create normal page tables. */
17247 +                       if (cpu_has_pse) {
17248 +                               unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
17249 +
17250 +                               if (is_kernel_text(address) || is_kernel_text(address2))
17251 +                                       set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
17252 +                               else
17253 +                                       set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
17254 +                               pfn += PTRS_PER_PTE;
17255 +                       } else {
17256 +                               pte = one_page_table_init(pmd);
17257 +
17258 +                               pte += pte_ofs;
17259 +                               for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) {
17260 +                                               /* XEN: Only map initial RAM allocation. */
17261 +                                               if ((pfn >= max_ram_pfn) || pte_present(*pte))
17262 +                                                       continue;
17263 +                                               if (is_kernel_text(address))
17264 +                                                       set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
17265 +                                               else
17266 +                                                       set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
17267 +                               }
17268 +                               pte_ofs = 0;
17269 +                       }
17270 +               }
17271 +               pmd_idx = 0;
17272 +       }
17273 +}
17274 +
17275 +#ifndef CONFIG_XEN
17276 +
17277 +static inline int page_kills_ppro(unsigned long pagenr)
17278 +{
17279 +       if (pagenr >= 0x70000 && pagenr <= 0x7003F)
17280 +               return 1;
17281 +       return 0;
17282 +}
17283 +
17284 +extern int is_available_memory(efi_memory_desc_t *);
17285 +
17286 +int page_is_ram(unsigned long pagenr)
17287 +{
17288 +       int i;
17289 +       unsigned long addr, end;
17290 +
17291 +       if (efi_enabled) {
17292 +               efi_memory_desc_t *md;
17293 +               void *p;
17294 +
17295 +               for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
17296 +                       md = p;
17297 +                       if (!is_available_memory(md))
17298 +                               continue;
17299 +                       addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
17300 +                       end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
17301 +
17302 +                       if ((pagenr >= addr) && (pagenr < end))
17303 +                               return 1;
17304 +               }
17305 +               return 0;
17306 +       }
17307 +
17308 +       for (i = 0; i < e820.nr_map; i++) {
17309 +
17310 +               if (e820.map[i].type != E820_RAM)       /* not usable memory */
17311 +                       continue;
17312 +               /*
17313 +                *      !!!FIXME!!! Some BIOSen report areas as RAM that
17314 +                *      are not. Notably the 640->1Mb area. We need a sanity
17315 +                *      check here.
17316 +                */
17317 +               addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
17318 +               end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
17319 +               if  ((pagenr >= addr) && (pagenr < end))
17320 +                       return 1;
17321 +       }
17322 +       return 0;
17323 +}
17324 +
17325 +#else /* CONFIG_XEN */
17326 +
17327 +#define page_kills_ppro(p)     0
17328 +#define page_is_ram(p)         1
17329 +
17330 +#endif
17331 +
17332 +#ifdef CONFIG_HIGHMEM
17333 +pte_t *kmap_pte;
17334 +pgprot_t kmap_prot;
17335 +
17336 +#define kmap_get_fixmap_pte(vaddr)                                     \
17337 +       pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
17338 +
17339 +static void __init kmap_init(void)
17340 +{
17341 +       unsigned long kmap_vstart;
17342 +
17343 +       /* cache the first kmap pte */
17344 +       kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
17345 +       kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
17346 +
17347 +       kmap_prot = PAGE_KERNEL;
17348 +}
17349 +
17350 +static void __init permanent_kmaps_init(pgd_t *pgd_base)
17351 +{
17352 +       pgd_t *pgd;
17353 +       pud_t *pud;
17354 +       pmd_t *pmd;
17355 +       pte_t *pte;
17356 +       unsigned long vaddr;
17357 +
17358 +       vaddr = PKMAP_BASE;
17359 +       page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
17360 +
17361 +       pgd = swapper_pg_dir + pgd_index(vaddr);
17362 +       pud = pud_offset(pgd, vaddr);
17363 +       pmd = pmd_offset(pud, vaddr);
17364 +       pte = pte_offset_kernel(pmd, vaddr);
17365 +       pkmap_page_table = pte; 
17366 +}
17367 +
17368 +static void __meminit free_new_highpage(struct page *page, int pfn)
17369 +{
17370 +       set_page_count(page, 1);
17371 +       if (pfn < xen_start_info->nr_pages)
17372 +               __free_page(page);
17373 +       totalhigh_pages++;
17374 +}
17375 +
17376 +void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
17377 +{
17378 +       if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
17379 +               ClearPageReserved(page);
17380 +               free_new_highpage(page, pfn);
17381 +       } else
17382 +               SetPageReserved(page);
17383 +}
17384 +
17385 +static int add_one_highpage_hotplug(struct page *page, unsigned long pfn)
17386 +{
17387 +       free_new_highpage(page, pfn);
17388 +       totalram_pages++;
17389 +#ifdef CONFIG_FLATMEM
17390 +       max_mapnr = max(pfn, max_mapnr);
17391 +#endif
17392 +       num_physpages++;
17393 +       return 0;
17394 +}
17395 +
17396 +/*
17397 + * Not currently handling the NUMA case.
17398 + * Assuming single node and all memory that
17399 + * has been added dynamically that would be
17400 + * onlined here is in HIGHMEM
17401 + */
17402 +void online_page(struct page *page)
17403 +{
17404 +       ClearPageReserved(page);
17405 +       add_one_highpage_hotplug(page, page_to_pfn(page));
17406 +}
17407 +
17408 +
17409 +#ifdef CONFIG_NUMA
17410 +extern void set_highmem_pages_init(int);
17411 +#else
17412 +static void __init set_highmem_pages_init(int bad_ppro)
17413 +{
17414 +       int pfn;
17415 +       for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
17416 +               add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
17417 +       totalram_pages += totalhigh_pages;
17418 +}
17419 +#endif /* CONFIG_FLATMEM */
17420 +
17421 +#else
17422 +#define kmap_init() do { } while (0)
17423 +#define permanent_kmaps_init(pgd_base) do { } while (0)
17424 +#define set_highmem_pages_init(bad_ppro) do { } while (0)
17425 +#endif /* CONFIG_HIGHMEM */
17426 +
17427 +unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
17428 +EXPORT_SYMBOL(__PAGE_KERNEL);
17429 +unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
17430 +
17431 +#ifdef CONFIG_NUMA
17432 +extern void __init remap_numa_kva(void);
17433 +#else
17434 +#define remap_numa_kva() do {} while (0)
17435 +#endif
17436 +
17437 +pgd_t *swapper_pg_dir;
17438 +
17439 +static void __init pagetable_init (void)
17440 +{
17441 +       unsigned long vaddr;
17442 +       pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
17443 +
17444 +       swapper_pg_dir = pgd_base;
17445 +       init_mm.pgd    = pgd_base;
17446 +
17447 +       /* Enable PSE if available */
17448 +       if (cpu_has_pse) {
17449 +               set_in_cr4(X86_CR4_PSE);
17450 +       }
17451 +
17452 +       /* Enable PGE if available */
17453 +       if (cpu_has_pge) {
17454 +               set_in_cr4(X86_CR4_PGE);
17455 +               __PAGE_KERNEL |= _PAGE_GLOBAL;
17456 +               __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
17457 +       }
17458 +
17459 +       kernel_physical_mapping_init(pgd_base);
17460 +       remap_numa_kva();
17461 +
17462 +       /*
17463 +        * Fixed mappings, only the page table structure has to be
17464 +        * created - mappings will be set by set_fixmap():
17465 +        */
17466 +       vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
17467 +       page_table_range_init(vaddr, 0, pgd_base);
17468 +
17469 +       permanent_kmaps_init(pgd_base);
17470 +}
17471 +
17472 +#ifdef CONFIG_SOFTWARE_SUSPEND
17473 +/*
17474 + * Swap suspend & friends need this for resume because things like the intel-agp
17475 + * driver might have split up a kernel 4MB mapping.
17476 + */
17477 +char __nosavedata swsusp_pg_dir[PAGE_SIZE]
17478 +       __attribute__ ((aligned (PAGE_SIZE)));
17479 +
17480 +static inline void save_pg_dir(void)
17481 +{
17482 +       memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
17483 +}
17484 +#else
17485 +static inline void save_pg_dir(void)
17486 +{
17487 +}
17488 +#endif
17489 +
17490 +void zap_low_mappings (void)
17491 +{
17492 +       int i;
17493 +
17494 +       save_pg_dir();
17495 +
17496 +       /*
17497 +        * Zap initial low-memory mappings.
17498 +        *
17499 +        * Note that "pgd_clear()" doesn't do it for
17500 +        * us, because pgd_clear() is a no-op on i386.
17501 +        */
17502 +       for (i = 0; i < USER_PTRS_PER_PGD; i++)
17503 +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
17504 +               set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
17505 +#else
17506 +               set_pgd(swapper_pg_dir+i, __pgd(0));
17507 +#endif
17508 +       flush_tlb_all();
17509 +}
17510 +
17511 +static int disable_nx __initdata = 0;
17512 +u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
17513 +EXPORT_SYMBOL(__supported_pte_mask);
17514 +
17515 +/*
17516 + * noexec = on|off
17517 + *
17518 + * Control non executable mappings.
17519 + *
17520 + * on      Enable
17521 + * off     Disable
17522 + */
17523 +void __init noexec_setup(const char *str)
17524 +{
17525 +       if (!strncmp(str, "on",2) && cpu_has_nx) {
17526 +               __supported_pte_mask |= _PAGE_NX;
17527 +               disable_nx = 0;
17528 +       } else if (!strncmp(str,"off",3)) {
17529 +               disable_nx = 1;
17530 +               __supported_pte_mask &= ~_PAGE_NX;
17531 +       }
17532 +}
17533 +
17534 +int nx_enabled = 0;
17535 +#ifdef CONFIG_X86_PAE
17536 +
17537 +static void __init set_nx(void)
17538 +{
17539 +       unsigned int v[4], l, h;
17540 +
17541 +       if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
17542 +               cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
17543 +               if ((v[3] & (1 << 20)) && !disable_nx) {
17544 +                       rdmsr(MSR_EFER, l, h);
17545 +                       l |= EFER_NX;
17546 +                       wrmsr(MSR_EFER, l, h);
17547 +                       nx_enabled = 1;
17548 +                       __supported_pte_mask |= _PAGE_NX;
17549 +               }
17550 +       }
17551 +}
17552 +
17553 +/*
17554 + * Enables/disables executability of a given kernel page and
17555 + * returns the previous setting.
17556 + */
17557 +int __init set_kernel_exec(unsigned long vaddr, int enable)
17558 +{
17559 +       pte_t *pte;
17560 +       int ret = 1;
17561 +
17562 +       if (!nx_enabled)
17563 +               goto out;
17564 +
17565 +       pte = lookup_address(vaddr);
17566 +       BUG_ON(!pte);
17567 +
17568 +       if (!pte_exec_kernel(*pte))
17569 +               ret = 0;
17570 +
17571 +       if (enable)
17572 +               pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
17573 +       else
17574 +               pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
17575 +       __flush_tlb_all();
17576 +out:
17577 +       return ret;
17578 +}
17579 +
17580 +#endif
17581 +
17582 +/*
17583 + * paging_init() sets up the page tables - note that the first 8MB are
17584 + * already mapped by head.S.
17585 + *
17586 + * This routines also unmaps the page at virtual kernel address 0, so
17587 + * that we can trap those pesky NULL-reference errors in the kernel.
17588 + */
17589 +void __init paging_init(void)
17590 +{
17591 +       int i;
17592 +
17593 +#ifdef CONFIG_X86_PAE
17594 +       set_nx();
17595 +       if (nx_enabled)
17596 +               printk("NX (Execute Disable) protection: active\n");
17597 +#endif
17598 +
17599 +       pagetable_init();
17600 +
17601 +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
17602 +       /*
17603 +        * We will bail out later - printk doesn't work right now so
17604 +        * the user would just see a hanging kernel.
17605 +        * when running as xen domain we are already in PAE mode at
17606 +        * this point.
17607 +        */
17608 +       if (cpu_has_pae)
17609 +               set_in_cr4(X86_CR4_PAE);
17610 +#endif
17611 +       __flush_tlb_all();
17612 +
17613 +       kmap_init();
17614 +
17615 +       if (!xen_feature(XENFEAT_auto_translated_physmap) ||
17616 +           xen_start_info->shared_info >= xen_start_info->nr_pages) {
17617 +               /* Switch to the real shared_info page, and clear the
17618 +                * dummy page. */
17619 +               set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
17620 +               HYPERVISOR_shared_info =
17621 +                       (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
17622 +               memset(empty_zero_page, 0, sizeof(empty_zero_page));
17623 +       }
17624 +
17625 +       /* Setup mapping of lower 1st MB */
17626 +       for (i = 0; i < NR_FIX_ISAMAPS; i++)
17627 +               if (xen_start_info->flags & SIF_PRIVILEGED)
17628 +                       set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
17629 +               else
17630 +                       __set_fixmap(FIX_ISAMAP_BEGIN - i,
17631 +                                    virt_to_machine(empty_zero_page),
17632 +                                    PAGE_KERNEL_RO);
17633 +}
17634 +
17635 +/*
17636 + * Test if the WP bit works in supervisor mode. It isn't supported on 386's
17637 + * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
17638 + * used to involve black magic jumps to work around some nasty CPU bugs,
17639 + * but fortunately the switch to using exceptions got rid of all that.
17640 + */
17641 +
17642 +static void __init test_wp_bit(void)
17643 +{
17644 +       printk("Checking if this processor honours the WP bit even in supervisor mode... ");
17645 +
17646 +       /* Any page-aligned address will do, the test is non-destructive */
17647 +       __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
17648 +       boot_cpu_data.wp_works_ok = do_test_wp_bit();
17649 +       clear_fixmap(FIX_WP_TEST);
17650 +
17651 +       if (!boot_cpu_data.wp_works_ok) {
17652 +               printk("No.\n");
17653 +#ifdef CONFIG_X86_WP_WORKS_OK
17654 +               panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
17655 +#endif
17656 +       } else {
17657 +               printk("Ok.\n");
17658 +       }
17659 +}
17660 +
17661 +static void __init set_max_mapnr_init(void)
17662 +{
17663 +#ifdef CONFIG_HIGHMEM
17664 +       num_physpages = highend_pfn;
17665 +#else
17666 +       num_physpages = max_low_pfn;
17667 +#endif
17668 +#ifdef CONFIG_FLATMEM
17669 +       max_mapnr = num_physpages;
17670 +#endif
17671 +}
17672 +
17673 +static struct kcore_list kcore_mem, kcore_vmalloc; 
17674 +
17675 +void __init mem_init(void)
17676 +{
17677 +       extern int ppro_with_ram_bug(void);
17678 +       int codesize, reservedpages, datasize, initsize;
17679 +       int tmp;
17680 +       int bad_ppro;
17681 +       unsigned long pfn;
17682 +
17683 +       contiguous_bitmap = alloc_bootmem_low_pages(
17684 +               (max_low_pfn + 2*BITS_PER_LONG) >> 3);
17685 +       BUG_ON(!contiguous_bitmap);
17686 +       memset(contiguous_bitmap, 0, (max_low_pfn + 2*BITS_PER_LONG) >> 3);
17687 +
17688 +#if defined(CONFIG_SWIOTLB)
17689 +       swiotlb_init(); 
17690 +#endif
17691 +
17692 +#ifdef CONFIG_FLATMEM
17693 +       if (!mem_map)
17694 +               BUG();
17695 +#endif
17696 +       
17697 +       bad_ppro = ppro_with_ram_bug();
17698 +
17699 +#ifdef CONFIG_HIGHMEM
17700 +       /* check that fixmap and pkmap do not overlap */
17701 +       if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
17702 +               printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
17703 +               printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
17704 +                               PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
17705 +               BUG();
17706 +       }
17707 +#endif
17708
17709 +       set_max_mapnr_init();
17710 +
17711 +#ifdef CONFIG_HIGHMEM
17712 +       high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
17713 +#else
17714 +       high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
17715 +#endif
17716 +       printk("vmalloc area: %lx-%lx, maxmem %lx\n",
17717 +              VMALLOC_START,VMALLOC_END,MAXMEM);
17718 +       BUG_ON(VMALLOC_START > VMALLOC_END);
17719 +       
17720 +       /* this will put all low memory onto the freelists */
17721 +       totalram_pages += free_all_bootmem();
17722 +       /* XEN: init and count low-mem pages outside initial allocation. */
17723 +       for (pfn = xen_start_info->nr_pages; pfn < max_low_pfn; pfn++) {
17724 +               ClearPageReserved(&mem_map[pfn]);
17725 +               set_page_count(&mem_map[pfn], 1);
17726 +               totalram_pages++;
17727 +       }
17728 +
17729 +       reservedpages = 0;
17730 +       for (tmp = 0; tmp < max_low_pfn; tmp++)
17731 +               /*
17732 +                * Only count reserved RAM pages
17733 +                */
17734 +               if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
17735 +                       reservedpages++;
17736 +
17737 +       set_highmem_pages_init(bad_ppro);
17738 +
17739 +       codesize =  (unsigned long) &_etext - (unsigned long) &_text;
17740 +       datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
17741 +       initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
17742 +
17743 +       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
17744 +       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
17745 +                  VMALLOC_END-VMALLOC_START);
17746 +
17747 +       printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
17748 +               (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
17749 +               num_physpages << (PAGE_SHIFT-10),
17750 +               codesize >> 10,
17751 +               reservedpages << (PAGE_SHIFT-10),
17752 +               datasize >> 10,
17753 +               initsize >> 10,
17754 +               (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
17755 +              );
17756 +
17757 +#ifdef CONFIG_X86_PAE
17758 +       if (!cpu_has_pae)
17759 +               panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
17760 +#endif
17761 +       if (boot_cpu_data.wp_works_ok < 0)
17762 +               test_wp_bit();
17763 +
17764 +       /*
17765 +        * Subtle. SMP is doing it's boot stuff late (because it has to
17766 +        * fork idle threads) - but it also needs low mappings for the
17767 +        * protected-mode entry to work. We zap these entries only after
17768 +        * the WP-bit has been tested.
17769 +        */
17770 +#ifndef CONFIG_SMP
17771 +       zap_low_mappings();
17772 +#endif
17773 +
17774 +       set_bit(PG_pinned, &virt_to_page(init_mm.pgd)->flags);
17775 +}
17776 +
17777 +/*
17778 + * this is for the non-NUMA, single node SMP system case.
17779 + * Specifically, in the case of x86, we will always add
17780 + * memory to the highmem for now.
17781 + */
17782 +#ifndef CONFIG_NEED_MULTIPLE_NODES
17783 +int add_memory(u64 start, u64 size)
17784 +{
17785 +       struct pglist_data *pgdata = &contig_page_data;
17786 +       struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
17787 +       unsigned long start_pfn = start >> PAGE_SHIFT;
17788 +       unsigned long nr_pages = size >> PAGE_SHIFT;
17789 +
17790 +       return __add_pages(zone, start_pfn, nr_pages);
17791 +}
17792 +
17793 +int remove_memory(u64 start, u64 size)
17794 +{
17795 +       return -EINVAL;
17796 +}
17797 +#endif
17798 +
17799 +kmem_cache_t *pgd_cache;
17800 +kmem_cache_t *pmd_cache;
17801 +
17802 +void __init pgtable_cache_init(void)
17803 +{
17804 +       if (PTRS_PER_PMD > 1) {
17805 +               pmd_cache = kmem_cache_create("pmd",
17806 +                                       PTRS_PER_PMD*sizeof(pmd_t),
17807 +                                       PTRS_PER_PMD*sizeof(pmd_t),
17808 +                                       0,
17809 +                                       pmd_ctor,
17810 +                                       NULL);
17811 +               if (!pmd_cache)
17812 +                       panic("pgtable_cache_init(): cannot create pmd cache");
17813 +       }
17814 +       pgd_cache = kmem_cache_create("pgd",
17815 +#ifndef CONFIG_XEN
17816 +                               PTRS_PER_PGD*sizeof(pgd_t),
17817 +                               PTRS_PER_PGD*sizeof(pgd_t),
17818 +#else
17819 +                               PAGE_SIZE,
17820 +                               PAGE_SIZE,
17821 +#endif
17822 +                               0,
17823 +                               pgd_ctor,
17824 +                               pgd_dtor);
17825 +       if (!pgd_cache)
17826 +               panic("pgtable_cache_init(): Cannot create pgd cache");
17827 +}
17828 +
17829 +/*
17830 + * This function cannot be __init, since exceptions don't work in that
17831 + * section.  Put this after the callers, so that it cannot be inlined.
17832 + */
17833 +static int noinline do_test_wp_bit(void)
17834 +{
17835 +       char tmp_reg;
17836 +       int flag;
17837 +
17838 +       __asm__ __volatile__(
17839 +               "       movb %0,%1      \n"
17840 +               "1:     movb %1,%0      \n"
17841 +               "       xorl %2,%2      \n"
17842 +               "2:                     \n"
17843 +               ".section __ex_table,\"a\"\n"
17844 +               "       .align 4        \n"
17845 +               "       .long 1b,2b     \n"
17846 +               ".previous              \n"
17847 +               :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
17848 +                "=q" (tmp_reg),
17849 +                "=r" (flag)
17850 +               :"2" (1)
17851 +               :"memory");
17852 +       
17853 +       return flag;
17854 +}
17855 +
17856 +void free_initmem(void)
17857 +{
17858 +       unsigned long addr;
17859 +
17860 +       addr = (unsigned long)(&__init_begin);
17861 +       for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
17862 +               ClearPageReserved(virt_to_page(addr));
17863 +               set_page_count(virt_to_page(addr), 1);
17864 +               memset((void *)addr, 0xcc, PAGE_SIZE);
17865 +               free_page(addr);
17866 +               totalram_pages++;
17867 +       }
17868 +       printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", (__init_end - __init_begin) >> 10);
17869 +}
17870 +
17871 +#ifdef CONFIG_DEBUG_RODATA
17872 +
17873 +extern char __start_rodata, __end_rodata;
17874 +void mark_rodata_ro(void)
17875 +{
17876 +       unsigned long addr = (unsigned long)&__start_rodata;
17877 +
17878 +       for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
17879 +               change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
17880 +
17881 +       printk ("Write protecting the kernel read-only data: %luk\n",
17882 +                       (unsigned long)(&__end_rodata - &__start_rodata) >> 10);
17883 +
17884 +       /*
17885 +        * change_page_attr() requires a global_flush_tlb() call after it.
17886 +        * We do this after the printk so that if something went wrong in the
17887 +        * change, the printk gets out at least to give a better debug hint
17888 +        * of who is the culprit.
17889 +        */
17890 +       global_flush_tlb();
17891 +}
17892 +#endif
17893 +
17894 +
17895 +#ifdef CONFIG_BLK_DEV_INITRD
17896 +void free_initrd_mem(unsigned long start, unsigned long end)
17897 +{
17898 +       if (start < end)
17899 +               printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
17900 +       for (; start < end; start += PAGE_SIZE) {
17901 +               ClearPageReserved(virt_to_page(start));
17902 +               set_page_count(virt_to_page(start), 1);
17903 +               free_page(start);
17904 +               totalram_pages++;
17905 +       }
17906 +}
17907 +#endif
17908 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/mm/ioremap-xen.c linux-2.6.16/arch/i386/mm/ioremap-xen.c
17909 --- linux-2.6.16.orig/arch/i386/mm/ioremap-xen.c        1970-01-01 01:00:00.000000000 +0100
17910 +++ linux-2.6.16/arch/i386/mm/ioremap-xen.c     2006-06-26 09:51:32.000000000 +0200
17911 @@ -0,0 +1,464 @@
17912 +/*
17913 + * arch/i386/mm/ioremap.c
17914 + *
17915 + * Re-map IO memory to kernel address space so that we can access it.
17916 + * This is needed for high PCI addresses that aren't mapped in the
17917 + * 640k-1MB IO memory area on PC's
17918 + *
17919 + * (C) Copyright 1995 1996 Linus Torvalds
17920 + */
17921 +
17922 +#include <linux/vmalloc.h>
17923 +#include <linux/init.h>
17924 +#include <linux/slab.h>
17925 +#include <linux/module.h>
17926 +#include <asm/io.h>
17927 +#include <asm/fixmap.h>
17928 +#include <asm/cacheflush.h>
17929 +#include <asm/tlbflush.h>
17930 +#include <asm/pgtable.h>
17931 +#include <asm/pgalloc.h>
17932 +
17933 +#define ISA_START_ADDRESS      0x0
17934 +#define ISA_END_ADDRESS                0x100000
17935 +
17936 +#if 0 /* not PAE safe */
17937 +/* These hacky macros avoid phys->machine translations. */
17938 +#define __direct_pte(x) ((pte_t) { (x) } )
17939 +#define __direct_mk_pte(page_nr,pgprot) \
17940 +  __direct_pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot))
17941 +#define direct_mk_pte_phys(physpage, pgprot) \
17942 +  __direct_mk_pte((physpage) >> PAGE_SHIFT, pgprot)
17943 +#endif
17944 +
17945 +static int direct_remap_area_pte_fn(pte_t *pte, 
17946 +                                   struct page *pmd_page,
17947 +                                   unsigned long address, 
17948 +                                   void *data)
17949 +{
17950 +       mmu_update_t **v = (mmu_update_t **)data;
17951 +
17952 +       (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
17953 +                    PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
17954 +       (*v)++;
17955 +
17956 +       return 0;
17957 +}
17958 +
17959 +static int __direct_remap_pfn_range(struct mm_struct *mm,
17960 +                                   unsigned long address, 
17961 +                                   unsigned long mfn,
17962 +                                   unsigned long size, 
17963 +                                   pgprot_t prot,
17964 +                                   domid_t  domid)
17965 +{
17966 +       int rc;
17967 +       unsigned long i, start_address;
17968 +       mmu_update_t *u, *v, *w;
17969 +
17970 +       u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
17971 +       if (u == NULL)
17972 +               return -ENOMEM;
17973 +
17974 +       start_address = address;
17975 +
17976 +       flush_cache_all();
17977 +
17978 +       for (i = 0; i < size; i += PAGE_SIZE) {
17979 +               if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
17980 +                       /* Fill in the PTE pointers. */
17981 +                       rc = apply_to_page_range(mm, start_address, 
17982 +                                                address - start_address,
17983 +                                                direct_remap_area_pte_fn, &w);
17984 +                       if (rc)
17985 +                               goto out;
17986 +                       w = u;
17987 +                       rc = -EFAULT;
17988 +                       if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
17989 +                               goto out;
17990 +                       v = u;
17991 +                       start_address = address;
17992 +               }
17993 +
17994 +               /*
17995 +                * Fill in the machine address: PTE ptr is done later by
17996 +                * __direct_remap_area_pages(). 
17997 +                */
17998 +               v->val = pte_val_ma(pfn_pte_ma(mfn, prot));
17999 +
18000 +               mfn++;
18001 +               address += PAGE_SIZE; 
18002 +               v++;
18003 +       }
18004 +
18005 +       if (v != u) {
18006 +               /* get the ptep's filled in */
18007 +               rc = apply_to_page_range(mm, start_address,
18008 +                                        address - start_address,
18009 +                                        direct_remap_area_pte_fn, &w);
18010 +               if (rc)
18011 +                       goto out;
18012 +               rc = -EFAULT;
18013 +               if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
18014 +                       goto out;
18015 +       }
18016 +
18017 +       rc = 0;
18018 +
18019 + out:
18020 +       flush_tlb_all();
18021 +
18022 +       free_page((unsigned long)u);
18023 +
18024 +       return rc;
18025 +}
18026 +
18027 +int direct_remap_pfn_range(struct vm_area_struct *vma,
18028 +                          unsigned long address, 
18029 +                          unsigned long mfn,
18030 +                          unsigned long size, 
18031 +                          pgprot_t prot,
18032 +                          domid_t  domid)
18033 +{
18034 +       /* Same as remap_pfn_range(). */
18035 +       vma->vm_flags |= VM_IO | VM_RESERVED;
18036 +
18037 +       if (domid == DOMID_SELF)
18038 +               return -EINVAL;
18039 +
18040 +       return __direct_remap_pfn_range(
18041 +               vma->vm_mm, address, mfn, size, prot, domid);
18042 +}
18043 +EXPORT_SYMBOL(direct_remap_pfn_range);
18044 +
18045 +int direct_kernel_remap_pfn_range(unsigned long address, 
18046 +                                 unsigned long mfn,
18047 +                                 unsigned long size, 
18048 +                                 pgprot_t prot,
18049 +                                 domid_t  domid)
18050 +{
18051 +       return __direct_remap_pfn_range(
18052 +               &init_mm, address, mfn, size, prot, domid);
18053 +}
18054 +EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
18055 +
18056 +static int lookup_pte_fn(
18057 +       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
18058 +{
18059 +       uint64_t *ptep = (uint64_t *)data;
18060 +       if (ptep)
18061 +               *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
18062 +                        PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
18063 +       return 0;
18064 +}
18065 +
18066 +int create_lookup_pte_addr(struct mm_struct *mm, 
18067 +                          unsigned long address,
18068 +                          uint64_t *ptep)
18069 +{
18070 +       return apply_to_page_range(mm, address, PAGE_SIZE,
18071 +                                  lookup_pte_fn, ptep);
18072 +}
18073 +
18074 +EXPORT_SYMBOL(create_lookup_pte_addr);
18075 +
18076 +static int noop_fn(
18077 +       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
18078 +{
18079 +       return 0;
18080 +}
18081 +
18082 +int touch_pte_range(struct mm_struct *mm,
18083 +                   unsigned long address,
18084 +                   unsigned long size)
18085 +{
18086 +       return apply_to_page_range(mm, address, size, noop_fn, NULL);
18087 +} 
18088 +
18089 +EXPORT_SYMBOL(touch_pte_range);
18090 +
18091 +/*
18092 + * Does @address reside within a non-highmem page that is local to this virtual
18093 + * machine (i.e., not an I/O page, nor a memory page belonging to another VM).
18094 + * See the comment that accompanies mfn_to_local_pfn() in page.h to understand
18095 + * why this works.
18096 + */
18097 +static inline int is_local_lowmem(unsigned long address)
18098 +{
18099 +       extern unsigned long max_low_pfn;
18100 +       return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn);
18101 +}
18102 +
18103 +/*
18104 + * Generic mapping function (not visible outside):
18105 + */
18106 +
18107 +/*
18108 + * Remap an arbitrary physical address space into the kernel virtual
18109 + * address space. Needed when the kernel wants to access high addresses
18110 + * directly.
18111 + *
18112 + * NOTE! We need to allow non-page-aligned mappings too: we will obviously
18113 + * have to convert them into an offset in a page-aligned mapping, but the
18114 + * caller shouldn't need to know that small detail.
18115 + */
18116 +void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
18117 +{
18118 +       void __iomem * addr;
18119 +       struct vm_struct * area;
18120 +       unsigned long offset, last_addr;
18121 +       domid_t domid = DOMID_IO;
18122 +
18123 +       /* Don't allow wraparound or zero size */
18124 +       last_addr = phys_addr + size - 1;
18125 +       if (!size || last_addr < phys_addr)
18126 +               return NULL;
18127 +
18128 +       /*
18129 +        * Don't remap the low PCI/ISA area, it's always mapped..
18130 +        */
18131 +       if (xen_start_info->flags & SIF_PRIVILEGED &&
18132 +           phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
18133 +               return (void __iomem *) isa_bus_to_virt(phys_addr);
18134 +
18135 +       /*
18136 +        * Don't allow anybody to remap normal RAM that we're using..
18137 +        */
18138 +       if (is_local_lowmem(phys_addr)) {
18139 +               char *t_addr, *t_end;
18140 +               struct page *page;
18141 +
18142 +               t_addr = bus_to_virt(phys_addr);
18143 +               t_end = t_addr + (size - 1);
18144 +          
18145 +               for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
18146 +                       if(!PageReserved(page))
18147 +                               return NULL;
18148 +
18149 +               domid = DOMID_SELF;
18150 +       }
18151 +
18152 +       /*
18153 +        * Mappings have to be page-aligned
18154 +        */
18155 +       offset = phys_addr & ~PAGE_MASK;
18156 +       phys_addr &= PAGE_MASK;
18157 +       size = PAGE_ALIGN(last_addr+1) - phys_addr;
18158 +
18159 +       /*
18160 +        * Ok, go for it..
18161 +        */
18162 +       area = get_vm_area(size, VM_IOREMAP | (flags << 20));
18163 +       if (!area)
18164 +               return NULL;
18165 +       area->phys_addr = phys_addr;
18166 +       addr = (void __iomem *) area->addr;
18167 +       flags |= _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED;
18168 +#ifdef __x86_64__
18169 +       flags |= _PAGE_USER;
18170 +#endif
18171 +       if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
18172 +                                    phys_addr>>PAGE_SHIFT,
18173 +                                    size, __pgprot(flags), domid)) {
18174 +               vunmap((void __force *) addr);
18175 +               return NULL;
18176 +       }
18177 +       return (void __iomem *) (offset + (char __iomem *)addr);
18178 +}
18179 +EXPORT_SYMBOL(__ioremap);
18180 +
18181 +/**
18182 + * ioremap_nocache     -   map bus memory into CPU space
18183 + * @offset:    bus address of the memory
18184 + * @size:      size of the resource to map
18185 + *
18186 + * ioremap_nocache performs a platform specific sequence of operations to
18187 + * make bus memory CPU accessible via the readb/readw/readl/writeb/
18188 + * writew/writel functions and the other mmio helpers. The returned
18189 + * address is not guaranteed to be usable directly as a virtual
18190 + * address. 
18191 + *
18192 + * This version of ioremap ensures that the memory is marked uncachable
18193 + * on the CPU as well as honouring existing caching rules from things like
18194 + * the PCI bus. Note that there are other caches and buffers on many 
18195 + * busses. In particular driver authors should read up on PCI writes
18196 + *
18197 + * It's useful if some control registers are in such an area and
18198 + * write combining or read caching is not desirable:
18199 + * 
18200 + * Must be freed with iounmap.
18201 + */
18202 +
18203 +void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
18204 +{
18205 +       unsigned long last_addr;
18206 +       void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
18207 +       if (!p) 
18208 +               return p; 
18209 +
18210 +       /* Guaranteed to be > phys_addr, as per __ioremap() */
18211 +       last_addr = phys_addr + size - 1;
18212 +
18213 +       if (is_local_lowmem(last_addr)) { 
18214 +               struct page *ppage = virt_to_page(bus_to_virt(phys_addr));
18215 +               unsigned long npages;
18216 +
18217 +               phys_addr &= PAGE_MASK;
18218 +
18219 +               /* This might overflow and become zero.. */
18220 +               last_addr = PAGE_ALIGN(last_addr);
18221 +
18222 +               /* .. but that's ok, because modulo-2**n arithmetic will make
18223 +               * the page-aligned "last - first" come out right.
18224 +               */
18225 +               npages = (last_addr - phys_addr) >> PAGE_SHIFT;
18226 +
18227 +               if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { 
18228 +                       iounmap(p); 
18229 +                       p = NULL;
18230 +               }
18231 +               global_flush_tlb();
18232 +       }
18233 +
18234 +       return p;                                       
18235 +}
18236 +EXPORT_SYMBOL(ioremap_nocache);
18237 +
18238 +/**
18239 + * iounmap - Free a IO remapping
18240 + * @addr: virtual address from ioremap_*
18241 + *
18242 + * Caller must ensure there is only one unmapping for the same pointer.
18243 + */
18244 +void iounmap(volatile void __iomem *addr)
18245 +{
18246 +       struct vm_struct *p, *o;
18247 +
18248 +       if ((void __force *)addr <= high_memory)
18249 +               return;
18250 +
18251 +       /*
18252 +        * __ioremap special-cases the PCI/ISA range by not instantiating a
18253 +        * vm_area and by simply returning an address into the kernel mapping
18254 +        * of ISA space.   So handle that here.
18255 +        */
18256 +       if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
18257 +               return;
18258 +
18259 +       addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
18260 +
18261 +       /* Use the vm area unlocked, assuming the caller
18262 +          ensures there isn't another iounmap for the same address
18263 +          in parallel. Reuse of the virtual address is prevented by
18264 +          leaving it in the global lists until we're done with it.
18265 +          cpa takes care of the direct mappings. */
18266 +       read_lock(&vmlist_lock);
18267 +       for (p = vmlist; p; p = p->next) {
18268 +               if (p->addr == addr)
18269 +                       break;
18270 +       }
18271 +       read_unlock(&vmlist_lock);
18272 +
18273 +       if (!p) {
18274 +               printk("iounmap: bad address %p\n", addr);
18275 +               dump_stack();
18276 +               return;
18277 +       }
18278 +
18279 +       /* Reset the direct mapping. Can block */
18280 +       if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) {
18281 +               /* p->size includes the guard page, but cpa doesn't like that */
18282 +               change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)),
18283 +                                (p->size - PAGE_SIZE) >> PAGE_SHIFT,
18284 +                                PAGE_KERNEL);
18285 +               global_flush_tlb();
18286 +       } 
18287 +
18288 +       /* Finally remove it */
18289 +       o = remove_vm_area((void *)addr);
18290 +       BUG_ON(p != o || o == NULL);
18291 +       kfree(p); 
18292 +}
18293 +EXPORT_SYMBOL(iounmap);
18294 +
18295 +#ifdef __i386__
18296 +
18297 +void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
18298 +{
18299 +       unsigned long offset, last_addr;
18300 +       unsigned int nrpages;
18301 +       enum fixed_addresses idx;
18302 +
18303 +       /* Don't allow wraparound or zero size */
18304 +       last_addr = phys_addr + size - 1;
18305 +       if (!size || last_addr < phys_addr)
18306 +               return NULL;
18307 +
18308 +       /*
18309 +        * Don't remap the low PCI/ISA area, it's always mapped..
18310 +        */
18311 +       if (xen_start_info->flags & SIF_PRIVILEGED &&
18312 +           phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
18313 +               return isa_bus_to_virt(phys_addr);
18314 +
18315 +       /*
18316 +        * Mappings have to be page-aligned
18317 +        */
18318 +       offset = phys_addr & ~PAGE_MASK;
18319 +       phys_addr &= PAGE_MASK;
18320 +       size = PAGE_ALIGN(last_addr) - phys_addr;
18321 +
18322 +       /*
18323 +        * Mappings have to fit in the FIX_BTMAP area.
18324 +        */
18325 +       nrpages = size >> PAGE_SHIFT;
18326 +       if (nrpages > NR_FIX_BTMAPS)
18327 +               return NULL;
18328 +
18329 +       /*
18330 +        * Ok, go for it..
18331 +        */
18332 +       idx = FIX_BTMAP_BEGIN;
18333 +       while (nrpages > 0) {
18334 +               set_fixmap(idx, phys_addr);
18335 +               phys_addr += PAGE_SIZE;
18336 +               --idx;
18337 +               --nrpages;
18338 +       }
18339 +       return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
18340 +}
18341 +
18342 +void __init bt_iounmap(void *addr, unsigned long size)
18343 +{
18344 +       unsigned long virt_addr;
18345 +       unsigned long offset;
18346 +       unsigned int nrpages;
18347 +       enum fixed_addresses idx;
18348 +
18349 +       virt_addr = (unsigned long)addr;
18350 +       if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
18351 +               return;
18352 +       if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
18353 +               return;
18354 +       offset = virt_addr & ~PAGE_MASK;
18355 +       nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
18356 +
18357 +       idx = FIX_BTMAP_BEGIN;
18358 +       while (nrpages > 0) {
18359 +               clear_fixmap(idx);
18360 +               --idx;
18361 +               --nrpages;
18362 +       }
18363 +}
18364 +
18365 +#endif /* __i386__ */
18366 +
18367 +/*
18368 + * Local variables:
18369 + *  c-file-style: "linux"
18370 + *  indent-tabs-mode: t
18371 + *  c-indent-level: 8
18372 + *  c-basic-offset: 8
18373 + *  tab-width: 8
18374 + * End:
18375 + */
18376 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/mm/pgtable-xen.c linux-2.6.16/arch/i386/mm/pgtable-xen.c
18377 --- linux-2.6.16.orig/arch/i386/mm/pgtable-xen.c        1970-01-01 01:00:00.000000000 +0100
18378 +++ linux-2.6.16/arch/i386/mm/pgtable-xen.c     2006-06-26 09:51:32.000000000 +0200
18379 @@ -0,0 +1,652 @@
18380 +/*
18381 + *  linux/arch/i386/mm/pgtable.c
18382 + */
18383 +
18384 +#include <linux/config.h>
18385 +#include <linux/sched.h>
18386 +#include <linux/kernel.h>
18387 +#include <linux/errno.h>
18388 +#include <linux/mm.h>
18389 +#include <linux/swap.h>
18390 +#include <linux/smp.h>
18391 +#include <linux/highmem.h>
18392 +#include <linux/slab.h>
18393 +#include <linux/pagemap.h>
18394 +#include <linux/spinlock.h>
18395 +#include <linux/module.h>
18396 +
18397 +#include <asm/system.h>
18398 +#include <asm/pgtable.h>
18399 +#include <asm/pgalloc.h>
18400 +#include <asm/fixmap.h>
18401 +#include <asm/e820.h>
18402 +#include <asm/tlb.h>
18403 +#include <asm/tlbflush.h>
18404 +#include <asm/io.h>
18405 +#include <asm/mmu_context.h>
18406 +
18407 +#include <xen/features.h>
18408 +#include <xen/foreign_page.h>
18409 +#include <asm/hypervisor.h>
18410 +
18411 +static void pgd_test_and_unpin(pgd_t *pgd);
18412 +
18413 +void show_mem(void)
18414 +{
18415 +       int total = 0, reserved = 0;
18416 +       int shared = 0, cached = 0;
18417 +       int highmem = 0;
18418 +       struct page *page;
18419 +       pg_data_t *pgdat;
18420 +       unsigned long i;
18421 +       struct page_state ps;
18422 +       unsigned long flags;
18423 +
18424 +       printk(KERN_INFO "Mem-info:\n");
18425 +       show_free_areas();
18426 +       printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
18427 +       for_each_pgdat(pgdat) {
18428 +               pgdat_resize_lock(pgdat, &flags);
18429 +               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
18430 +                       page = pgdat_page_nr(pgdat, i);
18431 +                       total++;
18432 +                       if (PageHighMem(page))
18433 +                               highmem++;
18434 +                       if (PageReserved(page))
18435 +                               reserved++;
18436 +                       else if (PageSwapCache(page))
18437 +                               cached++;
18438 +                       else if (page_count(page))
18439 +                               shared += page_count(page) - 1;
18440 +               }
18441 +               pgdat_resize_unlock(pgdat, &flags);
18442 +       }
18443 +       printk(KERN_INFO "%d pages of RAM\n", total);
18444 +       printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
18445 +       printk(KERN_INFO "%d reserved pages\n", reserved);
18446 +       printk(KERN_INFO "%d pages shared\n", shared);
18447 +       printk(KERN_INFO "%d pages swap cached\n", cached);
18448 +
18449 +       get_page_state(&ps);
18450 +       printk(KERN_INFO "%lu pages dirty\n", ps.nr_dirty);
18451 +       printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback);
18452 +       printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped);
18453 +       printk(KERN_INFO "%lu pages slab\n", ps.nr_slab);
18454 +       printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages);
18455 +}
18456 +
18457 +/*
18458 + * Associate a virtual page frame with a given physical page frame 
18459 + * and protection flags for that frame.
18460 + */ 
18461 +static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
18462 +{
18463 +       pgd_t *pgd;
18464 +       pud_t *pud;
18465 +       pmd_t *pmd;
18466 +       pte_t *pte;
18467 +
18468 +       pgd = swapper_pg_dir + pgd_index(vaddr);
18469 +       if (pgd_none(*pgd)) {
18470 +               BUG();
18471 +               return;
18472 +       }
18473 +       pud = pud_offset(pgd, vaddr);
18474 +       if (pud_none(*pud)) {
18475 +               BUG();
18476 +               return;
18477 +       }
18478 +       pmd = pmd_offset(pud, vaddr);
18479 +       if (pmd_none(*pmd)) {
18480 +               BUG();
18481 +               return;
18482 +       }
18483 +       pte = pte_offset_kernel(pmd, vaddr);
18484 +       /* <pfn,flags> stored as-is, to permit clearing entries */
18485 +       set_pte(pte, pfn_pte(pfn, flags));
18486 +
18487 +       /*
18488 +        * It's enough to flush this one mapping.
18489 +        * (PGE mappings get flushed as well)
18490 +        */
18491 +       __flush_tlb_one(vaddr);
18492 +}
18493 +
18494 +/*
18495 + * Associate a virtual page frame with a given physical page frame 
18496 + * and protection flags for that frame.
18497 + */ 
18498 +static void set_pte_pfn_ma(unsigned long vaddr, unsigned long pfn,
18499 +                          pgprot_t flags)
18500 +{
18501 +       pgd_t *pgd;
18502 +       pud_t *pud;
18503 +       pmd_t *pmd;
18504 +       pte_t *pte;
18505 +
18506 +       pgd = swapper_pg_dir + pgd_index(vaddr);
18507 +       if (pgd_none(*pgd)) {
18508 +               BUG();
18509 +               return;
18510 +       }
18511 +       pud = pud_offset(pgd, vaddr);
18512 +       if (pud_none(*pud)) {
18513 +               BUG();
18514 +               return;
18515 +       }
18516 +       pmd = pmd_offset(pud, vaddr);
18517 +       if (pmd_none(*pmd)) {
18518 +               BUG();
18519 +               return;
18520 +       }
18521 +       pte = pte_offset_kernel(pmd, vaddr);
18522 +       /* <pfn,flags> stored as-is, to permit clearing entries */
18523 +       set_pte(pte, pfn_pte_ma(pfn, flags));
18524 +
18525 +       /*
18526 +        * It's enough to flush this one mapping.
18527 +        * (PGE mappings get flushed as well)
18528 +        */
18529 +       __flush_tlb_one(vaddr);
18530 +}
18531 +
18532 +/*
18533 + * Associate a large virtual page frame with a given physical page frame 
18534 + * and protection flags for that frame. pfn is for the base of the page,
18535 + * vaddr is what the page gets mapped to - both must be properly aligned. 
18536 + * The pmd must already be instantiated. Assumes PAE mode.
18537 + */ 
18538 +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
18539 +{
18540 +       pgd_t *pgd;
18541 +       pud_t *pud;
18542 +       pmd_t *pmd;
18543 +
18544 +       if (vaddr & (PMD_SIZE-1)) {             /* vaddr is misaligned */
18545 +               printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
18546 +               return; /* BUG(); */
18547 +       }
18548 +       if (pfn & (PTRS_PER_PTE-1)) {           /* pfn is misaligned */
18549 +               printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
18550 +               return; /* BUG(); */
18551 +       }
18552 +       pgd = swapper_pg_dir + pgd_index(vaddr);
18553 +       if (pgd_none(*pgd)) {
18554 +               printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
18555 +               return; /* BUG(); */
18556 +       }
18557 +       pud = pud_offset(pgd, vaddr);
18558 +       pmd = pmd_offset(pud, vaddr);
18559 +       set_pmd(pmd, pfn_pmd(pfn, flags));
18560 +       /*
18561 +        * It's enough to flush this one mapping.
18562 +        * (PGE mappings get flushed as well)
18563 +        */
18564 +       __flush_tlb_one(vaddr);
18565 +}
18566 +
18567 +static int nr_fixmaps = 0;
18568 +unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - 2 * PAGE_SIZE);
18569 +EXPORT_SYMBOL(__FIXADDR_TOP);
18570 +
18571 +void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
18572 +{
18573 +       unsigned long address = __fix_to_virt(idx);
18574 +
18575 +       if (idx >= __end_of_fixed_addresses) {
18576 +               BUG();
18577 +               return;
18578 +       }
18579 +       switch (idx) {
18580 +       case FIX_WP_TEST:
18581 +#ifdef CONFIG_X86_F00F_BUG
18582 +       case FIX_F00F_IDT:
18583 +#endif
18584 +               set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
18585 +               break;
18586 +       default:
18587 +               set_pte_pfn_ma(address, phys >> PAGE_SHIFT, flags);
18588 +               break;
18589 +       }
18590 +       nr_fixmaps++;
18591 +}
18592 +
18593 +void set_fixaddr_top(unsigned long top)
18594 +{
18595 +       BUG_ON(nr_fixmaps > 0);
18596 +       __FIXADDR_TOP = top - PAGE_SIZE;
18597 +}
18598 +
18599 +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
18600 +{
18601 +       pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
18602 +       if (pte)
18603 +               make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
18604 +       return pte;
18605 +}
18606 +
18607 +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
18608 +{
18609 +       struct page *pte;
18610 +
18611 +#ifdef CONFIG_HIGHPTE
18612 +       pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
18613 +#else
18614 +       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
18615 +       if (pte) {
18616 +               SetPageForeign(pte, pte_free);
18617 +               set_page_count(pte, 1);
18618 +       }
18619 +#endif
18620 +       return pte;
18621 +}
18622 +
18623 +void pte_free(struct page *pte)
18624 +{
18625 +       unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
18626 +
18627 +       if (!pte_write(*virt_to_ptep(va)))
18628 +               BUG_ON(HYPERVISOR_update_va_mapping(
18629 +                       va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0));
18630 +
18631 +       ClearPageForeign(pte);
18632 +       set_page_count(pte, 1);
18633 +
18634 +       __free_page(pte);
18635 +}
18636 +
18637 +void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
18638 +{
18639 +       memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
18640 +}
18641 +
18642 +/*
18643 + * List of all pgd's needed for non-PAE so it can invalidate entries
18644 + * in both cached and uncached pgd's; not needed for PAE since the
18645 + * kernel pmd is shared. If PAE were not to share the pmd a similar
18646 + * tactic would be needed. This is essentially codepath-based locking
18647 + * against pageattr.c; it is the unique case in which a valid change
18648 + * of kernel pagetables can't be lazily synchronized by vmalloc faults.
18649 + * vmalloc faults work because attached pagetables are never freed.
18650 + * The locking scheme was chosen on the basis of manfred's
18651 + * recommendations and having no core impact whatsoever.
18652 + * -- wli
18653 + */
18654 +DEFINE_SPINLOCK(pgd_lock);
18655 +struct page *pgd_list;
18656 +
18657 +static inline void pgd_list_add(pgd_t *pgd)
18658 +{
18659 +       struct page *page = virt_to_page(pgd);
18660 +       page->index = (unsigned long)pgd_list;
18661 +       if (pgd_list)
18662 +               set_page_private(pgd_list, (unsigned long)&page->index);
18663 +       pgd_list = page;
18664 +       set_page_private(page, (unsigned long)&pgd_list);
18665 +}
18666 +
18667 +static inline void pgd_list_del(pgd_t *pgd)
18668 +{
18669 +       struct page *next, **pprev, *page = virt_to_page(pgd);
18670 +       next = (struct page *)page->index;
18671 +       pprev = (struct page **)page_private(page);
18672 +       *pprev = next;
18673 +       if (next)
18674 +               set_page_private(next, (unsigned long)pprev);
18675 +}
18676 +
18677 +void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
18678 +{
18679 +       unsigned long flags;
18680 +
18681 +       if (PTRS_PER_PMD > 1) {
18682 +               if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
18683 +                       int rc = xen_create_contiguous_region(
18684 +                               (unsigned long)pgd, 0, 32);
18685 +                       BUG_ON(rc);
18686 +               }
18687 +               if (HAVE_SHARED_KERNEL_PMD)
18688 +                       memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD,
18689 +                              swapper_pg_dir + USER_PTRS_PER_PGD,
18690 +                              (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
18691 +       } else {
18692 +               spin_lock_irqsave(&pgd_lock, flags);
18693 +               memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD,
18694 +                      swapper_pg_dir + USER_PTRS_PER_PGD,
18695 +                      (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
18696 +               memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
18697 +               pgd_list_add(pgd);
18698 +               spin_unlock_irqrestore(&pgd_lock, flags);
18699 +       }
18700 +}
18701 +
18702 +void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
18703 +{
18704 +       unsigned long flags; /* can be called from interrupt context */
18705 +
18706 +       if (PTRS_PER_PMD > 1) {
18707 +               if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
18708 +                       xen_destroy_contiguous_region((unsigned long)pgd, 0);
18709 +       } else {
18710 +               spin_lock_irqsave(&pgd_lock, flags);
18711 +               pgd_list_del(pgd);
18712 +               spin_unlock_irqrestore(&pgd_lock, flags);
18713 +
18714 +               pgd_test_and_unpin(pgd);
18715 +       }
18716 +}
18717 +
18718 +pgd_t *pgd_alloc(struct mm_struct *mm)
18719 +{
18720 +       int i;
18721 +       pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
18722 +
18723 +       pgd_test_and_unpin(pgd);
18724 +
18725 +       if (PTRS_PER_PMD == 1 || !pgd)
18726 +               return pgd;
18727 +
18728 +       for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
18729 +               pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
18730 +               if (!pmd)
18731 +                       goto out_oom;
18732 +               set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
18733 +       }
18734 +
18735 +       if (!HAVE_SHARED_KERNEL_PMD) {
18736 +               unsigned long flags;
18737 +
18738 +               for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
18739 +                       pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
18740 +                       if (!pmd)
18741 +                               goto out_oom;
18742 +                       set_pgd(&pgd[USER_PTRS_PER_PGD], __pgd(1 + __pa(pmd)));
18743 +               }
18744 +
18745 +               spin_lock_irqsave(&pgd_lock, flags);
18746 +               for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
18747 +                       unsigned long v = (unsigned long)i << PGDIR_SHIFT;
18748 +                       pgd_t *kpgd = pgd_offset_k(v);
18749 +                       pud_t *kpud = pud_offset(kpgd, v);
18750 +                       pmd_t *kpmd = pmd_offset(kpud, v);
18751 +                       pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
18752 +                       memcpy(pmd, kpmd, PAGE_SIZE);
18753 +                       make_lowmem_page_readonly(
18754 +                               pmd, XENFEAT_writable_page_tables);
18755 +               }
18756 +               pgd_list_add(pgd);
18757 +               spin_unlock_irqrestore(&pgd_lock, flags);
18758 +       }
18759 +
18760 +       return pgd;
18761 +
18762 +out_oom:
18763 +       for (i--; i >= 0; i--)
18764 +               kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
18765 +       kmem_cache_free(pgd_cache, pgd);
18766 +       return NULL;
18767 +}
18768 +
18769 +void pgd_free(pgd_t *pgd)
18770 +{
18771 +       int i;
18772 +
18773 +       pgd_test_and_unpin(pgd);
18774 +
18775 +       /* in the PAE case user pgd entries are overwritten before usage */
18776 +       if (PTRS_PER_PMD > 1) {
18777 +               for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
18778 +                       pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
18779 +                       kmem_cache_free(pmd_cache, pmd);
18780 +               }
18781 +               if (!HAVE_SHARED_KERNEL_PMD) {
18782 +                       unsigned long flags;
18783 +                       spin_lock_irqsave(&pgd_lock, flags);
18784 +                       pgd_list_del(pgd);
18785 +                       spin_unlock_irqrestore(&pgd_lock, flags);
18786 +                       for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
18787 +                               pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
18788 +                               make_lowmem_page_writable(
18789 +                                       pmd, XENFEAT_writable_page_tables);
18790 +                               memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
18791 +                               kmem_cache_free(pmd_cache, pmd);
18792 +                       }
18793 +               }
18794 +       }
18795 +       /* in the non-PAE case, free_pgtables() clears user pgd entries */
18796 +       kmem_cache_free(pgd_cache, pgd);
18797 +}
18798 +
18799 +void make_lowmem_page_readonly(void *va, unsigned int feature)
18800 +{
18801 +       pte_t *pte;
18802 +       int rc;
18803 +
18804 +       if (xen_feature(feature))
18805 +               return;
18806 +
18807 +       pte = virt_to_ptep(va);
18808 +       rc = HYPERVISOR_update_va_mapping(
18809 +               (unsigned long)va, pte_wrprotect(*pte), 0);
18810 +       BUG_ON(rc);
18811 +}
18812 +
18813 +void make_lowmem_page_writable(void *va, unsigned int feature)
18814 +{
18815 +       pte_t *pte;
18816 +       int rc;
18817 +
18818 +       if (xen_feature(feature))
18819 +               return;
18820 +
18821 +       pte = virt_to_ptep(va);
18822 +       rc = HYPERVISOR_update_va_mapping(
18823 +               (unsigned long)va, pte_mkwrite(*pte), 0);
18824 +       BUG_ON(rc);
18825 +}
18826 +
18827 +void make_page_readonly(void *va, unsigned int feature)
18828 +{
18829 +       pte_t *pte;
18830 +       int rc;
18831 +
18832 +       if (xen_feature(feature))
18833 +               return;
18834 +
18835 +       pte = virt_to_ptep(va);
18836 +       rc = HYPERVISOR_update_va_mapping(
18837 +               (unsigned long)va, pte_wrprotect(*pte), 0);
18838 +       if (rc) /* fallback? */
18839 +               xen_l1_entry_update(pte, pte_wrprotect(*pte));
18840 +       if ((unsigned long)va >= (unsigned long)high_memory) {
18841 +               unsigned long pfn = pte_pfn(*pte);
18842 +#ifdef CONFIG_HIGHMEM
18843 +               if (pfn >= highstart_pfn)
18844 +                       kmap_flush_unused(); /* flush stale writable kmaps */
18845 +               else
18846 +#endif
18847 +                       make_lowmem_page_readonly(
18848 +                               phys_to_virt(pfn << PAGE_SHIFT), feature); 
18849 +       }
18850 +}
18851 +
18852 +void make_page_writable(void *va, unsigned int feature)
18853 +{
18854 +       pte_t *pte;
18855 +       int rc;
18856 +
18857 +       if (xen_feature(feature))
18858 +               return;
18859 +
18860 +       pte = virt_to_ptep(va);
18861 +       rc = HYPERVISOR_update_va_mapping(
18862 +               (unsigned long)va, pte_mkwrite(*pte), 0);
18863 +       if (rc) /* fallback? */
18864 +               xen_l1_entry_update(pte, pte_mkwrite(*pte));
18865 +       if ((unsigned long)va >= (unsigned long)high_memory) {
18866 +               unsigned long pfn = pte_pfn(*pte); 
18867 +#ifdef CONFIG_HIGHMEM
18868 +               if (pfn < highstart_pfn)
18869 +#endif
18870 +                       make_lowmem_page_writable(
18871 +                               phys_to_virt(pfn << PAGE_SHIFT), feature);
18872 +       }
18873 +}
18874 +
18875 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
18876 +{
18877 +       if (xen_feature(feature))
18878 +               return;
18879 +
18880 +       while (nr-- != 0) {
18881 +               make_page_readonly(va, feature);
18882 +               va = (void *)((unsigned long)va + PAGE_SIZE);
18883 +       }
18884 +}
18885 +
18886 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
18887 +{
18888 +       if (xen_feature(feature))
18889 +               return;
18890 +
18891 +       while (nr-- != 0) {
18892 +               make_page_writable(va, feature);
18893 +               va = (void *)((unsigned long)va + PAGE_SIZE);
18894 +       }
18895 +}
18896 +
18897 +static inline void pgd_walk_set_prot(void *pt, pgprot_t flags)
18898 +{
18899 +       struct page *page = virt_to_page(pt);
18900 +       unsigned long pfn = page_to_pfn(page);
18901 +
18902 +       if (PageHighMem(page))
18903 +               return;
18904 +       BUG_ON(HYPERVISOR_update_va_mapping(
18905 +               (unsigned long)__va(pfn << PAGE_SHIFT),
18906 +               pfn_pte(pfn, flags), 0));
18907 +}
18908 +
18909 +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
18910 +{
18911 +       pgd_t *pgd = pgd_base;
18912 +       pud_t *pud;
18913 +       pmd_t *pmd;
18914 +       pte_t *pte;
18915 +       int    g, u, m;
18916 +
18917 +       if (xen_feature(XENFEAT_auto_translated_physmap))
18918 +               return;
18919 +
18920 +       for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
18921 +               if (pgd_none(*pgd))
18922 +                       continue;
18923 +               pud = pud_offset(pgd, 0);
18924 +               if (PTRS_PER_PUD > 1) /* not folded */
18925 +                       pgd_walk_set_prot(pud,flags);
18926 +               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
18927 +                       if (pud_none(*pud))
18928 +                               continue;
18929 +                       pmd = pmd_offset(pud, 0);
18930 +                       if (PTRS_PER_PMD > 1) /* not folded */
18931 +                               pgd_walk_set_prot(pmd,flags);
18932 +                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
18933 +                               if (pmd_none(*pmd))
18934 +                                       continue;
18935 +                               pte = pte_offset_kernel(pmd,0);
18936 +                               pgd_walk_set_prot(pte,flags);
18937 +                       }
18938 +               }
18939 +       }
18940 +
18941 +       BUG_ON(HYPERVISOR_update_va_mapping(
18942 +               (unsigned long)pgd_base,
18943 +               pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
18944 +               UVMF_TLB_FLUSH));
18945 +}
18946 +
18947 +static void __pgd_pin(pgd_t *pgd)
18948 +{
18949 +       pgd_walk(pgd, PAGE_KERNEL_RO);
18950 +       xen_pgd_pin(__pa(pgd));
18951 +       set_bit(PG_pinned, &virt_to_page(pgd)->flags);
18952 +}
18953 +
18954 +static void __pgd_unpin(pgd_t *pgd)
18955 +{
18956 +       xen_pgd_unpin(__pa(pgd));
18957 +       pgd_walk(pgd, PAGE_KERNEL);
18958 +       clear_bit(PG_pinned, &virt_to_page(pgd)->flags);
18959 +}
18960 +
18961 +static void pgd_test_and_unpin(pgd_t *pgd)
18962 +{
18963 +       if (test_bit(PG_pinned, &virt_to_page(pgd)->flags))
18964 +               __pgd_unpin(pgd);
18965 +}
18966 +
18967 +void mm_pin(struct mm_struct *mm)
18968 +{
18969 +       if (xen_feature(XENFEAT_writable_page_tables))
18970 +           return;
18971 +       spin_lock(&mm->page_table_lock);
18972 +       __pgd_pin(mm->pgd);
18973 +       spin_unlock(&mm->page_table_lock);
18974 +}
18975 +
18976 +void mm_unpin(struct mm_struct *mm)
18977 +{
18978 +       if (xen_feature(XENFEAT_writable_page_tables))
18979 +           return;
18980 +       spin_lock(&mm->page_table_lock);
18981 +       __pgd_unpin(mm->pgd);
18982 +       spin_unlock(&mm->page_table_lock);
18983 +}
18984 +
18985 +void mm_pin_all(void)
18986 +{
18987 +       struct page *page;
18988 +       if (xen_feature(XENFEAT_writable_page_tables))
18989 +           return;
18990 +       for (page = pgd_list; page; page = (struct page *)page->index) {
18991 +               if (!test_bit(PG_pinned, &page->flags))
18992 +                       __pgd_pin((pgd_t *)page_address(page));
18993 +       }
18994 +}
18995 +
18996 +void _arch_exit_mmap(struct mm_struct *mm)
18997 +{
18998 +       struct task_struct *tsk = current;
18999 +
19000 +       task_lock(tsk);
19001 +
19002 +       /*
19003 +        * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
19004 +        * *much* faster this way, as no tlb flushes means bigger wrpt batches.
19005 +        */
19006 +       if (tsk->active_mm == mm) {
19007 +               tsk->active_mm = &init_mm;
19008 +               atomic_inc(&init_mm.mm_count);
19009 +
19010 +               switch_mm(mm, &init_mm, tsk);
19011 +
19012 +               atomic_dec(&mm->mm_count);
19013 +               BUG_ON(atomic_read(&mm->mm_count) == 0);
19014 +       }
19015 +
19016 +       task_unlock(tsk);
19017 +
19018 +       if (test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags) &&
19019 +           (atomic_read(&mm->mm_count) == 1))
19020 +               mm_unpin(mm);
19021 +}
19022 +
19023 +/*
19024 + * Local variables:
19025 + *  c-file-style: "linux"
19026 + *  indent-tabs-mode: t
19027 + *  c-indent-level: 8
19028 + *  c-basic-offset: 8
19029 + *  tab-width: 8
19030 + * End:
19031 + */
19032 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/mm/pgtable.c linux-2.6.16/arch/i386/mm/pgtable.c
19033 --- linux-2.6.16.orig/arch/i386/mm/pgtable.c    2006-03-20 06:53:29.000000000 +0100
19034 +++ linux-2.6.16/arch/i386/mm/pgtable.c 2006-06-26 09:51:32.000000000 +0200
19035 @@ -13,6 +13,7 @@
19036  #include <linux/slab.h>
19037  #include <linux/pagemap.h>
19038  #include <linux/spinlock.h>
19039 +#include <linux/module.h>
19040  
19041  #include <asm/system.h>
19042  #include <asm/pgtable.h>
19043 @@ -138,6 +139,10 @@
19044         __flush_tlb_one(vaddr);
19045  }
19046  
19047 +static int nr_fixmaps = 0;
19048 +unsigned long __FIXADDR_TOP = 0xfffff000;
19049 +EXPORT_SYMBOL(__FIXADDR_TOP);
19050 +
19051  void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
19052  {
19053         unsigned long address = __fix_to_virt(idx);
19054 @@ -147,6 +152,13 @@
19055                 return;
19056         }
19057         set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
19058 +       nr_fixmaps++;
19059 +}
19060 +
19061 +void set_fixaddr_top(unsigned long top)
19062 +{
19063 +       BUG_ON(nr_fixmaps > 0);
19064 +       __FIXADDR_TOP = top - PAGE_SIZE;
19065  }
19066  
19067  pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
19068 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/pci/Makefile linux-2.6.16/arch/i386/pci/Makefile
19069 --- linux-2.6.16.orig/arch/i386/pci/Makefile    2006-03-20 06:53:29.000000000 +0100
19070 +++ linux-2.6.16/arch/i386/pci/Makefile 2006-06-26 09:51:32.000000000 +0200
19071 @@ -4,6 +4,10 @@
19072  obj-$(CONFIG_PCI_MMCONFIG)     += mmconfig.o direct.o
19073  obj-$(CONFIG_PCI_DIRECT)       += direct.o
19074  
19075 +# pcifront should be after pcbios.o, mmconfig.o, and direct.o as it should only
19076 +# take over if direct access to the PCI bus is unavailable
19077 +obj-$(CONFIG_XEN_PCIDEV_FRONTEND)      += pcifront.o
19078 +
19079  pci-y                          := fixup.o
19080  pci-$(CONFIG_ACPI)             += acpi.o
19081  pci-y                          += legacy.o irq.o
19082 @@ -12,3 +16,8 @@
19083  pci-$(CONFIG_X86_NUMAQ)                := numa.o irq.o
19084  
19085  obj-y                          += $(pci-y) common.o
19086 +
19087 +ifdef CONFIG_XEN
19088 +include $(srctree)/scripts/Makefile.xen
19089 +obj-y := $(call cherrypickxen, $(obj-y))
19090 +endif
19091 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/pci/irq-xen.c linux-2.6.16/arch/i386/pci/irq-xen.c
19092 --- linux-2.6.16.orig/arch/i386/pci/irq-xen.c   1970-01-01 01:00:00.000000000 +0100
19093 +++ linux-2.6.16/arch/i386/pci/irq-xen.c        2006-06-26 09:51:32.000000000 +0200
19094 @@ -0,0 +1,1202 @@
19095 +/*
19096 + *     Low-Level PCI Support for PC -- Routing of Interrupts
19097 + *
19098 + *     (c) 1999--2000 Martin Mares <mj@ucw.cz>
19099 + */
19100 +
19101 +#include <linux/config.h>
19102 +#include <linux/types.h>
19103 +#include <linux/kernel.h>
19104 +#include <linux/pci.h>
19105 +#include <linux/init.h>
19106 +#include <linux/slab.h>
19107 +#include <linux/interrupt.h>
19108 +#include <linux/dmi.h>
19109 +#include <asm/io.h>
19110 +#include <asm/smp.h>
19111 +#include <asm/io_apic.h>
19112 +#include <linux/irq.h>
19113 +#include <linux/acpi.h>
19114 +
19115 +#include "pci.h"
19116 +
19117 +#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
19118 +#define PIRQ_VERSION 0x0100
19119 +
19120 +static int broken_hp_bios_irq9;
19121 +static int acer_tm360_irqrouting;
19122 +
19123 +static struct irq_routing_table *pirq_table;
19124 +
19125 +static int pirq_enable_irq(struct pci_dev *dev);
19126 +
19127 +/*
19128 + * Never use: 0, 1, 2 (timer, keyboard, and cascade)
19129 + * Avoid using: 13, 14 and 15 (FP error and IDE).
19130 + * Penalize: 3, 4, 6, 7, 12 (known ISA uses: serial, floppy, parallel and mouse)
19131 + */
19132 +unsigned int pcibios_irq_mask = 0xfff8;
19133 +
19134 +static int pirq_penalty[16] = {
19135 +       1000000, 1000000, 1000000, 1000, 1000, 0, 1000, 1000,
19136 +       0, 0, 0, 0, 1000, 100000, 100000, 100000
19137 +};
19138 +
19139 +struct irq_router {
19140 +       char *name;
19141 +       u16 vendor, device;
19142 +       int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
19143 +       int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
19144 +};
19145 +
19146 +struct irq_router_handler {
19147 +       u16 vendor;
19148 +       int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device);
19149 +};
19150 +
19151 +int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL;
19152 +void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
19153 +
19154 +/*
19155 + *  Check passed address for the PCI IRQ Routing Table signature
19156 + *  and perform checksum verification.
19157 + */
19158 +
19159 +static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
19160 +{
19161 +       struct irq_routing_table *rt;
19162 +       int i;
19163 +       u8 sum;
19164 +
19165 +       rt = (struct irq_routing_table *) addr;
19166 +       if (rt->signature != PIRQ_SIGNATURE ||
19167 +           rt->version != PIRQ_VERSION ||
19168 +           rt->size % 16 ||
19169 +           rt->size < sizeof(struct irq_routing_table))
19170 +               return NULL;
19171 +       sum = 0;
19172 +       for (i=0; i < rt->size; i++)
19173 +               sum += addr[i];
19174 +       if (!sum) {
19175 +               DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
19176 +               return rt;
19177 +       }
19178 +       return NULL;
19179 +}
19180 +
19181 +
19182 +
19183 +/*
19184 + *  Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table.
19185 + */
19186 +
19187 +static struct irq_routing_table * __init pirq_find_routing_table(void)
19188 +{
19189 +       u8 *addr;
19190 +       struct irq_routing_table *rt;
19191 +
19192 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
19193 +       if (pirq_table_addr) {
19194 +               rt = pirq_check_routing_table((u8 *) isa_bus_to_virt(pirq_table_addr));
19195 +               if (rt)
19196 +                       return rt;
19197 +               printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
19198 +       }
19199 +       for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
19200 +               rt = pirq_check_routing_table(addr);
19201 +               if (rt)
19202 +                       return rt;
19203 +       }
19204 +#endif
19205 +       
19206 +       return NULL;
19207 +}
19208 +
19209 +/*
19210 + *  If we have a IRQ routing table, use it to search for peer host
19211 + *  bridges.  It's a gross hack, but since there are no other known
19212 + *  ways how to get a list of buses, we have to go this way.
19213 + */
19214 +
19215 +static void __init pirq_peer_trick(void)
19216 +{
19217 +       struct irq_routing_table *rt = pirq_table;
19218 +       u8 busmap[256];
19219 +       int i;
19220 +       struct irq_info *e;
19221 +
19222 +       memset(busmap, 0, sizeof(busmap));
19223 +       for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
19224 +               e = &rt->slots[i];
19225 +#ifdef DEBUG
19226 +               {
19227 +                       int j;
19228 +                       DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
19229 +                       for(j=0; j<4; j++)
19230 +                               DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
19231 +                       DBG("\n");
19232 +               }
19233 +#endif
19234 +               busmap[e->bus] = 1;
19235 +       }
19236 +       for(i = 1; i < 256; i++) {
19237 +               if (!busmap[i] || pci_find_bus(0, i))
19238 +                       continue;
19239 +               if (pci_scan_bus(i, &pci_root_ops, NULL))
19240 +                       printk(KERN_INFO "PCI: Discovered primary peer bus %02x [IRQ]\n", i);
19241 +       }
19242 +       pcibios_last_bus = -1;
19243 +}
19244 +
19245 +/*
19246 + *  Code for querying and setting of IRQ routes on various interrupt routers.
19247 + */
19248 +
19249 +void eisa_set_level_irq(unsigned int irq)
19250 +{
19251 +       unsigned char mask = 1 << (irq & 7);
19252 +       unsigned int port = 0x4d0 + (irq >> 3);
19253 +       unsigned char val;
19254 +       static u16 eisa_irq_mask;
19255 +
19256 +       if (irq >= 16 || (1 << irq) & eisa_irq_mask)
19257 +               return;
19258 +
19259 +       eisa_irq_mask |= (1 << irq);
19260 +       printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq);
19261 +       val = inb(port);
19262 +       if (!(val & mask)) {
19263 +               DBG(KERN_DEBUG " -> edge");
19264 +               outb(val | mask, port);
19265 +       }
19266 +}
19267 +
19268 +/*
19269 + * Common IRQ routing practice: nybbles in config space,
19270 + * offset by some magic constant.
19271 + */
19272 +static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr)
19273 +{
19274 +       u8 x;
19275 +       unsigned reg = offset + (nr >> 1);
19276 +
19277 +       pci_read_config_byte(router, reg, &x);
19278 +       return (nr & 1) ? (x >> 4) : (x & 0xf);
19279 +}
19280 +
19281 +static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
19282 +{
19283 +       u8 x;
19284 +       unsigned reg = offset + (nr >> 1);
19285 +
19286 +       pci_read_config_byte(router, reg, &x);
19287 +       x = (nr & 1) ? ((x & 0x0f) | (val << 4)) : ((x & 0xf0) | val);
19288 +       pci_write_config_byte(router, reg, x);
19289 +}
19290 +
19291 +/*
19292 + * ALI pirq entries are damn ugly, and completely undocumented.
19293 + * This has been figured out from pirq tables, and it's not a pretty
19294 + * picture.
19295 + */
19296 +static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19297 +{
19298 +       static unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
19299 +
19300 +       return irqmap[read_config_nybble(router, 0x48, pirq-1)];
19301 +}
19302 +
19303 +static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19304 +{
19305 +       static unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
19306 +       unsigned int val = irqmap[irq];
19307 +               
19308 +       if (val) {
19309 +               write_config_nybble(router, 0x48, pirq-1, val);
19310 +               return 1;
19311 +       }
19312 +       return 0;
19313 +}
19314 +
19315 +/*
19316 + * The Intel PIIX4 pirq rules are fairly simple: "pirq" is
19317 + * just a pointer to the config space.
19318 + */
19319 +static int pirq_piix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19320 +{
19321 +       u8 x;
19322 +
19323 +       pci_read_config_byte(router, pirq, &x);
19324 +       return (x < 16) ? x : 0;
19325 +}
19326 +
19327 +static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19328 +{
19329 +       pci_write_config_byte(router, pirq, irq);
19330 +       return 1;
19331 +}
19332 +
19333 +/*
19334 + * The VIA pirq rules are nibble-based, like ALI,
19335 + * but without the ugly irq number munging.
19336 + * However, PIRQD is in the upper instead of lower 4 bits.
19337 + */
19338 +static int pirq_via_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19339 +{
19340 +       return read_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq);
19341 +}
19342 +
19343 +static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19344 +{
19345 +       write_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq, irq);
19346 +       return 1;
19347 +}
19348 +
19349 +/*
19350 + * The VIA pirq rules are nibble-based, like ALI,
19351 + * but without the ugly irq number munging.
19352 + * However, for 82C586, nibble map is different .
19353 + */
19354 +static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19355 +{
19356 +       static unsigned int pirqmap[4] = { 3, 2, 5, 1 };
19357 +       return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
19358 +}
19359 +
19360 +static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19361 +{
19362 +       static unsigned int pirqmap[4] = { 3, 2, 5, 1 };
19363 +       write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
19364 +       return 1;
19365 +}
19366 +
19367 +/*
19368 + * ITE 8330G pirq rules are nibble-based
19369 + * FIXME: pirqmap may be { 1, 0, 3, 2 },
19370 + *       2+3 are both mapped to irq 9 on my system
19371 + */
19372 +static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19373 +{
19374 +       static unsigned char pirqmap[4] = { 1, 0, 2, 3 };
19375 +       return read_config_nybble(router,0x43, pirqmap[pirq-1]);
19376 +}
19377 +
19378 +static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19379 +{
19380 +       static unsigned char pirqmap[4] = { 1, 0, 2, 3 };
19381 +       write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
19382 +       return 1;
19383 +}
19384 +
19385 +/*
19386 + * OPTI: high four bits are nibble pointer..
19387 + * I wonder what the low bits do?
19388 + */
19389 +static int pirq_opti_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19390 +{
19391 +       return read_config_nybble(router, 0xb8, pirq >> 4);
19392 +}
19393 +
19394 +static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19395 +{
19396 +       write_config_nybble(router, 0xb8, pirq >> 4, irq);
19397 +       return 1;
19398 +}
19399 +
19400 +/*
19401 + * Cyrix: nibble offset 0x5C
19402 + * 0x5C bits 7:4 is INTB bits 3:0 is INTA 
19403 + * 0x5D bits 7:4 is INTD bits 3:0 is INTC
19404 + */
19405 +static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19406 +{
19407 +       return read_config_nybble(router, 0x5C, (pirq-1)^1);
19408 +}
19409 +
19410 +static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19411 +{
19412 +       write_config_nybble(router, 0x5C, (pirq-1)^1, irq);
19413 +       return 1;
19414 +}
19415 +
19416 +/*
19417 + *     PIRQ routing for SiS 85C503 router used in several SiS chipsets.
19418 + *     We have to deal with the following issues here:
19419 + *     - vendors have different ideas about the meaning of link values
19420 + *     - some onboard devices (integrated in the chipset) have special
19421 + *       links and are thus routed differently (i.e. not via PCI INTA-INTD)
19422 + *     - different revision of the router have a different layout for
19423 + *       the routing registers, particularly for the onchip devices
19424 + *
19425 + *     For all routing registers the common thing is we have one byte
19426 + *     per routeable link which is defined as:
19427 + *              bit 7      IRQ mapping enabled (0) or disabled (1)
19428 + *              bits [6:4] reserved (sometimes used for onchip devices)
19429 + *              bits [3:0] IRQ to map to
19430 + *                  allowed: 3-7, 9-12, 14-15
19431 + *                  reserved: 0, 1, 2, 8, 13
19432 + *
19433 + *     The config-space registers located at 0x41/0x42/0x43/0x44 are
19434 + *     always used to route the normal PCI INT A/B/C/D respectively.
19435 + *     Apparently there are systems implementing PCI routing table using
19436 + *     link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
19437 + *     We try our best to handle both link mappings.
19438 + *     
19439 + *     Currently (2003-05-21) it appears most SiS chipsets follow the
19440 + *     definition of routing registers from the SiS-5595 southbridge.
19441 + *     According to the SiS 5595 datasheets the revision id's of the
19442 + *     router (ISA-bridge) should be 0x01 or 0xb0.
19443 + *
19444 + *     Furthermore we've also seen lspci dumps with revision 0x00 and 0xb1.
19445 + *     Looks like these are used in a number of SiS 5xx/6xx/7xx chipsets.
19446 + *     They seem to work with the current routing code. However there is
19447 + *     some concern because of the two USB-OHCI HCs (original SiS 5595
19448 + *     had only one). YMMV.
19449 + *
19450 + *     Onchip routing for router rev-id 0x01/0xb0 and probably 0x00/0xb1:
19451 + *
19452 + *     0x61:   IDEIRQ:
19453 + *             bits [6:5] must be written 01
19454 + *             bit 4 channel-select primary (0), secondary (1)
19455 + *
19456 + *     0x62:   USBIRQ:
19457 + *             bit 6 OHCI function disabled (0), enabled (1)
19458 + *     
19459 + *     0x6a:   ACPI/SCI IRQ: bits 4-6 reserved
19460 + *
19461 + *     0x7e:   Data Acq. Module IRQ - bits 4-6 reserved
19462 + *
19463 + *     We support USBIRQ (in addition to INTA-INTD) and keep the
19464 + *     IDE, ACPI and DAQ routing untouched as set by the BIOS.
19465 + *
19466 + *     Currently the only reported exception is the new SiS 65x chipset
19467 + *     which includes the SiS 69x southbridge. Here we have the 85C503
19468 + *     router revision 0x04 and there are changes in the register layout
19469 + *     mostly related to the different USB HCs with USB 2.0 support.
19470 + *
19471 + *     Onchip routing for router rev-id 0x04 (try-and-error observation)
19472 + *
19473 + *     0x60/0x61/0x62/0x63:    1xEHCI and 3xOHCI (companion) USB-HCs
19474 + *                             bit 6-4 are probably unused, not like 5595
19475 + */
19476 +
19477 +#define PIRQ_SIS_IRQ_MASK      0x0f
19478 +#define PIRQ_SIS_IRQ_DISABLE   0x80
19479 +#define PIRQ_SIS_USB_ENABLE    0x40
19480 +
19481 +static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19482 +{
19483 +       u8 x;
19484 +       int reg;
19485 +
19486 +       reg = pirq;
19487 +       if (reg >= 0x01 && reg <= 0x04)
19488 +               reg += 0x40;
19489 +       pci_read_config_byte(router, reg, &x);
19490 +       return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK);
19491 +}
19492 +
19493 +static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19494 +{
19495 +       u8 x;
19496 +       int reg;
19497 +
19498 +       reg = pirq;
19499 +       if (reg >= 0x01 && reg <= 0x04)
19500 +               reg += 0x40;
19501 +       pci_read_config_byte(router, reg, &x);
19502 +       x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE);
19503 +       x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE;
19504 +       pci_write_config_byte(router, reg, x);
19505 +       return 1;
19506 +}
19507 +
19508 +
19509 +/*
19510 + * VLSI: nibble offset 0x74 - educated guess due to routing table and
19511 + *       config space of VLSI 82C534 PCI-bridge/router (1004:0102)
19512 + *       Tested on HP OmniBook 800 covering PIRQ 1, 2, 4, 8 for onboard
19513 + *       devices, PIRQ 3 for non-pci(!) soundchip and (untested) PIRQ 6
19514 + *       for the busbridge to the docking station.
19515 + */
19516 +
19517 +static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19518 +{
19519 +       if (pirq > 8) {
19520 +               printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
19521 +               return 0;
19522 +       }
19523 +       return read_config_nybble(router, 0x74, pirq-1);
19524 +}
19525 +
19526 +static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19527 +{
19528 +       if (pirq > 8) {
19529 +               printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
19530 +               return 0;
19531 +       }
19532 +       write_config_nybble(router, 0x74, pirq-1, irq);
19533 +       return 1;
19534 +}
19535 +
19536 +/*
19537 + * ServerWorks: PCI interrupts mapped to system IRQ lines through Index
19538 + * and Redirect I/O registers (0x0c00 and 0x0c01).  The Index register
19539 + * format is (PCIIRQ## | 0x10), e.g.: PCIIRQ10=0x1a.  The Redirect
19540 + * register is a straight binary coding of desired PIC IRQ (low nibble).
19541 + *
19542 + * The 'link' value in the PIRQ table is already in the correct format
19543 + * for the Index register.  There are some special index values:
19544 + * 0x00 for ACPI (SCI), 0x01 for USB, 0x02 for IDE0, 0x04 for IDE1,
19545 + * and 0x03 for SMBus.
19546 + */
19547 +static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19548 +{
19549 +       outb_p(pirq, 0xc00);
19550 +       return inb(0xc01) & 0xf;
19551 +}
19552 +
19553 +static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19554 +{
19555 +       outb_p(pirq, 0xc00);
19556 +       outb_p(irq, 0xc01);
19557 +       return 1;
19558 +}
19559 +
19560 +/* Support for AMD756 PCI IRQ Routing
19561 + * Jhon H. Caicedo <jhcaiced@osso.org.co>
19562 + * Jun/21/2001 0.2.0 Release, fixed to use "nybble" functions... (jhcaiced)
19563 + * Jun/19/2001 Alpha Release 0.1.0 (jhcaiced)
19564 + * The AMD756 pirq rules are nibble-based
19565 + * offset 0x56 0-3 PIRQA  4-7  PIRQB
19566 + * offset 0x57 0-3 PIRQC  4-7  PIRQD
19567 + */
19568 +static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
19569 +{
19570 +       u8 irq;
19571 +       irq = 0;
19572 +       if (pirq <= 4)
19573 +       {
19574 +               irq = read_config_nybble(router, 0x56, pirq - 1);
19575 +       }
19576 +       printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
19577 +               dev->vendor, dev->device, pirq, irq);
19578 +       return irq;
19579 +}
19580 +
19581 +static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19582 +{
19583 +       printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n", 
19584 +               dev->vendor, dev->device, pirq, irq);
19585 +       if (pirq <= 4)
19586 +       {
19587 +               write_config_nybble(router, 0x56, pirq - 1, irq);
19588 +       }
19589 +       return 1;
19590 +}
19591 +
19592 +#ifdef CONFIG_PCI_BIOS
19593 +
19594 +static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
19595 +{
19596 +       struct pci_dev *bridge;
19597 +       int pin = pci_get_interrupt_pin(dev, &bridge);
19598 +       return pcibios_set_irq_routing(bridge, pin, irq);
19599 +}
19600 +
19601 +#endif
19602 +
19603 +static __init int intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
19604 +{
19605 +       static struct pci_device_id pirq_440gx[] = {
19606 +               { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) },
19607 +               { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) },
19608 +               { },
19609 +       };
19610 +
19611 +       /* 440GX has a proprietary PIRQ router -- don't use it */
19612 +       if (pci_dev_present(pirq_440gx))
19613 +               return 0;
19614 +
19615 +       switch(device)
19616 +       {
19617 +               case PCI_DEVICE_ID_INTEL_82371FB_0:
19618 +               case PCI_DEVICE_ID_INTEL_82371SB_0:
19619 +               case PCI_DEVICE_ID_INTEL_82371AB_0:
19620 +               case PCI_DEVICE_ID_INTEL_82371MX:
19621 +               case PCI_DEVICE_ID_INTEL_82443MX_0:
19622 +               case PCI_DEVICE_ID_INTEL_82801AA_0:
19623 +               case PCI_DEVICE_ID_INTEL_82801AB_0:
19624 +               case PCI_DEVICE_ID_INTEL_82801BA_0:
19625 +               case PCI_DEVICE_ID_INTEL_82801BA_10:
19626 +               case PCI_DEVICE_ID_INTEL_82801CA_0:
19627 +               case PCI_DEVICE_ID_INTEL_82801CA_12:
19628 +               case PCI_DEVICE_ID_INTEL_82801DB_0:
19629 +               case PCI_DEVICE_ID_INTEL_82801E_0:
19630 +               case PCI_DEVICE_ID_INTEL_82801EB_0:
19631 +               case PCI_DEVICE_ID_INTEL_ESB_1:
19632 +               case PCI_DEVICE_ID_INTEL_ICH6_0:
19633 +               case PCI_DEVICE_ID_INTEL_ICH6_1:
19634 +               case PCI_DEVICE_ID_INTEL_ICH7_0:
19635 +               case PCI_DEVICE_ID_INTEL_ICH7_1:
19636 +               case PCI_DEVICE_ID_INTEL_ICH7_30:
19637 +               case PCI_DEVICE_ID_INTEL_ICH7_31:
19638 +               case PCI_DEVICE_ID_INTEL_ESB2_0:
19639 +               case PCI_DEVICE_ID_INTEL_ICH8_0:
19640 +               case PCI_DEVICE_ID_INTEL_ICH8_1:
19641 +               case PCI_DEVICE_ID_INTEL_ICH8_2:
19642 +               case PCI_DEVICE_ID_INTEL_ICH8_3:
19643 +               case PCI_DEVICE_ID_INTEL_ICH8_4:
19644 +                       r->name = "PIIX/ICH";
19645 +                       r->get = pirq_piix_get;
19646 +                       r->set = pirq_piix_set;
19647 +                       return 1;
19648 +       }
19649 +       return 0;
19650 +}
19651 +
19652 +static __init int via_router_probe(struct irq_router *r,
19653 +                               struct pci_dev *router, u16 device)
19654 +{
19655 +       /* FIXME: We should move some of the quirk fixup stuff here */
19656 +
19657 +       /*
19658 +        * work arounds for some buggy BIOSes
19659 +        */
19660 +       if (device == PCI_DEVICE_ID_VIA_82C586_0) {
19661 +               switch(router->device) {
19662 +               case PCI_DEVICE_ID_VIA_82C686:
19663 +                       /*
19664 +                        * Asus k7m bios wrongly reports 82C686A
19665 +                        * as 586-compatible
19666 +                        */
19667 +                       device = PCI_DEVICE_ID_VIA_82C686;
19668 +                       break;
19669 +               case PCI_DEVICE_ID_VIA_8235:
19670 +                       /**
19671 +                        * Asus a7v-x bios wrongly reports 8235
19672 +                        * as 586-compatible
19673 +                        */
19674 +                       device = PCI_DEVICE_ID_VIA_8235;
19675 +                       break;
19676 +               }
19677 +       }
19678 +
19679 +       switch(device) {
19680 +       case PCI_DEVICE_ID_VIA_82C586_0:
19681 +               r->name = "VIA";
19682 +               r->get = pirq_via586_get;
19683 +               r->set = pirq_via586_set;
19684 +               return 1;
19685 +       case PCI_DEVICE_ID_VIA_82C596:
19686 +       case PCI_DEVICE_ID_VIA_82C686:
19687 +       case PCI_DEVICE_ID_VIA_8231:
19688 +       case PCI_DEVICE_ID_VIA_8235:
19689 +               /* FIXME: add new ones for 8233/5 */
19690 +               r->name = "VIA";
19691 +               r->get = pirq_via_get;
19692 +               r->set = pirq_via_set;
19693 +               return 1;
19694 +       }
19695 +       return 0;
19696 +}
19697 +
19698 +static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
19699 +{
19700 +       switch(device)
19701 +       {
19702 +               case PCI_DEVICE_ID_VLSI_82C534:
19703 +                       r->name = "VLSI 82C534";
19704 +                       r->get = pirq_vlsi_get;
19705 +                       r->set = pirq_vlsi_set;
19706 +                       return 1;
19707 +       }
19708 +       return 0;
19709 +}
19710 +
19711 +
19712 +static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
19713 +{
19714 +       switch(device)
19715 +       {
19716 +               case PCI_DEVICE_ID_SERVERWORKS_OSB4:
19717 +               case PCI_DEVICE_ID_SERVERWORKS_CSB5:
19718 +                       r->name = "ServerWorks";
19719 +                       r->get = pirq_serverworks_get;
19720 +                       r->set = pirq_serverworks_set;
19721 +                       return 1;
19722 +       }
19723 +       return 0;
19724 +}
19725 +
19726 +static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
19727 +{
19728 +       if (device != PCI_DEVICE_ID_SI_503)
19729 +               return 0;
19730 +               
19731 +       r->name = "SIS";
19732 +       r->get = pirq_sis_get;
19733 +       r->set = pirq_sis_set;
19734 +       return 1;
19735 +}
19736 +
19737 +static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
19738 +{
19739 +       switch(device)
19740 +       {
19741 +               case PCI_DEVICE_ID_CYRIX_5520:
19742 +                       r->name = "NatSemi";
19743 +                       r->get = pirq_cyrix_get;
19744 +                       r->set = pirq_cyrix_set;
19745 +                       return 1;
19746 +       }
19747 +       return 0;
19748 +}
19749 +
19750 +static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
19751 +{
19752 +       switch(device)
19753 +       {
19754 +               case PCI_DEVICE_ID_OPTI_82C700:
19755 +                       r->name = "OPTI";
19756 +                       r->get = pirq_opti_get;
19757 +                       r->set = pirq_opti_set;
19758 +                       return 1;
19759 +       }
19760 +       return 0;
19761 +}
19762 +
19763 +static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
19764 +{
19765 +       switch(device)
19766 +       {
19767 +               case PCI_DEVICE_ID_ITE_IT8330G_0:
19768 +                       r->name = "ITE";
19769 +                       r->get = pirq_ite_get;
19770 +                       r->set = pirq_ite_set;
19771 +                       return 1;
19772 +       }
19773 +       return 0;
19774 +}
19775 +
19776 +static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
19777 +{
19778 +       switch(device)
19779 +       {
19780 +       case PCI_DEVICE_ID_AL_M1533:
19781 +       case PCI_DEVICE_ID_AL_M1563:
19782 +               printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
19783 +               r->name = "ALI";
19784 +               r->get = pirq_ali_get;
19785 +               r->set = pirq_ali_set;
19786 +               return 1;
19787 +       }
19788 +       return 0;
19789 +}
19790 +
19791 +static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
19792 +{
19793 +       switch(device)
19794 +       {
19795 +               case PCI_DEVICE_ID_AMD_VIPER_740B:
19796 +                       r->name = "AMD756";
19797 +                       break;
19798 +               case PCI_DEVICE_ID_AMD_VIPER_7413:
19799 +                       r->name = "AMD766";
19800 +                       break;
19801 +               case PCI_DEVICE_ID_AMD_VIPER_7443:
19802 +                       r->name = "AMD768";
19803 +                       break;
19804 +               default:
19805 +                       return 0;
19806 +       }
19807 +       r->get = pirq_amd756_get;
19808 +       r->set = pirq_amd756_set;
19809 +       return 1;
19810 +}
19811 +               
19812 +static __initdata struct irq_router_handler pirq_routers[] = {
19813 +       { PCI_VENDOR_ID_INTEL, intel_router_probe },
19814 +       { PCI_VENDOR_ID_AL, ali_router_probe },
19815 +       { PCI_VENDOR_ID_ITE, ite_router_probe },
19816 +       { PCI_VENDOR_ID_VIA, via_router_probe },
19817 +       { PCI_VENDOR_ID_OPTI, opti_router_probe },
19818 +       { PCI_VENDOR_ID_SI, sis_router_probe },
19819 +       { PCI_VENDOR_ID_CYRIX, cyrix_router_probe },
19820 +       { PCI_VENDOR_ID_VLSI, vlsi_router_probe },
19821 +       { PCI_VENDOR_ID_SERVERWORKS, serverworks_router_probe },
19822 +       { PCI_VENDOR_ID_AMD, amd_router_probe },
19823 +       /* Someone with docs needs to add the ATI Radeon IGP */
19824 +       { 0, NULL }
19825 +};
19826 +static struct irq_router pirq_router;
19827 +static struct pci_dev *pirq_router_dev;
19828 +
19829 +
19830 +/*
19831 + *     FIXME: should we have an option to say "generic for
19832 + *     chipset" ?
19833 + */
19834
19835 +static void __init pirq_find_router(struct irq_router *r)
19836 +{
19837 +       struct irq_routing_table *rt = pirq_table;
19838 +       struct irq_router_handler *h;
19839 +
19840 +#ifdef CONFIG_PCI_BIOS
19841 +       if (!rt->signature) {
19842 +               printk(KERN_INFO "PCI: Using BIOS for IRQ routing\n");
19843 +               r->set = pirq_bios_set;
19844 +               r->name = "BIOS";
19845 +               return;
19846 +       }
19847 +#endif
19848 +
19849 +       /* Default unless a driver reloads it */
19850 +       r->name = "default";
19851 +       r->get = NULL;
19852 +       r->set = NULL;
19853 +       
19854 +       DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
19855 +           rt->rtr_vendor, rt->rtr_device);
19856 +
19857 +       pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn);
19858 +       if (!pirq_router_dev) {
19859 +               DBG(KERN_DEBUG "PCI: Interrupt router not found at "
19860 +                       "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
19861 +               return;
19862 +       }
19863 +
19864 +       for( h = pirq_routers; h->vendor; h++) {
19865 +               /* First look for a router match */
19866 +               if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
19867 +                       break;
19868 +               /* Fall back to a device match */
19869 +               if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
19870 +                       break;
19871 +       }
19872 +       printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
19873 +               pirq_router.name,
19874 +               pirq_router_dev->vendor,
19875 +               pirq_router_dev->device,
19876 +               pci_name(pirq_router_dev));
19877 +}
19878 +
19879 +static struct irq_info *pirq_get_info(struct pci_dev *dev)
19880 +{
19881 +       struct irq_routing_table *rt = pirq_table;
19882 +       int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
19883 +       struct irq_info *info;
19884 +
19885 +       for (info = rt->slots; entries--; info++)
19886 +               if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
19887 +                       return info;
19888 +       return NULL;
19889 +}
19890 +
19891 +static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
19892 +{
19893 +       u8 pin;
19894 +       struct irq_info *info;
19895 +       int i, pirq, newirq;
19896 +       int irq = 0;
19897 +       u32 mask;
19898 +       struct irq_router *r = &pirq_router;
19899 +       struct pci_dev *dev2 = NULL;
19900 +       char *msg = NULL;
19901 +
19902 +       /* Find IRQ pin */
19903 +       pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
19904 +       if (!pin) {
19905 +               DBG(KERN_DEBUG " -> no interrupt pin\n");
19906 +               return 0;
19907 +       }
19908 +       pin = pin - 1;
19909 +
19910 +       /* Find IRQ routing entry */
19911 +
19912 +       if (!pirq_table)
19913 +               return 0;
19914 +       
19915 +       DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
19916 +       info = pirq_get_info(dev);
19917 +       if (!info) {
19918 +               DBG(" -> not found in routing table\n" KERN_DEBUG);
19919 +               return 0;
19920 +       }
19921 +       pirq = info->irq[pin].link;
19922 +       mask = info->irq[pin].bitmap;
19923 +       if (!pirq) {
19924 +               DBG(" -> not routed\n" KERN_DEBUG);
19925 +               return 0;
19926 +       }
19927 +       DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
19928 +       mask &= pcibios_irq_mask;
19929 +
19930 +       /* Work around broken HP Pavilion Notebooks which assign USB to
19931 +          IRQ 9 even though it is actually wired to IRQ 11 */
19932 +
19933 +       if (broken_hp_bios_irq9 && pirq == 0x59 && dev->irq == 9) {
19934 +               dev->irq = 11;
19935 +               pci_write_config_byte(dev, PCI_INTERRUPT_LINE, 11);
19936 +               r->set(pirq_router_dev, dev, pirq, 11);
19937 +       }
19938 +
19939 +       /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
19940 +       if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
19941 +               pirq = 0x68;
19942 +               mask = 0x400;
19943 +               dev->irq = r->get(pirq_router_dev, dev, pirq);
19944 +               pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq);
19945 +       }
19946 +
19947 +       /*
19948 +        * Find the best IRQ to assign: use the one
19949 +        * reported by the device if possible.
19950 +        */
19951 +       newirq = dev->irq;
19952 +       if (newirq && !((1 << newirq) & mask)) {
19953 +               if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
19954 +               else printk("\n" KERN_WARNING
19955 +                       "PCI: IRQ %i for device %s doesn't match PIRQ mask "
19956 +                       "- try pci=usepirqmask\n" KERN_DEBUG, newirq,
19957 +                       pci_name(dev));
19958 +       }
19959 +       if (!newirq && assign) {
19960 +               for (i = 0; i < 16; i++) {
19961 +                       if (!(mask & (1 << i)))
19962 +                               continue;
19963 +                       if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, SA_SHIRQ))
19964 +                               newirq = i;
19965 +               }
19966 +       }
19967 +       DBG(" -> newirq=%d", newirq);
19968 +
19969 +       /* Check if it is hardcoded */
19970 +       if ((pirq & 0xf0) == 0xf0) {
19971 +               irq = pirq & 0xf;
19972 +               DBG(" -> hardcoded IRQ %d\n", irq);
19973 +               msg = "Hardcoded";
19974 +       } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
19975 +       ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
19976 +               DBG(" -> got IRQ %d\n", irq);
19977 +               msg = "Found";
19978 +       } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
19979 +               DBG(" -> assigning IRQ %d", newirq);
19980 +               if (r->set(pirq_router_dev, dev, pirq, newirq)) {
19981 +                       eisa_set_level_irq(newirq);
19982 +                       DBG(" ... OK\n");
19983 +                       msg = "Assigned";
19984 +                       irq = newirq;
19985 +               }
19986 +       }
19987 +
19988 +       if (!irq) {
19989 +               DBG(" ... failed\n");
19990 +               if (newirq && mask == (1 << newirq)) {
19991 +                       msg = "Guessed";
19992 +                       irq = newirq;
19993 +               } else
19994 +                       return 0;
19995 +       }
19996 +       printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
19997 +
19998 +       /* Update IRQ for all devices with the same pirq value */
19999 +       while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
20000 +               pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
20001 +               if (!pin)
20002 +                       continue;
20003 +               pin--;
20004 +               info = pirq_get_info(dev2);
20005 +               if (!info)
20006 +                       continue;
20007 +               if (info->irq[pin].link == pirq) {
20008 +                       /* We refuse to override the dev->irq information. Give a warning! */
20009 +                       if ( dev2->irq && dev2->irq != irq && \
20010 +                       (!(pci_probe & PCI_USE_PIRQ_MASK) || \
20011 +                       ((1 << dev2->irq) & mask)) ) {
20012 +#ifndef CONFIG_PCI_MSI
20013 +                               printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
20014 +                                      pci_name(dev2), dev2->irq, irq);
20015 +#endif
20016 +                               continue;
20017 +                       }
20018 +                       dev2->irq = irq;
20019 +                       pirq_penalty[irq]++;
20020 +                       if (dev != dev2)
20021 +                               printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
20022 +               }
20023 +       }
20024 +       return 1;
20025 +}
20026 +
20027 +static void __init pcibios_fixup_irqs(void)
20028 +{
20029 +       struct pci_dev *dev = NULL;
20030 +       u8 pin;
20031 +
20032 +       DBG(KERN_DEBUG "PCI: IRQ fixup\n");
20033 +       while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
20034 +               /*
20035 +                * If the BIOS has set an out of range IRQ number, just ignore it.
20036 +                * Also keep track of which IRQ's are already in use.
20037 +                */
20038 +               if (dev->irq >= 16) {
20039 +                       DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
20040 +                       dev->irq = 0;
20041 +               }
20042 +               /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
20043 +               if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
20044 +                       pirq_penalty[dev->irq] = 0;
20045 +               pirq_penalty[dev->irq]++;
20046 +       }
20047 +
20048 +       dev = NULL;
20049 +       while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
20050 +               pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
20051 +#ifdef CONFIG_X86_IO_APIC
20052 +               /*
20053 +                * Recalculate IRQ numbers if we use the I/O APIC.
20054 +                */
20055 +               if (io_apic_assign_pci_irqs)
20056 +               {
20057 +                       int irq;
20058 +
20059 +                       if (pin) {
20060 +                               pin--;          /* interrupt pins are numbered starting from 1 */
20061 +                               irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
20062 +       /*
20063 +        * Busses behind bridges are typically not listed in the MP-table.
20064 +        * In this case we have to look up the IRQ based on the parent bus,
20065 +        * parent slot, and pin number. The SMP code detects such bridged
20066 +        * busses itself so we should get into this branch reliably.
20067 +        */
20068 +                               if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
20069 +                                       struct pci_dev * bridge = dev->bus->self;
20070 +
20071 +                                       pin = (pin + PCI_SLOT(dev->devfn)) % 4;
20072 +                                       irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 
20073 +                                                       PCI_SLOT(bridge->devfn), pin);
20074 +                                       if (irq >= 0)
20075 +                                               printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
20076 +                                                       pci_name(bridge), 'A' + pin, irq);
20077 +                               }
20078 +                               if (irq >= 0) {
20079 +                                       if (use_pci_vector() &&
20080 +                                               !platform_legacy_irq(irq))
20081 +                                               irq = IO_APIC_VECTOR(irq);
20082 +
20083 +                                       printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
20084 +                                               pci_name(dev), 'A' + pin, irq);
20085 +                                       dev->irq = irq;
20086 +                               }
20087 +                       }
20088 +               }
20089 +#endif
20090 +               /*
20091 +                * Still no IRQ? Try to lookup one...
20092 +                */
20093 +               if (pin && !dev->irq)
20094 +                       pcibios_lookup_irq(dev, 0);
20095 +       }
20096 +}
20097 +
20098 +/*
20099 + * Work around broken HP Pavilion Notebooks which assign USB to
20100 + * IRQ 9 even though it is actually wired to IRQ 11
20101 + */
20102 +static int __init fix_broken_hp_bios_irq9(struct dmi_system_id *d)
20103 +{
20104 +       if (!broken_hp_bios_irq9) {
20105 +               broken_hp_bios_irq9 = 1;
20106 +               printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
20107 +       }
20108 +       return 0;
20109 +}
20110 +
20111 +/*
20112 + * Work around broken Acer TravelMate 360 Notebooks which assign
20113 + * Cardbus to IRQ 11 even though it is actually wired to IRQ 10
20114 + */
20115 +static int __init fix_acer_tm360_irqrouting(struct dmi_system_id *d)
20116 +{
20117 +       if (!acer_tm360_irqrouting) {
20118 +               acer_tm360_irqrouting = 1;
20119 +               printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
20120 +       }
20121 +       return 0;
20122 +}
20123 +
20124 +static struct dmi_system_id __initdata pciirq_dmi_table[] = {
20125 +       {
20126 +               .callback = fix_broken_hp_bios_irq9,
20127 +               .ident = "HP Pavilion N5400 Series Laptop",
20128 +               .matches = {
20129 +                       DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
20130 +                       DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
20131 +                       DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
20132 +                       DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
20133 +               },
20134 +       },
20135 +       {
20136 +               .callback = fix_acer_tm360_irqrouting,
20137 +               .ident = "Acer TravelMate 36x Laptop",
20138 +               .matches = {
20139 +                       DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
20140 +                       DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
20141 +               },
20142 +       },
20143 +       { }
20144 +};
20145 +
20146 +static int __init pcibios_irq_init(void)
20147 +{
20148 +       DBG(KERN_DEBUG "PCI: IRQ init\n");
20149 +
20150 +       if (pcibios_enable_irq || raw_pci_ops == NULL)
20151 +               return 0;
20152 +
20153 +       dmi_check_system(pciirq_dmi_table);
20154 +
20155 +       pirq_table = pirq_find_routing_table();
20156 +
20157 +#ifdef CONFIG_PCI_BIOS
20158 +       if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN))
20159 +               pirq_table = pcibios_get_irq_routing_table();
20160 +#endif
20161 +       if (pirq_table) {
20162 +               pirq_peer_trick();
20163 +               pirq_find_router(&pirq_router);
20164 +               if (pirq_table->exclusive_irqs) {
20165 +                       int i;
20166 +                       for (i=0; i<16; i++)
20167 +                               if (!(pirq_table->exclusive_irqs & (1 << i)))
20168 +                                       pirq_penalty[i] += 100;
20169 +               }
20170 +               /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
20171 +               if (io_apic_assign_pci_irqs)
20172 +                       pirq_table = NULL;
20173 +       }
20174 +
20175 +       pcibios_enable_irq = pirq_enable_irq;
20176 +
20177 +       pcibios_fixup_irqs();
20178 +       return 0;
20179 +}
20180 +
20181 +subsys_initcall(pcibios_irq_init);
20182 +
20183 +
20184 +static void pirq_penalize_isa_irq(int irq, int active)
20185 +{
20186 +       /*
20187 +        *  If any ISAPnP device reports an IRQ in its list of possible
20188 +        *  IRQ's, we try to avoid assigning it to PCI devices.
20189 +        */
20190 +       if (irq < 16) {
20191 +               if (active)
20192 +                       pirq_penalty[irq] += 1000;
20193 +               else
20194 +                       pirq_penalty[irq] += 100;
20195 +       }
20196 +}
20197 +
20198 +void pcibios_penalize_isa_irq(int irq, int active)
20199 +{
20200 +#ifdef CONFIG_ACPI
20201 +       if (!acpi_noirq)
20202 +               acpi_penalize_isa_irq(irq, active);
20203 +       else
20204 +#endif
20205 +               pirq_penalize_isa_irq(irq, active);
20206 +}
20207 +
20208 +static int pirq_enable_irq(struct pci_dev *dev)
20209 +{
20210 +       u8 pin;
20211 +       struct pci_dev *temp_dev;
20212 +
20213 +       pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
20214 +       if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
20215 +               char *msg = "";
20216 +
20217 +               pin--;          /* interrupt pins are numbered starting from 1 */
20218 +
20219 +               if (io_apic_assign_pci_irqs) {
20220 +                       int irq;
20221 +
20222 +                       irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
20223 +                       /*
20224 +                        * Busses behind bridges are typically not listed in the MP-table.
20225 +                        * In this case we have to look up the IRQ based on the parent bus,
20226 +                        * parent slot, and pin number. The SMP code detects such bridged
20227 +                        * busses itself so we should get into this branch reliably.
20228 +                        */
20229 +                       temp_dev = dev;
20230 +                       while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
20231 +                               struct pci_dev * bridge = dev->bus->self;
20232 +
20233 +                               pin = (pin + PCI_SLOT(dev->devfn)) % 4;
20234 +                               irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 
20235 +                                               PCI_SLOT(bridge->devfn), pin);
20236 +                               if (irq >= 0)
20237 +                                       printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
20238 +                                               pci_name(bridge), 'A' + pin, irq);
20239 +                               dev = bridge;
20240 +                       }
20241 +                       dev = temp_dev;
20242 +                       if (irq >= 0) {
20243 +#ifdef CONFIG_PCI_MSI
20244 +                               if (!platform_legacy_irq(irq))
20245 +                                       irq = IO_APIC_VECTOR(irq);
20246 +#endif
20247 +                               printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
20248 +                                       pci_name(dev), 'A' + pin, irq);
20249 +                               dev->irq = irq;
20250 +                               return 0;
20251 +                       } else
20252 +                               msg = " Probably buggy MP table.";
20253 +               } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
20254 +                       msg = "";
20255 +               else
20256 +                       msg = " Please try using pci=biosirq.";
20257 +
20258 +               /* With IDE legacy devices the IRQ lookup failure is not a problem.. */
20259 +               if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
20260 +                       return 0;
20261 +
20262 +               printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
20263 +                      'A' + pin, pci_name(dev), msg);
20264 +       }
20265 +       return 0;
20266 +}
20267 +
20268 +int pci_vector_resources(int last, int nr_released)
20269 +{
20270 +       int count = nr_released;
20271 +
20272 +       int next = last;
20273 +       int offset = (last % 8);
20274 +
20275 +       while (next < FIRST_SYSTEM_VECTOR) {
20276 +               next += 8;
20277 +#ifdef CONFIG_X86_64
20278 +               if (next == IA32_SYSCALL_VECTOR)
20279 +                       continue;
20280 +#else
20281 +               if (next == SYSCALL_VECTOR)
20282 +                       continue;
20283 +#endif
20284 +               count++;
20285 +               if (next >= FIRST_SYSTEM_VECTOR) {
20286 +                       if (offset%8) {
20287 +                               next = FIRST_DEVICE_VECTOR + offset;
20288 +                               offset++;
20289 +                               continue;
20290 +                       }
20291 +                       count--;
20292 +               }
20293 +       }
20294 +
20295 +       return count;
20296 +}
20297 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/pci/pcifront.c linux-2.6.16/arch/i386/pci/pcifront.c
20298 --- linux-2.6.16.orig/arch/i386/pci/pcifront.c  1970-01-01 01:00:00.000000000 +0100
20299 +++ linux-2.6.16/arch/i386/pci/pcifront.c       2006-06-26 09:51:32.000000000 +0200
20300 @@ -0,0 +1,55 @@
20301 +/*
20302 + * PCI Frontend Stub - puts some "dummy" functions in to the Linux x86 PCI core
20303 + *                     to support the Xen PCI Frontend's operation
20304 + *
20305 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
20306 + */
20307 +#include <linux/module.h>
20308 +#include <linux/init.h>
20309 +#include <linux/pci.h>
20310 +#include <asm/acpi.h>
20311 +#include "pci.h"
20312 +
20313 +static int pcifront_enable_irq(struct pci_dev *dev)
20314 +{
20315 +       u8 irq;
20316 +       pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq);
20317 +       dev->irq = irq;
20318 +
20319 +       return 0;
20320 +}
20321 +
20322 +extern u8 pci_cache_line_size;
20323 +
20324 +static int __init pcifront_x86_stub_init(void)
20325 +{
20326 +       struct cpuinfo_x86 *c = &boot_cpu_data;
20327 +
20328 +       /* Only install our method if we haven't found real hardware already */
20329 +       if (raw_pci_ops)
20330 +               return 0;
20331 +
20332 +       printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n");
20333 +
20334 +       /* Copied from arch/i386/pci/common.c */
20335 +       pci_cache_line_size = 32 >> 2;
20336 +       if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD)
20337 +               pci_cache_line_size = 64 >> 2;  /* K7 & K8 */
20338 +       else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
20339 +               pci_cache_line_size = 128 >> 2; /* P4 */
20340 +
20341 +       /* On x86, we need to disable the normal IRQ routing table and
20342 +        * just ask the backend
20343 +        */
20344 +       pcibios_enable_irq = pcifront_enable_irq;
20345 +       pcibios_disable_irq = NULL;
20346 +
20347 +#ifdef CONFIG_ACPI
20348 +       /* Keep ACPI out of the picture */
20349 +       acpi_noirq = 1;
20350 +#endif
20351 +
20352 +       return 0;
20353 +}
20354 +
20355 +arch_initcall(pcifront_x86_stub_init);
20356 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/i386/power/Makefile linux-2.6.16/arch/i386/power/Makefile
20357 --- linux-2.6.16.orig/arch/i386/power/Makefile  2006-06-26 09:49:45.000000000 +0200
20358 +++ linux-2.6.16/arch/i386/power/Makefile       2006-06-26 09:51:32.000000000 +0200
20359 @@ -1,2 +1,4 @@
20360 -obj-$(CONFIG_PM)               += cpu.o
20361 +obj-$(CONFIG_PM_LEGACY)                += cpu.o
20362 +obj-$(CONFIG_SOFTWARE_SUSPEND) += cpu.o
20363 +obj-$(CONFIG_ACPI_SLEEP)       += cpu.o
20364  obj-$(CONFIG_SUSPEND_SHARED)   += swsusp.o
20365 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/Kconfig linux-2.6.16/arch/ia64/Kconfig
20366 --- linux-2.6.16.orig/arch/ia64/Kconfig 2006-06-26 09:49:46.000000000 +0200
20367 +++ linux-2.6.16/arch/ia64/Kconfig      2006-06-26 09:51:32.000000000 +0200
20368 @@ -50,6 +50,52 @@
20369         bool
20370         default y
20371  
20372 +config XEN
20373 +       bool
20374 +       default y
20375 +       help
20376 +         Enable Xen hypervisor support.  Resulting kernel runs
20377 +         both as a guest OS on Xen and natively on hardware.
20378 +
20379 +config ARCH_XEN
20380 +       bool
20381 +       default y
20382 +       help
20383 +         TEMP ONLY. Needs to be on for drivers/xen to build.
20384 +
20385 +config XEN_PRIVILEGED_GUEST
20386 +       bool "Privileged Guest"
20387 +       default n
20388 +       help
20389 +         Used in drivers/xen/privcmd.c.  Should go away?
20390 +
20391 +config XEN_BLKDEV_GRANT
20392 +       depends on XEN
20393 +       bool
20394 +       default y
20395 +
20396 +config XEN_BLKDEV_FRONTEND
20397 +       depends on XEN
20398 +       bool
20399 +       default y
20400 +
20401 +config XEN_BLKDEV_BACKEND
20402 +       depends on XEN
20403 +       bool
20404 +       default y
20405 +
20406 +config XEN_SYSFS
20407 +       bool "Export Xen attributes in sysfs"
20408 +       depends on XEN && SYSFS
20409 +       default y
20410 +       help
20411 +               Xen hypervisor attributes will show up under /sys/hypervisor/.
20412 +
20413 +config XEN_INTERFACE_VERSION
20414 +       hex
20415 +       depends on XEN
20416 +       default 0x00030101
20417 +
20418  config SCHED_NO_NO_OMIT_FRAME_POINTER
20419         bool
20420         default y
20421 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/Makefile linux-2.6.16/arch/ia64/Makefile
20422 --- linux-2.6.16.orig/arch/ia64/Makefile        2006-03-20 06:53:29.000000000 +0100
20423 +++ linux-2.6.16/arch/ia64/Makefile     2006-06-26 09:51:32.000000000 +0200
20424 @@ -42,6 +42,12 @@
20425  endif
20426  
20427  CFLAGS += $(cflags-y)
20428 +
20429 +cppflags-$(CONFIG_XEN) += \
20430 +       -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION)
20431 +
20432 +CPPFLAGS += $(cppflags-y)
20433 +
20434  head-y := arch/ia64/kernel/head.o arch/ia64/kernel/init_task.o
20435  
20436  libs-y                         += arch/ia64/lib/
20437 @@ -52,9 +58,15 @@
20438  core-$(CONFIG_IA64_HP_ZX1)     += arch/ia64/dig/
20439  core-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += arch/ia64/dig/
20440  core-$(CONFIG_IA64_SGI_SN2)    += arch/ia64/sn/
20441 +core-$(CONFIG_XEN)             += arch/ia64/xen/
20442  
20443  drivers-$(CONFIG_PCI)          += arch/ia64/pci/
20444 +ifneq ($(CONFIG_XEN),y)
20445  drivers-$(CONFIG_IA64_HP_SIM)  += arch/ia64/hp/sim/
20446 +endif
20447 +ifneq ($(CONFIG_IA64_GENERIC),y)
20448 +drivers-$(CONFIG_XEN)          += arch/ia64/hp/sim/
20449 +endif
20450  drivers-$(CONFIG_IA64_HP_ZX1)  += arch/ia64/hp/common/ arch/ia64/hp/zx1/
20451  drivers-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += arch/ia64/hp/common/ arch/ia64/hp/zx1/
20452  drivers-$(CONFIG_IA64_GENERIC) += arch/ia64/hp/common/ arch/ia64/hp/zx1/ arch/ia64/hp/sim/ arch/ia64/sn/
20453 @@ -68,6 +80,8 @@
20454  
20455  compressed: vmlinux.gz
20456  
20457 +vmlinuz: vmlinux.gz
20458 +
20459  vmlinux.gz: vmlinux
20460         $(Q)$(MAKE) $(build)=$(boot) $@
20461  
20462 @@ -82,7 +96,7 @@
20463  boot:  lib/lib.a vmlinux
20464         $(Q)$(MAKE) $(build)=$(boot) $@
20465  
20466 -install: vmlinux.gz
20467 +install:
20468         sh $(srctree)/arch/ia64/install.sh $(KERNELRELEASE) $< System.map "$(INSTALL_PATH)"
20469  
20470  define archhelp
20471 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/hp/sim/Makefile linux-2.6.16/arch/ia64/hp/sim/Makefile
20472 --- linux-2.6.16.orig/arch/ia64/hp/sim/Makefile 2006-03-20 06:53:29.000000000 +0100
20473 +++ linux-2.6.16/arch/ia64/hp/sim/Makefile      2006-06-26 09:51:32.000000000 +0200
20474 @@ -14,3 +14,5 @@
20475  obj-$(CONFIG_HP_SIMSERIAL) += simserial.o
20476  obj-$(CONFIG_HP_SIMSERIAL_CONSOLE) += hpsim_console.o
20477  obj-$(CONFIG_HP_SIMSCSI) += simscsi.o
20478 +obj-$(CONFIG_XEN) += simserial.o
20479 +obj-$(CONFIG_XEN) += hpsim_console.o
20480 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/kernel/entry.S linux-2.6.16/arch/ia64/kernel/entry.S
20481 --- linux-2.6.16.orig/arch/ia64/kernel/entry.S  2006-06-26 09:49:46.000000000 +0200
20482 +++ linux-2.6.16/arch/ia64/kernel/entry.S       2006-06-26 09:51:32.000000000 +0200
20483 @@ -181,7 +181,7 @@
20484   *     called.  The code starting at .map relies on this.  The rest of the code
20485   *     doesn't care about the interrupt masking status.
20486   */
20487 -GLOBAL_ENTRY(ia64_switch_to)
20488 +GLOBAL_ENTRY(__ia64_switch_to)
20489         .prologue
20490         alloc r16=ar.pfs,1,0,0,0
20491         DO_SAVE_SWITCH_STACK
20492 @@ -235,7 +235,7 @@
20493         ;;
20494         srlz.d
20495         br.cond.sptk .done
20496 -END(ia64_switch_to)
20497 +END(__ia64_switch_to)
20498  
20499  /*
20500   * Note that interrupts are enabled during save_switch_stack and load_switch_stack.  This
20501 @@ -376,7 +376,7 @@
20502   *     - b7 holds address to return to
20503   *     - must not touch r8-r11
20504   */
20505 -ENTRY(load_switch_stack)
20506 +GLOBAL_ENTRY(load_switch_stack)
20507         .prologue
20508         .altrp b7
20509  
20510 @@ -511,7 +511,7 @@
20511          * because some system calls (such as ia64_execve) directly
20512          * manipulate ar.pfs.
20513          */
20514 -GLOBAL_ENTRY(ia64_trace_syscall)
20515 +GLOBAL_ENTRY(__ia64_trace_syscall)
20516         PT_REGS_UNWIND_INFO(0)
20517         /*
20518          * We need to preserve the scratch registers f6-f11 in case the system
20519 @@ -583,7 +583,7 @@
20520  (p6)   mov r10=-1
20521  (p6)   mov r8=r9
20522         br.cond.sptk .strace_save_retval
20523 -END(ia64_trace_syscall)
20524 +END(__ia64_trace_syscall)
20525  
20526         /*
20527          * When traced and returning from sigreturn, we invoke syscall_trace but then
20528 @@ -636,8 +636,11 @@
20529         adds r2=PT(R8)+16,sp                    // r2 = &pt_regs.r8
20530         mov r10=r0                              // clear error indication in r10
20531  (p7)   br.cond.spnt handle_syscall_error       // handle potential syscall failure
20532 +       ;;
20533 +       // don't fall through, ia64_leave_syscall may be #define'd
20534 +       br.cond.sptk.few ia64_leave_syscall
20535 +       ;;
20536  END(ia64_ret_from_syscall)
20537 -       // fall through
20538  /*
20539   * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't
20540   *     need to switch to bank 0 and doesn't restore the scratch registers.
20541 @@ -682,7 +685,7 @@
20542   *           ar.csd: cleared
20543   *           ar.ssd: cleared
20544   */
20545 -ENTRY(ia64_leave_syscall)
20546 +GLOBAL_ENTRY(__ia64_leave_syscall)
20547         PT_REGS_UNWIND_INFO(0)
20548         /*
20549          * work.need_resched etc. mustn't get changed by this CPU before it returns to
20550 @@ -790,7 +793,7 @@
20551         mov.m ar.ssd=r0                 // M2   clear ar.ssd
20552         mov f11=f0                      // F    clear f11
20553         br.cond.sptk.many rbs_switch    // B
20554 -END(ia64_leave_syscall)
20555 +END(__ia64_leave_syscall)
20556  
20557  #ifdef CONFIG_IA32_SUPPORT
20558  GLOBAL_ENTRY(ia64_ret_from_ia32_execve)
20559 @@ -802,10 +805,13 @@
20560         st8.spill [r2]=r8       // store return value in slot for r8 and set unat bit
20561         .mem.offset 8,0
20562         st8.spill [r3]=r0       // clear error indication in slot for r10 and set unat bit
20563 +       ;;
20564 +       // don't fall through, ia64_leave_kernel may be #define'd
20565 +       br.cond.sptk.few ia64_leave_kernel
20566 +       ;;
20567  END(ia64_ret_from_ia32_execve)
20568 -       // fall through
20569  #endif /* CONFIG_IA32_SUPPORT */
20570 -GLOBAL_ENTRY(ia64_leave_kernel)
20571 +GLOBAL_ENTRY(__ia64_leave_kernel)
20572         PT_REGS_UNWIND_INFO(0)
20573         /*
20574          * work.need_resched etc. mustn't get changed by this CPU before it returns to
20575 @@ -1150,7 +1156,7 @@
20576         ld8 r10=[r3]
20577         br.cond.sptk.many .work_processed_syscall       // re-check
20578  
20579 -END(ia64_leave_kernel)
20580 +END(__ia64_leave_kernel)
20581  
20582  ENTRY(handle_syscall_error)
20583         /*
20584 @@ -1190,7 +1196,7 @@
20585          * be set up by the caller.  We declare 8 input registers so the system call
20586          * args get preserved, in case we need to restart a system call.
20587          */
20588 -ENTRY(notify_resume_user)
20589 +GLOBAL_ENTRY(notify_resume_user)
20590         .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
20591         alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart!
20592         mov r9=ar.unat
20593 @@ -1278,7 +1284,7 @@
20594         adds sp=16,sp
20595         ;;
20596         ld8 r9=[sp]                             // load new ar.unat
20597 -       mov.sptk b7=r8,ia64_leave_kernel
20598 +       mov.sptk b7=r8,__ia64_leave_kernel
20599         ;;
20600         mov ar.unat=r9
20601         br.many b7
20602 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/kernel/head.S linux-2.6.16/arch/ia64/kernel/head.S
20603 --- linux-2.6.16.orig/arch/ia64/kernel/head.S   2006-03-20 06:53:29.000000000 +0100
20604 +++ linux-2.6.16/arch/ia64/kernel/head.S        2006-06-26 09:51:32.000000000 +0200
20605 @@ -363,6 +363,12 @@
20606         ;;
20607  (isBP) st8 [r2]=r28            // save the address of the boot param area passed by the bootloader
20608  
20609 +#ifdef CONFIG_XEN
20610 +       //  Note: isBP is used by the subprogram.
20611 +       br.call.sptk.many rp=early_xen_setup
20612 +       ;;
20613 +#endif
20614 +
20615  #ifdef CONFIG_SMP
20616  (isAP) br.call.sptk.many rp=start_secondary
20617  .ret0:
20618 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/kernel/pal.S linux-2.6.16/arch/ia64/kernel/pal.S
20619 --- linux-2.6.16.orig/arch/ia64/kernel/pal.S    2006-03-20 06:53:29.000000000 +0100
20620 +++ linux-2.6.16/arch/ia64/kernel/pal.S 2006-06-26 09:51:32.000000000 +0200
20621 @@ -16,6 +16,7 @@
20622  #include <asm/processor.h>
20623  
20624         .data
20625 +       .globl pal_entry_point
20626  pal_entry_point:
20627         data8 ia64_pal_default_handler
20628         .text
20629 @@ -53,7 +54,7 @@
20630   * in4        1 ==> clear psr.ic,  0 ==> don't clear psr.ic
20631   *
20632   */
20633 -GLOBAL_ENTRY(ia64_pal_call_static)
20634 +GLOBAL_ENTRY(__ia64_pal_call_static)
20635         .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5)
20636         alloc loc1 = ar.pfs,5,5,0,0
20637         movl loc2 = pal_entry_point
20638 @@ -90,7 +91,7 @@
20639         ;;
20640         srlz.d                          // seralize restoration of psr.l
20641         br.ret.sptk.many b0
20642 -END(ia64_pal_call_static)
20643 +END(__ia64_pal_call_static)
20644  
20645  /*
20646   * Make a PAL call using the stacked registers calling convention.
20647 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/kernel/sal.c linux-2.6.16/arch/ia64/kernel/sal.c
20648 --- linux-2.6.16.orig/arch/ia64/kernel/sal.c    2006-03-20 06:53:29.000000000 +0100
20649 +++ linux-2.6.16/arch/ia64/kernel/sal.c 2006-06-26 09:51:32.000000000 +0200
20650 @@ -336,6 +336,9 @@
20651                 p += SAL_DESC_SIZE(*p);
20652         }
20653  
20654 +#ifdef CONFIG_XEN
20655 +       if (!running_on_xen)
20656 +#endif
20657         check_sal_cache_flush();
20658  }
20659  
20660 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/kernel/setup.c linux-2.6.16/arch/ia64/kernel/setup.c
20661 --- linux-2.6.16.orig/arch/ia64/kernel/setup.c  2006-06-26 09:49:46.000000000 +0200
20662 +++ linux-2.6.16/arch/ia64/kernel/setup.c       2006-06-26 09:51:32.000000000 +0200
20663 @@ -62,6 +62,9 @@
20664  #include <asm/system.h>
20665  #include <asm/unistd.h>
20666  #include <asm/system.h>
20667 +#ifdef CONFIG_XEN
20668 +#include <asm/hypervisor.h>
20669 +#endif
20670  
20671  #if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE)
20672  # error "struct cpuinfo_ia64 too big!"
20673 @@ -244,6 +247,14 @@
20674         rsvd_region[n].end   = (unsigned long) ia64_imva(_end);
20675         n++;
20676  
20677 +#ifdef CONFIG_XEN
20678 +       if (running_on_xen) {
20679 +               rsvd_region[n].start = (unsigned long)__va((HYPERVISOR_shared_info->arch.start_info_pfn << PAGE_SHIFT));
20680 +               rsvd_region[n].end   = rsvd_region[n].start + PAGE_SIZE;
20681 +               n++;
20682 +       }
20683 +#endif
20684 +
20685  #ifdef CONFIG_BLK_DEV_INITRD
20686         if (ia64_boot_param->initrd_start) {
20687                 rsvd_region[n].start = (unsigned long)__va(ia64_boot_param->initrd_start);
20688 @@ -261,6 +272,7 @@
20689         n++;
20690  
20691         num_rsvd_regions = n;
20692 +       BUG_ON(IA64_MAX_RSVD_REGIONS + 1 < n);
20693  
20694         sort_regions(rsvd_region, num_rsvd_regions);
20695  }
20696 @@ -334,6 +346,10 @@
20697  {
20698         int earlycons = 0;
20699  
20700 +#ifdef CONFIG_XEN
20701 +       if (!early_xen_console_setup(cmdline))
20702 +               earlycons++;
20703 +#endif
20704  #ifdef CONFIG_SERIAL_SGI_L1_CONSOLE
20705         {
20706                 extern int sn_serial_console_early_setup(void);
20707 @@ -491,6 +507,22 @@
20708                         conswitchp = &vga_con;
20709  # endif
20710         }
20711 +#ifdef CONFIG_XEN
20712 +       if (running_on_xen) {
20713 +               extern shared_info_t *HYPERVISOR_shared_info;
20714 +
20715 +               /* xen_start_info isn't setup yet, get the flags manually */
20716 +               if (HYPERVISOR_shared_info->arch.flags & SIF_INITDOMAIN) {
20717 +                       if (!(HYPERVISOR_shared_info->arch.flags & SIF_PRIVILEGED))
20718 +                               panic("Xen granted us console access "
20719 +                                     "but not privileged status");
20720 +               } else {
20721 +                       extern int console_use_vt;
20722 +                       conswitchp = NULL;
20723 +                       console_use_vt = 0;
20724 +               }
20725 +       }
20726 +#endif
20727  #endif
20728  
20729         /* enable IA-64 Machine Check Abort Handling unless disabled */
20730 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/xen/Makefile linux-2.6.16/arch/ia64/xen/Makefile
20731 --- linux-2.6.16.orig/arch/ia64/xen/Makefile    1970-01-01 01:00:00.000000000 +0100
20732 +++ linux-2.6.16/arch/ia64/xen/Makefile 2006-06-26 09:51:32.000000000 +0200
20733 @@ -0,0 +1,5 @@
20734 +#
20735 +# Makefile for Xen components
20736 +#
20737 +
20738 +obj-y := hypercall.o xenivt.o xenentry.o xensetup.o xenpal.o xenhpski.o xenconsole.o xen_ksyms.o
20739 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/xen/drivers/Makefile linux-2.6.16/arch/ia64/xen/drivers/Makefile
20740 --- linux-2.6.16.orig/arch/ia64/xen/drivers/Makefile    1970-01-01 01:00:00.000000000 +0100
20741 +++ linux-2.6.16/arch/ia64/xen/drivers/Makefile 2006-06-26 09:51:32.000000000 +0200
20742 @@ -0,0 +1,20 @@
20743 +
20744 +obj-y   += util.o
20745 +
20746 +obj-y  += core/
20747 +obj-y  += console/
20748 +obj-y  += evtchn/
20749 +#obj-y += balloon/
20750 +obj-y  += privcmd/
20751 +obj-y  += blkback/
20752 +#obj-y += netback/
20753 +obj-y  += blkfront/
20754 +obj-y  += xenbus/
20755 +#obj-y += netfront/
20756 +#obj-$(CONFIG_XEN_PRIVILEGED_GUEST)    += privcmd/
20757 +#obj-$(CONFIG_XEN_BLKDEV_BACKEND)      += blkback/
20758 +#obj-$(CONFIG_XEN_NETDEV_BACKEND)      += netback/
20759 +#obj-$(CONFIG_XEN_BLKDEV_FRONTEND)     += blkfront/
20760 +#obj-$(CONFIG_XEN_NETDEV_FRONTEND)     += netfront/
20761 +#obj-$(CONFIG_XEN_BLKDEV_TAP)          += blktap/
20762 +
20763 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/xen/drivers/README linux-2.6.16/arch/ia64/xen/drivers/README
20764 --- linux-2.6.16.orig/arch/ia64/xen/drivers/README      1970-01-01 01:00:00.000000000 +0100
20765 +++ linux-2.6.16/arch/ia64/xen/drivers/README   2006-06-26 09:51:32.000000000 +0200
20766 @@ -0,0 +1,2 @@
20767 +This is a temporary location for source/Makefiles that need to be
20768 +patched/reworked in drivers/xen to work with xenlinux/ia64.
20769 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/xen/drivers/coreMakefile linux-2.6.16/arch/ia64/xen/drivers/coreMakefile
20770 --- linux-2.6.16.orig/arch/ia64/xen/drivers/coreMakefile        1970-01-01 01:00:00.000000000 +0100
20771 +++ linux-2.6.16/arch/ia64/xen/drivers/coreMakefile     2006-06-26 09:51:32.000000000 +0200
20772 @@ -0,0 +1,26 @@
20773 +#
20774 +# Makefile for the linux kernel.
20775 +#
20776 +
20777 +XENARCH        := $(subst ",,$(CONFIG_XENARCH))
20778 +
20779 +CPPFLAGS_vmlinux.lds += -U$(XENARCH)
20780 +
20781 +$(obj)/vmlinux.lds.S:
20782 +       @ln -fsn $(srctree)/arch/$(XENARCH)/kernel/vmlinux.lds.S $@
20783 +
20784 +
20785 +obj-y   := gnttab.o features.o
20786 +obj-$(CONFIG_PROC_FS) += xen_proc.o
20787 +
20788 +ifeq ($(ARCH),ia64)
20789 +obj-y   += evtchn_ia64.o
20790 +obj-y   += xenia64_init.o
20791 +else
20792 +extra-y += vmlinux.lds
20793 +obj-y   += reboot.o evtchn.o fixup.o 
20794 +obj-$(CONFIG_SMP)     += smp.o         # setup_profiling_timer def'd in ia64
20795 +obj-$(CONFIG_NET)     += skbuff.o      # until networking is up on ia64
20796 +endif
20797 +obj-$(CONFIG_SYSFS)   += hypervisor_sysfs.o
20798 +obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o
20799 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/xen/drivers/evtchn_ia64.c linux-2.6.16/arch/ia64/xen/drivers/evtchn_ia64.c
20800 --- linux-2.6.16.orig/arch/ia64/xen/drivers/evtchn_ia64.c       1970-01-01 01:00:00.000000000 +0100
20801 +++ linux-2.6.16/arch/ia64/xen/drivers/evtchn_ia64.c    2006-06-26 09:51:32.000000000 +0200
20802 @@ -0,0 +1,273 @@
20803 +/* NOTE: This file split off from evtchn.c because there was
20804 +   some discussion that the mechanism is sufficiently different.
20805 +   It may be possible to merge it back in the future... djm */
20806 +#include <linux/config.h>
20807 +#include <linux/kernel.h>
20808 +#include <asm/hw_irq.h>
20809 +#include <xen/evtchn.h>
20810 +
20811 +#define MAX_EVTCHN 1024
20812 +
20813 +/* Xen will never allocate port zero for any purpose. */
20814 +#define VALID_EVTCHN(_chn) (((_chn) != 0) && ((_chn) < MAX_EVTCHN))
20815 +
20816 +/* Binding types. Hey, only IRQT_VIRQ and IRQT_EVTCHN are supported now
20817 + * for XEN/IA64 - ktian1
20818 + */
20819 +enum { IRQT_UNBOUND, IRQT_PIRQ, IRQT_VIRQ, IRQT_IPI, IRQT_EVTCHN };
20820 +
20821 +/* Constructor for packed IRQ information. */
20822 +#define mk_irq_info(type, index, evtchn)                               \
20823 +       (((u32)(type) << 24) | ((u32)(index) << 16) | (u32)(evtchn))
20824 +/* Convenient shorthand for packed representation of an unbound IRQ. */
20825 +#define IRQ_UNBOUND    mk_irq_info(IRQT_UNBOUND, 0, 0)
20826 +/* Accessor macros for packed IRQ information. */
20827 +#define evtchn_from_irq(irq) ((u16)(irq_info[irq]))
20828 +#define index_from_irq(irq)  ((u8)(irq_info[irq] >> 16))
20829 +#define type_from_irq(irq)   ((u8)(irq_info[irq] >> 24))
20830 +
20831 +/* Packed IRQ information: binding type, sub-type index, and event channel. */
20832 +static u32 irq_info[NR_IRQS];
20833 +
20834 +/* One note for XEN/IA64 is that we have all event channels bound to one
20835 + * physical irq vector. So we always mean evtchn vector identical to 'irq'
20836 + * vector in this context. - ktian1
20837 + */
20838 +static struct {
20839 +       irqreturn_t (*handler)(int, void *, struct pt_regs *);
20840 +       void *dev_id;
20841 +       char opened;    /* Whether allocated */
20842 +} evtchns[MAX_EVTCHN];
20843 +
20844 +/*
20845 + * This lock protects updates to the following mapping and reference-count
20846 + * arrays. The lock does not need to be acquired to read the mapping tables.
20847 + */
20848 +static spinlock_t irq_mapping_update_lock;
20849 +
20850 +void mask_evtchn(int port)
20851 +{
20852 +       shared_info_t *s = HYPERVISOR_shared_info;
20853 +       synch_set_bit(port, &s->evtchn_mask[0]);
20854 +}
20855 +EXPORT_SYMBOL(mask_evtchn);
20856 +
20857 +void unmask_evtchn(int port)
20858 +{
20859 +       shared_info_t *s = HYPERVISOR_shared_info;
20860 +       unsigned int cpu = smp_processor_id();
20861 +       vcpu_info_t *vcpu_info = &s->vcpu_info[cpu];
20862 +
20863 +#if 0  // FIXME: diverged from x86 evtchn.c
20864 +       /* Slow path (hypercall) if this is a non-local port. */
20865 +       if (unlikely(cpu != cpu_from_evtchn(port))) {
20866 +               evtchn_op_t op = { .cmd = EVTCHNOP_unmask,
20867 +                                  .u.unmask.port = port };
20868 +               (void)HYPERVISOR_event_channel_op(&op);
20869 +               return;
20870 +       }
20871 +#endif
20872 +
20873 +       synch_clear_bit(port, &s->evtchn_mask[0]);
20874 +
20875 +       /*
20876 +        * The following is basically the equivalent of 'hw_resend_irq'. Just
20877 +        * like a real IO-APIC we 'lose the interrupt edge' if the channel is
20878 +        * masked.
20879 +        */
20880 +       if (synch_test_bit(port, &s->evtchn_pending[0]) && 
20881 +           !synch_test_and_set_bit(port / BITS_PER_LONG,
20882 +                                   &vcpu_info->evtchn_pending_sel)) {
20883 +               vcpu_info->evtchn_upcall_pending = 1;
20884 +               if (!vcpu_info->evtchn_upcall_mask)
20885 +                       force_evtchn_callback();
20886 +       }
20887 +}
20888 +EXPORT_SYMBOL(unmask_evtchn);
20889 +
20890 +
20891 +#define unbound_irq(e) (VALID_EVTCHN(e) && (!evtchns[(e)].opened))
20892 +int bind_virq_to_irqhandler(
20893 +       unsigned int virq,
20894 +       unsigned int cpu,
20895 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
20896 +       unsigned long irqflags,
20897 +       const char *devname,
20898 +       void *dev_id)
20899 +{
20900 +    evtchn_op_t op;
20901 +    int evtchn;
20902 +
20903 +    spin_lock(&irq_mapping_update_lock);
20904 +
20905 +    op.cmd = EVTCHNOP_bind_virq;
20906 +    op.u.bind_virq.virq = virq;
20907 +    op.u.bind_virq.vcpu = cpu;
20908 +    BUG_ON(HYPERVISOR_event_channel_op(&op) != 0 );
20909 +    evtchn = op.u.bind_virq.port;
20910 +
20911 +    if (!unbound_irq(evtchn)) {
20912 +        evtchn = -EINVAL;
20913 +        goto out;
20914 +    }
20915 +
20916 +    evtchns[evtchn].handler = handler;
20917 +    evtchns[evtchn].dev_id = dev_id;
20918 +    evtchns[evtchn].opened = 1;
20919 +    irq_info[evtchn] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
20920 +
20921 +    unmask_evtchn(evtchn);
20922 +out:
20923 +    spin_unlock(&irq_mapping_update_lock);
20924 +    return evtchn;
20925 +}
20926 +
20927 +int bind_evtchn_to_irqhandler(unsigned int evtchn,
20928 +                   irqreturn_t (*handler)(int, void *, struct pt_regs *),
20929 +                   unsigned long irqflags, const char * devname, void *dev_id)
20930 +{
20931 +    spin_lock(&irq_mapping_update_lock);
20932 +
20933 +    if (!unbound_irq(evtchn)) {
20934 +       evtchn = -EINVAL;
20935 +       goto out;
20936 +    }
20937 +
20938 +    evtchns[evtchn].handler = handler;
20939 +    evtchns[evtchn].dev_id = dev_id;
20940 +    evtchns[evtchn].opened = 1;
20941 +    irq_info[evtchn] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
20942 +
20943 +    unmask_evtchn(evtchn);
20944 +out:
20945 +    spin_unlock(&irq_mapping_update_lock);
20946 +    return evtchn;
20947 +}
20948 +
20949 +int bind_ipi_to_irqhandler(
20950 +       unsigned int ipi,
20951 +       unsigned int cpu,
20952 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
20953 +       unsigned long irqflags,
20954 +       const char *devname,
20955 +       void *dev_id)
20956 +{
20957 +    printk("%s is called which has not been supported now...?\n", __FUNCTION__);
20958 +    while(1);
20959 +}
20960 +
20961 +void unbind_from_irqhandler(unsigned int irq, void *dev_id)
20962 +{
20963 +    evtchn_op_t op;
20964 +    int evtchn = evtchn_from_irq(irq);
20965 +
20966 +    spin_lock(&irq_mapping_update_lock);
20967 +
20968 +    if (unbound_irq(irq))
20969 +        goto out;
20970 +
20971 +    op.cmd = EVTCHNOP_close;
20972 +    op.u.close.port = evtchn;
20973 +    BUG_ON(HYPERVISOR_event_channel_op(&op) != 0);
20974 +
20975 +    switch (type_from_irq(irq)) {
20976 +       case IRQT_VIRQ:
20977 +           /* Add smp stuff later... */
20978 +           break;
20979 +       case IRQT_IPI:
20980 +           /* Add smp stuff later... */
20981 +           break;
20982 +       default:
20983 +           break;
20984 +    }
20985 +
20986 +    mask_evtchn(evtchn);
20987 +    evtchns[evtchn].handler = NULL;
20988 +    evtchns[evtchn].opened = 0;
20989 +
20990 +out:
20991 +    spin_unlock(&irq_mapping_update_lock);
20992 +}
20993 +
20994 +void notify_remote_via_irq(int irq)
20995 +{
20996 +       int evtchn = evtchn_from_irq(irq);
20997 +
20998 +       if (!unbound_irq(evtchn))
20999 +               notify_remote_via_evtchn(evtchn);
21000 +}
21001 +
21002 +irqreturn_t evtchn_interrupt(int irq, void *dev_id, struct pt_regs *regs)
21003 +{
21004 +    unsigned long  l1, l2;
21005 +    unsigned int   l1i, l2i, port;
21006 +    irqreturn_t (*handler)(int, void *, struct pt_regs *);
21007 +    shared_info_t *s = HYPERVISOR_shared_info;
21008 +    vcpu_info_t   *vcpu_info = &s->vcpu_info[smp_processor_id()];
21009 +
21010 +    vcpu_info->evtchn_upcall_mask = 1;
21011 +    vcpu_info->evtchn_upcall_pending = 0;
21012 +
21013 +    /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
21014 +    l1 = xchg(&vcpu_info->evtchn_pending_sel, 0);
21015 +    while ( l1 != 0 )
21016 +    {
21017 +        l1i = __ffs(l1);
21018 +        l1 &= ~(1UL << l1i);
21019 +
21020 +        while ( (l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i]) != 0 )
21021 +        {
21022 +            l2i = __ffs(l2);
21023 +            l2 &= ~(1UL << l2i);
21024 +
21025 +            port = (l1i * BITS_PER_LONG) + l2i;
21026 +            if ( (handler = evtchns[port].handler) != NULL )
21027 +           {
21028 +               clear_evtchn(port);
21029 +                handler(port, evtchns[port].dev_id, regs);
21030 +           }
21031 +            else
21032 +           {
21033 +                evtchn_device_upcall(port);
21034 +           }
21035 +        }
21036 +    }
21037 +    vcpu_info->evtchn_upcall_mask = 0;
21038 +    return IRQ_HANDLED;
21039 +}
21040 +
21041 +void force_evtchn_callback(void)
21042 +{
21043 +       //(void)HYPERVISOR_xen_version(0, NULL);
21044 +}
21045 +
21046 +static struct irqaction evtchn_irqaction = {
21047 +       .handler =      evtchn_interrupt,
21048 +       .flags =        SA_INTERRUPT,
21049 +       .name =         "xen-event-channel"
21050 +};
21051 +
21052 +int evtchn_irq = 0xe9;
21053 +void __init evtchn_init(void)
21054 +{
21055 +    shared_info_t *s = HYPERVISOR_shared_info;
21056 +    vcpu_info_t   *vcpu_info = &s->vcpu_info[smp_processor_id()];
21057 +
21058 +#if 0
21059 +    int ret;
21060 +    irq = assign_irq_vector(AUTO_ASSIGN);
21061 +    ret = request_irq(irq, evtchn_interrupt, 0, "xen-event-channel", NULL);
21062 +    if (ret < 0)
21063 +    {
21064 +       printk("xen-event-channel unable to get irq %d (%d)\n", irq, ret);
21065 +       return;
21066 +    }
21067 +#endif
21068 +    register_percpu_irq(evtchn_irq, &evtchn_irqaction);
21069 +
21070 +    vcpu_info->arch.evtchn_vector = evtchn_irq;
21071 +    printk("xen-event-channel using irq %d\n", evtchn_irq);
21072 +
21073 +    spin_lock_init(&irq_mapping_update_lock);
21074 +    memset(evtchns, 0, sizeof(evtchns));
21075 +}
21076 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/xen/drivers/patches/blkback.c.patch linux-2.6.16/arch/ia64/xen/drivers/patches/blkback.c.patch
21077 --- linux-2.6.16.orig/arch/ia64/xen/drivers/patches/blkback.c.patch     1970-01-01 01:00:00.000000000 +0100
21078 +++ linux-2.6.16/arch/ia64/xen/drivers/patches/blkback.c.patch  2006-06-26 09:51:32.000000000 +0200
21079 @@ -0,0 +1,57 @@
21080 +diff -Naur xen/blkback/blkback.c xen.patched/blkback/blkback.c
21081 +--- xen/blkback/blkback.c      2005-09-23 10:54:50.000000000 -0600
21082 ++++ xen.patched/blkback/blkback.c      2005-09-23 10:57:51.000000000 -0600
21083 +@@ -30,10 +30,16 @@
21084 + static unsigned long mmap_vstart;
21085 + #define MMAP_PAGES                                            \
21086 +       (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
21087 ++#ifdef __ia64__
21088 ++static void *pending_vaddrs[MMAP_PAGES];
21089 ++#define MMAP_VADDR(_idx, _i) \
21090 ++      (unsigned long)(pending_vaddrs[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
21091 ++#else
21092 + #define MMAP_VADDR(_req,_seg)                                         \
21093 +       (mmap_vstart +                                                  \
21094 +        ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
21095 +        ((_seg) * PAGE_SIZE))
21096 ++#endif
21097
21098 + /*
21099 +  * Each outstanding request that we've passed to the lower device layers has a 
21100 +@@ -377,9 +383,13 @@
21101 +                       goto bad_descriptor;
21102 +               }
21103
21104 ++#ifdef __ia64__
21105 ++              MMAP_VADDR(pending_idx,i) = gnttab_map_vaddr(map[i]);
21106 ++#else
21107 +               phys_to_machine_mapping[__pa(MMAP_VADDR(
21108 +                       pending_idx, i)) >> PAGE_SHIFT] =
21109 +                       FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT);
21110 ++#endif
21111
21112 +               pending_handle(pending_idx, i) = map[i].handle;
21113 +       }
21114 +@@ -500,9 +510,22 @@
21115
21116 +       blkif_interface_init();
21117
21118 ++#ifdef __ia64__
21119 ++    {
21120 ++      extern unsigned long alloc_empty_foreign_map_page_range(unsigned long pages);
21121 ++      int i;
21122 ++
21123 ++      mmap_vstart =  alloc_empty_foreign_map_page_range(MMAP_PAGES);
21124 ++      printk("Allocated mmap_vstart: 0x%lx\n", mmap_vstart);
21125 ++      for(i = 0; i < MMAP_PAGES; i++)
21126 ++          pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
21127 ++      BUG_ON(mmap_vstart == NULL);
21128 ++    }
21129 ++#else
21130 +       page = balloon_alloc_empty_page_range(MMAP_PAGES);
21131 +       BUG_ON(page == NULL);
21132 +       mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
21133 ++#endif
21134
21135 +       pending_cons = 0;
21136 +       pending_prod = MAX_PENDING_REQS;
21137 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/xen/drivers/patches/console.c.patch linux-2.6.16/arch/ia64/xen/drivers/patches/console.c.patch
21138 --- linux-2.6.16.orig/arch/ia64/xen/drivers/patches/console.c.patch     1970-01-01 01:00:00.000000000 +0100
21139 +++ linux-2.6.16/arch/ia64/xen/drivers/patches/console.c.patch  2006-06-26 09:51:32.000000000 +0200
21140 @@ -0,0 +1,18 @@
21141 +--- xen/console/console.c      2005-11-02 14:13:07.000000000 +0100
21142 ++++ xen.patched/console/console.c      2005-11-02 14:21:20.000000000 +0100
21143 +@@ -768,9 +771,15 @@
21144 + #endif
21145
21146 +       if (xen_start_info->flags & SIF_INITDOMAIN) {
21147 ++#ifdef __ia64__
21148 ++              xencons_priv_irq = bind_virq_to_evtchn(VIRQ_CONSOLE);
21149 ++              bind_evtchn_to_irqhandler(xencons_priv_irq,
21150 ++                              xencons_priv_interrupt, 0, "console", NULL);
21151 ++#else
21152 +               xencons_priv_irq = bind_virq_to_irq(VIRQ_CONSOLE, 0);
21153 +               (void)request_irq(xencons_priv_irq,
21154 +                                 xencons_priv_interrupt, 0, "console", NULL);
21155 ++#endif
21156 +       } else {
21157 +               xencons_ring_register_receiver(xencons_rx);
21158 +       }
21159 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/xen/drivers/patches/devmem.c.patch linux-2.6.16/arch/ia64/xen/drivers/patches/devmem.c.patch
21160 --- linux-2.6.16.orig/arch/ia64/xen/drivers/patches/devmem.c.patch      1970-01-01 01:00:00.000000000 +0100
21161 +++ linux-2.6.16/arch/ia64/xen/drivers/patches/devmem.c.patch   2006-06-26 09:51:32.000000000 +0200
21162 @@ -0,0 +1,3 @@
21163 +diff -Naur xen/core/devmem.c xen.patched/core/devmem.c
21164 +--- xen/core/devmem.c  2005-09-23 10:54:50.000000000 -0600
21165 ++++ xen.patched/core/devmem.c  2005-09-23 10:57:51.000000000 -0600
21166 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/xen/drivers/patches/gnttab.c.patch linux-2.6.16/arch/ia64/xen/drivers/patches/gnttab.c.patch
21167 --- linux-2.6.16.orig/arch/ia64/xen/drivers/patches/gnttab.c.patch      1970-01-01 01:00:00.000000000 +0100
21168 +++ linux-2.6.16/arch/ia64/xen/drivers/patches/gnttab.c.patch   2006-06-26 09:51:32.000000000 +0200
21169 @@ -0,0 +1,46 @@
21170 +diff -Naur xen/core/gnttab.c xen.patched/core/gnttab.c
21171 +--- xen/core/gnttab.c  2005-09-23 10:54:50.000000000 -0600
21172 ++++ xen.patched/core/gnttab.c  2005-09-23 10:57:51.000000000 -0600
21173 +@@ -346,6 +350,10 @@
21174 +       if ( hypercall.op != __HYPERVISOR_grant_table_op )
21175 +               return -ENOSYS;
21176
21177 ++
21178 ++#ifdef __ia64__
21179 ++      ret = HYPERVISOR_grant_table_op(hypercall.arg[0], (void *)hypercall.arg[1], hypercall.arg[2]);
21180 ++#else
21181 +       /* hypercall-invoking asm taken from privcmd.c */
21182 +       __asm__ __volatile__ (
21183 +               "pushl %%ebx; pushl %%ecx; pushl %%edx; "
21184 +@@ -359,6 +367,7 @@
21185 +               TRAP_INSTR "; "
21186 +               "popl %%edi; popl %%esi; popl %%edx; popl %%ecx; popl %%ebx"
21187 +               : "=a" (ret) : "0" (&hypercall) : "memory" );
21188 ++#endif
21189
21190 +       return ret;
21191 + }
21192 +@@ -423,8 +432,13 @@
21193 +       BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1));
21194 +       BUG_ON(setup.status != 0);
21195
21196 ++#ifdef __ia64__
21197 ++      shared = __va(frames[0] << PAGE_SHIFT);
21198 ++      printk("grant table at %p\n", shared);
21199 ++#else
21200 +       for (i = 0; i < NR_GRANT_FRAMES; i++)
21201 +               set_fixmap(FIX_GNTTAB_END - i, frames[i] << PAGE_SHIFT);
21202 ++#endif
21203
21204 +       return 0;
21205 + }
21206 +@@ -450,7 +466,9 @@
21207
21208 +       BUG_ON(gnttab_resume());
21209
21210 ++#ifndef __ia64__
21211 +       shared = (grant_entry_t *)fix_to_virt(FIX_GNTTAB_END);
21212 ++#endif
21213
21214 +       for (i = NR_RESERVED_ENTRIES; i < NR_GRANT_ENTRIES; i++)
21215 +               gnttab_list[i] = i + 1;
21216 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/xen/drivers/patches/privcmd.c.patch linux-2.6.16/arch/ia64/xen/drivers/patches/privcmd.c.patch
21217 --- linux-2.6.16.orig/arch/ia64/xen/drivers/patches/privcmd.c.patch     1970-01-01 01:00:00.000000000 +0100
21218 +++ linux-2.6.16/arch/ia64/xen/drivers/patches/privcmd.c.patch  2006-06-26 09:51:32.000000000 +0200
21219 @@ -0,0 +1,43 @@
21220 +diff -Naur xen/privcmd/privcmd.c xen.patched/privcmd/privcmd.c
21221 +--- xen/privcmd/privcmd.c      2005-09-23 10:54:50.000000000 -0600
21222 ++++ xen.patched/privcmd/privcmd.c      2005-09-23 10:57:51.000000000 -0600
21223 +@@ -180,6 +183,15 @@
21224 +               for (i = 0; i < m.num; i++, addr += PAGE_SIZE, p++) {
21225 +                       if (get_user(mfn, p))
21226 +                               return -EFAULT;
21227 ++#ifdef __ia64__
21228 ++                      ret = remap_pfn_range(vma,
21229 ++                                            addr&PAGE_MASK,
21230 ++                                            mfn,
21231 ++                                            1<<PAGE_SHIFT,
21232 ++                                            vma->vm_page_prot);
21233 ++                      if (ret < 0)
21234 ++                          goto batch_err;
21235 ++#else
21236
21237 +                       ret = create_lookup_pte_addr(vma->vm_mm, addr, &ptep);
21238 +                       if (ret)
21239 +@@ -190,6 +202,7 @@
21240
21241 +                       if (HYPERVISOR_mmu_update(&u, 1, NULL, m.dom) < 0)
21242 +                               put_user(0xF0000000 | mfn, p);
21243 ++#endif
21244 +               }
21245
21246 +               ret = 0;
21247 +@@ -205,6 +218,7 @@
21248 +       break;
21249 + #endif
21250
21251 ++#ifndef __ia64__
21252 +       case IOCTL_PRIVCMD_GET_MACH2PHYS_START_MFN: {
21253 +               unsigned long m2pv = (unsigned long)machine_to_phys_mapping;
21254 +               pgd_t *pgd = pgd_offset_k(m2pv);
21255 +@@ -216,6 +230,7 @@
21256 +                       -EFAULT: 0;
21257 +       }
21258 +       break;
21259 ++#endif
21260
21261 +       default:
21262 +               ret = -EINVAL;
21263 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/xen/drivers/xenia64_init.c linux-2.6.16/arch/ia64/xen/drivers/xenia64_init.c
21264 --- linux-2.6.16.orig/arch/ia64/xen/drivers/xenia64_init.c      1970-01-01 01:00:00.000000000 +0100
21265 +++ linux-2.6.16/arch/ia64/xen/drivers/xenia64_init.c   2006-06-26 09:51:32.000000000 +0200
21266 @@ -0,0 +1,55 @@
21267 +#ifdef __ia64__
21268 +#include <linux/config.h>
21269 +#include <linux/module.h>
21270 +#include <linux/efi.h>
21271 +#include <asm/sal.h>
21272 +#include <asm/hypervisor.h>
21273 +/* #include <asm-xen/evtchn.h> */
21274 +#include <linux/vmalloc.h>
21275 +
21276 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)0xf100000000000000;
21277 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
21278 +
21279 +static int initialized;
21280 +start_info_t *xen_start_info;
21281 +
21282 +int xen_init(void)
21283 +{
21284 +       shared_info_t *s = HYPERVISOR_shared_info;
21285 +
21286 +       if (initialized)
21287 +               return running_on_xen ? 0 : -1;
21288 +
21289 +       if (!running_on_xen)
21290 +               return -1;
21291 +
21292 +       xen_start_info = __va(s->arch.start_info_pfn << PAGE_SHIFT);
21293 +       xen_start_info->flags = s->arch.flags;
21294 +       printk("Running on Xen! start_info_pfn=0x%lx nr_pages=%d flags=0x%x\n",
21295 +               s->arch.start_info_pfn, xen_start_info->nr_pages,
21296 +               xen_start_info->flags);
21297 +
21298 +       evtchn_init();
21299 +       initialized = 1;
21300 +       return 0;
21301 +}
21302 +
21303 +/* We just need a range of legal va here, though finally identity
21304 + * mapped one is instead used for gnttab mapping.
21305 + */
21306 +unsigned long alloc_empty_foreign_map_page_range(unsigned long pages)
21307 +{
21308 +       struct vm_struct *vma;
21309 +
21310 +       if ( (vma = get_vm_area(PAGE_SIZE * pages, VM_ALLOC)) == NULL )
21311 +               return NULL;
21312 +
21313 +       return (unsigned long)vma->addr;
21314 +}
21315 +
21316 +#if 0
21317 +/* These should be define'd but some drivers use them without
21318 + * a convenient arch include */
21319 +unsigned long mfn_to_pfn(unsigned long mfn) { return mfn; }
21320 +#endif
21321 +#endif
21322 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/xen/hypercall.S linux-2.6.16/arch/ia64/xen/hypercall.S
21323 --- linux-2.6.16.orig/arch/ia64/xen/hypercall.S 1970-01-01 01:00:00.000000000 +0100
21324 +++ linux-2.6.16/arch/ia64/xen/hypercall.S      2006-06-26 09:51:32.000000000 +0200
21325 @@ -0,0 +1,365 @@
21326 +/*
21327 + * Support routines for Xen hypercalls
21328 + *
21329 + * Copyright (C) 2005 Dan Magenheimer <dan.magenheimer@hp.com>
21330 + */
21331 +
21332 +#include <linux/config.h>
21333 +#include <asm/processor.h>
21334 +#include <asm/asmmacro.h>
21335 +
21336 +GLOBAL_ENTRY(xen_get_ivr)
21337 +       movl r8=running_on_xen;;
21338 +       ld4 r8=[r8];;
21339 +       cmp.eq p7,p0=r8,r0;;
21340 +(p7)   mov r8=cr.ivr;;
21341 +(p7)   br.ret.sptk.many rp
21342 +       ;;
21343 +       movl r9=XSI_PSR_IC
21344 +       ;;
21345 +       ld8 r10=[r9]
21346 +       ;;
21347 +       st8 [r9]=r0
21348 +       ;;
21349 +       XEN_HYPER_GET_IVR
21350 +       ;;
21351 +       st8 [r9]=r10
21352 +       br.ret.sptk.many rp
21353 +       ;;
21354 +END(xen_get_ivr)
21355 +
21356 +GLOBAL_ENTRY(xen_get_tpr)
21357 +       movl r8=running_on_xen;;
21358 +       ld4 r8=[r8];;
21359 +       cmp.eq p7,p0=r8,r0;;
21360 +(p7)   mov r8=cr.tpr;;
21361 +(p7)   br.ret.sptk.many rp
21362 +       ;;
21363 +       movl r9=XSI_PSR_IC
21364 +       ;;
21365 +       ld8 r10=[r9]
21366 +       ;;
21367 +       st8 [r9]=r0
21368 +       ;;
21369 +       XEN_HYPER_GET_TPR
21370 +       ;;
21371 +       st8 [r9]=r10
21372 +       br.ret.sptk.many rp
21373 +       ;;
21374 +END(xen_get_tpr)
21375 +
21376 +GLOBAL_ENTRY(xen_set_tpr)
21377 +       movl r8=running_on_xen;;
21378 +       ld4 r8=[r8];;
21379 +       cmp.eq p7,p0=r8,r0;;
21380 +(p7)   mov cr.tpr=r32;;
21381 +(p7)   br.ret.sptk.many rp
21382 +       ;;
21383 +       movl r9=XSI_PSR_IC
21384 +       mov r8=r32
21385 +       ;;
21386 +       ld8 r10=[r9]
21387 +       ;;
21388 +       st8 [r9]=r0
21389 +       ;;
21390 +       XEN_HYPER_SET_TPR
21391 +       ;;
21392 +       st8 [r9]=r10
21393 +       br.ret.sptk.many rp
21394 +       ;;
21395 +END(xen_set_tpr)
21396 +
21397 +GLOBAL_ENTRY(xen_eoi)
21398 +       movl r8=running_on_xen;;
21399 +       ld4 r8=[r8];;
21400 +       cmp.eq p7,p0=r8,r0;;
21401 +(p7)   mov cr.eoi=r0;;
21402 +(p7)   br.ret.sptk.many rp
21403 +       ;;
21404 +       movl r9=XSI_PSR_IC
21405 +       mov r8=r32
21406 +       ;;
21407 +       ld8 r10=[r9]
21408 +       ;;
21409 +       st8 [r9]=r0
21410 +       ;;
21411 +       XEN_HYPER_EOI
21412 +       ;;
21413 +       st8 [r9]=r10
21414 +       br.ret.sptk.many rp
21415 +       ;;
21416 +END(xen_eoi)
21417 +
21418 +GLOBAL_ENTRY(xen_thash)
21419 +       movl r8=running_on_xen;;
21420 +       ld4 r8=[r8];;
21421 +       cmp.eq p7,p0=r8,r0;;
21422 +(p7)   thash r8=r32;;
21423 +(p7)   br.ret.sptk.many rp
21424 +       ;;
21425 +       movl r9=XSI_PSR_IC
21426 +       mov r8=r32
21427 +       ;;
21428 +       ld8 r10=[r9]
21429 +       ;;
21430 +       st8 [r9]=r0
21431 +       ;;
21432 +       XEN_HYPER_THASH
21433 +       ;;
21434 +       st8 [r9]=r10
21435 +       ;;
21436 +       br.ret.sptk.many rp
21437 +       ;;
21438 +END(xen_thash)
21439 +
21440 +GLOBAL_ENTRY(xen_set_itm)
21441 +       movl r8=running_on_xen;;
21442 +       ld4 r8=[r8];;
21443 +       cmp.eq p7,p0=r8,r0;;
21444 +(p7)   mov cr.itm=r32;;
21445 +(p7)   br.ret.sptk.many rp
21446 +       ;;
21447 +       movl r9=XSI_PSR_IC
21448 +       mov r8=r32
21449 +       ;;
21450 +       ld8 r10=[r9]
21451 +       ;;
21452 +       st8 [r9]=r0
21453 +       ;;
21454 +       XEN_HYPER_SET_ITM
21455 +       ;;
21456 +       st8 [r9]=r10
21457 +       ;;
21458 +       br.ret.sptk.many rp
21459 +       ;;
21460 +END(xen_set_itm)
21461 +
21462 +GLOBAL_ENTRY(xen_ptcga)
21463 +       movl r8=running_on_xen;;
21464 +       ld4 r8=[r8];;
21465 +       cmp.eq p7,p0=r8,r0;;
21466 +(p7)   ptc.ga r32,r33;;
21467 +(p7)   br.ret.sptk.many rp
21468 +       ;;
21469 +       movl r11=XSI_PSR_IC
21470 +       mov r8=r32
21471 +       mov r9=r33
21472 +       ;;
21473 +       ld8 r10=[r11]
21474 +       ;;
21475 +       st8 [r11]=r0
21476 +       ;;
21477 +       XEN_HYPER_PTC_GA
21478 +       ;;
21479 +       st8 [r11]=r10
21480 +       ;;
21481 +       br.ret.sptk.many rp
21482 +       ;;
21483 +END(xen_ptcga)
21484 +
21485 +GLOBAL_ENTRY(xen_get_rr)
21486 +       movl r8=running_on_xen;;
21487 +       ld4 r8=[r8];;
21488 +       cmp.eq p7,p0=r8,r0;;
21489 +(p7)   mov r8=rr[r32];;
21490 +(p7)   br.ret.sptk.many rp
21491 +       ;;
21492 +       movl r9=XSI_PSR_IC
21493 +       mov r8=r32
21494 +       ;;
21495 +       ld8 r10=[r9]
21496 +       ;;
21497 +       st8 [r9]=r0
21498 +       ;;
21499 +       XEN_HYPER_GET_RR
21500 +       ;;
21501 +       st8 [r9]=r10
21502 +       ;;
21503 +       br.ret.sptk.many rp
21504 +       ;;
21505 +END(xen_get_rr)
21506 +
21507 +GLOBAL_ENTRY(xen_set_rr)
21508 +       movl r8=running_on_xen;;
21509 +       ld4 r8=[r8];;
21510 +       cmp.eq p7,p0=r8,r0;;
21511 +(p7)   mov rr[r32]=r33;;
21512 +(p7)   br.ret.sptk.many rp
21513 +       ;;
21514 +       movl r11=XSI_PSR_IC
21515 +       mov r8=r32
21516 +       mov r9=r33
21517 +       ;;
21518 +       ld8 r10=[r11]
21519 +       ;;
21520 +       st8 [r11]=r0
21521 +       ;;
21522 +       XEN_HYPER_SET_RR
21523 +       ;;
21524 +       st8 [r11]=r10
21525 +       ;;
21526 +       br.ret.sptk.many rp
21527 +       ;;
21528 +END(xen_set_rr)
21529 +
21530 +GLOBAL_ENTRY(xen_set_kr)
21531 +       movl r8=running_on_xen;;
21532 +       ld4 r8=[r8];;
21533 +       cmp.ne p7,p0=r8,r0;;
21534 +(p7)   br.cond.spnt.few 1f;
21535 +       ;;
21536 +       cmp.eq p7,p0=r8,r0
21537 +       adds r8=-1,r8;;
21538 +(p7)   mov ar0=r9
21539 +(p7)   br.ret.sptk.many rp;;
21540 +       cmp.eq p7,p0=r8,r0
21541 +       adds r8=-1,r8;;
21542 +(p7)   mov ar1=r9
21543 +(p7)   br.ret.sptk.many rp;;
21544 +       cmp.eq p7,p0=r8,r0
21545 +       adds r8=-1,r8;;
21546 +(p7)   mov ar2=r9
21547 +(p7)   br.ret.sptk.many rp;;
21548 +       cmp.eq p7,p0=r8,r0
21549 +       adds r8=-1,r8;;
21550 +(p7)   mov ar3=r9
21551 +(p7)   br.ret.sptk.many rp;;
21552 +       cmp.eq p7,p0=r8,r0
21553 +       adds r8=-1,r8;;
21554 +(p7)   mov ar4=r9
21555 +(p7)   br.ret.sptk.many rp;;
21556 +       cmp.eq p7,p0=r8,r0
21557 +       adds r8=-1,r8;;
21558 +(p7)   mov ar5=r9
21559 +(p7)   br.ret.sptk.many rp;;
21560 +       cmp.eq p7,p0=r8,r0
21561 +       adds r8=-1,r8;;
21562 +(p7)   mov ar6=r9
21563 +(p7)   br.ret.sptk.many rp;;
21564 +       cmp.eq p7,p0=r8,r0
21565 +       adds r8=-1,r8;;
21566 +(p7)   mov ar7=r9
21567 +(p7)   br.ret.sptk.many rp;;
21568 +
21569 +1:     movl r11=XSI_PSR_IC
21570 +       mov r8=r32
21571 +       mov r9=r33
21572 +       ;;
21573 +       ld8 r10=[r11]
21574 +       ;;
21575 +       st8 [r11]=r0
21576 +       ;;
21577 +       XEN_HYPER_SET_KR
21578 +       ;;
21579 +       st8 [r11]=r10
21580 +       ;;
21581 +       br.ret.sptk.many rp
21582 +END(xen_set_rr)
21583 +
21584 +GLOBAL_ENTRY(xen_fc)
21585 +       movl r8=running_on_xen;;
21586 +       ld4 r8=[r8];;
21587 +       cmp.eq p7,p0=r8,r0;;
21588 +(p7)   fc r32;;
21589 +(p7)   br.ret.sptk.many rp
21590 +       ;;
21591 +       movl r9=XSI_PSR_IC
21592 +       mov r8=r32
21593 +       ;;
21594 +       ld8 r10=[r9]
21595 +       ;;
21596 +       st8 [r9]=r0
21597 +       ;;
21598 +       XEN_HYPER_FC
21599 +       ;;
21600 +       st8 [r9]=r10
21601 +       ;;
21602 +       br.ret.sptk.many rp
21603 +END(xen_fc)
21604 +
21605 +GLOBAL_ENTRY(xen_get_cpuid)
21606 +       movl r8=running_on_xen;;
21607 +       ld4 r8=[r8];;
21608 +       cmp.eq p7,p0=r8,r0;;
21609 +(p7)   mov r8=cpuid[r32];;
21610 +(p7)   br.ret.sptk.many rp
21611 +       ;;
21612 +       movl r9=XSI_PSR_IC
21613 +       mov r8=r32
21614 +       ;;
21615 +       ld8 r10=[r9]
21616 +       ;;
21617 +       st8 [r9]=r0
21618 +       ;;
21619 +       XEN_HYPER_GET_CPUID
21620 +       ;;
21621 +       st8 [r9]=r10
21622 +       ;;
21623 +       br.ret.sptk.many rp
21624 +END(xen_get_cpuid)
21625 +
21626 +GLOBAL_ENTRY(xen_get_pmd)
21627 +       movl r8=running_on_xen;;
21628 +       ld4 r8=[r8];;
21629 +       cmp.eq p7,p0=r8,r0;;
21630 +(p7)   mov r8=pmd[r32];;
21631 +(p7)   br.ret.sptk.many rp
21632 +       ;;
21633 +       movl r9=XSI_PSR_IC
21634 +       mov r8=r32
21635 +       ;;
21636 +       ld8 r10=[r9]
21637 +       ;;
21638 +       st8 [r9]=r0
21639 +       ;;
21640 +       XEN_HYPER_GET_PMD
21641 +       ;;
21642 +       st8 [r9]=r10
21643 +       ;;
21644 +       br.ret.sptk.many rp
21645 +END(xen_get_pmd)
21646 +
21647 +#ifdef CONFIG_IA32_SUPPORT
21648 +GLOBAL_ENTRY(xen_get_eflag)
21649 +       movl r8=running_on_xen;;
21650 +       ld4 r8=[r8];;
21651 +       cmp.eq p7,p0=r8,r0;;
21652 +(p7)   mov r8=ar24;;
21653 +(p7)   br.ret.sptk.many rp
21654 +       ;;
21655 +       movl r9=XSI_PSR_IC
21656 +       mov r8=r32
21657 +       ;;
21658 +       ld8 r10=[r9]
21659 +       ;;
21660 +       st8 [r9]=r0
21661 +       ;;
21662 +       XEN_HYPER_GET_EFLAG
21663 +       ;;
21664 +       st8 [r9]=r10
21665 +       ;;
21666 +       br.ret.sptk.many rp
21667 +END(xen_get_eflag)
21668 +       
21669 +// some bits aren't set if pl!=0, see SDM vol1 3.1.8
21670 +GLOBAL_ENTRY(xen_set_eflag)
21671 +       movl r8=running_on_xen;;
21672 +       ld4 r8=[r8];;
21673 +       cmp.eq p7,p0=r8,r0;;
21674 +(p7)   mov ar24=r32
21675 +(p7)   br.ret.sptk.many rp
21676 +       ;;
21677 +       movl r9=XSI_PSR_IC
21678 +       mov r8=r32
21679 +       ;;
21680 +       ld8 r10=[r9]
21681 +       ;;
21682 +       st8 [r9]=r0
21683 +       ;;
21684 +       XEN_HYPER_SET_EFLAG
21685 +       ;;
21686 +       st8 [r9]=r10
21687 +       ;;
21688 +       br.ret.sptk.many rp
21689 +END(xen_set_eflag)
21690 +#endif
21691 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/xen/xen_ksyms.c linux-2.6.16/arch/ia64/xen/xen_ksyms.c
21692 --- linux-2.6.16.orig/arch/ia64/xen/xen_ksyms.c 1970-01-01 01:00:00.000000000 +0100
21693 +++ linux-2.6.16/arch/ia64/xen/xen_ksyms.c      2006-06-26 09:51:32.000000000 +0200
21694 @@ -0,0 +1,12 @@
21695 +/*
21696 + * Architecture-specific kernel symbols
21697 + *
21698 + * Don't put any exports here unless it's defined in an assembler file.
21699 + * All other exports should be put directly after the definition.
21700 + */
21701 +
21702 +#include <linux/config.h>
21703 +#include <linux/module.h>
21704 +
21705 +extern int is_running_on_xen(void);
21706 +EXPORT_SYMBOL(is_running_on_xen);
21707 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/xen/xenconsole.c linux-2.6.16/arch/ia64/xen/xenconsole.c
21708 --- linux-2.6.16.orig/arch/ia64/xen/xenconsole.c        1970-01-01 01:00:00.000000000 +0100
21709 +++ linux-2.6.16/arch/ia64/xen/xenconsole.c     2006-06-26 09:51:32.000000000 +0200
21710 @@ -0,0 +1,19 @@
21711 +#include <linux/config.h>
21712 +#include <linux/console.h>
21713 +
21714 +int
21715 +early_xen_console_setup (char *cmdline)
21716 +{
21717 +#ifdef CONFIG_XEN
21718 +#ifndef CONFIG_IA64_HP_SIM
21719 +       extern int running_on_xen;
21720 +       if (running_on_xen) {
21721 +               extern struct console hpsim_cons;
21722 +               hpsim_cons.flags |= CON_BOOT;
21723 +               register_console(&hpsim_cons);
21724 +               return 0;
21725 +       }
21726 +#endif
21727 +#endif
21728 +       return -1;
21729 +}
21730 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/xen/xenentry.S linux-2.6.16/arch/ia64/xen/xenentry.S
21731 --- linux-2.6.16.orig/arch/ia64/xen/xenentry.S  1970-01-01 01:00:00.000000000 +0100
21732 +++ linux-2.6.16/arch/ia64/xen/xenentry.S       2006-06-26 09:51:32.000000000 +0200
21733 @@ -0,0 +1,850 @@
21734 +/*
21735 + * ia64/xen/entry.S
21736 + *
21737 + * Alternate kernel routines for Xen.  Heavily leveraged from
21738 + *   ia64/kernel/entry.S
21739 + *
21740 + * Copyright (C) 2005 Hewlett-Packard Co
21741 + *     Dan Magenheimer <dan.magenheimer@.hp.com>
21742 + */
21743 +
21744 +#include <linux/config.h>
21745 +
21746 +#include <asm/asmmacro.h>
21747 +#include <asm/cache.h>
21748 +#include <asm/errno.h>
21749 +#include <asm/kregs.h>
21750 +#include <asm/asm-offsets.h>
21751 +#include <asm/pgtable.h>
21752 +#include <asm/percpu.h>
21753 +#include <asm/processor.h>
21754 +#include <asm/thread_info.h>
21755 +#include <asm/unistd.h>
21756 +
21757 +#ifdef CONFIG_XEN
21758 +#include "xenminstate.h"
21759 +#else
21760 +#include "minstate.h"
21761 +#endif
21762 +
21763 +/*
21764 + * prev_task <- ia64_switch_to(struct task_struct *next)
21765 + *     With Ingo's new scheduler, interrupts are disabled when this routine gets
21766 + *     called.  The code starting at .map relies on this.  The rest of the code
21767 + *     doesn't care about the interrupt masking status.
21768 + */
21769 +#ifdef CONFIG_XEN
21770 +GLOBAL_ENTRY(xen_switch_to)
21771 +       .prologue
21772 +       alloc r16=ar.pfs,1,0,0,0
21773 +       movl r22=running_on_xen;;
21774 +       ld4 r22=[r22];;
21775 +       cmp.eq p7,p0=r22,r0
21776 +(p7)   br.cond.sptk.many __ia64_switch_to;;
21777 +#else
21778 +GLOBAL_ENTRY(ia64_switch_to)
21779 +       .prologue
21780 +       alloc r16=ar.pfs,1,0,0,0
21781 +#endif
21782 +       DO_SAVE_SWITCH_STACK
21783 +       .body
21784 +
21785 +       adds r22=IA64_TASK_THREAD_KSP_OFFSET,r13
21786 +       movl r25=init_task
21787 +       mov r27=IA64_KR(CURRENT_STACK)
21788 +       adds r21=IA64_TASK_THREAD_KSP_OFFSET,in0
21789 +       dep r20=0,in0,61,3              // physical address of "next"
21790 +       ;;
21791 +       st8 [r22]=sp                    // save kernel stack pointer of old task
21792 +       shr.u r26=r20,IA64_GRANULE_SHIFT
21793 +       cmp.eq p7,p6=r25,in0
21794 +       ;;
21795 +#ifdef CONFIG_XEN
21796 +       movl r8=XSI_PSR_IC
21797 +       ;;
21798 +       st4 [r8]=r0     // force psr.ic off for hyperprivop(s)
21799 +       ;;
21800 +#endif
21801 +       /*
21802 +        * If we've already mapped this task's page, we can skip doing it again.
21803 +        */
21804 +(p6)   cmp.eq p7,p6=r26,r27
21805 +(p6)   br.cond.dpnt .map
21806 +       ;;
21807 +.done:
21808 +#ifdef CONFIG_XEN
21809 +       // psr.ic already off
21810 +       // update "current" application register
21811 +       mov r8=IA64_KR_CURRENT
21812 +       mov r9=in0;;
21813 +       XEN_HYPER_SET_KR
21814 +       ld8 sp=[r21]                    // load kernel stack pointer of new task
21815 +       movl r27=XSI_PSR_IC
21816 +       mov r8=1
21817 +       ;;
21818 +       st4 [r27]=r8                    // psr.ic back on
21819 +       ;;
21820 +#else
21821 +(p6)   ssm psr.ic                      // if we had to map, reenable the psr.ic bit FIRST!!!
21822 +       ;;
21823 +(p6)   srlz.d
21824 +       ld8 sp=[r21]                    // load kernel stack pointer of new task
21825 +       mov IA64_KR(CURRENT)=in0        // update "current" application register
21826 +#endif
21827 +       mov r8=r13                      // return pointer to previously running task
21828 +       mov r13=in0                     // set "current" pointer
21829 +       ;;
21830 +       DO_LOAD_SWITCH_STACK
21831 +
21832 +#ifdef CONFIG_SMP
21833 +       sync.i                          // ensure "fc"s done by this CPU are visible on other CPUs
21834 +#endif
21835 +       br.ret.sptk.many rp             // boogie on out in new context
21836 +
21837 +.map:
21838 +#ifdef CONFIG_XEN
21839 +       // psr.ic already off
21840 +#else
21841 +       rsm psr.ic                      // interrupts (psr.i) are already disabled here
21842 +#endif
21843 +       movl r25=PAGE_KERNEL
21844 +       ;;
21845 +       srlz.d
21846 +       or r23=r25,r20                  // construct PA | page properties
21847 +       mov r25=IA64_GRANULE_SHIFT<<2
21848 +       ;;
21849 +#ifdef CONFIG_XEN
21850 +       movl r8=XSI_ITIR
21851 +       ;;
21852 +       st8 [r8]=r25
21853 +       ;;
21854 +       movl r8=XSI_IFA
21855 +       ;;
21856 +       st8 [r8]=in0                     // VA of next task...
21857 +       ;;
21858 +       mov r25=IA64_TR_CURRENT_STACK
21859 +       // remember last page we mapped...
21860 +       mov r8=IA64_KR_CURRENT_STACK
21861 +       mov r9=r26;;
21862 +       XEN_HYPER_SET_KR;;
21863 +#else
21864 +       mov cr.itir=r25
21865 +       mov cr.ifa=in0                  // VA of next task...
21866 +       ;;
21867 +       mov r25=IA64_TR_CURRENT_STACK
21868 +       mov IA64_KR(CURRENT_STACK)=r26  // remember last page we mapped...
21869 +#endif
21870 +       ;;
21871 +       itr.d dtr[r25]=r23              // wire in new mapping...
21872 +       br.cond.sptk .done
21873 +#ifdef CONFIG_XEN
21874 +END(xen_switch_to)
21875 +#else
21876 +END(ia64_switch_to)
21877 +#endif
21878 +
21879 +       /*
21880 +        * Invoke a system call, but do some tracing before and after the call.
21881 +        * We MUST preserve the current register frame throughout this routine
21882 +        * because some system calls (such as ia64_execve) directly
21883 +        * manipulate ar.pfs.
21884 +        */
21885 +#ifdef CONFIG_XEN
21886 +GLOBAL_ENTRY(xen_trace_syscall)
21887 +       PT_REGS_UNWIND_INFO(0)
21888 +       movl r16=running_on_xen;;
21889 +       ld4 r16=[r16];;
21890 +       cmp.eq p7,p0=r16,r0
21891 +(p7)   br.cond.sptk.many __ia64_trace_syscall;;
21892 +#else
21893 +GLOBAL_ENTRY(ia64_trace_syscall)
21894 +       PT_REGS_UNWIND_INFO(0)
21895 +#endif
21896 +       /*
21897 +        * We need to preserve the scratch registers f6-f11 in case the system
21898 +        * call is sigreturn.
21899 +        */
21900 +       adds r16=PT(F6)+16,sp
21901 +       adds r17=PT(F7)+16,sp
21902 +       ;;
21903 +       stf.spill [r16]=f6,32
21904 +       stf.spill [r17]=f7,32
21905 +       ;;
21906 +       stf.spill [r16]=f8,32
21907 +       stf.spill [r17]=f9,32
21908 +       ;;
21909 +       stf.spill [r16]=f10
21910 +       stf.spill [r17]=f11
21911 +       br.call.sptk.many rp=syscall_trace_enter // give parent a chance to catch syscall args
21912 +       adds r16=PT(F6)+16,sp
21913 +       adds r17=PT(F7)+16,sp
21914 +       ;;
21915 +       ldf.fill f6=[r16],32
21916 +       ldf.fill f7=[r17],32
21917 +       ;;
21918 +       ldf.fill f8=[r16],32
21919 +       ldf.fill f9=[r17],32
21920 +       ;;
21921 +       ldf.fill f10=[r16]
21922 +       ldf.fill f11=[r17]
21923 +       // the syscall number may have changed, so re-load it and re-calculate the
21924 +       // syscall entry-point:
21925 +       adds r15=PT(R15)+16,sp                  // r15 = &pt_regs.r15 (syscall #)
21926 +       ;;
21927 +       ld8 r15=[r15]
21928 +       mov r3=NR_syscalls - 1
21929 +       ;;
21930 +       adds r15=-1024,r15
21931 +       movl r16=sys_call_table
21932 +       ;;
21933 +       shladd r20=r15,3,r16                    // r20 = sys_call_table + 8*(syscall-1024)
21934 +       cmp.leu p6,p7=r15,r3
21935 +       ;;
21936 +(p6)   ld8 r20=[r20]                           // load address of syscall entry point
21937 +(p7)   movl r20=sys_ni_syscall
21938 +       ;;
21939 +       mov b6=r20
21940 +       br.call.sptk.many rp=b6                 // do the syscall
21941 +.strace_check_retval:
21942 +       cmp.lt p6,p0=r8,r0                      // syscall failed?
21943 +       adds r2=PT(R8)+16,sp                    // r2 = &pt_regs.r8
21944 +       adds r3=PT(R10)+16,sp                   // r3 = &pt_regs.r10
21945 +       mov r10=0
21946 +(p6)   br.cond.sptk strace_error               // syscall failed ->
21947 +       ;;                                      // avoid RAW on r10
21948 +.strace_save_retval:
21949 +.mem.offset 0,0; st8.spill [r2]=r8             // store return value in slot for r8
21950 +.mem.offset 8,0; st8.spill [r3]=r10            // clear error indication in slot for r10
21951 +       br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value
21952 +.ret3: br.cond.sptk .work_pending_syscall_end
21953 +
21954 +strace_error:
21955 +       ld8 r3=[r2]                             // load pt_regs.r8
21956 +       sub r9=0,r8                             // negate return value to get errno value
21957 +       ;;
21958 +       cmp.ne p6,p0=r3,r0                      // is pt_regs.r8!=0?
21959 +       adds r3=16,r2                           // r3=&pt_regs.r10
21960 +       ;;
21961 +(p6)   mov r10=-1
21962 +(p6)   mov r8=r9
21963 +       br.cond.sptk .strace_save_retval
21964 +#ifdef CONFIG_XEN
21965 +END(xen_trace_syscall)
21966 +#else
21967 +END(ia64_trace_syscall)
21968 +#endif
21969 +
21970 +/*
21971 + * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't
21972 + *     need to switch to bank 0 and doesn't restore the scratch registers.
21973 + *     To avoid leaking kernel bits, the scratch registers are set to
21974 + *     the following known-to-be-safe values:
21975 + *
21976 + *               r1: restored (global pointer)
21977 + *               r2: cleared
21978 + *               r3: 1 (when returning to user-level)
21979 + *           r8-r11: restored (syscall return value(s))
21980 + *              r12: restored (user-level stack pointer)
21981 + *              r13: restored (user-level thread pointer)
21982 + *              r14: cleared
21983 + *              r15: restored (syscall #)
21984 + *          r16-r17: cleared
21985 + *              r18: user-level b6
21986 + *              r19: cleared
21987 + *              r20: user-level ar.fpsr
21988 + *              r21: user-level b0
21989 + *              r22: cleared
21990 + *              r23: user-level ar.bspstore
21991 + *              r24: user-level ar.rnat
21992 + *              r25: user-level ar.unat
21993 + *              r26: user-level ar.pfs
21994 + *              r27: user-level ar.rsc
21995 + *              r28: user-level ip
21996 + *              r29: user-level psr
21997 + *              r30: user-level cfm
21998 + *              r31: user-level pr
21999 + *           f6-f11: cleared
22000 + *               pr: restored (user-level pr)
22001 + *               b0: restored (user-level rp)
22002 + *               b6: restored
22003 + *               b7: cleared
22004 + *          ar.unat: restored (user-level ar.unat)
22005 + *           ar.pfs: restored (user-level ar.pfs)
22006 + *           ar.rsc: restored (user-level ar.rsc)
22007 + *          ar.rnat: restored (user-level ar.rnat)
22008 + *      ar.bspstore: restored (user-level ar.bspstore)
22009 + *          ar.fpsr: restored (user-level ar.fpsr)
22010 + *           ar.ccv: cleared
22011 + *           ar.csd: cleared
22012 + *           ar.ssd: cleared
22013 + */
22014 +#ifdef CONFIG_XEN
22015 +GLOBAL_ENTRY(xen_leave_syscall)
22016 +       PT_REGS_UNWIND_INFO(0)
22017 +       movl r22=running_on_xen;;
22018 +       ld4 r22=[r22];;
22019 +       cmp.eq p7,p0=r22,r0
22020 +(p7)   br.cond.sptk.many __ia64_leave_syscall;;
22021 +#else
22022 +ENTRY(ia64_leave_syscall)
22023 +       PT_REGS_UNWIND_INFO(0)
22024 +#endif
22025 +       /*
22026 +        * work.need_resched etc. mustn't get changed by this CPU before it returns to
22027 +        * user- or fsys-mode, hence we disable interrupts early on.
22028 +        *
22029 +        * p6 controls whether current_thread_info()->flags needs to be check for
22030 +        * extra work.  We always check for extra work when returning to user-level.
22031 +        * With CONFIG_PREEMPT, we also check for extra work when the preempt_count
22032 +        * is 0.  After extra work processing has been completed, execution
22033 +        * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check
22034 +        * needs to be redone.
22035 +        */
22036 +#ifdef CONFIG_PREEMPT
22037 +       rsm psr.i                               // disable interrupts
22038 +       cmp.eq pLvSys,p0=r0,r0                  // pLvSys=1: leave from syscall
22039 +(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
22040 +       ;;
22041 +       .pred.rel.mutex pUStk,pKStk
22042 +(pKStk) ld4 r21=[r20]                  // r21 <- preempt_count
22043 +(pUStk)        mov r21=0                       // r21 <- 0
22044 +       ;;
22045 +       cmp.eq p6,p0=r21,r0             // p6 <- pUStk || (preempt_count == 0)
22046 +#else /* !CONFIG_PREEMPT */
22047 +#ifdef CONFIG_XEN
22048 +       movl r2=XSI_PSR_I
22049 +       ;;
22050 +(pUStk)        st4 [r2]=r0
22051 +#else
22052 +(pUStk)        rsm psr.i
22053 +#endif
22054 +       cmp.eq pLvSys,p0=r0,r0          // pLvSys=1: leave from syscall
22055 +(pUStk)        cmp.eq.unc p6,p0=r0,r0          // p6 <- pUStk
22056 +#endif
22057 +.work_processed_syscall:
22058 +       adds r2=PT(LOADRS)+16,r12
22059 +       adds r3=PT(AR_BSPSTORE)+16,r12
22060 +       adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
22061 +       ;;
22062 +(p6)   ld4 r31=[r18]                           // load current_thread_info()->flags
22063 +       ld8 r19=[r2],PT(B6)-PT(LOADRS)          // load ar.rsc value for "loadrs"
22064 +       mov b7=r0               // clear b7
22065 +       ;;
22066 +       ld8 r23=[r3],PT(R11)-PT(AR_BSPSTORE)    // load ar.bspstore (may be garbage)
22067 +       ld8 r18=[r2],PT(R9)-PT(B6)              // load b6
22068 +(p6)   and r15=TIF_WORK_MASK,r31               // any work other than TIF_SYSCALL_TRACE?
22069 +       ;;
22070 +       mov r16=ar.bsp                          // M2  get existing backing store pointer
22071 +(p6)   cmp4.ne.unc p6,p0=r15, r0               // any special work pending?
22072 +(p6)   br.cond.spnt .work_pending_syscall
22073 +       ;;
22074 +       // start restoring the state saved on the kernel stack (struct pt_regs):
22075 +       ld8 r9=[r2],PT(CR_IPSR)-PT(R9)
22076 +       ld8 r11=[r3],PT(CR_IIP)-PT(R11)
22077 +       mov f6=f0               // clear f6
22078 +       ;;
22079 +       invala                  // M0|1 invalidate ALAT
22080 +#ifdef CONFIG_XEN
22081 +       movl r29=XSI_PSR_IC
22082 +       ;;
22083 +       st8     [r29]=r0        // note: clears both vpsr.i and vpsr.ic!
22084 +       ;;
22085 +#else
22086 +       rsm psr.i | psr.ic      // M2 initiate turning off of interrupt and interruption collection
22087 +#endif
22088 +       mov f9=f0               // clear f9
22089 +
22090 +       ld8 r29=[r2],16         // load cr.ipsr
22091 +       ld8 r28=[r3],16                 // load cr.iip
22092 +       mov f8=f0               // clear f8
22093 +       ;;
22094 +       ld8 r30=[r2],16         // M0|1 load cr.ifs
22095 +       mov.m ar.ssd=r0         // M2 clear ar.ssd
22096 +       cmp.eq p9,p0=r0,r0      // set p9 to indicate that we should restore cr.ifs
22097 +       ;;
22098 +       ld8 r25=[r3],16         // M0|1 load ar.unat
22099 +       mov.m ar.csd=r0         // M2 clear ar.csd
22100 +       mov r22=r0              // clear r22
22101 +       ;;
22102 +       ld8 r26=[r2],PT(B0)-PT(AR_PFS)  // M0|1 load ar.pfs
22103 +(pKStk)        mov r22=psr             // M2 read PSR now that interrupts are disabled
22104 +       mov f10=f0              // clear f10
22105 +       ;;
22106 +       ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // load b0
22107 +       ld8 r27=[r3],PT(PR)-PT(AR_RSC)  // load ar.rsc
22108 +       mov f11=f0              // clear f11
22109 +       ;;
22110 +       ld8 r24=[r2],PT(AR_FPSR)-PT(AR_RNAT)    // load ar.rnat (may be garbage)
22111 +       ld8 r31=[r3],PT(R1)-PT(PR)              // load predicates
22112 +(pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
22113 +       ;;
22114 +       ld8 r20=[r2],PT(R12)-PT(AR_FPSR)        // load ar.fpsr
22115 +       ld8.fill r1=[r3],16     // load r1
22116 +(pUStk) mov r17=1
22117 +       ;;
22118 +       srlz.d                  // M0  ensure interruption collection is off
22119 +       ld8.fill r13=[r3],16
22120 +       mov f7=f0               // clear f7
22121 +       ;;
22122 +       ld8.fill r12=[r2]       // restore r12 (sp)
22123 +       ld8.fill r15=[r3]       // restore r15
22124 +       addl r3=THIS_CPU(ia64_phys_stacked_size_p8),r0
22125 +       ;;
22126 +(pUStk)        ld4 r3=[r3]             // r3 = cpu_data->phys_stacked_size_p8
22127 +(pUStk) st1 [r14]=r17
22128 +       mov b6=r18              // I0  restore b6
22129 +       ;;
22130 +       mov r14=r0              // clear r14
22131 +       shr.u r18=r19,16        // I0|1 get byte size of existing "dirty" partition
22132 +(pKStk) br.cond.dpnt.many skip_rbs_switch
22133 +
22134 +       mov.m ar.ccv=r0         // clear ar.ccv
22135 +(pNonSys) br.cond.dpnt.many dont_preserve_current_frame
22136 +       br.cond.sptk.many rbs_switch
22137 +#ifdef CONFIG_XEN
22138 +END(xen_leave_syscall)
22139 +#else
22140 +END(ia64_leave_syscall)
22141 +#endif
22142 +
22143 +#ifdef CONFIG_XEN
22144 +GLOBAL_ENTRY(xen_leave_kernel)
22145 +       PT_REGS_UNWIND_INFO(0)
22146 +       movl r22=running_on_xen;;
22147 +       ld4 r22=[r22];;
22148 +       cmp.eq p7,p0=r22,r0
22149 +(p7)   br.cond.sptk.many __ia64_leave_kernel;;
22150 +#else
22151 +GLOBAL_ENTRY(ia64_leave_kernel)
22152 +       PT_REGS_UNWIND_INFO(0)
22153 +#endif
22154 +       /*
22155 +        * work.need_resched etc. mustn't get changed by this CPU before it returns to
22156 +        * user- or fsys-mode, hence we disable interrupts early on.
22157 +        *
22158 +        * p6 controls whether current_thread_info()->flags needs to be check for
22159 +        * extra work.  We always check for extra work when returning to user-level.
22160 +        * With CONFIG_PREEMPT, we also check for extra work when the preempt_count
22161 +        * is 0.  After extra work processing has been completed, execution
22162 +        * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check
22163 +        * needs to be redone.
22164 +        */
22165 +#ifdef CONFIG_PREEMPT
22166 +       rsm psr.i                               // disable interrupts
22167 +       cmp.eq p0,pLvSys=r0,r0                  // pLvSys=0: leave from kernel
22168 +(pKStk)        adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
22169 +       ;;
22170 +       .pred.rel.mutex pUStk,pKStk
22171 +(pKStk)        ld4 r21=[r20]                   // r21 <- preempt_count
22172 +(pUStk)        mov r21=0                       // r21 <- 0
22173 +       ;;
22174 +       cmp.eq p6,p0=r21,r0             // p6 <- pUStk || (preempt_count == 0)
22175 +#else
22176 +#ifdef CONFIG_XEN
22177 +(pUStk)        movl r17=XSI_PSR_I
22178 +       ;;
22179 +(pUStk)        st4 [r17]=r0
22180 +       ;;
22181 +#else
22182 +(pUStk)        rsm psr.i
22183 +#endif
22184 +       cmp.eq p0,pLvSys=r0,r0          // pLvSys=0: leave from kernel
22185 +(pUStk)        cmp.eq.unc p6,p0=r0,r0          // p6 <- pUStk
22186 +#endif
22187 +.work_processed_kernel:
22188 +       adds r17=TI_FLAGS+IA64_TASK_SIZE,r13
22189 +       ;;
22190 +(p6)   ld4 r31=[r17]                           // load current_thread_info()->flags
22191 +       adds r21=PT(PR)+16,r12
22192 +       ;;
22193 +
22194 +       lfetch [r21],PT(CR_IPSR)-PT(PR)
22195 +       adds r2=PT(B6)+16,r12
22196 +       adds r3=PT(R16)+16,r12
22197 +       ;;
22198 +       lfetch [r21]
22199 +       ld8 r28=[r2],8          // load b6
22200 +       adds r29=PT(R24)+16,r12
22201 +
22202 +       ld8.fill r16=[r3],PT(AR_CSD)-PT(R16)
22203 +       adds r30=PT(AR_CCV)+16,r12
22204 +(p6)   and r19=TIF_WORK_MASK,r31               // any work other than TIF_SYSCALL_TRACE?
22205 +       ;;
22206 +       ld8.fill r24=[r29]
22207 +       ld8 r15=[r30]           // load ar.ccv
22208 +(p6)   cmp4.ne.unc p6,p0=r19, r0               // any special work pending?
22209 +       ;;
22210 +       ld8 r29=[r2],16         // load b7
22211 +       ld8 r30=[r3],16         // load ar.csd
22212 +(p6)   br.cond.spnt .work_pending
22213 +       ;;
22214 +       ld8 r31=[r2],16         // load ar.ssd
22215 +       ld8.fill r8=[r3],16
22216 +       ;;
22217 +       ld8.fill r9=[r2],16
22218 +       ld8.fill r10=[r3],PT(R17)-PT(R10)
22219 +       ;;
22220 +       ld8.fill r11=[r2],PT(R18)-PT(R11)
22221 +       ld8.fill r17=[r3],16
22222 +       ;;
22223 +       ld8.fill r18=[r2],16
22224 +       ld8.fill r19=[r3],16
22225 +       ;;
22226 +       ld8.fill r20=[r2],16
22227 +       ld8.fill r21=[r3],16
22228 +       mov ar.csd=r30
22229 +       mov ar.ssd=r31
22230 +       ;;
22231 +#ifdef CONFIG_XEN
22232 +       movl r22=XSI_PSR_IC
22233 +       ;;
22234 +       st8 [r22]=r0            // note: clears both vpsr.i and vpsr.ic!
22235 +       ;;
22236 +#else
22237 +       rsm psr.i | psr.ic      // initiate turning off of interrupt and interruption collection
22238 +#endif
22239 +       invala                  // invalidate ALAT
22240 +       ;;
22241 +       ld8.fill r22=[r2],24
22242 +       ld8.fill r23=[r3],24
22243 +       mov b6=r28
22244 +       ;;
22245 +       ld8.fill r25=[r2],16
22246 +       ld8.fill r26=[r3],16
22247 +       mov b7=r29
22248 +       ;;
22249 +       ld8.fill r27=[r2],16
22250 +       ld8.fill r28=[r3],16
22251 +       ;;
22252 +       ld8.fill r29=[r2],16
22253 +       ld8.fill r30=[r3],24
22254 +       ;;
22255 +       ld8.fill r31=[r2],PT(F9)-PT(R31)
22256 +       adds r3=PT(F10)-PT(F6),r3
22257 +       ;;
22258 +       ldf.fill f9=[r2],PT(F6)-PT(F9)
22259 +       ldf.fill f10=[r3],PT(F8)-PT(F10)
22260 +       ;;
22261 +       ldf.fill f6=[r2],PT(F7)-PT(F6)
22262 +       ;;
22263 +       ldf.fill f7=[r2],PT(F11)-PT(F7)
22264 +       ldf.fill f8=[r3],32
22265 +       ;;
22266 +       srlz.i                  // ensure interruption collection is off
22267 +       mov ar.ccv=r15
22268 +       ;;
22269 +       ldf.fill f11=[r2]
22270 +#ifdef CONFIG_XEN
22271 +       ;;
22272 +       // r16-r31 all now hold bank1 values
22273 +       movl r2=XSI_BANK1_R16
22274 +       movl r3=XSI_BANK1_R16+8
22275 +       ;;
22276 +       st8.spill [r2]=r16,16
22277 +       st8.spill [r3]=r17,16
22278 +       ;;
22279 +       st8.spill [r2]=r18,16
22280 +       st8.spill [r3]=r19,16
22281 +       ;;
22282 +       st8.spill [r2]=r20,16
22283 +       st8.spill [r3]=r21,16
22284 +       ;;
22285 +       st8.spill [r2]=r22,16
22286 +       st8.spill [r3]=r23,16
22287 +       ;;
22288 +       st8.spill [r2]=r24,16
22289 +       st8.spill [r3]=r25,16
22290 +       ;;
22291 +       st8.spill [r2]=r26,16
22292 +       st8.spill [r3]=r27,16
22293 +       ;;
22294 +       st8.spill [r2]=r28,16
22295 +       st8.spill [r3]=r29,16
22296 +       ;;
22297 +       st8.spill [r2]=r30,16
22298 +       st8.spill [r3]=r31,16
22299 +       ;;
22300 +       movl r2=XSI_BANKNUM;;
22301 +       st4 [r2]=r0;
22302 +#else
22303 +       bsw.0                   // switch back to bank 0 (no stop bit required beforehand...)
22304 +#endif
22305 +       ;;
22306 +(pUStk)        mov r18=IA64_KR(CURRENT)// M2 (12 cycle read latency)
22307 +       adds r16=PT(CR_IPSR)+16,r12
22308 +       adds r17=PT(CR_IIP)+16,r12
22309 +
22310 +(pKStk)        mov r22=psr             // M2 read PSR now that interrupts are disabled
22311 +       nop.i 0
22312 +       nop.i 0
22313 +       ;;
22314 +       ld8 r29=[r16],16        // load cr.ipsr
22315 +       ld8 r28=[r17],16        // load cr.iip
22316 +       ;;
22317 +       ld8 r30=[r16],16        // load cr.ifs
22318 +       ld8 r25=[r17],16        // load ar.unat
22319 +       ;;
22320 +       ld8 r26=[r16],16        // load ar.pfs
22321 +       ld8 r27=[r17],16        // load ar.rsc
22322 +       cmp.eq p9,p0=r0,r0      // set p9 to indicate that we should restore cr.ifs
22323 +       ;;
22324 +       ld8 r24=[r16],16        // load ar.rnat (may be garbage)
22325 +       ld8 r23=[r17],16        // load ar.bspstore (may be garbage)
22326 +       ;;
22327 +       ld8 r31=[r16],16        // load predicates
22328 +       ld8 r21=[r17],16        // load b0
22329 +       ;;
22330 +       ld8 r19=[r16],16        // load ar.rsc value for "loadrs"
22331 +       ld8.fill r1=[r17],16    // load r1
22332 +       ;;
22333 +       ld8.fill r12=[r16],16
22334 +       ld8.fill r13=[r17],16
22335 +(pUStk)        adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18
22336 +       ;;
22337 +       ld8 r20=[r16],16        // ar.fpsr
22338 +       ld8.fill r15=[r17],16
22339 +       ;;
22340 +       ld8.fill r14=[r16],16
22341 +       ld8.fill r2=[r17]
22342 +(pUStk)        mov r17=1
22343 +       ;;
22344 +       ld8.fill r3=[r16]
22345 +(pUStk)        st1 [r18]=r17           // restore current->thread.on_ustack
22346 +       shr.u r18=r19,16        // get byte size of existing "dirty" partition
22347 +       ;;
22348 +       mov r16=ar.bsp          // get existing backing store pointer
22349 +       addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0
22350 +       ;;
22351 +       ld4 r17=[r17]           // r17 = cpu_data->phys_stacked_size_p8
22352 +(pKStk)        br.cond.dpnt skip_rbs_switch
22353 +
22354 +       /*
22355 +        * Restore user backing store.
22356 +        *
22357 +        * NOTE: alloc, loadrs, and cover can't be predicated.
22358 +        */
22359 +(pNonSys) br.cond.dpnt dont_preserve_current_frame
22360 +
22361 +rbs_switch:
22362 +#ifdef CONFIG_XEN
22363 +       XEN_HYPER_COVER;
22364 +#else
22365 +       cover                           // add current frame into dirty partition and set cr.ifs
22366 +#endif
22367 +       ;;
22368 +       mov r19=ar.bsp                  // get new backing store pointer
22369 +       sub r16=r16,r18                 // krbs = old bsp - size of dirty partition
22370 +       cmp.ne p9,p0=r0,r0              // clear p9 to skip restore of cr.ifs
22371 +       ;;
22372 +       sub r19=r19,r16                 // calculate total byte size of dirty partition
22373 +       add r18=64,r18                  // don't force in0-in7 into memory...
22374 +       ;;
22375 +       shl r19=r19,16                  // shift size of dirty partition into loadrs position
22376 +       ;;
22377 +dont_preserve_current_frame:
22378 +       /*
22379 +        * To prevent leaking bits between the kernel and user-space,
22380 +        * we must clear the stacked registers in the "invalid" partition here.
22381 +        * Not pretty, but at least it's fast (3.34 registers/cycle on Itanium,
22382 +        * 5 registers/cycle on McKinley).
22383 +        */
22384 +#      define pRecurse p6
22385 +#      define pReturn  p7
22386 +#ifdef CONFIG_ITANIUM
22387 +#      define Nregs    10
22388 +#else
22389 +#      define Nregs    14
22390 +#endif
22391 +       alloc loc0=ar.pfs,2,Nregs-2,2,0
22392 +       shr.u loc1=r18,9                // RNaTslots <= floor(dirtySize / (64*8))
22393 +       sub r17=r17,r18                 // r17 = (physStackedSize + 8) - dirtySize
22394 +       ;;
22395 +       mov ar.rsc=r19                  // load ar.rsc to be used for "loadrs"
22396 +       shladd in0=loc1,3,r17
22397 +       mov in1=0
22398 +       ;;
22399 +       TEXT_ALIGN(32)
22400 +rse_clear_invalid:
22401 +#ifdef CONFIG_ITANIUM
22402 +       // cycle 0
22403 + { .mii
22404 +       alloc loc0=ar.pfs,2,Nregs-2,2,0
22405 +       cmp.lt pRecurse,p0=Nregs*8,in0  // if more than Nregs regs left to clear, (re)curse
22406 +       add out0=-Nregs*8,in0
22407 +}{ .mfb
22408 +       add out1=1,in1                  // increment recursion count
22409 +       nop.f 0
22410 +       nop.b 0                         // can't do br.call here because of alloc (WAW on CFM)
22411 +       ;;
22412 +}{ .mfi        // cycle 1
22413 +       mov loc1=0
22414 +       nop.f 0
22415 +       mov loc2=0
22416 +}{ .mib
22417 +       mov loc3=0
22418 +       mov loc4=0
22419 +(pRecurse) br.call.sptk.many b0=rse_clear_invalid
22420 +
22421 +}{ .mfi        // cycle 2
22422 +       mov loc5=0
22423 +       nop.f 0
22424 +       cmp.ne pReturn,p0=r0,in1        // if recursion count != 0, we need to do a br.ret
22425 +}{ .mib
22426 +       mov loc6=0
22427 +       mov loc7=0
22428 +(pReturn) br.ret.sptk.many b0
22429 +}
22430 +#else /* !CONFIG_ITANIUM */
22431 +       alloc loc0=ar.pfs,2,Nregs-2,2,0
22432 +       cmp.lt pRecurse,p0=Nregs*8,in0  // if more than Nregs regs left to clear, (re)curse
22433 +       add out0=-Nregs*8,in0
22434 +       add out1=1,in1                  // increment recursion count
22435 +       mov loc1=0
22436 +       mov loc2=0
22437 +       ;;
22438 +       mov loc3=0
22439 +       mov loc4=0
22440 +       mov loc5=0
22441 +       mov loc6=0
22442 +       mov loc7=0
22443 +(pRecurse) br.call.sptk.few b0=rse_clear_invalid
22444 +       ;;
22445 +       mov loc8=0
22446 +       mov loc9=0
22447 +       cmp.ne pReturn,p0=r0,in1        // if recursion count != 0, we need to do a br.ret
22448 +       mov loc10=0
22449 +       mov loc11=0
22450 +(pReturn) br.ret.sptk.many b0
22451 +#endif /* !CONFIG_ITANIUM */
22452 +#      undef pRecurse
22453 +#      undef pReturn
22454 +       ;;
22455 +       alloc r17=ar.pfs,0,0,0,0        // drop current register frame
22456 +       ;;
22457 +       loadrs
22458 +       ;;
22459 +skip_rbs_switch:
22460 +       mov ar.unat=r25         // M2
22461 +(pKStk)        extr.u r22=r22,21,1     // I0 extract current value of psr.pp from r22
22462 +(pLvSys)mov r19=r0             // A  clear r19 for leave_syscall, no-op otherwise
22463 +       ;;
22464 +(pUStk)        mov ar.bspstore=r23     // M2
22465 +(pKStk)        dep r29=r22,r29,21,1    // I0 update ipsr.pp with psr.pp
22466 +(pLvSys)mov r16=r0             // A  clear r16 for leave_syscall, no-op otherwise
22467 +       ;;
22468 +#ifdef CONFIG_XEN
22469 +       movl r25=XSI_IPSR
22470 +       ;;
22471 +       st8[r25]=r29,XSI_IFS-XSI_IPSR
22472 +       ;;
22473 +#else
22474 +       mov cr.ipsr=r29         // M2
22475 +#endif
22476 +       mov ar.pfs=r26          // I0
22477 +(pLvSys)mov r17=r0             // A  clear r17 for leave_syscall, no-op otherwise
22478 +
22479 +#ifdef CONFIG_XEN
22480 +(p9)   st8 [r25]=r30
22481 +       ;;
22482 +       adds r25=XSI_IIP-XSI_IFS,r25
22483 +       ;;
22484 +#else
22485 +(p9)   mov cr.ifs=r30          // M2
22486 +#endif
22487 +       mov b0=r21              // I0
22488 +(pLvSys)mov r18=r0             // A  clear r18 for leave_syscall, no-op otherwise
22489 +
22490 +       mov ar.fpsr=r20         // M2
22491 +#ifdef CONFIG_XEN
22492 +       st8     [r25]=r28
22493 +#else
22494 +       mov cr.iip=r28          // M2
22495 +#endif
22496 +       nop 0
22497 +       ;;
22498 +(pUStk)        mov ar.rnat=r24         // M2 must happen with RSE in lazy mode
22499 +       nop 0
22500 +(pLvSys)mov r2=r0
22501 +
22502 +       mov ar.rsc=r27          // M2
22503 +       mov pr=r31,-1           // I0
22504 +#ifdef CONFIG_XEN
22505 +       ;;
22506 +       XEN_HYPER_RFI;
22507 +#else
22508 +       rfi                     // B
22509 +#endif
22510 +
22511 +       /*
22512 +        * On entry:
22513 +        *      r20 = &current->thread_info->pre_count (if CONFIG_PREEMPT)
22514 +        *      r31 = current->thread_info->flags
22515 +        * On exit:
22516 +        *      p6 = TRUE if work-pending-check needs to be redone
22517 +        */
22518 +.work_pending_syscall:
22519 +       add r2=-8,r2
22520 +       add r3=-8,r3
22521 +       ;;
22522 +       st8 [r2]=r8
22523 +       st8 [r3]=r10
22524 +.work_pending:
22525 +       tbit.nz p6,p0=r31,TIF_SIGDELAYED                // signal delayed from  MCA/INIT/NMI/PMI context?
22526 +(p6)   br.cond.sptk.few .sigdelayed
22527 +       ;;
22528 +       tbit.z p6,p0=r31,TIF_NEED_RESCHED               // current_thread_info()->need_resched==0?
22529 +(p6)   br.cond.sptk.few .notify
22530 +#ifdef CONFIG_PREEMPT
22531 +(pKStk) dep r21=-1,r0,PREEMPT_ACTIVE_BIT,1
22532 +       ;;
22533 +(pKStk) st4 [r20]=r21
22534 +       ssm psr.i               // enable interrupts
22535 +#endif
22536 +       br.call.spnt.many rp=schedule
22537 +.ret9: cmp.eq p6,p0=r0,r0                              // p6 <- 1
22538 +#ifdef CONFIG_XEN
22539 +       movl r2=XSI_PSR_I
22540 +       ;;
22541 +       st4 [r2]=r0
22542 +#else
22543 +       rsm psr.i               // disable interrupts
22544 +#endif
22545 +       ;;
22546 +#ifdef CONFIG_PREEMPT
22547 +(pKStk)        adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
22548 +       ;;
22549 +(pKStk)        st4 [r20]=r0            // preempt_count() <- 0
22550 +#endif
22551 +(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
22552 +       br.cond.sptk.many .work_processed_kernel        // re-check
22553 +
22554 +.notify:
22555 +(pUStk)        br.call.spnt.many rp=notify_resume_user
22556 +.ret10:        cmp.ne p6,p0=r0,r0                              // p6 <- 0
22557 +(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
22558 +       br.cond.sptk.many .work_processed_kernel        // don't re-check
22559 +
22560 +// There is a delayed signal that was detected in MCA/INIT/NMI/PMI context where
22561 +// it could not be delivered.  Deliver it now.  The signal might be for us and
22562 +// may set TIF_SIGPENDING, so redrive ia64_leave_* after processing the delayed
22563 +// signal.
22564 +
22565 +.sigdelayed:
22566 +       br.call.sptk.many rp=do_sigdelayed
22567 +       cmp.eq p6,p0=r0,r0                              // p6 <- 1, always re-check
22568 +(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
22569 +       br.cond.sptk.many .work_processed_kernel        // re-check
22570 +
22571 +.work_pending_syscall_end:
22572 +       adds r2=PT(R8)+16,r12
22573 +       adds r3=PT(R10)+16,r12
22574 +       ;;
22575 +       ld8 r8=[r2]
22576 +       ld8 r10=[r3]
22577 +       br.cond.sptk.many .work_processed_syscall       // re-check
22578 +
22579 +#ifdef CONFIG_XEN
22580 +END(xen_leave_kernel)
22581 +#else
22582 +END(ia64_leave_kernel)
22583 +#endif
22584 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/xen/xenhpski.c linux-2.6.16/arch/ia64/xen/xenhpski.c
22585 --- linux-2.6.16.orig/arch/ia64/xen/xenhpski.c  1970-01-01 01:00:00.000000000 +0100
22586 +++ linux-2.6.16/arch/ia64/xen/xenhpski.c       2006-06-26 09:51:32.000000000 +0200
22587 @@ -0,0 +1,19 @@
22588 +
22589 +extern unsigned long xen_get_cpuid(int);
22590 +
22591 +int
22592 +running_on_sim(void)
22593 +{
22594 +       int i;
22595 +       long cpuid[6];
22596 +
22597 +       for (i = 0; i < 5; ++i)
22598 +               cpuid[i] = xen_get_cpuid(i);
22599 +       if ((cpuid[0] & 0xff) != 'H') return 0;
22600 +       if ((cpuid[3] & 0xff) != 0x4) return 0;
22601 +       if (((cpuid[3] >> 8) & 0xff) != 0x0) return 0;
22602 +       if (((cpuid[3] >> 16) & 0xff) != 0x0) return 0;
22603 +       if (((cpuid[3] >> 24) & 0x7) != 0x7) return 0;
22604 +       return 1;
22605 +}
22606 +
22607 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/xen/xenivt.S linux-2.6.16/arch/ia64/xen/xenivt.S
22608 --- linux-2.6.16.orig/arch/ia64/xen/xenivt.S    1970-01-01 01:00:00.000000000 +0100
22609 +++ linux-2.6.16/arch/ia64/xen/xenivt.S 2006-06-26 09:51:32.000000000 +0200
22610 @@ -0,0 +1,2032 @@
22611 +/*
22612 + * arch/ia64/xen/ivt.S
22613 + *
22614 + * Copyright (C) 2005 Hewlett-Packard Co
22615 + *     Dan Magenheimer <dan.magenheimer@hp.com>
22616 + */
22617 +/*
22618 + * This file defines the interruption vector table used by the CPU.
22619 + * It does not include one entry per possible cause of interruption.
22620 + *
22621 + * The first 20 entries of the table contain 64 bundles each while the
22622 + * remaining 48 entries contain only 16 bundles each.
22623 + *
22624 + * The 64 bundles are used to allow inlining the whole handler for critical
22625 + * interruptions like TLB misses.
22626 + *
22627 + *  For each entry, the comment is as follows:
22628 + *
22629 + *             // 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
22630 + *  entry offset ----/     /         /                  /          /
22631 + *  entry number ---------/         /                  /          /
22632 + *  size of the entry -------------/                  /          /
22633 + *  vector name -------------------------------------/          /
22634 + *  interruptions triggering this vector ----------------------/
22635 + *
22636 + * The table is 32KB in size and must be aligned on 32KB boundary.
22637 + * (The CPU ignores the 15 lower bits of the address)
22638 + *
22639 + * Table is based upon EAS2.6 (Oct 1999)
22640 + */
22641 +
22642 +#include <linux/config.h>
22643 +
22644 +#include <asm/asmmacro.h>
22645 +#include <asm/break.h>
22646 +#include <asm/ia32.h>
22647 +#include <asm/kregs.h>
22648 +#include <asm/asm-offsets.h>
22649 +#include <asm/pgtable.h>
22650 +#include <asm/processor.h>
22651 +#include <asm/ptrace.h>
22652 +#include <asm/system.h>
22653 +#include <asm/thread_info.h>
22654 +#include <asm/unistd.h>
22655 +#include <asm/errno.h>
22656 +
22657 +#ifdef CONFIG_XEN
22658 +#define ia64_ivt xen_ivt
22659 +#endif
22660 +
22661 +#if 1
22662 +# define PSR_DEFAULT_BITS      psr.ac
22663 +#else
22664 +# define PSR_DEFAULT_BITS      0
22665 +#endif
22666 +
22667 +#if 0
22668 +  /*
22669 +   * This lets you track the last eight faults that occurred on the CPU.  Make sure ar.k2 isn't
22670 +   * needed for something else before enabling this...
22671 +   */
22672 +# define DBG_FAULT(i)  mov r16=ar.k2;; shl r16=r16,8;; add r16=(i),r16;;mov ar.k2=r16
22673 +#else
22674 +# define DBG_FAULT(i)
22675 +#endif
22676 +
22677 +#define MINSTATE_VIRT  /* needed by minstate.h */
22678 +#include "xenminstate.h"
22679 +
22680 +#define FAULT(n)                                                                       \
22681 +       mov r31=pr;                                                                     \
22682 +       mov r19=n;;                     /* prepare to save predicates */                \
22683 +       br.sptk.many dispatch_to_fault_handler
22684 +
22685 +       .section .text.ivt,"ax"
22686 +
22687 +       .align 32768    // align on 32KB boundary
22688 +       .global ia64_ivt
22689 +ia64_ivt:
22690 +/////////////////////////////////////////////////////////////////////////////////////////
22691 +// 0x0000 Entry 0 (size 64 bundles) VHPT Translation (8,20,47)
22692 +ENTRY(vhpt_miss)
22693 +       DBG_FAULT(0)
22694 +       /*
22695 +        * The VHPT vector is invoked when the TLB entry for the virtual page table
22696 +        * is missing.  This happens only as a result of a previous
22697 +        * (the "original") TLB miss, which may either be caused by an instruction
22698 +        * fetch or a data access (or non-access).
22699 +        *
22700 +        * What we do here is normal TLB miss handing for the _original_ miss, followed
22701 +        * by inserting the TLB entry for the virtual page table page that the VHPT
22702 +        * walker was attempting to access.  The latter gets inserted as long
22703 +        * as both L1 and L2 have valid mappings for the faulting address.
22704 +        * The TLB entry for the original miss gets inserted only if
22705 +        * the L3 entry indicates that the page is present.
22706 +        *
22707 +        * do_page_fault gets invoked in the following cases:
22708 +        *      - the faulting virtual address uses unimplemented address bits
22709 +        *      - the faulting virtual address has no L1, L2, or L3 mapping
22710 +        */
22711 +#ifdef CONFIG_XEN
22712 +       movl r16=XSI_IFA
22713 +       ;;
22714 +       ld8 r16=[r16]
22715 +#ifdef CONFIG_HUGETLB_PAGE
22716 +       movl r18=PAGE_SHIFT
22717 +       movl r25=XSI_ITIR
22718 +       ;;
22719 +       ld8 r25=[r25]
22720 +#endif
22721 +       ;;
22722 +#else
22723 +       mov r16=cr.ifa                          // get address that caused the TLB miss
22724 +#ifdef CONFIG_HUGETLB_PAGE
22725 +       movl r18=PAGE_SHIFT
22726 +       mov r25=cr.itir
22727 +#endif
22728 +#endif
22729 +       ;;
22730 +#ifdef CONFIG_XEN
22731 +       XEN_HYPER_RSM_PSR_DT;
22732 +#else
22733 +       rsm psr.dt                              // use physical addressing for data
22734 +#endif
22735 +       mov r31=pr                              // save the predicate registers
22736 +       mov r19=IA64_KR(PT_BASE)                // get page table base address
22737 +       shl r21=r16,3                           // shift bit 60 into sign bit
22738 +       shr.u r17=r16,61                        // get the region number into r17
22739 +       ;;
22740 +       shr r22=r21,3
22741 +#ifdef CONFIG_HUGETLB_PAGE
22742 +       extr.u r26=r25,2,6
22743 +       ;;
22744 +       cmp.ne p8,p0=r18,r26
22745 +       sub r27=r26,r18
22746 +       ;;
22747 +(p8)   dep r25=r18,r25,2,6
22748 +(p8)   shr r22=r22,r27
22749 +#endif
22750 +       ;;
22751 +       cmp.eq p6,p7=5,r17                      // is IFA pointing into to region 5?
22752 +       shr.u r18=r22,PGDIR_SHIFT               // get bits 33-63 of the faulting address
22753 +       ;;
22754 +(p7)   dep r17=r17,r19,(PAGE_SHIFT-3),3        // put region number bits in place
22755 +
22756 +       srlz.d
22757 +       LOAD_PHYSICAL(p6, r19, swapper_pg_dir)  // region 5 is rooted at swapper_pg_dir
22758 +
22759 +       .pred.rel "mutex", p6, p7
22760 +(p6)   shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
22761 +(p7)   shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
22762 +       ;;
22763 +(p6)   dep r17=r18,r19,3,(PAGE_SHIFT-3)        // r17=PTA + IFA(33,42)*8
22764 +(p7)   dep r17=r18,r17,3,(PAGE_SHIFT-6)        // r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
22765 +       cmp.eq p7,p6=0,r21                      // unused address bits all zeroes?
22766 +       shr.u r18=r22,PMD_SHIFT                 // shift L2 index into position
22767 +       ;;
22768 +       ld8 r17=[r17]                           // fetch the L1 entry (may be 0)
22769 +       ;;
22770 +(p7)   cmp.eq p6,p7=r17,r0                     // was L1 entry NULL?
22771 +       dep r17=r18,r17,3,(PAGE_SHIFT-3)        // compute address of L2 page table entry
22772 +       ;;
22773 +(p7)   ld8 r20=[r17]                           // fetch the L2 entry (may be 0)
22774 +       shr.u r19=r22,PAGE_SHIFT                // shift L3 index into position
22775 +       ;;
22776 +(p7)   cmp.eq.or.andcm p6,p7=r20,r0            // was L2 entry NULL?
22777 +       dep r21=r19,r20,3,(PAGE_SHIFT-3)        // compute address of L3 page table entry
22778 +       ;;
22779 +#ifdef CONFIG_XEN
22780 +(p7)   ld8 r18=[r21]                           // read the L3 PTE
22781 +       movl r19=XSI_ISR
22782 +       ;;
22783 +       ld8 r19=[r19]
22784 +       ;;
22785 +(p7)   tbit.z p6,p7=r18,_PAGE_P_BIT            // page present bit cleared?
22786 +       movl r22=XSI_IHA
22787 +       ;;
22788 +       ld8 r22=[r22]
22789 +       ;;
22790 +#else
22791 +(p7)   ld8 r18=[r21]                           // read the L3 PTE
22792 +       mov r19=cr.isr                          // cr.isr bit 0 tells us if this is an insn miss
22793 +       ;;
22794 +(p7)   tbit.z p6,p7=r18,_PAGE_P_BIT            // page present bit cleared?
22795 +       mov r22=cr.iha                          // get the VHPT address that caused the TLB miss
22796 +       ;;                                      // avoid RAW on p7
22797 +#endif
22798 +(p7)   tbit.nz.unc p10,p11=r19,32              // is it an instruction TLB miss?
22799 +       dep r23=0,r20,0,PAGE_SHIFT              // clear low bits to get page address
22800 +       ;;
22801 +#ifdef CONFIG_XEN
22802 +       mov r24=r8
22803 +       mov r8=r18
22804 +       ;;
22805 +(p10)  XEN_HYPER_ITC_D
22806 +       ;;
22807 +(p11)  XEN_HYPER_ITC_I
22808 +       ;;
22809 +       mov r8=r24
22810 +       ;;
22811 +(p6)   br.cond.spnt.many page_fault            // handle bad address/page not present (page fault)
22812 +       ;;
22813 +       movl r24=XSI_IFA
22814 +       ;;
22815 +       st8 [r24]=r22
22816 +       ;;
22817 +#else
22818 +(p10)  itc.i r18                               // insert the instruction TLB entry
22819 +(p11)  itc.d r18                               // insert the data TLB entry
22820 +(p6)   br.cond.spnt.many page_fault            // handle bad address/page not present (page fault)
22821 +       mov cr.ifa=r22
22822 +#endif
22823 +
22824 +#ifdef CONFIG_HUGETLB_PAGE
22825 +(p8)   mov cr.itir=r25                         // change to default page-size for VHPT
22826 +#endif
22827 +
22828 +       /*
22829 +        * Now compute and insert the TLB entry for the virtual page table.  We never
22830 +        * execute in a page table page so there is no need to set the exception deferral
22831 +        * bit.
22832 +        */
22833 +       adds r24=__DIRTY_BITS_NO_ED|_PAGE_PL_0|_PAGE_AR_RW,r23
22834 +       ;;
22835 +#ifdef CONFIG_XEN
22836 +(p7)   mov r25=r8
22837 +(p7)   mov r8=r24
22838 +       ;;
22839 +(p7)   XEN_HYPER_ITC_D
22840 +       ;;
22841 +(p7)   mov r8=r25
22842 +       ;;
22843 +#else
22844 +(p7)   itc.d r24
22845 +#endif
22846 +       ;;
22847 +#ifdef CONFIG_SMP
22848 +       /*
22849 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
22850 +        * cannot possibly affect the following loads:
22851 +        */
22852 +       dv_serialize_data
22853 +
22854 +       /*
22855 +        * Re-check L2 and L3 pagetable.  If they changed, we may have received a ptc.g
22856 +        * between reading the pagetable and the "itc".  If so, flush the entry we
22857 +        * inserted and retry.
22858 +        */
22859 +       ld8 r25=[r21]                           // read L3 PTE again
22860 +       ld8 r26=[r17]                           // read L2 entry again
22861 +       ;;
22862 +       cmp.ne p6,p7=r26,r20                    // did L2 entry change
22863 +       mov r27=PAGE_SHIFT<<2
22864 +       ;;
22865 +(p6)   ptc.l r22,r27                           // purge PTE page translation
22866 +(p7)   cmp.ne.or.andcm p6,p7=r25,r18           // did L3 PTE change
22867 +       ;;
22868 +(p6)   ptc.l r16,r27                           // purge translation
22869 +#endif
22870 +
22871 +       mov pr=r31,-1                           // restore predicate registers
22872 +#ifdef CONFIG_XEN
22873 +       XEN_HYPER_RFI;
22874 +#else
22875 +       rfi
22876 +#endif
22877 +END(vhpt_miss)
22878 +
22879 +       .org ia64_ivt+0x400
22880 +/////////////////////////////////////////////////////////////////////////////////////////
22881 +// 0x0400 Entry 1 (size 64 bundles) ITLB (21)
22882 +ENTRY(itlb_miss)
22883 +       DBG_FAULT(1)
22884 +       /*
22885 +        * The ITLB handler accesses the L3 PTE via the virtually mapped linear
22886 +        * page table.  If a nested TLB miss occurs, we switch into physical
22887 +        * mode, walk the page table, and then re-execute the L3 PTE read
22888 +        * and go on normally after that.
22889 +        */
22890 +#ifdef CONFIG_XEN
22891 +       movl r16=XSI_IFA
22892 +       ;;
22893 +       ld8 r16=[r16]
22894 +#else
22895 +       mov r16=cr.ifa                          // get virtual address
22896 +#endif
22897 +       mov r29=b0                              // save b0
22898 +       mov r31=pr                              // save predicates
22899 +.itlb_fault:
22900 +#ifdef CONFIG_XEN
22901 +       movl r17=XSI_IHA
22902 +       ;;
22903 +       ld8 r17=[r17]                           // get virtual address of L3 PTE
22904 +#else
22905 +       mov r17=cr.iha                          // get virtual address of L3 PTE
22906 +#endif
22907 +       movl r30=1f                             // load nested fault continuation point
22908 +       ;;
22909 +1:     ld8 r18=[r17]                           // read L3 PTE
22910 +       ;;
22911 +       mov b0=r29
22912 +       tbit.z p6,p0=r18,_PAGE_P_BIT            // page present bit cleared?
22913 +(p6)   br.cond.spnt page_fault
22914 +       ;;
22915 +#ifdef CONFIG_XEN
22916 +       mov r19=r8
22917 +       mov r8=r18
22918 +       ;;
22919 +       XEN_HYPER_ITC_I
22920 +       ;;
22921 +       mov r8=r19
22922 +#else
22923 +       itc.i r18
22924 +#endif
22925 +       ;;
22926 +#ifdef CONFIG_SMP
22927 +       /*
22928 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
22929 +        * cannot possibly affect the following loads:
22930 +        */
22931 +       dv_serialize_data
22932 +
22933 +       ld8 r19=[r17]                           // read L3 PTE again and see if same
22934 +       mov r20=PAGE_SHIFT<<2                   // setup page size for purge
22935 +       ;;
22936 +       cmp.ne p7,p0=r18,r19
22937 +       ;;
22938 +(p7)   ptc.l r16,r20
22939 +#endif
22940 +       mov pr=r31,-1
22941 +#ifdef CONFIG_XEN
22942 +       XEN_HYPER_RFI;
22943 +#else
22944 +       rfi
22945 +#endif
22946 +END(itlb_miss)
22947 +
22948 +       .org ia64_ivt+0x0800
22949 +/////////////////////////////////////////////////////////////////////////////////////////
22950 +// 0x0800 Entry 2 (size 64 bundles) DTLB (9,48)
22951 +ENTRY(dtlb_miss)
22952 +       DBG_FAULT(2)
22953 +       /*
22954 +        * The DTLB handler accesses the L3 PTE via the virtually mapped linear
22955 +        * page table.  If a nested TLB miss occurs, we switch into physical
22956 +        * mode, walk the page table, and then re-execute the L3 PTE read
22957 +        * and go on normally after that.
22958 +        */
22959 +#ifdef CONFIG_XEN
22960 +       movl r16=XSI_IFA
22961 +       ;;
22962 +       ld8 r16=[r16]
22963 +#else
22964 +       mov r16=cr.ifa                          // get virtual address
22965 +#endif
22966 +       mov r29=b0                              // save b0
22967 +       mov r31=pr                              // save predicates
22968 +dtlb_fault:
22969 +#ifdef CONFIG_XEN
22970 +       movl r17=XSI_IHA
22971 +       ;;
22972 +       ld8 r17=[r17]                           // get virtual address of L3 PTE
22973 +#else
22974 +       mov r17=cr.iha                          // get virtual address of L3 PTE
22975 +#endif
22976 +       movl r30=1f                             // load nested fault continuation point
22977 +       ;;
22978 +1:     ld8 r18=[r17]                           // read L3 PTE
22979 +       ;;
22980 +       mov b0=r29
22981 +       tbit.z p6,p0=r18,_PAGE_P_BIT            // page present bit cleared?
22982 +(p6)   br.cond.spnt page_fault
22983 +       ;;
22984 +#ifdef CONFIG_XEN
22985 +       mov r19=r8
22986 +       mov r8=r18
22987 +       ;;
22988 +       XEN_HYPER_ITC_D
22989 +       ;;
22990 +       mov r8=r19
22991 +       ;;
22992 +#else
22993 +       itc.d r18
22994 +#endif
22995 +       ;;
22996 +#ifdef CONFIG_SMP
22997 +       /*
22998 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
22999 +        * cannot possibly affect the following loads:
23000 +        */
23001 +       dv_serialize_data
23002 +
23003 +       ld8 r19=[r17]                           // read L3 PTE again and see if same
23004 +       mov r20=PAGE_SHIFT<<2                   // setup page size for purge
23005 +       ;;
23006 +       cmp.ne p7,p0=r18,r19
23007 +       ;;
23008 +(p7)   ptc.l r16,r20
23009 +#endif
23010 +       mov pr=r31,-1
23011 +#ifdef CONFIG_XEN
23012 +       XEN_HYPER_RFI;
23013 +#else
23014 +       rfi
23015 +#endif
23016 +END(dtlb_miss)
23017 +
23018 +       .org ia64_ivt+0x0c00
23019 +/////////////////////////////////////////////////////////////////////////////////////////
23020 +// 0x0c00 Entry 3 (size 64 bundles) Alt ITLB (19)
23021 +ENTRY(alt_itlb_miss)
23022 +       DBG_FAULT(3)
23023 +#ifdef CONFIG_XEN
23024 +       movl r31=XSI_IPSR
23025 +       ;;
23026 +       ld8 r21=[r31],XSI_IFA-XSI_IPSR  // get ipsr, point to ifa
23027 +       movl r17=PAGE_KERNEL
23028 +       ;;
23029 +       movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
23030 +       ;;
23031 +       ld8 r16=[r31]           // get ifa
23032 +       mov r31=pr
23033 +       ;;
23034 +#else
23035 +       mov r16=cr.ifa          // get address that caused the TLB miss
23036 +       movl r17=PAGE_KERNEL
23037 +       mov r21=cr.ipsr
23038 +       movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
23039 +       mov r31=pr
23040 +       ;;
23041 +#endif
23042 +#ifdef CONFIG_DISABLE_VHPT
23043 +       shr.u r22=r16,61                        // get the region number into r21
23044 +       ;;
23045 +       cmp.gt p8,p0=6,r22                      // user mode
23046 +       ;;
23047 +#ifndef CONFIG_XEN
23048 +(p8)   thash r17=r16
23049 +       ;;
23050 +(p8)   mov cr.iha=r17
23051 +#endif
23052 +(p8)   mov r29=b0                              // save b0
23053 +(p8)   br.cond.dptk .itlb_fault
23054 +#endif
23055 +       extr.u r23=r21,IA64_PSR_CPL0_BIT,2      // extract psr.cpl
23056 +       and r19=r19,r16         // clear ed, reserved bits, and PTE control bits
23057 +       shr.u r18=r16,57        // move address bit 61 to bit 4
23058 +       ;;
23059 +       andcm r18=0x10,r18      // bit 4=~address-bit(61)
23060 +       cmp.ne p8,p0=r0,r23     // psr.cpl != 0?
23061 +       or r19=r17,r19          // insert PTE control bits into r19
23062 +       ;;
23063 +       or r19=r19,r18          // set bit 4 (uncached) if the access was to region 6
23064 +(p8)   br.cond.spnt page_fault
23065 +       ;;
23066 +#ifdef CONFIG_XEN
23067 +       mov r18=r8
23068 +       mov r8=r19
23069 +       ;;
23070 +       XEN_HYPER_ITC_I
23071 +       ;;
23072 +       mov r8=r18
23073 +       ;;
23074 +       mov pr=r31,-1
23075 +       ;;
23076 +       XEN_HYPER_RFI;
23077 +#else
23078 +       itc.i r19               // insert the TLB entry
23079 +       mov pr=r31,-1
23080 +       rfi
23081 +#endif
23082 +END(alt_itlb_miss)
23083 +
23084 +       .org ia64_ivt+0x1000
23085 +/////////////////////////////////////////////////////////////////////////////////////////
23086 +// 0x1000 Entry 4 (size 64 bundles) Alt DTLB (7,46)
23087 +ENTRY(alt_dtlb_miss)
23088 +       DBG_FAULT(4)
23089 +#ifdef CONFIG_XEN
23090 +       movl r31=XSI_IPSR
23091 +       ;;
23092 +       ld8 r21=[r31],XSI_ISR-XSI_IPSR  // get ipsr, point to isr
23093 +       movl r17=PAGE_KERNEL
23094 +       ;;
23095 +       ld8 r20=[r31],XSI_IFA-XSI_ISR   // get isr, point to ifa
23096 +       movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
23097 +       ;;
23098 +       ld8 r16=[r31]           // get ifa
23099 +       mov r31=pr
23100 +       ;;
23101 +#else
23102 +       mov r16=cr.ifa          // get address that caused the TLB miss
23103 +       movl r17=PAGE_KERNEL
23104 +       mov r20=cr.isr
23105 +       movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
23106 +       mov r21=cr.ipsr
23107 +       mov r31=pr
23108 +       ;;
23109 +#endif
23110 +#ifdef CONFIG_DISABLE_VHPT
23111 +       shr.u r22=r16,61                        // get the region number into r21
23112 +       ;;
23113 +       cmp.gt p8,p0=6,r22                      // access to region 0-5
23114 +       ;;
23115 +#ifndef CONFIG_XEN
23116 +(p8)   thash r17=r16
23117 +       ;;
23118 +(p8)   mov cr.iha=r17
23119 +#endif
23120 +(p8)   mov r29=b0                              // save b0
23121 +(p8)   br.cond.dptk dtlb_fault
23122 +#endif
23123 +       extr.u r23=r21,IA64_PSR_CPL0_BIT,2      // extract psr.cpl
23124 +       and r22=IA64_ISR_CODE_MASK,r20          // get the isr.code field
23125 +       tbit.nz p6,p7=r20,IA64_ISR_SP_BIT       // is speculation bit on?
23126 +       shr.u r18=r16,57                        // move address bit 61 to bit 4
23127 +       and r19=r19,r16                         // clear ed, reserved bits, and PTE control bits
23128 +       tbit.nz p9,p0=r20,IA64_ISR_NA_BIT       // is non-access bit on?
23129 +       ;;
23130 +       andcm r18=0x10,r18      // bit 4=~address-bit(61)
23131 +       cmp.ne p8,p0=r0,r23
23132 +(p9)   cmp.eq.or.andcm p6,p7=IA64_ISR_CODE_LFETCH,r22  // check isr.code field
23133 +(p8)   br.cond.spnt page_fault
23134 +
23135 +       dep r21=-1,r21,IA64_PSR_ED_BIT,1
23136 +       or r19=r19,r17          // insert PTE control bits into r19
23137 +       ;;
23138 +       or r19=r19,r18          // set bit 4 (uncached) if the access was to region 6
23139 +(p6)   mov cr.ipsr=r21
23140 +       ;;
23141 +#ifdef CONFIG_XEN
23142 +(p7)   mov r18=r8
23143 +(p7)   mov r8=r19
23144 +       ;;
23145 +(p7)   XEN_HYPER_ITC_D
23146 +       ;;
23147 +(p7)   mov r8=r18
23148 +       ;;
23149 +       mov pr=r31,-1
23150 +       ;;
23151 +       XEN_HYPER_RFI;
23152 +#else
23153 +(p7)   itc.d r19               // insert the TLB entry
23154 +       mov pr=r31,-1
23155 +       rfi
23156 +#endif
23157 +END(alt_dtlb_miss)
23158 +
23159 +       .org ia64_ivt+0x1400
23160 +/////////////////////////////////////////////////////////////////////////////////////////
23161 +// 0x1400 Entry 5 (size 64 bundles) Data nested TLB (6,45)
23162 +ENTRY(nested_dtlb_miss)
23163 +       /*
23164 +        * In the absence of kernel bugs, we get here when the virtually mapped linear
23165 +        * page table is accessed non-speculatively (e.g., in the Dirty-bit, Instruction
23166 +        * Access-bit, or Data Access-bit faults).  If the DTLB entry for the virtual page
23167 +        * table is missing, a nested TLB miss fault is triggered and control is
23168 +        * transferred to this point.  When this happens, we lookup the pte for the
23169 +        * faulting address by walking the page table in physical mode and return to the
23170 +        * continuation point passed in register r30 (or call page_fault if the address is
23171 +        * not mapped).
23172 +        *
23173 +        * Input:       r16:    faulting address
23174 +        *              r29:    saved b0
23175 +        *              r30:    continuation address
23176 +        *              r31:    saved pr
23177 +        *
23178 +        * Output:      r17:    physical address of L3 PTE of faulting address
23179 +        *              r29:    saved b0
23180 +        *              r30:    continuation address
23181 +        *              r31:    saved pr
23182 +        *
23183 +        * Clobbered:   b0, r18, r19, r21, psr.dt (cleared)
23184 +        */
23185 +#ifdef CONFIG_XEN
23186 +       XEN_HYPER_RSM_PSR_DT;
23187 +#else
23188 +       rsm psr.dt                              // switch to using physical data addressing
23189 +#endif
23190 +       mov r19=IA64_KR(PT_BASE)                // get the page table base address
23191 +       shl r21=r16,3                           // shift bit 60 into sign bit
23192 +       ;;
23193 +       shr.u r17=r16,61                        // get the region number into r17
23194 +       ;;
23195 +       cmp.eq p6,p7=5,r17                      // is faulting address in region 5?
23196 +       shr.u r18=r16,PGDIR_SHIFT               // get bits 33-63 of faulting address
23197 +       ;;
23198 +(p7)   dep r17=r17,r19,(PAGE_SHIFT-3),3        // put region number bits in place
23199 +
23200 +       srlz.d
23201 +       LOAD_PHYSICAL(p6, r19, swapper_pg_dir)  // region 5 is rooted at swapper_pg_dir
23202 +
23203 +       .pred.rel "mutex", p6, p7
23204 +(p6)   shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
23205 +(p7)   shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
23206 +       ;;
23207 +(p6)   dep r17=r18,r19,3,(PAGE_SHIFT-3)        // r17=PTA + IFA(33,42)*8
23208 +(p7)   dep r17=r18,r17,3,(PAGE_SHIFT-6)        // r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
23209 +       cmp.eq p7,p6=0,r21                      // unused address bits all zeroes?
23210 +       shr.u r18=r16,PMD_SHIFT                 // shift L2 index into position
23211 +       ;;
23212 +       ld8 r17=[r17]                           // fetch the L1 entry (may be 0)
23213 +       ;;
23214 +(p7)   cmp.eq p6,p7=r17,r0                     // was L1 entry NULL?
23215 +       dep r17=r18,r17,3,(PAGE_SHIFT-3)        // compute address of L2 page table entry
23216 +       ;;
23217 +(p7)   ld8 r17=[r17]                           // fetch the L2 entry (may be 0)
23218 +       shr.u r19=r16,PAGE_SHIFT                // shift L3 index into position
23219 +       ;;
23220 +(p7)   cmp.eq.or.andcm p6,p7=r17,r0            // was L2 entry NULL?
23221 +       dep r17=r19,r17,3,(PAGE_SHIFT-3)        // compute address of L3 page table entry
23222 +(p6)   br.cond.spnt page_fault
23223 +       mov b0=r30
23224 +       br.sptk.many b0                         // return to continuation point
23225 +END(nested_dtlb_miss)
23226 +
23227 +       .org ia64_ivt+0x1800
23228 +/////////////////////////////////////////////////////////////////////////////////////////
23229 +// 0x1800 Entry 6 (size 64 bundles) Instruction Key Miss (24)
23230 +ENTRY(ikey_miss)
23231 +       DBG_FAULT(6)
23232 +       FAULT(6)
23233 +END(ikey_miss)
23234 +
23235 +       //-----------------------------------------------------------------------------------
23236 +       // call do_page_fault (predicates are in r31, psr.dt may be off, r16 is faulting address)
23237 +ENTRY(page_fault)
23238 +#ifdef CONFIG_XEN
23239 +       XEN_HYPER_SSM_PSR_DT;
23240 +#else
23241 +       ssm psr.dt
23242 +       ;;
23243 +       srlz.i
23244 +#endif
23245 +       ;;
23246 +       SAVE_MIN_WITH_COVER
23247 +       alloc r15=ar.pfs,0,0,3,0
23248 +#ifdef CONFIG_XEN
23249 +       movl r3=XSI_ISR
23250 +       ;;
23251 +       ld8 out1=[r3],XSI_IFA-XSI_ISR           // get vcr.isr, point to ifa
23252 +       ;;
23253 +       ld8 out0=[r3]                           // get vcr.ifa
23254 +       mov r14=1
23255 +       ;;
23256 +       add r3=XSI_PSR_IC-XSI_IFA, r3           // point to vpsr.ic
23257 +       ;;
23258 +       st4 [r3]=r14                            // vpsr.ic = 1
23259 +       adds r3=8,r2                            // set up second base pointer
23260 +       ;;
23261 +#else
23262 +       mov out0=cr.ifa
23263 +       mov out1=cr.isr
23264 +       adds r3=8,r2                            // set up second base pointer
23265 +       ;;
23266 +       ssm psr.ic | PSR_DEFAULT_BITS
23267 +       ;;
23268 +       srlz.i                                  // guarantee that interruption collectin is on
23269 +       ;;
23270 +#endif
23271 +#ifdef CONFIG_XEN
23272 +       br.cond.sptk.many       xen_page_fault
23273 +       ;;
23274 +done_xen_page_fault:
23275 +#endif
23276 +(p15)  ssm psr.i                               // restore psr.i
23277 +       movl r14=ia64_leave_kernel
23278 +       ;;
23279 +       SAVE_REST
23280 +       mov rp=r14
23281 +       ;;
23282 +       adds out2=16,r12                        // out2 = pointer to pt_regs
23283 +       br.call.sptk.many b6=ia64_do_page_fault // ignore return address
23284 +END(page_fault)
23285 +
23286 +       .org ia64_ivt+0x1c00
23287 +/////////////////////////////////////////////////////////////////////////////////////////
23288 +// 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
23289 +ENTRY(dkey_miss)
23290 +       DBG_FAULT(7)
23291 +       FAULT(7)
23292 +#ifdef CONFIG_XEN
23293 +       // Leaving this code inline above results in an IVT section overflow
23294 +       // There is no particular reason for this code to be here...
23295 +xen_page_fault:
23296 +(p15)  movl r3=XSI_PSR_I
23297 +       ;;
23298 +(p15)  st4 [r3]=r14,XSI_PEND-XSI_PSR_I         // if (p15) vpsr.i = 1
23299 +       mov r14=r0
23300 +       ;;
23301 +(p15)  ld4 r14=[r3]                            // if (pending_interrupts)
23302 +       adds r3=8,r2                            // re-set up second base pointer
23303 +       ;;
23304 +(p15)  cmp.ne  p15,p0=r14,r0
23305 +       ;;
23306 +       br.cond.sptk.many done_xen_page_fault
23307 +       ;;
23308 +#endif
23309 +END(dkey_miss)
23310 +
23311 +       .org ia64_ivt+0x2000
23312 +/////////////////////////////////////////////////////////////////////////////////////////
23313 +// 0x2000 Entry 8 (size 64 bundles) Dirty-bit (54)
23314 +ENTRY(dirty_bit)
23315 +       DBG_FAULT(8)
23316 +       /*
23317 +        * What we do here is to simply turn on the dirty bit in the PTE.  We need to
23318 +        * update both the page-table and the TLB entry.  To efficiently access the PTE,
23319 +        * we address it through the virtual page table.  Most likely, the TLB entry for
23320 +        * the relevant virtual page table page is still present in the TLB so we can
23321 +        * normally do this without additional TLB misses.  In case the necessary virtual
23322 +        * page table TLB entry isn't present, we take a nested TLB miss hit where we look
23323 +        * up the physical address of the L3 PTE and then continue at label 1 below.
23324 +        */
23325 +#ifdef CONFIG_XEN
23326 +       movl r16=XSI_IFA
23327 +       ;;
23328 +       ld8 r16=[r16]
23329 +       ;;
23330 +#else
23331 +       mov r16=cr.ifa                          // get the address that caused the fault
23332 +#endif
23333 +       movl r30=1f                             // load continuation point in case of nested fault
23334 +       ;;
23335 +#ifdef CONFIG_XEN
23336 +       mov r18=r8;
23337 +       mov r8=r16;
23338 +       XEN_HYPER_THASH;;
23339 +       mov r17=r8;
23340 +       mov r8=r18;;
23341 +#else
23342 +       thash r17=r16                           // compute virtual address of L3 PTE
23343 +#endif
23344 +       mov r29=b0                              // save b0 in case of nested fault
23345 +       mov r31=pr                              // save pr
23346 +#ifdef CONFIG_SMP
23347 +       mov r28=ar.ccv                          // save ar.ccv
23348 +       ;;
23349 +1:     ld8 r18=[r17]
23350 +       ;;                                      // avoid RAW on r18
23351 +       mov ar.ccv=r18                          // set compare value for cmpxchg
23352 +       or r25=_PAGE_D|_PAGE_A,r18              // set the dirty and accessed bits
23353 +       ;;
23354 +       cmpxchg8.acq r26=[r17],r25,ar.ccv
23355 +       mov r24=PAGE_SHIFT<<2
23356 +       ;;
23357 +       cmp.eq p6,p7=r26,r18
23358 +       ;;
23359 +(p6)   itc.d r25                               // install updated PTE
23360 +       ;;
23361 +       /*
23362 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
23363 +        * cannot possibly affect the following loads:
23364 +        */
23365 +       dv_serialize_data
23366 +
23367 +       ld8 r18=[r17]                           // read PTE again
23368 +       ;;
23369 +       cmp.eq p6,p7=r18,r25                    // is it same as the newly installed
23370 +       ;;
23371 +(p7)   ptc.l r16,r24
23372 +       mov b0=r29                              // restore b0
23373 +       mov ar.ccv=r28
23374 +#else
23375 +       ;;
23376 +1:     ld8 r18=[r17]
23377 +       ;;                                      // avoid RAW on r18
23378 +       or r18=_PAGE_D|_PAGE_A,r18              // set the dirty and accessed bits
23379 +       mov b0=r29                              // restore b0
23380 +       ;;
23381 +       st8 [r17]=r18                           // store back updated PTE
23382 +       itc.d r18                               // install updated PTE
23383 +#endif
23384 +       mov pr=r31,-1                           // restore pr
23385 +#ifdef CONFIG_XEN
23386 +       XEN_HYPER_RFI;
23387 +#else
23388 +       rfi
23389 +#endif
23390 +END(dirty_bit)
23391 +
23392 +       .org ia64_ivt+0x2400
23393 +/////////////////////////////////////////////////////////////////////////////////////////
23394 +// 0x2400 Entry 9 (size 64 bundles) Instruction Access-bit (27)
23395 +ENTRY(iaccess_bit)
23396 +       DBG_FAULT(9)
23397 +       // Like Entry 8, except for instruction access
23398 +#ifdef CONFIG_XEN
23399 +       movl r16=XSI_IFA
23400 +       ;;
23401 +       ld8 r16=[r16]
23402 +       ;;
23403 +#else
23404 +       mov r16=cr.ifa                          // get the address that caused the fault
23405 +#endif
23406 +       movl r30=1f                             // load continuation point in case of nested fault
23407 +       mov r31=pr                              // save predicates
23408 +#ifdef CONFIG_ITANIUM
23409 +       /*
23410 +        * Erratum 10 (IFA may contain incorrect address) has "NoFix" status.
23411 +        */
23412 +       mov r17=cr.ipsr
23413 +       ;;
23414 +       mov r18=cr.iip
23415 +       tbit.z p6,p0=r17,IA64_PSR_IS_BIT        // IA64 instruction set?
23416 +       ;;
23417 +(p6)   mov r16=r18                             // if so, use cr.iip instead of cr.ifa
23418 +#endif /* CONFIG_ITANIUM */
23419 +       ;;
23420 +#ifdef CONFIG_XEN
23421 +       mov r18=r8;
23422 +       mov r8=r16;
23423 +       XEN_HYPER_THASH;;
23424 +       mov r17=r8;
23425 +       mov r8=r18;;
23426 +#else
23427 +       thash r17=r16                           // compute virtual address of L3 PTE
23428 +#endif
23429 +       mov r29=b0                              // save b0 in case of nested fault)
23430 +#ifdef CONFIG_SMP
23431 +       mov r28=ar.ccv                          // save ar.ccv
23432 +       ;;
23433 +1:     ld8 r18=[r17]
23434 +       ;;
23435 +       mov ar.ccv=r18                          // set compare value for cmpxchg
23436 +       or r25=_PAGE_A,r18                      // set the accessed bit
23437 +       ;;
23438 +       cmpxchg8.acq r26=[r17],r25,ar.ccv
23439 +       mov r24=PAGE_SHIFT<<2
23440 +       ;;
23441 +       cmp.eq p6,p7=r26,r18
23442 +       ;;
23443 +#ifdef CONFIG_XEN
23444 +       mov r26=r8
23445 +       mov r8=r25
23446 +       ;;
23447 +(p6)   XEN_HYPER_ITC_I
23448 +       ;;
23449 +       mov r8=r26
23450 +       ;;
23451 +#else
23452 +(p6)   itc.i r25                               // install updated PTE
23453 +#endif
23454 +       ;;
23455 +       /*
23456 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
23457 +        * cannot possibly affect the following loads:
23458 +        */
23459 +       dv_serialize_data
23460 +
23461 +       ld8 r18=[r17]                           // read PTE again
23462 +       ;;
23463 +       cmp.eq p6,p7=r18,r25                    // is it same as the newly installed
23464 +       ;;
23465 +(p7)   ptc.l r16,r24
23466 +       mov b0=r29                              // restore b0
23467 +       mov ar.ccv=r28
23468 +#else /* !CONFIG_SMP */
23469 +       ;;
23470 +1:     ld8 r18=[r17]
23471 +       ;;
23472 +       or r18=_PAGE_A,r18                      // set the accessed bit
23473 +       mov b0=r29                              // restore b0
23474 +       ;;
23475 +       st8 [r17]=r18                           // store back updated PTE
23476 +       itc.i r18                               // install updated PTE
23477 +#endif /* !CONFIG_SMP */
23478 +       mov pr=r31,-1
23479 +#ifdef CONFIG_XEN
23480 +       XEN_HYPER_RFI;
23481 +#else
23482 +       rfi
23483 +#endif
23484 +END(iaccess_bit)
23485 +
23486 +       .org ia64_ivt+0x2800
23487 +/////////////////////////////////////////////////////////////////////////////////////////
23488 +// 0x2800 Entry 10 (size 64 bundles) Data Access-bit (15,55)
23489 +ENTRY(daccess_bit)
23490 +       DBG_FAULT(10)
23491 +       // Like Entry 8, except for data access
23492 +#ifdef CONFIG_XEN
23493 +       movl r16=XSI_IFA
23494 +       ;;
23495 +       ld8 r16=[r16]
23496 +       ;;
23497 +#else
23498 +       mov r16=cr.ifa                          // get the address that caused the fault
23499 +#endif
23500 +       movl r30=1f                             // load continuation point in case of nested fault
23501 +       ;;
23502 +#ifdef CONFIG_XEN
23503 +       mov r18=r8;
23504 +       mov r8=r16;
23505 +       XEN_HYPER_THASH;;
23506 +       mov r17=r8;
23507 +       mov r8=r18;;
23508 +#else
23509 +       thash r17=r16                           // compute virtual address of L3 PTE
23510 +#endif
23511 +       mov r31=pr
23512 +       mov r29=b0                              // save b0 in case of nested fault)
23513 +#ifdef CONFIG_SMP
23514 +       mov r28=ar.ccv                          // save ar.ccv
23515 +       ;;
23516 +1:     ld8 r18=[r17]
23517 +       ;;                                      // avoid RAW on r18
23518 +       mov ar.ccv=r18                          // set compare value for cmpxchg
23519 +       or r25=_PAGE_A,r18                      // set the dirty bit
23520 +       ;;
23521 +       cmpxchg8.acq r26=[r17],r25,ar.ccv
23522 +       mov r24=PAGE_SHIFT<<2
23523 +       ;;
23524 +       cmp.eq p6,p7=r26,r18
23525 +       ;;
23526 +#ifdef CONFIG_XEN
23527 +       mov r26=r8
23528 +       mov r8=r25
23529 +       ;;
23530 +(p6)   XEN_HYPER_ITC_D
23531 +       ;;
23532 +       mov r8=r26
23533 +       ;;
23534 +#else
23535 +(p6)   itc.d r25                               // install updated PTE
23536 +#endif
23537 +       /*
23538 +        * Tell the assemblers dependency-violation checker that the above "itc" instructions
23539 +        * cannot possibly affect the following loads:
23540 +        */
23541 +       dv_serialize_data
23542 +       ;;
23543 +       ld8 r18=[r17]                           // read PTE again
23544 +       ;;
23545 +       cmp.eq p6,p7=r18,r25                    // is it same as the newly installed
23546 +       ;;
23547 +(p7)   ptc.l r16,r24
23548 +       mov ar.ccv=r28
23549 +#else
23550 +       ;;
23551 +1:     ld8 r18=[r17]
23552 +       ;;                                      // avoid RAW on r18
23553 +       or r18=_PAGE_A,r18                      // set the accessed bit
23554 +       ;;
23555 +       st8 [r17]=r18                           // store back updated PTE
23556 +       itc.d r18                               // install updated PTE
23557 +#endif
23558 +       mov b0=r29                              // restore b0
23559 +       mov pr=r31,-1
23560 +#ifdef CONFIG_XEN
23561 +       XEN_HYPER_RFI;
23562 +#else
23563 +       rfi
23564 +#endif
23565 +END(daccess_bit)
23566 +
23567 +       .org ia64_ivt+0x2c00
23568 +/////////////////////////////////////////////////////////////////////////////////////////
23569 +// 0x2c00 Entry 11 (size 64 bundles) Break instruction (33)
23570 +ENTRY(break_fault)
23571 +       /*
23572 +        * The streamlined system call entry/exit paths only save/restore the initial part
23573 +        * of pt_regs.  This implies that the callers of system-calls must adhere to the
23574 +        * normal procedure calling conventions.
23575 +        *
23576 +        *   Registers to be saved & restored:
23577 +        *      CR registers: cr.ipsr, cr.iip, cr.ifs
23578 +        *      AR registers: ar.unat, ar.pfs, ar.rsc, ar.rnat, ar.bspstore, ar.fpsr
23579 +        *      others: pr, b0, b6, loadrs, r1, r11, r12, r13, r15
23580 +        *   Registers to be restored only:
23581 +        *      r8-r11: output value from the system call.
23582 +        *
23583 +        * During system call exit, scratch registers (including r15) are modified/cleared
23584 +        * to prevent leaking bits from kernel to user level.
23585 +        */
23586 +       DBG_FAULT(11)
23587 +       mov r16=IA64_KR(CURRENT)                // r16 = current task; 12 cycle read lat.
23588 +#ifdef CONFIG_XEN
23589 +       movl r31=XSI_IPSR
23590 +       ;;
23591 +       ld8 r29=[r31],XSI_IIP-XSI_IPSR          // get ipsr, point to iip
23592 +       mov r18=__IA64_BREAK_SYSCALL
23593 +       mov r21=ar.fpsr
23594 +       ;;
23595 +       ld8 r28=[r31],XSI_IIM-XSI_IIP           // get iip, point to iim
23596 +       mov r19=b6
23597 +       mov r25=ar.unat
23598 +       ;;
23599 +       ld8 r17=[r31]                           // get iim
23600 +       mov r27=ar.rsc
23601 +       mov r26=ar.pfs
23602 +       ;;
23603 +#else
23604 +       mov r17=cr.iim
23605 +       mov r18=__IA64_BREAK_SYSCALL
23606 +       mov r21=ar.fpsr
23607 +       mov r29=cr.ipsr
23608 +       mov r19=b6
23609 +       mov r25=ar.unat
23610 +       mov r27=ar.rsc
23611 +       mov r26=ar.pfs
23612 +       mov r28=cr.iip
23613 +#endif
23614 +       mov r31=pr                              // prepare to save predicates
23615 +       mov r20=r1
23616 +       ;;
23617 +       adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16
23618 +       cmp.eq p0,p7=r18,r17                    // is this a system call? (p7 <- false, if so)
23619 +(p7)   br.cond.spnt non_syscall
23620 +       ;;
23621 +       ld1 r17=[r16]                           // load current->thread.on_ustack flag
23622 +       st1 [r16]=r0                            // clear current->thread.on_ustack flag
23623 +       add r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16   // set r1 for MINSTATE_START_SAVE_MIN_VIRT
23624 +       ;;
23625 +       invala
23626 +
23627 +       /* adjust return address so we skip over the break instruction: */
23628 +
23629 +       extr.u r8=r29,41,2                      // extract ei field from cr.ipsr
23630 +       ;;
23631 +       cmp.eq p6,p7=2,r8                       // isr.ei==2?
23632 +       mov r2=r1                               // setup r2 for ia64_syscall_setup
23633 +       ;;
23634 +(p6)   mov r8=0                                // clear ei to 0
23635 +(p6)   adds r28=16,r28                         // switch cr.iip to next bundle cr.ipsr.ei wrapped
23636 +(p7)   adds r8=1,r8                            // increment ei to next slot
23637 +       ;;
23638 +       cmp.eq pKStk,pUStk=r0,r17               // are we in kernel mode already?
23639 +       dep r29=r8,r29,41,2                     // insert new ei into cr.ipsr
23640 +       ;;
23641 +
23642 +       // switch from user to kernel RBS:
23643 +       MINSTATE_START_SAVE_MIN_VIRT
23644 +       br.call.sptk.many b7=ia64_syscall_setup
23645 +       ;;
23646 +#ifdef CONFIG_XEN
23647 +       mov r2=b0; br.call.sptk b0=xen_bsw1;; mov b0=r2;;
23648 +#else
23649 +       MINSTATE_END_SAVE_MIN_VIRT              // switch to bank 1
23650 +#endif
23651 +#ifdef CONFIG_XEN
23652 +       movl r3=XSI_PSR_IC
23653 +       mov r16=1
23654 +       ;;
23655 +#if 1
23656 +       st4 [r3]=r16,XSI_PSR_I-XSI_PSR_IC       // vpsr.ic = 1
23657 +       ;;
23658 +(p15)  st4 [r3]=r16,XSI_PEND-XSI_PSR_I         // if (p15) vpsr.i = 1
23659 +       mov r16=r0
23660 +       ;;
23661 +(p15)  ld4 r16=[r3]                            // if (pending_interrupts)
23662 +       ;;
23663 +       cmp.ne  p6,p0=r16,r0
23664 +       ;;
23665 +(p6)   ssm     psr.i                           //   do a real ssm psr.i
23666 +       ;;
23667 +#else
23668 +//     st4 [r3]=r16,XSI_PSR_I-XSI_PSR_IC       // vpsr.ic = 1
23669 +       adds r3=XSI_PSR_I-XSI_PSR_IC,r3         // SKIP vpsr.ic = 1
23670 +       ;;
23671 +(p15)  st4 [r3]=r16,XSI_PEND-XSI_PSR_I         // if (p15) vpsr.i = 1
23672 +       mov r16=r0
23673 +       ;;
23674 +(p15)  ld4 r16=[r3]                            // if (pending_interrupts)
23675 +       ;;
23676 +       cmp.ne  p6,p0=r16,r0
23677 +       ;;
23678 +//(p6) ssm     psr.i                           //   do a real ssm psr.i
23679 +//(p6) XEN_HYPER_SSM_I;
23680 +(p6)   break 0x7;
23681 +       ;;
23682 +#endif
23683 +       mov r3=NR_syscalls - 1
23684 +       ;;
23685 +#else
23686 +       ssm psr.ic | PSR_DEFAULT_BITS
23687 +       ;;
23688 +       srlz.i                                  // guarantee that interruption collection is on
23689 +       mov r3=NR_syscalls - 1
23690 +       ;;
23691 +(p15)  ssm psr.i                               // restore psr.i
23692 +#endif
23693 +       // p10==true means out registers are more than 8 or r15's Nat is true
23694 +(p10)  br.cond.spnt.many ia64_ret_from_syscall
23695 +       ;;
23696 +       movl r16=sys_call_table
23697 +
23698 +       adds r15=-1024,r15                      // r15 contains the syscall number---subtract 1024
23699 +       movl r2=ia64_ret_from_syscall
23700 +       ;;
23701 +       shladd r20=r15,3,r16                    // r20 = sys_call_table + 8*(syscall-1024)
23702 +       cmp.leu p6,p7=r15,r3                    // (syscall > 0 && syscall < 1024 + NR_syscalls) ?
23703 +       mov rp=r2                               // set the real return addr
23704 +       ;;
23705 +(p6)   ld8 r20=[r20]                           // load address of syscall entry point
23706 +(p7)   movl r20=sys_ni_syscall
23707 +
23708 +       add r2=TI_FLAGS+IA64_TASK_SIZE,r13
23709 +       ;;
23710 +       ld4 r2=[r2]                             // r2 = current_thread_info()->flags
23711 +       ;;
23712 +       and r2=_TIF_SYSCALL_TRACEAUDIT,r2       // mask trace or audit
23713 +       ;;
23714 +       cmp.eq p8,p0=r2,r0
23715 +       mov b6=r20
23716 +       ;;
23717 +(p8)   br.call.sptk.many b6=b6                 // ignore this return addr
23718 +       br.cond.sptk ia64_trace_syscall
23719 +       // NOT REACHED
23720 +END(break_fault)
23721 +
23722 +       .org ia64_ivt+0x3000
23723 +/////////////////////////////////////////////////////////////////////////////////////////
23724 +// 0x3000 Entry 12 (size 64 bundles) External Interrupt (4)
23725 +ENTRY(interrupt)
23726 +       DBG_FAULT(12)
23727 +       mov r31=pr              // prepare to save predicates
23728 +       ;;
23729 +       SAVE_MIN_WITH_COVER     // uses r31; defines r2 and r3
23730 +#ifdef CONFIG_XEN
23731 +       movl r3=XSI_PSR_IC
23732 +       mov r14=1
23733 +       ;;
23734 +       st4 [r3]=r14
23735 +#else
23736 +       ssm psr.ic | PSR_DEFAULT_BITS
23737 +#endif
23738 +       ;;
23739 +       adds r3=8,r2            // set up second base pointer for SAVE_REST
23740 +       srlz.i                  // ensure everybody knows psr.ic is back on
23741 +       ;;
23742 +       SAVE_REST
23743 +       ;;
23744 +       alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group
23745 +#ifdef CONFIG_XEN
23746 +       ;;
23747 +       br.call.sptk.many rp=xen_get_ivr
23748 +       ;;
23749 +       mov out0=r8             // pass cr.ivr as first arg
23750 +#else
23751 +       mov out0=cr.ivr         // pass cr.ivr as first arg
23752 +#endif
23753 +       add out1=16,sp          // pass pointer to pt_regs as second arg
23754 +       ;;
23755 +       srlz.d                  // make sure we see the effect of cr.ivr
23756 +       movl r14=ia64_leave_kernel
23757 +       ;;
23758 +       mov rp=r14
23759 +       br.call.sptk.many b6=ia64_handle_irq
23760 +END(interrupt)
23761 +
23762 +       .org ia64_ivt+0x3400
23763 +/////////////////////////////////////////////////////////////////////////////////////////
23764 +// 0x3400 Entry 13 (size 64 bundles) Reserved
23765 +       DBG_FAULT(13)
23766 +       FAULT(13)
23767 +
23768 +       .org ia64_ivt+0x3800
23769 +/////////////////////////////////////////////////////////////////////////////////////////
23770 +// 0x3800 Entry 14 (size 64 bundles) Reserved
23771 +       DBG_FAULT(14)
23772 +       FAULT(14)
23773 +
23774 +       /*
23775 +        * There is no particular reason for this code to be here, other than that
23776 +        * there happens to be space here that would go unused otherwise.  If this
23777 +        * fault ever gets "unreserved", simply moved the following code to a more
23778 +        * suitable spot...
23779 +        *
23780 +        * ia64_syscall_setup() is a separate subroutine so that it can
23781 +        *      allocate stacked registers so it can safely demine any
23782 +        *      potential NaT values from the input registers.
23783 +        *
23784 +        * On entry:
23785 +        *      - executing on bank 0 or bank 1 register set (doesn't matter)
23786 +        *      -  r1: stack pointer
23787 +        *      -  r2: current task pointer
23788 +        *      -  r3: preserved
23789 +        *      - r11: original contents (saved ar.pfs to be saved)
23790 +        *      - r12: original contents (sp to be saved)
23791 +        *      - r13: original contents (tp to be saved)
23792 +        *      - r15: original contents (syscall # to be saved)
23793 +        *      - r18: saved bsp (after switching to kernel stack)
23794 +        *      - r19: saved b6
23795 +        *      - r20: saved r1 (gp)
23796 +        *      - r21: saved ar.fpsr
23797 +        *      - r22: kernel's register backing store base (krbs_base)
23798 +        *      - r23: saved ar.bspstore
23799 +        *      - r24: saved ar.rnat
23800 +        *      - r25: saved ar.unat
23801 +        *      - r26: saved ar.pfs
23802 +        *      - r27: saved ar.rsc
23803 +        *      - r28: saved cr.iip
23804 +        *      - r29: saved cr.ipsr
23805 +        *      - r31: saved pr
23806 +        *      -  b0: original contents (to be saved)
23807 +        * On exit:
23808 +        *      - executing on bank 1 registers
23809 +        *      - psr.ic enabled, interrupts restored
23810 +        *      -  p10: TRUE if syscall is invoked with more than 8 out
23811 +        *              registers or r15's Nat is true
23812 +        *      -  r1: kernel's gp
23813 +        *      -  r3: preserved (same as on entry)
23814 +        *      -  r8: -EINVAL if p10 is true
23815 +        *      - r12: points to kernel stack
23816 +        *      - r13: points to current task
23817 +        *      - p15: TRUE if interrupts need to be re-enabled
23818 +        *      - ar.fpsr: set to kernel settings
23819 +        */
23820 +#ifndef CONFIG_XEN
23821 +GLOBAL_ENTRY(ia64_syscall_setup)
23822 +#if PT(B6) != 0
23823 +# error This code assumes that b6 is the first field in pt_regs.
23824 +#endif
23825 +       st8 [r1]=r19                            // save b6
23826 +       add r16=PT(CR_IPSR),r1                  // initialize first base pointer
23827 +       add r17=PT(R11),r1                      // initialize second base pointer
23828 +       ;;
23829 +       alloc r19=ar.pfs,8,0,0,0                // ensure in0-in7 are writable
23830 +       st8 [r16]=r29,PT(AR_PFS)-PT(CR_IPSR)    // save cr.ipsr
23831 +       tnat.nz p8,p0=in0
23832 +
23833 +       st8.spill [r17]=r11,PT(CR_IIP)-PT(R11)  // save r11
23834 +       tnat.nz p9,p0=in1
23835 +(pKStk)        mov r18=r0                              // make sure r18 isn't NaT
23836 +       ;;
23837 +
23838 +       st8 [r16]=r26,PT(CR_IFS)-PT(AR_PFS)     // save ar.pfs
23839 +       st8 [r17]=r28,PT(AR_UNAT)-PT(CR_IIP)    // save cr.iip
23840 +       mov r28=b0                              // save b0 (2 cyc)
23841 +       ;;
23842 +
23843 +       st8 [r17]=r25,PT(AR_RSC)-PT(AR_UNAT)    // save ar.unat
23844 +       dep r19=0,r19,38,26                     // clear all bits but 0..37 [I0]
23845 +(p8)   mov in0=-1
23846 +       ;;
23847 +
23848 +       st8 [r16]=r19,PT(AR_RNAT)-PT(CR_IFS)    // store ar.pfs.pfm in cr.ifs
23849 +       extr.u r11=r19,7,7      // I0           // get sol of ar.pfs
23850 +       and r8=0x7f,r19         // A            // get sof of ar.pfs
23851 +
23852 +       st8 [r17]=r27,PT(AR_BSPSTORE)-PT(AR_RSC)// save ar.rsc
23853 +       tbit.nz p15,p0=r29,IA64_PSR_I_BIT // I0
23854 +(p9)   mov in1=-1
23855 +       ;;
23856 +
23857 +(pUStk) sub r18=r18,r22                                // r18=RSE.ndirty*8
23858 +       tnat.nz p10,p0=in2
23859 +       add r11=8,r11
23860 +       ;;
23861 +(pKStk) adds r16=PT(PR)-PT(AR_RNAT),r16                // skip over ar_rnat field
23862 +(pKStk) adds r17=PT(B0)-PT(AR_BSPSTORE),r17    // skip over ar_bspstore field
23863 +       tnat.nz p11,p0=in3
23864 +       ;;
23865 +(p10)  mov in2=-1
23866 +       tnat.nz p12,p0=in4                              // [I0]
23867 +(p11)  mov in3=-1
23868 +       ;;
23869 +(pUStk) st8 [r16]=r24,PT(PR)-PT(AR_RNAT)       // save ar.rnat
23870 +(pUStk) st8 [r17]=r23,PT(B0)-PT(AR_BSPSTORE)   // save ar.bspstore
23871 +       shl r18=r18,16                          // compute ar.rsc to be used for "loadrs"
23872 +       ;;
23873 +       st8 [r16]=r31,PT(LOADRS)-PT(PR)         // save predicates
23874 +       st8 [r17]=r28,PT(R1)-PT(B0)             // save b0
23875 +       tnat.nz p13,p0=in5                              // [I0]
23876 +       ;;
23877 +       st8 [r16]=r18,PT(R12)-PT(LOADRS)        // save ar.rsc value for "loadrs"
23878 +       st8.spill [r17]=r20,PT(R13)-PT(R1)      // save original r1
23879 +(p12)  mov in4=-1
23880 +       ;;
23881 +
23882 +.mem.offset 0,0; st8.spill [r16]=r12,PT(AR_FPSR)-PT(R12)       // save r12
23883 +.mem.offset 8,0; st8.spill [r17]=r13,PT(R15)-PT(R13)           // save r13
23884 +(p13)  mov in5=-1
23885 +       ;;
23886 +       st8 [r16]=r21,PT(R8)-PT(AR_FPSR)        // save ar.fpsr
23887 +       tnat.nz p14,p0=in6
23888 +       cmp.lt p10,p9=r11,r8    // frame size can't be more than local+8
23889 +       ;;
23890 +       stf8 [r16]=f1           // ensure pt_regs.r8 != 0 (see handle_syscall_error)
23891 +(p9)   tnat.nz p10,p0=r15
23892 +       adds r12=-16,r1         // switch to kernel memory stack (with 16 bytes of scratch)
23893 +
23894 +       st8.spill [r17]=r15                     // save r15
23895 +       tnat.nz p8,p0=in7
23896 +       nop.i 0
23897 +
23898 +       mov r13=r2                              // establish `current'
23899 +       movl r1=__gp                            // establish kernel global pointer
23900 +       ;;
23901 +(p14)  mov in6=-1
23902 +(p8)   mov in7=-1
23903 +       nop.i 0
23904 +
23905 +       cmp.eq pSys,pNonSys=r0,r0               // set pSys=1, pNonSys=0
23906 +       movl r17=FPSR_DEFAULT
23907 +       ;;
23908 +       mov.m ar.fpsr=r17                       // set ar.fpsr to kernel default value
23909 +(p10)  mov r8=-EINVAL
23910 +       br.ret.sptk.many b7
23911 +END(ia64_syscall_setup)
23912 +#endif
23913 +
23914 +       .org ia64_ivt+0x3c00
23915 +/////////////////////////////////////////////////////////////////////////////////////////
23916 +// 0x3c00 Entry 15 (size 64 bundles) Reserved
23917 +       DBG_FAULT(15)
23918 +       FAULT(15)
23919 +
23920 +       /*
23921 +        * Squatting in this space ...
23922 +        *
23923 +        * This special case dispatcher for illegal operation faults allows preserved
23924 +        * registers to be modified through a callback function (asm only) that is handed
23925 +        * back from the fault handler in r8. Up to three arguments can be passed to the
23926 +        * callback function by returning an aggregate with the callback as its first
23927 +        * element, followed by the arguments.
23928 +        */
23929 +ENTRY(dispatch_illegal_op_fault)
23930 +       SAVE_MIN_WITH_COVER
23931 +       ssm psr.ic | PSR_DEFAULT_BITS
23932 +       ;;
23933 +       srlz.i          // guarantee that interruption collection is on
23934 +       ;;
23935 +(p15)  ssm psr.i       // restore psr.i
23936 +       adds r3=8,r2    // set up second base pointer for SAVE_REST
23937 +       ;;
23938 +       alloc r14=ar.pfs,0,0,1,0        // must be first in insn group
23939 +       mov out0=ar.ec
23940 +       ;;
23941 +       SAVE_REST
23942 +       ;;
23943 +       br.call.sptk.many rp=ia64_illegal_op_fault
23944 +.ret0: ;;
23945 +       alloc r14=ar.pfs,0,0,3,0        // must be first in insn group
23946 +       mov out0=r9
23947 +       mov out1=r10
23948 +       mov out2=r11
23949 +       movl r15=ia64_leave_kernel
23950 +       ;;
23951 +       mov rp=r15
23952 +       mov b6=r8
23953 +       ;;
23954 +       cmp.ne p6,p0=0,r8
23955 +(p6)   br.call.dpnt.many b6=b6         // call returns to ia64_leave_kernel
23956 +       br.sptk.many ia64_leave_kernel
23957 +END(dispatch_illegal_op_fault)
23958 +
23959 +       .org ia64_ivt+0x4000
23960 +/////////////////////////////////////////////////////////////////////////////////////////
23961 +// 0x4000 Entry 16 (size 64 bundles) Reserved
23962 +       DBG_FAULT(16)
23963 +       FAULT(16)
23964 +
23965 +       .org ia64_ivt+0x4400
23966 +/////////////////////////////////////////////////////////////////////////////////////////
23967 +// 0x4400 Entry 17 (size 64 bundles) Reserved
23968 +       DBG_FAULT(17)
23969 +       FAULT(17)
23970 +
23971 +ENTRY(non_syscall)
23972 +       SAVE_MIN_WITH_COVER
23973 +
23974 +       // There is no particular reason for this code to be here, other than that
23975 +       // there happens to be space here that would go unused otherwise.  If this
23976 +       // fault ever gets "unreserved", simply moved the following code to a more
23977 +       // suitable spot...
23978 +
23979 +       alloc r14=ar.pfs,0,0,2,0
23980 +       mov out0=cr.iim
23981 +       add out1=16,sp
23982 +       adds r3=8,r2                    // set up second base pointer for SAVE_REST
23983 +
23984 +       ssm psr.ic | PSR_DEFAULT_BITS
23985 +       ;;
23986 +       srlz.i                          // guarantee that interruption collection is on
23987 +       ;;
23988 +(p15)  ssm psr.i                       // restore psr.i
23989 +       movl r15=ia64_leave_kernel
23990 +       ;;
23991 +       SAVE_REST
23992 +       mov rp=r15
23993 +       ;;
23994 +       br.call.sptk.many b6=ia64_bad_break     // avoid WAW on CFM and ignore return addr
23995 +END(non_syscall)
23996 +
23997 +       .org ia64_ivt+0x4800
23998 +/////////////////////////////////////////////////////////////////////////////////////////
23999 +// 0x4800 Entry 18 (size 64 bundles) Reserved
24000 +       DBG_FAULT(18)
24001 +       FAULT(18)
24002 +
24003 +       /*
24004 +        * There is no particular reason for this code to be here, other than that
24005 +        * there happens to be space here that would go unused otherwise.  If this
24006 +        * fault ever gets "unreserved", simply moved the following code to a more
24007 +        * suitable spot...
24008 +        */
24009 +
24010 +ENTRY(dispatch_unaligned_handler)
24011 +       SAVE_MIN_WITH_COVER
24012 +       ;;
24013 +       alloc r14=ar.pfs,0,0,2,0                // now it's safe (must be first in insn group!)
24014 +       mov out0=cr.ifa
24015 +       adds out1=16,sp
24016 +
24017 +       ssm psr.ic | PSR_DEFAULT_BITS
24018 +       ;;
24019 +       srlz.i                                  // guarantee that interruption collection is on
24020 +       ;;
24021 +(p15)  ssm psr.i                               // restore psr.i
24022 +       adds r3=8,r2                            // set up second base pointer
24023 +       ;;
24024 +       SAVE_REST
24025 +       movl r14=ia64_leave_kernel
24026 +       ;;
24027 +       mov rp=r14
24028 +       br.sptk.many ia64_prepare_handle_unaligned
24029 +END(dispatch_unaligned_handler)
24030 +
24031 +       .org ia64_ivt+0x4c00
24032 +/////////////////////////////////////////////////////////////////////////////////////////
24033 +// 0x4c00 Entry 19 (size 64 bundles) Reserved
24034 +       DBG_FAULT(19)
24035 +       FAULT(19)
24036 +
24037 +       /*
24038 +        * There is no particular reason for this code to be here, other than that
24039 +        * there happens to be space here that would go unused otherwise.  If this
24040 +        * fault ever gets "unreserved", simply moved the following code to a more
24041 +        * suitable spot...
24042 +        */
24043 +
24044 +ENTRY(dispatch_to_fault_handler)
24045 +       /*
24046 +        * Input:
24047 +        *      psr.ic: off
24048 +        *      r19:    fault vector number (e.g., 24 for General Exception)
24049 +        *      r31:    contains saved predicates (pr)
24050 +        */
24051 +       SAVE_MIN_WITH_COVER_R19
24052 +       alloc r14=ar.pfs,0,0,5,0
24053 +       mov out0=r15
24054 +#ifdef CONFIG_XEN
24055 +       movl out1=XSI_ISR
24056 +       ;;
24057 +       adds out2=XSI_IFA-XSI_ISR,out1
24058 +       adds out3=XSI_IIM-XSI_ISR,out1
24059 +       adds out4=XSI_ITIR-XSI_ISR,out1
24060 +       ;;
24061 +       ld8 out1=[out1]
24062 +       ld8 out2=[out2]
24063 +       ld8 out3=[out4]
24064 +       ld8 out4=[out4]
24065 +       ;;
24066 +#else
24067 +       mov out1=cr.isr
24068 +       mov out2=cr.ifa
24069 +       mov out3=cr.iim
24070 +       mov out4=cr.itir
24071 +       ;;
24072 +#endif
24073 +       ssm psr.ic | PSR_DEFAULT_BITS
24074 +       ;;
24075 +       srlz.i                                  // guarantee that interruption collection is on
24076 +       ;;
24077 +(p15)  ssm psr.i                               // restore psr.i
24078 +       adds r3=8,r2                            // set up second base pointer for SAVE_REST
24079 +       ;;
24080 +       SAVE_REST
24081 +       movl r14=ia64_leave_kernel
24082 +       ;;
24083 +       mov rp=r14
24084 +       br.call.sptk.many b6=ia64_fault
24085 +END(dispatch_to_fault_handler)
24086 +
24087 +//
24088 +// --- End of long entries, Beginning of short entries
24089 +//
24090 +
24091 +       .org ia64_ivt+0x5000
24092 +/////////////////////////////////////////////////////////////////////////////////////////
24093 +// 0x5000 Entry 20 (size 16 bundles) Page Not Present (10,22,49)
24094 +ENTRY(page_not_present)
24095 +       DBG_FAULT(20)
24096 +       mov r16=cr.ifa
24097 +       rsm psr.dt
24098 +       /*
24099 +        * The Linux page fault handler doesn't expect non-present pages to be in
24100 +        * the TLB.  Flush the existing entry now, so we meet that expectation.
24101 +        */
24102 +       mov r17=PAGE_SHIFT<<2
24103 +       ;;
24104 +       ptc.l r16,r17
24105 +       ;;
24106 +       mov r31=pr
24107 +       srlz.d
24108 +       br.sptk.many page_fault
24109 +END(page_not_present)
24110 +
24111 +       .org ia64_ivt+0x5100
24112 +/////////////////////////////////////////////////////////////////////////////////////////
24113 +// 0x5100 Entry 21 (size 16 bundles) Key Permission (13,25,52)
24114 +ENTRY(key_permission)
24115 +       DBG_FAULT(21)
24116 +       mov r16=cr.ifa
24117 +       rsm psr.dt
24118 +       mov r31=pr
24119 +       ;;
24120 +       srlz.d
24121 +       br.sptk.many page_fault
24122 +END(key_permission)
24123 +
24124 +       .org ia64_ivt+0x5200
24125 +/////////////////////////////////////////////////////////////////////////////////////////
24126 +// 0x5200 Entry 22 (size 16 bundles) Instruction Access Rights (26)
24127 +ENTRY(iaccess_rights)
24128 +       DBG_FAULT(22)
24129 +       mov r16=cr.ifa
24130 +       rsm psr.dt
24131 +       mov r31=pr
24132 +       ;;
24133 +       srlz.d
24134 +       br.sptk.many page_fault
24135 +END(iaccess_rights)
24136 +
24137 +       .org ia64_ivt+0x5300
24138 +/////////////////////////////////////////////////////////////////////////////////////////
24139 +// 0x5300 Entry 23 (size 16 bundles) Data Access Rights (14,53)
24140 +ENTRY(daccess_rights)
24141 +       DBG_FAULT(23)
24142 +#ifdef CONFIG_XEN
24143 +       movl r16=XSI_IFA
24144 +       ;;
24145 +       ld8 r16=[r16]
24146 +       ;;
24147 +       XEN_HYPER_RSM_PSR_DT;
24148 +#else
24149 +       mov r16=cr.ifa
24150 +       rsm psr.dt
24151 +#endif
24152 +       mov r31=pr
24153 +       ;;
24154 +       srlz.d
24155 +       br.sptk.many page_fault
24156 +END(daccess_rights)
24157 +
24158 +       .org ia64_ivt+0x5400
24159 +/////////////////////////////////////////////////////////////////////////////////////////
24160 +// 0x5400 Entry 24 (size 16 bundles) General Exception (5,32,34,36,38,39)
24161 +ENTRY(general_exception)
24162 +       DBG_FAULT(24)
24163 +       mov r16=cr.isr
24164 +       mov r31=pr
24165 +       ;;
24166 +       cmp4.eq p6,p0=0,r16
24167 +(p6)   br.sptk.many dispatch_illegal_op_fault
24168 +       ;;
24169 +       mov r19=24              // fault number
24170 +       br.sptk.many dispatch_to_fault_handler
24171 +END(general_exception)
24172 +
24173 +       .org ia64_ivt+0x5500
24174 +/////////////////////////////////////////////////////////////////////////////////////////
24175 +// 0x5500 Entry 25 (size 16 bundles) Disabled FP-Register (35)
24176 +ENTRY(disabled_fp_reg)
24177 +       DBG_FAULT(25)
24178 +       rsm psr.dfh             // ensure we can access fph
24179 +       ;;
24180 +       srlz.d
24181 +       mov r31=pr
24182 +       mov r19=25
24183 +       br.sptk.many dispatch_to_fault_handler
24184 +END(disabled_fp_reg)
24185 +
24186 +       .org ia64_ivt+0x5600
24187 +/////////////////////////////////////////////////////////////////////////////////////////
24188 +// 0x5600 Entry 26 (size 16 bundles) Nat Consumption (11,23,37,50)
24189 +ENTRY(nat_consumption)
24190 +       DBG_FAULT(26)
24191 +       FAULT(26)
24192 +END(nat_consumption)
24193 +
24194 +       .org ia64_ivt+0x5700
24195 +/////////////////////////////////////////////////////////////////////////////////////////
24196 +// 0x5700 Entry 27 (size 16 bundles) Speculation (40)
24197 +ENTRY(speculation_vector)
24198 +       DBG_FAULT(27)
24199 +       /*
24200 +        * A [f]chk.[as] instruction needs to take the branch to the recovery code but
24201 +        * this part of the architecture is not implemented in hardware on some CPUs, such
24202 +        * as Itanium.  Thus, in general we need to emulate the behavior.  IIM contains
24203 +        * the relative target (not yet sign extended).  So after sign extending it we
24204 +        * simply add it to IIP.  We also need to reset the EI field of the IPSR to zero,
24205 +        * i.e., the slot to restart into.
24206 +        *
24207 +        * cr.imm contains zero_ext(imm21)
24208 +        */
24209 +       mov r18=cr.iim
24210 +       ;;
24211 +       mov r17=cr.iip
24212 +       shl r18=r18,43                  // put sign bit in position (43=64-21)
24213 +       ;;
24214 +
24215 +       mov r16=cr.ipsr
24216 +       shr r18=r18,39                  // sign extend (39=43-4)
24217 +       ;;
24218 +
24219 +       add r17=r17,r18                 // now add the offset
24220 +       ;;
24221 +       mov cr.iip=r17
24222 +       dep r16=0,r16,41,2              // clear EI
24223 +       ;;
24224 +
24225 +       mov cr.ipsr=r16
24226 +       ;;
24227 +
24228 +#ifdef CONFIG_XEN
24229 +       XEN_HYPER_RFI;
24230 +#else
24231 +       rfi
24232 +#endif
24233 +END(speculation_vector)
24234 +
24235 +       .org ia64_ivt+0x5800
24236 +/////////////////////////////////////////////////////////////////////////////////////////
24237 +// 0x5800 Entry 28 (size 16 bundles) Reserved
24238 +       DBG_FAULT(28)
24239 +       FAULT(28)
24240 +
24241 +       .org ia64_ivt+0x5900
24242 +/////////////////////////////////////////////////////////////////////////////////////////
24243 +// 0x5900 Entry 29 (size 16 bundles) Debug (16,28,56)
24244 +ENTRY(debug_vector)
24245 +       DBG_FAULT(29)
24246 +       FAULT(29)
24247 +END(debug_vector)
24248 +
24249 +       .org ia64_ivt+0x5a00
24250 +/////////////////////////////////////////////////////////////////////////////////////////
24251 +// 0x5a00 Entry 30 (size 16 bundles) Unaligned Reference (57)
24252 +ENTRY(unaligned_access)
24253 +       DBG_FAULT(30)
24254 +       mov r16=cr.ipsr
24255 +       mov r31=pr              // prepare to save predicates
24256 +       ;;
24257 +       br.sptk.many dispatch_unaligned_handler
24258 +END(unaligned_access)
24259 +
24260 +       .org ia64_ivt+0x5b00
24261 +/////////////////////////////////////////////////////////////////////////////////////////
24262 +// 0x5b00 Entry 31 (size 16 bundles) Unsupported Data Reference (57)
24263 +ENTRY(unsupported_data_reference)
24264 +       DBG_FAULT(31)
24265 +       FAULT(31)
24266 +END(unsupported_data_reference)
24267 +
24268 +       .org ia64_ivt+0x5c00
24269 +/////////////////////////////////////////////////////////////////////////////////////////
24270 +// 0x5c00 Entry 32 (size 16 bundles) Floating-Point Fault (64)
24271 +ENTRY(floating_point_fault)
24272 +       DBG_FAULT(32)
24273 +       FAULT(32)
24274 +END(floating_point_fault)
24275 +
24276 +       .org ia64_ivt+0x5d00
24277 +/////////////////////////////////////////////////////////////////////////////////////////
24278 +// 0x5d00 Entry 33 (size 16 bundles) Floating Point Trap (66)
24279 +ENTRY(floating_point_trap)
24280 +       DBG_FAULT(33)
24281 +       FAULT(33)
24282 +END(floating_point_trap)
24283 +
24284 +       .org ia64_ivt+0x5e00
24285 +/////////////////////////////////////////////////////////////////////////////////////////
24286 +// 0x5e00 Entry 34 (size 16 bundles) Lower Privilege Transfer Trap (66)
24287 +ENTRY(lower_privilege_trap)
24288 +       DBG_FAULT(34)
24289 +       FAULT(34)
24290 +END(lower_privilege_trap)
24291 +
24292 +       .org ia64_ivt+0x5f00
24293 +/////////////////////////////////////////////////////////////////////////////////////////
24294 +// 0x5f00 Entry 35 (size 16 bundles) Taken Branch Trap (68)
24295 +ENTRY(taken_branch_trap)
24296 +       DBG_FAULT(35)
24297 +       FAULT(35)
24298 +END(taken_branch_trap)
24299 +
24300 +       .org ia64_ivt+0x6000
24301 +/////////////////////////////////////////////////////////////////////////////////////////
24302 +// 0x6000 Entry 36 (size 16 bundles) Single Step Trap (69)
24303 +ENTRY(single_step_trap)
24304 +       DBG_FAULT(36)
24305 +       FAULT(36)
24306 +END(single_step_trap)
24307 +
24308 +       .org ia64_ivt+0x6100
24309 +/////////////////////////////////////////////////////////////////////////////////////////
24310 +// 0x6100 Entry 37 (size 16 bundles) Reserved
24311 +       DBG_FAULT(37)
24312 +       FAULT(37)
24313 +
24314 +       .org ia64_ivt+0x6200
24315 +/////////////////////////////////////////////////////////////////////////////////////////
24316 +// 0x6200 Entry 38 (size 16 bundles) Reserved
24317 +       DBG_FAULT(38)
24318 +       FAULT(38)
24319 +
24320 +       .org ia64_ivt+0x6300
24321 +/////////////////////////////////////////////////////////////////////////////////////////
24322 +// 0x6300 Entry 39 (size 16 bundles) Reserved
24323 +       DBG_FAULT(39)
24324 +       FAULT(39)
24325 +
24326 +       .org ia64_ivt+0x6400
24327 +/////////////////////////////////////////////////////////////////////////////////////////
24328 +// 0x6400 Entry 40 (size 16 bundles) Reserved
24329 +       DBG_FAULT(40)
24330 +       FAULT(40)
24331 +
24332 +       .org ia64_ivt+0x6500
24333 +/////////////////////////////////////////////////////////////////////////////////////////
24334 +// 0x6500 Entry 41 (size 16 bundles) Reserved
24335 +       DBG_FAULT(41)
24336 +       FAULT(41)
24337 +
24338 +       .org ia64_ivt+0x6600
24339 +/////////////////////////////////////////////////////////////////////////////////////////
24340 +// 0x6600 Entry 42 (size 16 bundles) Reserved
24341 +       DBG_FAULT(42)
24342 +       FAULT(42)
24343 +
24344 +       .org ia64_ivt+0x6700
24345 +/////////////////////////////////////////////////////////////////////////////////////////
24346 +// 0x6700 Entry 43 (size 16 bundles) Reserved
24347 +       DBG_FAULT(43)
24348 +       FAULT(43)
24349 +
24350 +       .org ia64_ivt+0x6800
24351 +/////////////////////////////////////////////////////////////////////////////////////////
24352 +// 0x6800 Entry 44 (size 16 bundles) Reserved
24353 +       DBG_FAULT(44)
24354 +       FAULT(44)
24355 +
24356 +       .org ia64_ivt+0x6900
24357 +/////////////////////////////////////////////////////////////////////////////////////////
24358 +// 0x6900 Entry 45 (size 16 bundles) IA-32 Exeception (17,18,29,41,42,43,44,58,60,61,62,72,73,75,76,77)
24359 +ENTRY(ia32_exception)
24360 +       DBG_FAULT(45)
24361 +       FAULT(45)
24362 +END(ia32_exception)
24363 +
24364 +       .org ia64_ivt+0x6a00
24365 +/////////////////////////////////////////////////////////////////////////////////////////
24366 +// 0x6a00 Entry 46 (size 16 bundles) IA-32 Intercept  (30,31,59,70,71)
24367 +ENTRY(ia32_intercept)
24368 +       DBG_FAULT(46)
24369 +#ifdef CONFIG_IA32_SUPPORT
24370 +       mov r31=pr
24371 +       mov r16=cr.isr
24372 +       ;;
24373 +       extr.u r17=r16,16,8     // get ISR.code
24374 +       mov r18=ar.eflag
24375 +       mov r19=cr.iim          // old eflag value
24376 +       ;;
24377 +       cmp.ne p6,p0=2,r17
24378 +(p6)   br.cond.spnt 1f         // not a system flag fault
24379 +       xor r16=r18,r19
24380 +       ;;
24381 +       extr.u r17=r16,18,1     // get the eflags.ac bit
24382 +       ;;
24383 +       cmp.eq p6,p0=0,r17
24384 +(p6)   br.cond.spnt 1f         // eflags.ac bit didn't change
24385 +       ;;
24386 +       mov pr=r31,-1           // restore predicate registers
24387 +#ifdef CONFIG_XEN
24388 +       XEN_HYPER_RFI;
24389 +#else
24390 +       rfi
24391 +#endif
24392 +
24393 +1:
24394 +#endif // CONFIG_IA32_SUPPORT
24395 +       FAULT(46)
24396 +END(ia32_intercept)
24397 +
24398 +       .org ia64_ivt+0x6b00
24399 +/////////////////////////////////////////////////////////////////////////////////////////
24400 +// 0x6b00 Entry 47 (size 16 bundles) IA-32 Interrupt  (74)
24401 +ENTRY(ia32_interrupt)
24402 +       DBG_FAULT(47)
24403 +#ifdef CONFIG_IA32_SUPPORT
24404 +       mov r31=pr
24405 +       br.sptk.many dispatch_to_ia32_handler
24406 +#else
24407 +       FAULT(47)
24408 +#endif
24409 +END(ia32_interrupt)
24410 +
24411 +       .org ia64_ivt+0x6c00
24412 +/////////////////////////////////////////////////////////////////////////////////////////
24413 +// 0x6c00 Entry 48 (size 16 bundles) Reserved
24414 +       DBG_FAULT(48)
24415 +       FAULT(48)
24416 +
24417 +       .org ia64_ivt+0x6d00
24418 +/////////////////////////////////////////////////////////////////////////////////////////
24419 +// 0x6d00 Entry 49 (size 16 bundles) Reserved
24420 +       DBG_FAULT(49)
24421 +       FAULT(49)
24422 +
24423 +       .org ia64_ivt+0x6e00
24424 +/////////////////////////////////////////////////////////////////////////////////////////
24425 +// 0x6e00 Entry 50 (size 16 bundles) Reserved
24426 +       DBG_FAULT(50)
24427 +       FAULT(50)
24428 +
24429 +       .org ia64_ivt+0x6f00
24430 +/////////////////////////////////////////////////////////////////////////////////////////
24431 +// 0x6f00 Entry 51 (size 16 bundles) Reserved
24432 +       DBG_FAULT(51)
24433 +       FAULT(51)
24434 +
24435 +       .org ia64_ivt+0x7000
24436 +/////////////////////////////////////////////////////////////////////////////////////////
24437 +// 0x7000 Entry 52 (size 16 bundles) Reserved
24438 +       DBG_FAULT(52)
24439 +       FAULT(52)
24440 +
24441 +       .org ia64_ivt+0x7100
24442 +/////////////////////////////////////////////////////////////////////////////////////////
24443 +// 0x7100 Entry 53 (size 16 bundles) Reserved
24444 +       DBG_FAULT(53)
24445 +       FAULT(53)
24446 +
24447 +       .org ia64_ivt+0x7200
24448 +/////////////////////////////////////////////////////////////////////////////////////////
24449 +// 0x7200 Entry 54 (size 16 bundles) Reserved
24450 +       DBG_FAULT(54)
24451 +       FAULT(54)
24452 +
24453 +       .org ia64_ivt+0x7300
24454 +/////////////////////////////////////////////////////////////////////////////////////////
24455 +// 0x7300 Entry 55 (size 16 bundles) Reserved
24456 +       DBG_FAULT(55)
24457 +       FAULT(55)
24458 +
24459 +       .org ia64_ivt+0x7400
24460 +/////////////////////////////////////////////////////////////////////////////////////////
24461 +// 0x7400 Entry 56 (size 16 bundles) Reserved
24462 +       DBG_FAULT(56)
24463 +       FAULT(56)
24464 +
24465 +       .org ia64_ivt+0x7500
24466 +/////////////////////////////////////////////////////////////////////////////////////////
24467 +// 0x7500 Entry 57 (size 16 bundles) Reserved
24468 +       DBG_FAULT(57)
24469 +       FAULT(57)
24470 +
24471 +       .org ia64_ivt+0x7600
24472 +/////////////////////////////////////////////////////////////////////////////////////////
24473 +// 0x7600 Entry 58 (size 16 bundles) Reserved
24474 +       DBG_FAULT(58)
24475 +       FAULT(58)
24476 +
24477 +       .org ia64_ivt+0x7700
24478 +/////////////////////////////////////////////////////////////////////////////////////////
24479 +// 0x7700 Entry 59 (size 16 bundles) Reserved
24480 +       DBG_FAULT(59)
24481 +       FAULT(59)
24482 +
24483 +       .org ia64_ivt+0x7800
24484 +/////////////////////////////////////////////////////////////////////////////////////////
24485 +// 0x7800 Entry 60 (size 16 bundles) Reserved
24486 +       DBG_FAULT(60)
24487 +       FAULT(60)
24488 +
24489 +       .org ia64_ivt+0x7900
24490 +/////////////////////////////////////////////////////////////////////////////////////////
24491 +// 0x7900 Entry 61 (size 16 bundles) Reserved
24492 +       DBG_FAULT(61)
24493 +       FAULT(61)
24494 +
24495 +       .org ia64_ivt+0x7a00
24496 +/////////////////////////////////////////////////////////////////////////////////////////
24497 +// 0x7a00 Entry 62 (size 16 bundles) Reserved
24498 +       DBG_FAULT(62)
24499 +       FAULT(62)
24500 +
24501 +       .org ia64_ivt+0x7b00
24502 +/////////////////////////////////////////////////////////////////////////////////////////
24503 +// 0x7b00 Entry 63 (size 16 bundles) Reserved
24504 +       DBG_FAULT(63)
24505 +       FAULT(63)
24506 +
24507 +       .org ia64_ivt+0x7c00
24508 +/////////////////////////////////////////////////////////////////////////////////////////
24509 +// 0x7c00 Entry 64 (size 16 bundles) Reserved
24510 +       DBG_FAULT(64)
24511 +       FAULT(64)
24512 +
24513 +       .org ia64_ivt+0x7d00
24514 +/////////////////////////////////////////////////////////////////////////////////////////
24515 +// 0x7d00 Entry 65 (size 16 bundles) Reserved
24516 +       DBG_FAULT(65)
24517 +       FAULT(65)
24518 +
24519 +       .org ia64_ivt+0x7e00
24520 +/////////////////////////////////////////////////////////////////////////////////////////
24521 +// 0x7e00 Entry 66 (size 16 bundles) Reserved
24522 +       DBG_FAULT(66)
24523 +       FAULT(66)
24524 +
24525 +#ifdef CONFIG_XEN
24526 +       /*
24527 +        * There is no particular reason for this code to be here, other than that
24528 +        * there happens to be space here that would go unused otherwise.  If this
24529 +        * fault ever gets "unreserved", simply moved the following code to a more
24530 +        * suitable spot...
24531 +        */
24532 +
24533 +GLOBAL_ENTRY(xen_bsw1)
24534 +       /* FIXME: THIS CODE IS NOT NaT SAFE! */
24535 +       movl r30=XSI_BANKNUM;
24536 +       mov r31=1;;
24537 +       st4 [r30]=r31;
24538 +       movl r30=XSI_BANK1_R16;
24539 +       movl r31=XSI_BANK1_R16+8;;
24540 +       ld8 r16=[r30],16; ld8 r17=[r31],16;;
24541 +       ld8 r18=[r30],16; ld8 r19=[r31],16;;
24542 +       ld8 r20=[r30],16; ld8 r21=[r31],16;;
24543 +       ld8 r22=[r30],16; ld8 r23=[r31],16;;
24544 +       ld8 r24=[r30],16; ld8 r25=[r31],16;;
24545 +       ld8 r26=[r30],16; ld8 r27=[r31],16;;
24546 +       ld8 r28=[r30],16; ld8 r29=[r31],16;;
24547 +       ld8 r30=[r30]; ld8 r31=[r31];;
24548 +       br.ret.sptk.many b0
24549 +#endif
24550 +
24551 +       .org ia64_ivt+0x7f00
24552 +/////////////////////////////////////////////////////////////////////////////////////////
24553 +// 0x7f00 Entry 67 (size 16 bundles) Reserved
24554 +       DBG_FAULT(67)
24555 +       FAULT(67)
24556 +
24557 +#ifdef CONFIG_IA32_SUPPORT
24558 +
24559 +       /*
24560 +        * There is no particular reason for this code to be here, other than that
24561 +        * there happens to be space here that would go unused otherwise.  If this
24562 +        * fault ever gets "unreserved", simply moved the following code to a more
24563 +        * suitable spot...
24564 +        */
24565 +
24566 +       // IA32 interrupt entry point
24567 +
24568 +ENTRY(dispatch_to_ia32_handler)
24569 +       SAVE_MIN
24570 +       ;;
24571 +       mov r14=cr.isr
24572 +       ssm psr.ic | PSR_DEFAULT_BITS
24573 +       ;;
24574 +       srlz.i                                  // guarantee that interruption collection is on
24575 +       ;;
24576 +(p15)  ssm psr.i
24577 +       adds r3=8,r2            // Base pointer for SAVE_REST
24578 +       ;;
24579 +       SAVE_REST
24580 +       ;;
24581 +       mov r15=0x80
24582 +       shr r14=r14,16          // Get interrupt number
24583 +       ;;
24584 +       cmp.ne p6,p0=r14,r15
24585 +(p6)   br.call.dpnt.many b6=non_ia32_syscall
24586 +
24587 +       adds r14=IA64_PT_REGS_R8_OFFSET + 16,sp // 16 byte hole per SW conventions
24588 +       adds r15=IA64_PT_REGS_R1_OFFSET + 16,sp
24589 +       ;;
24590 +       cmp.eq pSys,pNonSys=r0,r0 // set pSys=1, pNonSys=0
24591 +       ld8 r8=[r14]            // get r8
24592 +       ;;
24593 +       st8 [r15]=r8            // save original EAX in r1 (IA32 procs don't use the GP)
24594 +       ;;
24595 +       alloc r15=ar.pfs,0,0,6,0        // must first in an insn group
24596 +       ;;
24597 +       ld4 r8=[r14],8          // r8 == eax (syscall number)
24598 +       mov r15=IA32_NR_syscalls
24599 +       ;;
24600 +       cmp.ltu.unc p6,p7=r8,r15
24601 +       ld4 out1=[r14],8        // r9 == ecx
24602 +       ;;
24603 +       ld4 out2=[r14],8        // r10 == edx
24604 +       ;;
24605 +       ld4 out0=[r14]          // r11 == ebx
24606 +       adds r14=(IA64_PT_REGS_R13_OFFSET) + 16,sp
24607 +       ;;
24608 +       ld4 out5=[r14],PT(R14)-PT(R13)  // r13 == ebp
24609 +       ;;
24610 +       ld4 out3=[r14],PT(R15)-PT(R14)  // r14 == esi
24611 +       adds r2=TI_FLAGS+IA64_TASK_SIZE,r13
24612 +       ;;
24613 +       ld4 out4=[r14]          // r15 == edi
24614 +       movl r16=ia32_syscall_table
24615 +       ;;
24616 +(p6)   shladd r16=r8,3,r16     // force ni_syscall if not valid syscall number
24617 +       ld4 r2=[r2]             // r2 = current_thread_info()->flags
24618 +       ;;
24619 +       ld8 r16=[r16]
24620 +       and r2=_TIF_SYSCALL_TRACEAUDIT,r2       // mask trace or audit
24621 +       ;;
24622 +       mov b6=r16
24623 +       movl r15=ia32_ret_from_syscall
24624 +       cmp.eq p8,p0=r2,r0
24625 +       ;;
24626 +       mov rp=r15
24627 +(p8)   br.call.sptk.many b6=b6
24628 +       br.cond.sptk ia32_trace_syscall
24629 +
24630 +non_ia32_syscall:
24631 +       alloc r15=ar.pfs,0,0,2,0
24632 +       mov out0=r14                            // interrupt #
24633 +       add out1=16,sp                          // pointer to pt_regs
24634 +       ;;                      // avoid WAW on CFM
24635 +       br.call.sptk.many rp=ia32_bad_interrupt
24636 +.ret1: movl r15=ia64_leave_kernel
24637 +       ;;
24638 +       mov rp=r15
24639 +       br.ret.sptk.many rp
24640 +END(dispatch_to_ia32_handler)
24641 +
24642 +#endif /* CONFIG_IA32_SUPPORT */
24643 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/xen/xenminstate.h linux-2.6.16/arch/ia64/xen/xenminstate.h
24644 --- linux-2.6.16.orig/arch/ia64/xen/xenminstate.h       1970-01-01 01:00:00.000000000 +0100
24645 +++ linux-2.6.16/arch/ia64/xen/xenminstate.h    2006-06-26 09:51:32.000000000 +0200
24646 @@ -0,0 +1,367 @@
24647 +#include <linux/config.h>
24648 +
24649 +#include <asm/cache.h>
24650 +
24651 +#ifdef CONFIG_XEN
24652 +#include "../kernel/entry.h"
24653 +#else
24654 +#include "entry.h"
24655 +#endif
24656 +
24657 +/*
24658 + * For ivt.s we want to access the stack virtually so we don't have to disable translation
24659 + * on interrupts.
24660 + *
24661 + *  On entry:
24662 + *     r1:     pointer to current task (ar.k6)
24663 + */
24664 +#define MINSTATE_START_SAVE_MIN_VIRT                                                           \
24665 +(pUStk)        mov ar.rsc=0;           /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */     \
24666 +       ;;                                                                                      \
24667 +(pUStk)        mov.m r24=ar.rnat;                                                                      \
24668 +(pUStk)        addl r22=IA64_RBS_OFFSET,r1;                    /* compute base of RBS */               \
24669 +(pKStk) mov r1=sp;                                     /* get sp  */                           \
24670 +       ;;                                                                                      \
24671 +(pUStk) lfetch.fault.excl.nt1 [r22];                                                           \
24672 +(pUStk)        addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;   /* compute base of memory stack */      \
24673 +(pUStk)        mov r23=ar.bspstore;                            /* save ar.bspstore */                  \
24674 +       ;;                                                                                      \
24675 +(pUStk)        mov ar.bspstore=r22;                            /* switch to kernel RBS */              \
24676 +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;                 /* if in kernel mode, use sp (r12) */   \
24677 +       ;;                                                                                      \
24678 +(pUStk)        mov r18=ar.bsp;                                                                         \
24679 +(pUStk)        mov ar.rsc=0x3;         /* set eager mode, pl 0, little-endian, loadrs=0 */             \
24680 +
24681 +#define MINSTATE_END_SAVE_MIN_VIRT                                                             \
24682 +       bsw.1;                  /* switch back to bank 1 (must be last in insn group) */        \
24683 +       ;;
24684 +
24685 +/*
24686 + * For mca_asm.S we want to access the stack physically since the state is saved before we
24687 + * go virtual and don't want to destroy the iip or ipsr.
24688 + */
24689 +#define MINSTATE_START_SAVE_MIN_PHYS                                                           \
24690 +(pKStk) mov r3=IA64_KR(PER_CPU_DATA);;                                                         \
24691 +(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;;                                                   \
24692 +(pKStk) ld8 r3 = [r3];;                                                                                \
24693 +(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;;                                            \
24694 +(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3;                                          \
24695 +(pUStk)        mov ar.rsc=0;           /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */     \
24696 +(pUStk)        addl r22=IA64_RBS_OFFSET,r1;            /* compute base of register backing store */    \
24697 +       ;;                                                                                      \
24698 +(pUStk)        mov r24=ar.rnat;                                                                        \
24699 +(pUStk)        addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;   /* compute base of memory stack */      \
24700 +(pUStk)        mov r23=ar.bspstore;                            /* save ar.bspstore */                  \
24701 +(pUStk)        dep r22=-1,r22,61,3;                    /* compute kernel virtual addr of RBS */        \
24702 +       ;;                                                                                      \
24703 +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;         /* if in kernel mode, use sp (r12) */           \
24704 +(pUStk)        mov ar.bspstore=r22;                    /* switch to kernel RBS */                      \
24705 +       ;;                                                                                      \
24706 +(pUStk)        mov r18=ar.bsp;                                                                         \
24707 +(pUStk)        mov ar.rsc=0x3;         /* set eager mode, pl 0, little-endian, loadrs=0 */             \
24708 +
24709 +#define MINSTATE_END_SAVE_MIN_PHYS                                                             \
24710 +       dep r12=-1,r12,61,3;            /* make sp a kernel virtual address */                  \
24711 +       ;;
24712 +
24713 +#ifdef MINSTATE_VIRT
24714 +# define MINSTATE_GET_CURRENT(reg)     mov reg=IA64_KR(CURRENT)
24715 +# define MINSTATE_START_SAVE_MIN       MINSTATE_START_SAVE_MIN_VIRT
24716 +# define MINSTATE_END_SAVE_MIN         MINSTATE_END_SAVE_MIN_VIRT
24717 +#endif
24718 +
24719 +#ifdef MINSTATE_PHYS
24720 +# define MINSTATE_GET_CURRENT(reg)     mov reg=IA64_KR(CURRENT);; tpa reg=reg
24721 +# define MINSTATE_START_SAVE_MIN       MINSTATE_START_SAVE_MIN_PHYS
24722 +# define MINSTATE_END_SAVE_MIN         MINSTATE_END_SAVE_MIN_PHYS
24723 +#endif
24724 +
24725 +/*
24726 + * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
24727 + * the minimum state necessary that allows us to turn psr.ic back
24728 + * on.
24729 + *
24730 + * Assumed state upon entry:
24731 + *     psr.ic: off
24732 + *     r31:    contains saved predicates (pr)
24733 + *
24734 + * Upon exit, the state is as follows:
24735 + *     psr.ic: off
24736 + *      r2 = points to &pt_regs.r16
24737 + *      r8 = contents of ar.ccv
24738 + *      r9 = contents of ar.csd
24739 + *     r10 = contents of ar.ssd
24740 + *     r11 = FPSR_DEFAULT
24741 + *     r12 = kernel sp (kernel virtual address)
24742 + *     r13 = points to current task_struct (kernel virtual address)
24743 + *     p15 = TRUE if psr.i is set in cr.ipsr
24744 + *     predicate registers (other than p2, p3, and p15), b6, r3, r14, r15:
24745 + *             preserved
24746 + * CONFIG_XEN note: p6/p7 are not preserved
24747 + *
24748 + * Note that psr.ic is NOT turned on by this macro.  This is so that
24749 + * we can pass interruption state as arguments to a handler.
24750 + */
24751 +#ifdef CONFIG_XEN
24752 +#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA)                                                      \
24753 +       MINSTATE_GET_CURRENT(r16);      /* M (or M;;I) */                                       \
24754 +       mov r27=ar.rsc;                 /* M */                                                 \
24755 +       mov r20=r1;                     /* A */                                                 \
24756 +       mov r25=ar.unat;                /* M */                                                 \
24757 +       /* mov r29=cr.ipsr;             /* M */                                                 \
24758 +       movl r29=XSI_IPSR;;                                                                     \
24759 +       ld8 r29=[r29];;                                                                         \
24760 +       mov r26=ar.pfs;                 /* I */                                                 \
24761 +       /* mov r28=cr.iip;              /* M */                                                 \
24762 +       movl r28=XSI_IIP;;                                                                      \
24763 +       ld8 r28=[r28];;                                                                         \
24764 +       mov r21=ar.fpsr;                /* M */                                                 \
24765 +       COVER;                  /* B;; (or nothing) */                                  \
24766 +       ;;                                                                                      \
24767 +       adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;                                         \
24768 +       ;;                                                                                      \
24769 +       ld1 r17=[r16];                          /* load current->thread.on_ustack flag */       \
24770 +       st1 [r16]=r0;                           /* clear current->thread.on_ustack flag */      \
24771 +       adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16                                          \
24772 +       /* switch from user to kernel RBS: */                                                   \
24773 +       ;;                                                                                      \
24774 +       invala;                         /* M */                                                 \
24775 +       /* SAVE_IFS; /* see xen special handling below */                                               \
24776 +       cmp.eq pKStk,pUStk=r0,r17;              /* are we in kernel mode already? */            \
24777 +       ;;                                                                                      \
24778 +       MINSTATE_START_SAVE_MIN                                                                 \
24779 +       adds r17=2*L1_CACHE_BYTES,r1;           /* really: biggest cache-line size */           \
24780 +       adds r16=PT(CR_IPSR),r1;                                                                \
24781 +       ;;                                                                                      \
24782 +       lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;                                             \
24783 +       st8 [r16]=r29;          /* save cr.ipsr */                                              \
24784 +       ;;                                                                                      \
24785 +       lfetch.fault.excl.nt1 [r17];                                                            \
24786 +       tbit.nz p15,p0=r29,IA64_PSR_I_BIT;                                                      \
24787 +       mov r29=b0                                                                              \
24788 +       ;;                                                                                      \
24789 +       adds r16=PT(R8),r1;     /* initialize first base pointer */                             \
24790 +       adds r17=PT(R9),r1;     /* initialize second base pointer */                            \
24791 +(pKStk)        mov r18=r0;             /* make sure r18 isn't NaT */                                   \
24792 +       ;;                                                                                      \
24793 +.mem.offset 0,0; st8.spill [r16]=r8,16;                                                                \
24794 +.mem.offset 8,0; st8.spill [r17]=r9,16;                                                                \
24795 +        ;;                                                                                     \
24796 +.mem.offset 0,0; st8.spill [r16]=r10,24;                                                       \
24797 +.mem.offset 8,0; st8.spill [r17]=r11,24;                                                       \
24798 +        ;;                                                                                     \
24799 +       /* xen special handling for possibly lazy cover */                                      \
24800 +       movl r8=XSI_INCOMPL_REGFR;                                                              \
24801 +       ;;                                                                                      \
24802 +       ld4 r30=[r8];                                                                           \
24803 +       ;;                                                                                      \
24804 +       cmp.eq  p6,p7=r30,r0;                                                                   \
24805 +       ;; /* not sure if this stop bit is necessary */                                         \
24806 +(p6)   adds r8=XSI_PRECOVER_IFS-XSI_INCOMPL_REGFR,r8;                                          \
24807 +(p7)   adds r8=XSI_IFS-XSI_INCOMPL_REGFR,r8;                                                   \
24808 +       ;;                                                                                      \
24809 +       ld8 r30=[r8];                                                                           \
24810 +       ;;                                                                                      \
24811 +       st8 [r16]=r28,16;       /* save cr.iip */                                               \
24812 +       st8 [r17]=r30,16;       /* save cr.ifs */                                               \
24813 +(pUStk)        sub r18=r18,r22;        /* r18=RSE.ndirty*8 */                                          \
24814 +       mov r8=ar.ccv;                                                                          \
24815 +       mov r9=ar.csd;                                                                          \
24816 +       mov r10=ar.ssd;                                                                         \
24817 +       movl r11=FPSR_DEFAULT;   /* L-unit */                                                   \
24818 +       ;;                                                                                      \
24819 +       st8 [r16]=r25,16;       /* save ar.unat */                                              \
24820 +       st8 [r17]=r26,16;       /* save ar.pfs */                                               \
24821 +       shl r18=r18,16;         /* compute ar.rsc to be used for "loadrs" */                    \
24822 +       ;;                                                                                      \
24823 +       st8 [r16]=r27,16;       /* save ar.rsc */                                               \
24824 +(pUStk)        st8 [r17]=r24,16;       /* save ar.rnat */                                              \
24825 +(pKStk)        adds r17=16,r17;        /* skip over ar_rnat field */                                   \
24826 +       ;;                      /* avoid RAW on r16 & r17 */                                    \
24827 +(pUStk)        st8 [r16]=r23,16;       /* save ar.bspstore */                                          \
24828 +       st8 [r17]=r31,16;       /* save predicates */                                           \
24829 +(pKStk)        adds r16=16,r16;        /* skip over ar_bspstore field */                               \
24830 +       ;;                                                                                      \
24831 +       st8 [r16]=r29,16;       /* save b0 */                                                   \
24832 +       st8 [r17]=r18,16;       /* save ar.rsc value for "loadrs" */                            \
24833 +       cmp.eq pNonSys,pSys=r0,r0       /* initialize pSys=0, pNonSys=1 */                      \
24834 +       ;;                                                                                      \
24835 +.mem.offset 0,0; st8.spill [r16]=r20,16;       /* save original r1 */                          \
24836 +.mem.offset 8,0; st8.spill [r17]=r12,16;                                                       \
24837 +       adds r12=-16,r1;        /* switch to kernel memory stack (with 16 bytes of scratch) */  \
24838 +       ;;                                                                                      \
24839 +.mem.offset 0,0; st8.spill [r16]=r13,16;                                                       \
24840 +.mem.offset 8,0; st8.spill [r17]=r21,16;       /* save ar.fpsr */                              \
24841 +       mov r13=IA64_KR(CURRENT);       /* establish `current' */                               \
24842 +       ;;                                                                                      \
24843 +.mem.offset 0,0; st8.spill [r16]=r15,16;                                                       \
24844 +.mem.offset 8,0; st8.spill [r17]=r14,16;                                                       \
24845 +       ;;                                                                                      \
24846 +.mem.offset 0,0; st8.spill [r16]=r2,16;                                                                \
24847 +.mem.offset 8,0; st8.spill [r17]=r3,16;                                                                \
24848 +       ;;                                                                                      \
24849 +       EXTRA;                                                                                  \
24850 +       mov r2=b0; br.call.sptk b0=xen_bsw1;; mov b0=r2;                                        \
24851 +       adds r2=IA64_PT_REGS_R16_OFFSET,r1;                                                     \
24852 +       ;;                                                                                      \
24853 +       movl r1=__gp;           /* establish kernel global pointer */                           \
24854 +       ;;                                                                                      \
24855 +       /* MINSTATE_END_SAVE_MIN */
24856 +#else
24857 +#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA)                                                      \
24858 +       MINSTATE_GET_CURRENT(r16);      /* M (or M;;I) */                                       \
24859 +       mov r27=ar.rsc;                 /* M */                                                 \
24860 +       mov r20=r1;                     /* A */                                                 \
24861 +       mov r25=ar.unat;                /* M */                                                 \
24862 +       mov r29=cr.ipsr;                /* M */                                                 \
24863 +       mov r26=ar.pfs;                 /* I */                                                 \
24864 +       mov r28=cr.iip;                 /* M */                                                 \
24865 +       mov r21=ar.fpsr;                /* M */                                                 \
24866 +       COVER;                          /* B;; (or nothing) */                                  \
24867 +       ;;                                                                                      \
24868 +       adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;                                         \
24869 +       ;;                                                                                      \
24870 +       ld1 r17=[r16];                          /* load current->thread.on_ustack flag */       \
24871 +       st1 [r16]=r0;                           /* clear current->thread.on_ustack flag */      \
24872 +       adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16                                          \
24873 +       /* switch from user to kernel RBS: */                                                   \
24874 +       ;;                                                                                      \
24875 +       invala;                         /* M */                                                 \
24876 +       SAVE_IFS;                                                                               \
24877 +       cmp.eq pKStk,pUStk=r0,r17;              /* are we in kernel mode already? */            \
24878 +       ;;                                                                                      \
24879 +       MINSTATE_START_SAVE_MIN                                                                 \
24880 +       adds r17=2*L1_CACHE_BYTES,r1;           /* really: biggest cache-line size */           \
24881 +       adds r16=PT(CR_IPSR),r1;                                                                \
24882 +       ;;                                                                                      \
24883 +       lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;                                             \
24884 +       st8 [r16]=r29;          /* save cr.ipsr */                                              \
24885 +       ;;                                                                                      \
24886 +       lfetch.fault.excl.nt1 [r17];                                                            \
24887 +       tbit.nz p15,p0=r29,IA64_PSR_I_BIT;                                                      \
24888 +       mov r29=b0                                                                              \
24889 +       ;;                                                                                      \
24890 +       adds r16=PT(R8),r1;     /* initialize first base pointer */                             \
24891 +       adds r17=PT(R9),r1;     /* initialize second base pointer */                            \
24892 +(pKStk)        mov r18=r0;             /* make sure r18 isn't NaT */                                   \
24893 +       ;;                                                                                      \
24894 +.mem.offset 0,0; st8.spill [r16]=r8,16;                                                                \
24895 +.mem.offset 8,0; st8.spill [r17]=r9,16;                                                                \
24896 +        ;;                                                                                     \
24897 +.mem.offset 0,0; st8.spill [r16]=r10,24;                                                       \
24898 +.mem.offset 8,0; st8.spill [r17]=r11,24;                                                       \
24899 +        ;;                                                                                     \
24900 +       st8 [r16]=r28,16;       /* save cr.iip */                                               \
24901 +       st8 [r17]=r30,16;       /* save cr.ifs */                                               \
24902 +(pUStk)        sub r18=r18,r22;        /* r18=RSE.ndirty*8 */                                          \
24903 +       mov r8=ar.ccv;                                                                          \
24904 +       mov r9=ar.csd;                                                                          \
24905 +       mov r10=ar.ssd;                                                                         \
24906 +       movl r11=FPSR_DEFAULT;   /* L-unit */                                                   \
24907 +       ;;                                                                                      \
24908 +       st8 [r16]=r25,16;       /* save ar.unat */                                              \
24909 +       st8 [r17]=r26,16;       /* save ar.pfs */                                               \
24910 +       shl r18=r18,16;         /* compute ar.rsc to be used for "loadrs" */                    \
24911 +       ;;                                                                                      \
24912 +       st8 [r16]=r27,16;       /* save ar.rsc */                                               \
24913 +(pUStk)        st8 [r17]=r24,16;       /* save ar.rnat */                                              \
24914 +(pKStk)        adds r17=16,r17;        /* skip over ar_rnat field */                                   \
24915 +       ;;                      /* avoid RAW on r16 & r17 */                                    \
24916 +(pUStk)        st8 [r16]=r23,16;       /* save ar.bspstore */                                          \
24917 +       st8 [r17]=r31,16;       /* save predicates */                                           \
24918 +(pKStk)        adds r16=16,r16;        /* skip over ar_bspstore field */                               \
24919 +       ;;                                                                                      \
24920 +       st8 [r16]=r29,16;       /* save b0 */                                                   \
24921 +       st8 [r17]=r18,16;       /* save ar.rsc value for "loadrs" */                            \
24922 +       cmp.eq pNonSys,pSys=r0,r0       /* initialize pSys=0, pNonSys=1 */                      \
24923 +       ;;                                                                                      \
24924 +.mem.offset 0,0; st8.spill [r16]=r20,16;       /* save original r1 */                          \
24925 +.mem.offset 8,0; st8.spill [r17]=r12,16;                                                       \
24926 +       adds r12=-16,r1;        /* switch to kernel memory stack (with 16 bytes of scratch) */  \
24927 +       ;;                                                                                      \
24928 +.mem.offset 0,0; st8.spill [r16]=r13,16;                                                       \
24929 +.mem.offset 8,0; st8.spill [r17]=r21,16;       /* save ar.fpsr */                              \
24930 +       mov r13=IA64_KR(CURRENT);       /* establish `current' */                               \
24931 +       ;;                                                                                      \
24932 +.mem.offset 0,0; st8.spill [r16]=r15,16;                                                       \
24933 +.mem.offset 8,0; st8.spill [r17]=r14,16;                                                       \
24934 +       ;;                                                                                      \
24935 +.mem.offset 0,0; st8.spill [r16]=r2,16;                                                                \
24936 +.mem.offset 8,0; st8.spill [r17]=r3,16;                                                                \
24937 +       adds r2=IA64_PT_REGS_R16_OFFSET,r1;                                                     \
24938 +       ;;                                                                                      \
24939 +       EXTRA;                                                                                  \
24940 +       movl r1=__gp;           /* establish kernel global pointer */                           \
24941 +       ;;                                                                                      \
24942 +       MINSTATE_END_SAVE_MIN
24943 +#endif
24944 +
24945 +/*
24946 + * SAVE_REST saves the remainder of pt_regs (with psr.ic on).
24947 + *
24948 + * Assumed state upon entry:
24949 + *     psr.ic: on
24950 + *     r2:     points to &pt_regs.r16
24951 + *     r3:     points to &pt_regs.r17
24952 + *     r8:     contents of ar.ccv
24953 + *     r9:     contents of ar.csd
24954 + *     r10:    contents of ar.ssd
24955 + *     r11:    FPSR_DEFAULT
24956 + *
24957 + * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST.
24958 + */
24959 +#define SAVE_REST                              \
24960 +.mem.offset 0,0; st8.spill [r2]=r16,16;                \
24961 +.mem.offset 8,0; st8.spill [r3]=r17,16;                \
24962 +       ;;                                      \
24963 +.mem.offset 0,0; st8.spill [r2]=r18,16;                \
24964 +.mem.offset 8,0; st8.spill [r3]=r19,16;                \
24965 +       ;;                                      \
24966 +.mem.offset 0,0; st8.spill [r2]=r20,16;                \
24967 +.mem.offset 8,0; st8.spill [r3]=r21,16;                \
24968 +       mov r18=b6;                             \
24969 +       ;;                                      \
24970 +.mem.offset 0,0; st8.spill [r2]=r22,16;                \
24971 +.mem.offset 8,0; st8.spill [r3]=r23,16;                \
24972 +       mov r19=b7;                             \
24973 +       ;;                                      \
24974 +.mem.offset 0,0; st8.spill [r2]=r24,16;                \
24975 +.mem.offset 8,0; st8.spill [r3]=r25,16;                \
24976 +       ;;                                      \
24977 +.mem.offset 0,0; st8.spill [r2]=r26,16;                \
24978 +.mem.offset 8,0; st8.spill [r3]=r27,16;                \
24979 +       ;;                                      \
24980 +.mem.offset 0,0; st8.spill [r2]=r28,16;                \
24981 +.mem.offset 8,0; st8.spill [r3]=r29,16;                \
24982 +       ;;                                      \
24983 +.mem.offset 0,0; st8.spill [r2]=r30,16;                \
24984 +.mem.offset 8,0; st8.spill [r3]=r31,32;                \
24985 +       ;;                                      \
24986 +       mov ar.fpsr=r11;        /* M-unit */    \
24987 +       st8 [r2]=r8,8;          /* ar.ccv */    \
24988 +       adds r24=PT(B6)-PT(F7),r3;              \
24989 +       ;;                                      \
24990 +       stf.spill [r2]=f6,32;                   \
24991 +       stf.spill [r3]=f7,32;                   \
24992 +       ;;                                      \
24993 +       stf.spill [r2]=f8,32;                   \
24994 +       stf.spill [r3]=f9,32;                   \
24995 +       ;;                                      \
24996 +       stf.spill [r2]=f10;                     \
24997 +       stf.spill [r3]=f11;                     \
24998 +       adds r25=PT(B7)-PT(F11),r3;             \
24999 +       ;;                                      \
25000 +       st8 [r24]=r18,16;       /* b6 */        \
25001 +       st8 [r25]=r19,16;       /* b7 */        \
25002 +       ;;                                      \
25003 +       st8 [r24]=r9;           /* ar.csd */    \
25004 +       st8 [r25]=r10;          /* ar.ssd */    \
25005 +       ;;
25006 +
25007 +#define SAVE_MIN_WITH_COVER    DO_SAVE_MIN(cover, mov r30=cr.ifs,)
25008 +#define SAVE_MIN_WITH_COVER_R19        DO_SAVE_MIN(cover, mov r30=cr.ifs, mov r15=r19)
25009 +#ifdef CONFIG_XEN
25010 +#define SAVE_MIN               break 0;; /* FIXME: non-cover version only for ia32 support? */
25011 +#else
25012 +#define SAVE_MIN               DO_SAVE_MIN(     , mov r30=r0, )
25013 +#endif
25014 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/xen/xenpal.S linux-2.6.16/arch/ia64/xen/xenpal.S
25015 --- linux-2.6.16.orig/arch/ia64/xen/xenpal.S    1970-01-01 01:00:00.000000000 +0100
25016 +++ linux-2.6.16/arch/ia64/xen/xenpal.S 2006-06-26 09:51:32.000000000 +0200
25017 @@ -0,0 +1,73 @@
25018 +/*
25019 + * ia64/xen/xenpal.S
25020 + *
25021 + * Alternate PAL  routines for Xen.  Heavily leveraged from
25022 + *   ia64/kernel/pal.S
25023 + *
25024 + * Copyright (C) 2005 Hewlett-Packard Co
25025 + *     Dan Magenheimer <dan.magenheimer@.hp.com>
25026 + */
25027 +
25028 +#include <asm/asmmacro.h>
25029 +#include <asm/processor.h>
25030 +
25031 +GLOBAL_ENTRY(xen_pal_call_static)
25032 +       .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5)
25033 +       alloc loc1 = ar.pfs,5,5,0,0
25034 +#ifdef CONFIG_XEN
25035 +       movl r22=running_on_xen;;
25036 +       ld4 r22=[r22];;
25037 +       cmp.eq p7,p0=r22,r0
25038 +(p7)   br.cond.spnt.many __ia64_pal_call_static;;
25039 +#endif
25040 +       movl loc2 = pal_entry_point
25041 +1:     {
25042 +         mov r28 = in0
25043 +         mov r29 = in1
25044 +         mov r8 = ip
25045 +       }
25046 +       ;;
25047 +       ld8 loc2 = [loc2]               // loc2 <- entry point
25048 +       tbit.nz p6,p7 = in4, 0
25049 +       adds r8 = 1f-1b,r8
25050 +       mov loc4=ar.rsc                 // save RSE configuration
25051 +       ;;
25052 +       mov ar.rsc=0                    // put RSE in enforced lazy, LE mode
25053 +       mov loc3 = psr
25054 +       mov loc0 = rp
25055 +       .body
25056 +       mov r30 = in2
25057 +
25058 +#ifdef CONFIG_XEN
25059 +       // this is low priority for paravirtualization, but is called
25060 +       // from the idle loop so confuses privop counting
25061 +       movl r31=XSI_PSR_IC
25062 +       ;;
25063 +(p6)   st8 [r31]=r0
25064 +       ;;
25065 +(p7)   adds r31=XSI_PSR_I-XSI_PSR_IC,r31
25066 +       ;;
25067 +(p7)   st4 [r31]=r0
25068 +       ;;
25069 +       mov r31 = in3
25070 +       mov b7 = loc2
25071 +       ;;
25072 +#else
25073 +(p6)   rsm psr.i | psr.ic
25074 +       mov r31 = in3
25075 +       mov b7 = loc2
25076 +
25077 +(p7)   rsm psr.i
25078 +       ;;
25079 +(p6)   srlz.i
25080 +#endif
25081 +       mov rp = r8
25082 +       br.cond.sptk.many b7
25083 +1:     mov psr.l = loc3
25084 +       mov ar.rsc = loc4               // restore RSE configuration
25085 +       mov ar.pfs = loc1
25086 +       mov rp = loc0
25087 +       ;;
25088 +       srlz.d                          // seralize restoration of psr.l
25089 +       br.ret.sptk.many b0
25090 +END(xen_pal_call_static)
25091 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/xen/xensetup.S linux-2.6.16/arch/ia64/xen/xensetup.S
25092 --- linux-2.6.16.orig/arch/ia64/xen/xensetup.S  1970-01-01 01:00:00.000000000 +0100
25093 +++ linux-2.6.16/arch/ia64/xen/xensetup.S       2006-06-26 09:51:32.000000000 +0200
25094 @@ -0,0 +1,35 @@
25095 +/*
25096 + * Support routines for Xen
25097 + *
25098 + * Copyright (C) 2005 Dan Magenheimer <dan.magenheimer@hp.com>
25099 + */
25100 +
25101 +#include <linux/config.h>
25102 +#include <asm/processor.h>
25103 +#include <asm/asmmacro.h>
25104 +
25105 +       .data
25106 +       .align 8
25107 +       .globl running_on_xen
25108 +running_on_xen:
25109 +       data4 0
25110 +
25111 +#define isBP   p3      // are we the Bootstrap Processor?
25112 +
25113 +       .text
25114 +GLOBAL_ENTRY(early_xen_setup)
25115 +       mov r8=cr.dcr
25116 +(isBP) movl r9=running_on_xen;;
25117 +       extr.u r8=r8,63,1;;
25118 +       cmp.ne p7,p0=r8,r0;;
25119 +(isBP) st4 [r9]=r8
25120 +(p7)   movl r10=xen_ivt;;
25121 +(p7)   mov cr.iva=r10
25122 +       br.ret.sptk.many rp;;
25123 +END(early_xen_setup)
25124 +
25125 +GLOBAL_ENTRY(is_running_on_xen)
25126 +       movl r9=running_on_xen;;
25127 +       ld4 r8=[r9]
25128 +       br.ret.sptk.many rp;;
25129 +END(is_running_on_xen)
25130 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/xen-mkbuildtree-post linux-2.6.16/arch/ia64/xen-mkbuildtree-post
25131 --- linux-2.6.16.orig/arch/ia64/xen-mkbuildtree-post    1970-01-01 01:00:00.000000000 +0100
25132 +++ linux-2.6.16/arch/ia64/xen-mkbuildtree-post 2006-06-26 09:51:32.000000000 +0200
25133 @@ -0,0 +1,2 @@
25134 +#!/bin/bash
25135 +echo 'NOTHING YET IN ' ${0}
25136 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/ia64/xen-mkbuildtree-pre linux-2.6.16/arch/ia64/xen-mkbuildtree-pre
25137 --- linux-2.6.16.orig/arch/ia64/xen-mkbuildtree-pre     1970-01-01 01:00:00.000000000 +0100
25138 +++ linux-2.6.16/arch/ia64/xen-mkbuildtree-pre  2006-06-26 09:51:32.000000000 +0200
25139 @@ -0,0 +1,50 @@
25140 +#!/bin/bash
25141 +# restructure directories to match future drivers/xen plan
25142 +# and move aside xen/x86 specific changes
25143 +# WARNING!: This directory movement really confuses hg which makes
25144 +# it difficult to do development in a directory which is being used
25145 +# for building (as all files in mv'd directories are thought by hg
25146 +# to have been deleted).  I don't know how to avoid this right now,
25147 +# but if someone has a better way, I'm all ears
25148 +if [ ! -e mm.xen-x86 ]
25149 +then
25150 +       mv mm mm.xen-x86
25151 +       mkdir mm
25152 +       mv net net.xen-x86
25153 +       mv kernel kernel.xen-x86
25154 +       mv drivers/acpi/tables.c drivers/acpi/tables.c.xen-x86
25155 +#      mv arch/xen/kernel drivers/xen/core
25156 +#      mv arch/xen arch/xen.xen-x86
25157 +#      mkdir arch/xen
25158 +#      mv arch/xen.xen-x86/configs arch/xen
25159 +#      mv include/asm-generic include/asm-generic.xen-x86
25160 +       mv include/linux include/linux.xen-x86
25161 +       mkdir include/linux
25162 +fi
25163 +
25164 +# need to grab a couple of xen-modified files for generic_page_range and
25165 +# typedef pte_fn_t which are used by driver/xen blkif
25166 +cp mm.xen-x86/memory.c mm/memory.c
25167 +cp include/linux.xen-x86/mm.h include/linux/mm.h
25168 +
25169 +#eventually asm-xsi-offsets needs to be part of hypervisor.h/hypercall.h
25170 +cp ../xen/include/asm-ia64/asm-xsi-offsets.h include/asm-ia64/xen/
25171 +
25172 +#ia64 drivers/xen isn't fully functional yet, workaround...
25173 +#also ignore core/evtchn.c which uses a different irq mechanism than ia64
25174 +#(warning: there be dragons here if these files diverge)
25175 +cp arch/ia64/xen/drivers/Makefile drivers/xen/Makefile
25176 +cp arch/ia64/xen/drivers/coreMakefile drivers/xen/core/Makefile
25177 +
25178 +#not sure where these ia64-specific files will end up in the future
25179 +cp arch/ia64/xen/drivers/xenia64_init.c drivers/xen/core
25180 +cp arch/ia64/xen/drivers/evtchn_ia64.c drivers/xen/core
25181 +
25182 +#still a few x86-ism's in various drivers/xen files, patch them
25183 +#cd drivers/xen
25184 +#if [ ! -e ia64.patch.semaphore ]
25185 +#then
25186 +#      cat ../../arch/ia64/xen/drivers/patches/* | patch -p1 -b
25187 +#fi
25188 +#touch ia64.patch.semaphore
25189 +#cd ../..
25190 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/um/kernel/physmem.c linux-2.6.16/arch/um/kernel/physmem.c
25191 --- linux-2.6.16.orig/arch/um/kernel/physmem.c  2006-03-20 06:53:29.000000000 +0100
25192 +++ linux-2.6.16/arch/um/kernel/physmem.c       2006-06-26 09:51:32.000000000 +0200
25193 @@ -225,7 +225,7 @@
25194  EXPORT_SYMBOL(physmem_remove_mapping);
25195  EXPORT_SYMBOL(physmem_subst_mapping);
25196  
25197 -void arch_free_page(struct page *page, int order)
25198 +int arch_free_page(struct page *page, int order)
25199  {
25200         void *virt;
25201         int i;
25202 @@ -234,6 +234,8 @@
25203                 virt = __va(page_to_phys(page + i));
25204                 physmem_remove_mapping(virt);
25205         }
25206 +
25207 +       return 0;
25208  }
25209  
25210  int is_remapped(void *virt)
25211 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/Kconfig linux-2.6.16/arch/x86_64/Kconfig
25212 --- linux-2.6.16.orig/arch/x86_64/Kconfig       2006-06-26 09:49:46.000000000 +0200
25213 +++ linux-2.6.16/arch/x86_64/Kconfig    2006-06-26 09:51:32.000000000 +0200
25214 @@ -119,6 +119,22 @@
25215  
25216  endchoice
25217  
25218 +config X86_64_XEN
25219 +       bool "Enable Xen compatible kernel"
25220 +       select SWIOTLB
25221 +       help
25222 +         This option will compile a kernel compatible with Xen hypervisor
25223 +
25224 +config X86_NO_TSS
25225 +       bool
25226 +       depends on X86_64_XEN
25227 +       default y
25228 +
25229 +config X86_NO_IDT
25230 +       bool
25231 +       depends on X86_64_XEN
25232 +       default y
25233 +
25234  #
25235  # Define implied options from the CPU selection here
25236  #
25237 @@ -134,6 +150,7 @@
25238  
25239  config X86_TSC
25240         bool
25241 +       depends on !X86_64_XEN
25242         default y
25243  
25244  config X86_GOOD_APIC
25245 @@ -176,7 +193,7 @@
25246  
25247  config X86_HT
25248         bool
25249 -       depends on SMP && !MK8
25250 +       depends on SMP && !MK8 && !X86_64_XEN
25251         default y
25252  
25253  config MATH_EMULATION
25254 @@ -190,14 +207,22 @@
25255  
25256  config X86_IO_APIC
25257         bool
25258 +       depends !XEN_UNPRIVILEGED_GUEST
25259         default y
25260  
25261 +config X86_XEN_GENAPIC
25262 +       bool
25263 +       depends X86_64_XEN
25264 +       default XEN_PRIVILEGED_GUEST || SMP
25265 +
25266  config X86_LOCAL_APIC
25267         bool
25268 +       depends !XEN_UNPRIVILEGED_GUEST
25269         default y
25270  
25271  config MTRR
25272         bool "MTRR (Memory Type Range Register) support"
25273 +       depends on !XEN_UNPRIVILEGED_GUEST
25274         ---help---
25275           On Intel P6 family processors (Pentium Pro, Pentium II and later)
25276           the Memory Type Range Registers (MTRRs) may be used to control
25277 @@ -238,7 +263,7 @@
25278  
25279  config SCHED_SMT
25280         bool "SMT (Hyperthreading) scheduler support"
25281 -       depends on SMP
25282 +       depends on SMP && !X86_64_XEN
25283         default n
25284         help
25285           SMT scheduler support improves the CPU scheduler's decision making
25286 @@ -250,7 +275,7 @@
25287  
25288  config NUMA
25289         bool "Non Uniform Memory Access (NUMA) Support"
25290 -       depends on SMP
25291 +       depends on SMP && !X86_64_XEN
25292         help
25293          Enable NUMA (Non Uniform Memory Access) support. The kernel 
25294          will try to allocate memory used by a CPU on the local memory 
25295 @@ -325,6 +350,7 @@
25296         int "Maximum number of CPUs (2-256)"
25297         range 2 256
25298         depends on SMP
25299 +       default "16" if X86_64_XEN
25300         default "8"
25301         help
25302           This allows you to specify the maximum number of CPUs which this
25303 @@ -345,6 +371,7 @@
25304  
25305  config HPET_TIMER
25306         bool
25307 +       depends on !X86_64_XEN
25308         default y
25309         help
25310           Use the IA-PC HPET (High Precision Event Timer) to manage
25311 @@ -362,7 +389,7 @@
25312         bool "K8 GART IOMMU support"
25313         default y
25314         select SWIOTLB
25315 -       depends on PCI
25316 +       depends on PCI && !X86_64_XEN
25317         help
25318           Support the IOMMU. Needed to run systems with more than 3GB of memory
25319           properly with 32-bit PCI devices that do not support DAC (Double Address
25320 @@ -380,6 +407,7 @@
25321  
25322  config X86_MCE
25323         bool "Machine check support" if EMBEDDED
25324 +       depends on !X86_64_XEN
25325         default y
25326         help
25327            Include a machine check error handler to report hardware errors.
25328 @@ -405,7 +433,7 @@
25329  
25330  config KEXEC
25331         bool "kexec system call (EXPERIMENTAL)"
25332 -       depends on EXPERIMENTAL
25333 +       depends on EXPERIMENTAL && !X86_64_XEN
25334         help
25335           kexec is a system call that implements the ability to shutdown your
25336           current kernel, and to start another kernel.  It is like a reboot
25337 @@ -488,8 +516,11 @@
25338         default y
25339  
25340  menu "Power management options"
25341 +       depends on !XEN_UNPRIVILEGED_GUEST
25342  
25343 +if !X86_64_XEN
25344  source kernel/power/Kconfig
25345 +endif
25346  
25347  source "drivers/acpi/Kconfig"
25348  
25349 @@ -512,6 +543,21 @@
25350         bool "Support mmconfig PCI config space access"
25351         depends on PCI && ACPI
25352  
25353 +config XEN_PCIDEV_FRONTEND
25354 +       bool "Xen PCI Frontend"
25355 +       depends on PCI && X86_64_XEN
25356 +       default y
25357 +       help
25358 +         The PCI device frontend driver allows the kernel to import arbitrary
25359 +         PCI devices from a PCI backend to support PCI driver domains.
25360 +
25361 +config XEN_PCIDEV_FE_DEBUG
25362 +       bool "Xen PCI Frontend Debugging"
25363 +       depends on XEN_PCIDEV_FRONTEND
25364 +       default n
25365 +       help
25366 +         Enables some debug statements within the PCI Frontend.
25367 +
25368  config UNORDERED_IO
25369         bool "Unordered IO mapping access"
25370         depends on EXPERIMENTAL
25371 @@ -522,6 +568,7 @@
25372          from i386. Requires that the driver writer used memory barriers
25373          properly.
25374  
25375 +
25376  source "drivers/pci/pcie/Kconfig"
25377  
25378  source "drivers/pci/Kconfig"
25379 @@ -529,6 +576,7 @@
25380  source "drivers/pcmcia/Kconfig"
25381  
25382  source "drivers/pci/hotplug/Kconfig"
25383 +
25384  
25385  endmenu
25386  
25387 @@ -594,4 +642,6 @@
25388  
25389  source "crypto/Kconfig"
25390  
25391 +source "drivers/xen/Kconfig"
25392 +
25393  source "lib/Kconfig"
25394 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/Makefile linux-2.6.16/arch/x86_64/Makefile
25395 --- linux-2.6.16.orig/arch/x86_64/Makefile      2006-03-20 06:53:29.000000000 +0100
25396 +++ linux-2.6.16/arch/x86_64/Makefile   2006-06-26 09:51:32.000000000 +0200
25397 @@ -31,6 +31,10 @@
25398  cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
25399  CFLAGS += $(cflags-y)
25400  
25401 +cppflags-$(CONFIG_XEN) += \
25402 +       -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION)
25403 +CPPFLAGS += $(cppflags-y)
25404 +
25405  CFLAGS += -m64
25406  CFLAGS += -mno-red-zone
25407  CFLAGS += -mcmodel=kernel
25408 @@ -70,6 +74,21 @@
25409  .PHONY: bzImage bzlilo install archmrproper \
25410         fdimage fdimage144 fdimage288 archclean
25411  
25412 +ifdef CONFIG_XEN
25413 +CPPFLAGS := -Iinclude$(if $(KBUILD_SRC),2)/asm/mach-xen $(CPPFLAGS)
25414 +head-y := arch/x86_64/kernel/head-xen.o arch/x86_64/kernel/head64-xen.o arch/x86_64/kernel/init_task.o
25415 +LDFLAGS_vmlinux := -e _start
25416 +boot := arch/i386/boot-xen
25417 +.PHONY: vmlinuz
25418 +#Default target when executing "make"
25419 +all: vmlinuz
25420 +
25421 +vmlinuz: vmlinux
25422 +       $(Q)$(MAKE) $(build)=$(boot) $@
25423 +
25424 +install:
25425 +       $(Q)$(MAKE) $(build)=$(boot) XENGUEST=$(XENGUEST) $@
25426 +else
25427  #Default target when executing "make"
25428  all: bzImage
25429  
25430 @@ -90,6 +109,7 @@
25431  
25432  install:
25433         $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@ 
25434 +endif
25435  
25436  archclean:
25437         $(Q)$(MAKE) $(clean)=$(boot)
25438 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/ia32/Makefile linux-2.6.16/arch/x86_64/ia32/Makefile
25439 --- linux-2.6.16.orig/arch/x86_64/ia32/Makefile 2006-06-26 09:49:45.000000000 +0200
25440 +++ linux-2.6.16/arch/x86_64/ia32/Makefile      2006-06-26 09:54:59.000000000 +0200
25441 @@ -23,9 +23,25 @@
25442                            -Wl,-soname=linux-gate.so.1 -o $@ \
25443                            -Wl,-T,$(filter-out FORCE,$^)
25444  
25445 +$(obj)/vsyscall-int80.so \
25446  $(obj)/vsyscall-sysenter.so $(obj)/vsyscall-syscall.so: \
25447  $(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
25448         $(call if_changed,syscall)
25449  
25450 -AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32
25451 -AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32
25452 +AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32 -Iarch/i386/kernel
25453 +AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32 -Iarch/i386/kernel
25454 +
25455 +ifdef CONFIG_XEN
25456 +AFLAGS_vsyscall-int80.o = -m32 -Iarch/i386/kernel
25457 +CFLAGS_syscall32-xen.o += -DUSE_INT80
25458 +AFLAGS_syscall32_syscall-xen.o += -DUSE_INT80
25459 +
25460 +$(obj)/syscall32_syscall-xen.o: \
25461 +       $(foreach F,int80 sysenter syscall,$(obj)/vsyscall-$F.so)
25462 +
25463 +targets := $(foreach F,int80 sysenter syscall,vsyscall-$F.o vsyscall-$F.so)
25464 +
25465 +include $(srctree)/scripts/Makefile.xen
25466 +
25467 +obj-y := $(call cherrypickxen, $(obj-y))
25468 +endif
25469 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/ia32/ia32entry-xen.S linux-2.6.16/arch/x86_64/ia32/ia32entry-xen.S
25470 --- linux-2.6.16.orig/arch/x86_64/ia32/ia32entry-xen.S  1970-01-01 01:00:00.000000000 +0100
25471 +++ linux-2.6.16/arch/x86_64/ia32/ia32entry-xen.S       2006-06-26 09:51:32.000000000 +0200
25472 @@ -0,0 +1,721 @@
25473 +/*
25474 + * Compatibility mode system call entry point for x86-64. 
25475 + *             
25476 + * Copyright 2000-2002 Andi Kleen, SuSE Labs.
25477 + */             
25478 +
25479 +#include <asm/dwarf2.h>
25480 +#include <asm/calling.h>
25481 +#include <asm/asm-offsets.h>
25482 +#include <asm/current.h>
25483 +#include <asm/errno.h>
25484 +#include <asm/ia32_unistd.h>   
25485 +#include <asm/thread_info.h>   
25486 +#include <asm/segment.h>
25487 +#include <asm/vsyscall32.h>
25488 +#include <linux/linkage.h>
25489 +
25490 +#define __XEN_X86_64 1
25491 +       
25492 +       .macro IA32_ARG_FIXUP noebp=0
25493 +       movl    %edi,%r8d
25494 +       .if \noebp
25495 +       .else
25496 +       movl    %ebp,%r9d
25497 +       .endif
25498 +       xchg    %ecx,%esi
25499 +       movl    %ebx,%edi
25500 +       movl    %edx,%edx       /* zero extension */
25501 +       .endm 
25502 +
25503 +       /* clobbers %eax */     
25504 +       .macro  CLEAR_RREGS
25505 +       xorl    %eax,%eax
25506 +       movq    %rax,R11(%rsp)
25507 +       movq    %rax,R10(%rsp)
25508 +       movq    %rax,R9(%rsp)
25509 +       movq    %rax,R8(%rsp)
25510 +       .endm
25511 +
25512 +#if defined (__XEN_X86_64)
25513 +#include "../kernel/xen_entry.S"
25514 +               
25515 +#define        __swapgs
25516 +#define __cli
25517 +#define __sti  
25518 +#else
25519 +/*
25520 + * Use the native instructions
25521 + */    
25522 +#define        __swapgs        swapgs
25523 +#define __cli          cli
25524 +#define __sti          sti     
25525 +#endif                 
25526 +
25527 +       .macro CFI_STARTPROC32 simple
25528 +       CFI_STARTPROC   \simple
25529 +       CFI_UNDEFINED   r8
25530 +       CFI_UNDEFINED   r9
25531 +       CFI_UNDEFINED   r10
25532 +       CFI_UNDEFINED   r11
25533 +       CFI_UNDEFINED   r12
25534 +       CFI_UNDEFINED   r13
25535 +       CFI_UNDEFINED   r14
25536 +       CFI_UNDEFINED   r15
25537 +       .endm
25538 +
25539 +/*
25540 + * 32bit SYSENTER instruction entry.
25541 + *
25542 + * Arguments:
25543 + * %eax        System call number.
25544 + * %ebx Arg1
25545 + * %ecx Arg2
25546 + * %edx Arg3
25547 + * %esi Arg4
25548 + * %edi Arg5
25549 + * %ebp user stack
25550 + * 0(%ebp) Arg6        
25551 + *     
25552 + * Interrupts off.
25553 + *     
25554 + * This is purely a fast path. For anything complicated we use the int 0x80
25555 + * path below. Set up a complete hardware stack frame to share code
25556 + * with the int 0x80 path.
25557 + */    
25558 +ENTRY(ia32_sysenter_target)
25559 +       CFI_STARTPROC32 simple
25560 +       CFI_DEF_CFA     rsp,0
25561 +       CFI_REGISTER    rsp,rbp
25562 +       __swapgs 
25563 +       movq    %gs:pda_kernelstack, %rsp
25564 +       addq    $(PDA_STACKOFFSET),%rsp
25565 +       XEN_UNBLOCK_EVENTS(%r11)        
25566 +       __sti
25567 +       movl    %ebp,%ebp               /* zero extension */
25568 +       pushq   $__USER32_DS
25569 +       CFI_ADJUST_CFA_OFFSET 8
25570 +       /*CFI_REL_OFFSET ss,0*/
25571 +       pushq   %rbp
25572 +       CFI_ADJUST_CFA_OFFSET 8
25573 +       CFI_REL_OFFSET rsp,0
25574 +       pushfq
25575 +       CFI_ADJUST_CFA_OFFSET 8
25576 +       /*CFI_REL_OFFSET rflags,0*/
25577 +       movl    $VSYSCALL32_SYSEXIT, %r10d
25578 +       CFI_REGISTER rip,r10
25579 +       pushq   $__USER32_CS
25580 +       CFI_ADJUST_CFA_OFFSET 8
25581 +       /*CFI_REL_OFFSET cs,0*/
25582 +       movl    %eax, %eax
25583 +       pushq   %r10
25584 +       CFI_ADJUST_CFA_OFFSET 8
25585 +       CFI_REL_OFFSET rip,0
25586 +       pushq   %rax
25587 +       CFI_ADJUST_CFA_OFFSET 8
25588 +       cld
25589 +       SAVE_ARGS 0,0,1
25590 +       /* no need to do an access_ok check here because rbp has been
25591 +          32bit zero extended */ 
25592 +1:     movl    (%rbp),%r9d
25593 +       .section __ex_table,"a"
25594 +       .quad 1b,ia32_badarg
25595 +       .previous       
25596 +       GET_THREAD_INFO(%r10)
25597 +       orl    $TS_COMPAT,threadinfo_status(%r10)
25598 +       testl  $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
25599 +       CFI_REMEMBER_STATE
25600 +       jnz  sysenter_tracesys
25601 +sysenter_do_call:      
25602 +       cmpl    $(IA32_NR_syscalls),%eax
25603 +       jae     ia32_badsys
25604 +       IA32_ARG_FIXUP 1
25605 +       call    *ia32_sys_call_table(,%rax,8)
25606 +       movq    %rax,RAX-ARGOFFSET(%rsp)
25607 +       GET_THREAD_INFO(%r10)
25608 +       XEN_BLOCK_EVENTS(%r11)  
25609 +       __cli
25610 +       testl   $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
25611 +       jnz     int_ret_from_sys_call
25612 +       andl    $~TS_COMPAT,threadinfo_status(%r10)
25613 +       /* clear IF, that popfq doesn't enable interrupts early */
25614 +       andl  $~0x200,EFLAGS-R11(%rsp) 
25615 +       RESTORE_ARGS 1,24,1,1,1,1
25616 +       popfq
25617 +       CFI_ADJUST_CFA_OFFSET -8
25618 +       /*CFI_RESTORE rflags*/
25619 +       popq    %rcx                            /* User %esp */
25620 +       CFI_ADJUST_CFA_OFFSET -8
25621 +       CFI_REGISTER rsp,rcx
25622 +       movl    $VSYSCALL32_SYSEXIT,%edx        /* User %eip */
25623 +       CFI_REGISTER rip,rdx
25624 +       __swapgs
25625 +       XEN_UNBLOCK_EVENTS(%r11)                
25626 +       __sti           /* sti only takes effect after the next instruction */
25627 +       /* sysexit */
25628 +       .byte   0xf, 0x35  /* TBD */
25629 +
25630 +sysenter_tracesys:
25631 +       CFI_RESTORE_STATE
25632 +       SAVE_REST
25633 +       CLEAR_RREGS
25634 +       movq    $-ENOSYS,RAX(%rsp)      /* really needed? */
25635 +       movq    %rsp,%rdi        /* &pt_regs -> arg1 */
25636 +       call    syscall_trace_enter
25637 +       LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
25638 +       RESTORE_REST
25639 +       movl    %ebp, %ebp
25640 +       /* no need to do an access_ok check here because rbp has been
25641 +          32bit zero extended */ 
25642 +1:     movl    (%rbp),%r9d
25643 +       .section __ex_table,"a"
25644 +       .quad 1b,ia32_badarg
25645 +       .previous
25646 +       jmp     sysenter_do_call
25647 +       CFI_ENDPROC
25648 +
25649 +/*
25650 + * 32bit SYSCALL instruction entry.
25651 + *
25652 + * Arguments:
25653 + * %eax        System call number.
25654 + * %ebx Arg1
25655 + * %ecx return EIP 
25656 + * %edx Arg3
25657 + * %esi Arg4
25658 + * %edi Arg5
25659 + * %ebp Arg2    [note: not saved in the stack frame, should not be touched]
25660 + * %esp user stack 
25661 + * 0(%esp) Arg6
25662 + *     
25663 + * Interrupts off.
25664 + *     
25665 + * This is purely a fast path. For anything complicated we use the int 0x80
25666 + * path below. Set up a complete hardware stack frame to share code
25667 + * with the int 0x80 path.     
25668 + */    
25669 +ENTRY(ia32_cstar_target)
25670 +       CFI_STARTPROC32 simple
25671 +       CFI_DEF_CFA     rsp,0
25672 +       CFI_REGISTER    rip,rcx
25673 +       /*CFI_REGISTER  rflags,r11*/
25674 +       __swapgs
25675 +       movl    %esp,%r8d
25676 +       CFI_REGISTER    rsp,r8
25677 +       movq    %gs:pda_kernelstack,%rsp
25678 +       XEN_UNBLOCK_EVENTS(%r11)        
25679 +       __sti
25680 +       SAVE_ARGS 8,1,1
25681 +       movl    %eax,%eax       /* zero extension */
25682 +       movq    %rax,ORIG_RAX-ARGOFFSET(%rsp)
25683 +       movq    %rcx,RIP-ARGOFFSET(%rsp)
25684 +       CFI_REL_OFFSET rip,RIP-ARGOFFSET
25685 +       movq    %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
25686 +       movl    %ebp,%ecx
25687 +       movq    $__USER32_CS,CS-ARGOFFSET(%rsp)
25688 +       movq    $__USER32_DS,SS-ARGOFFSET(%rsp)
25689 +       movq    %r11,EFLAGS-ARGOFFSET(%rsp)
25690 +       /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
25691 +       movq    %r8,RSP-ARGOFFSET(%rsp) 
25692 +       CFI_REL_OFFSET rsp,RSP-ARGOFFSET
25693 +       /* no need to do an access_ok check here because r8 has been
25694 +          32bit zero extended */ 
25695 +       /* hardware stack frame is complete now */      
25696 +1:     movl    (%r8),%r9d
25697 +       .section __ex_table,"a"
25698 +       .quad 1b,ia32_badarg
25699 +       .previous       
25700 +       GET_THREAD_INFO(%r10)
25701 +       orl   $TS_COMPAT,threadinfo_status(%r10)
25702 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
25703 +       CFI_REMEMBER_STATE
25704 +       jnz   cstar_tracesys
25705 +cstar_do_call: 
25706 +       cmpl $IA32_NR_syscalls,%eax
25707 +       jae  ia32_badsys
25708 +       IA32_ARG_FIXUP 1
25709 +       call *ia32_sys_call_table(,%rax,8)
25710 +       movq %rax,RAX-ARGOFFSET(%rsp)
25711 +       GET_THREAD_INFO(%r10)
25712 +       XEN_BLOCK_EVENTS(%r11)          
25713 +       __cli
25714 +       testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
25715 +       jnz  int_ret_from_sys_call
25716 +       andl $~TS_COMPAT,threadinfo_status(%r10)
25717 +       RESTORE_ARGS 1,-ARG_SKIP,1,1,1
25718 +       movl RIP-ARGOFFSET(%rsp),%ecx
25719 +       CFI_REGISTER rip,rcx
25720 +       movl EFLAGS-ARGOFFSET(%rsp),%r11d       
25721 +       /*CFI_REGISTER rflags,r11*/
25722 +       movl RSP-ARGOFFSET(%rsp),%esp
25723 +       CFI_RESTORE rsp
25724 +       __swapgs
25725 +       sysretl  /* TBD */
25726 +       
25727 +cstar_tracesys:        
25728 +       CFI_RESTORE_STATE
25729 +       SAVE_REST
25730 +       CLEAR_RREGS
25731 +       movq $-ENOSYS,RAX(%rsp) /* really needed? */
25732 +       movq %rsp,%rdi        /* &pt_regs -> arg1 */
25733 +       call syscall_trace_enter
25734 +       LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
25735 +       RESTORE_REST
25736 +       movl RSP-ARGOFFSET(%rsp), %r8d
25737 +       /* no need to do an access_ok check here because r8 has been
25738 +          32bit zero extended */ 
25739 +1:     movl    (%r8),%r9d
25740 +       .section __ex_table,"a"
25741 +       .quad 1b,ia32_badarg
25742 +       .previous
25743 +       jmp cstar_do_call
25744 +                               
25745 +ia32_badarg:
25746 +       movq $-EFAULT,%rax
25747 +       jmp ia32_sysret
25748 +       CFI_ENDPROC
25749 +
25750 +/* 
25751 + * Emulated IA32 system calls via int 0x80. 
25752 + *
25753 + * Arguments:   
25754 + * %eax        System call number.
25755 + * %ebx Arg1
25756 + * %ecx Arg2
25757 + * %edx Arg3
25758 + * %esi Arg4
25759 + * %edi Arg5
25760 + * %ebp Arg6    [note: not saved in the stack frame, should not be touched]
25761 + *
25762 + * Notes:
25763 + * Uses the same stack frame as the x86-64 version.    
25764 + * All registers except %eax must be saved (but ptrace may violate that)
25765 + * Arguments are zero extended. For system calls that want sign extension and
25766 + * take long arguments a wrapper is needed. Most calls can just be called
25767 + * directly.
25768 + * Assumes it is only called from user space and entered with interrupts off.  
25769 + */                            
25770 +
25771 +ENTRY(ia32_syscall)
25772 +       CFI_STARTPROC   simple
25773 +       CFI_DEF_CFA     rsp,SS+8-RIP
25774 +       /*CFI_REL_OFFSET        ss,SS-RIP*/
25775 +       CFI_REL_OFFSET  rsp,RSP-RIP
25776 +       /*CFI_REL_OFFSET        rflags,EFLAGS-RIP*/
25777 +       /*CFI_REL_OFFSET        cs,CS-RIP*/
25778 +       CFI_REL_OFFSET  rip,RIP-RIP
25779 +       __swapgs
25780 +       XEN_UNBLOCK_EVENTS(%r11)
25781 +       __sti
25782 +       movq (%rsp),%rcx
25783 +       movq 8(%rsp),%r11
25784 +        addq $0x10,%rsp /* skip rcx and r11 */
25785 +       movl %eax,%eax
25786 +       pushq %rax
25787 +       CFI_ADJUST_CFA_OFFSET 8
25788 +       cld
25789 +/* 1:  jmp 1b   */
25790 +       /* note the registers are not zero extended to the sf.
25791 +          this could be a problem. */
25792 +       SAVE_ARGS 0,0,1
25793 +       GET_THREAD_INFO(%r10)
25794 +       orl   $TS_COMPAT,threadinfo_status(%r10)
25795 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
25796 +       jnz ia32_tracesys
25797 +ia32_do_syscall:       
25798 +       cmpl $(IA32_NR_syscalls),%eax
25799 +       jae  ia32_badsys
25800 +       IA32_ARG_FIXUP
25801 +       call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
25802 +ia32_sysret:
25803 +       movq %rax,RAX-ARGOFFSET(%rsp)
25804 +       jmp int_ret_from_sys_call 
25805 +
25806 +ia32_tracesys:                  
25807 +       SAVE_REST
25808 +       movq $-ENOSYS,RAX(%rsp) /* really needed? */
25809 +       movq %rsp,%rdi        /* &pt_regs -> arg1 */
25810 +       call syscall_trace_enter
25811 +       LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
25812 +       RESTORE_REST
25813 +       jmp ia32_do_syscall
25814 +
25815 +ia32_badsys:
25816 +       movq $0,ORIG_RAX-ARGOFFSET(%rsp)
25817 +       movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
25818 +       jmp int_ret_from_sys_call
25819 +
25820 +ni_syscall:
25821 +       movq %rax,%rdi
25822 +       jmp  sys32_ni_syscall                   
25823 +
25824 +quiet_ni_syscall:
25825 +       movq $-ENOSYS,%rax
25826 +       ret
25827 +       CFI_ENDPROC
25828 +       
25829 +       .macro PTREGSCALL label, func, arg
25830 +       .globl \label
25831 +\label:
25832 +       leaq \func(%rip),%rax
25833 +       leaq -ARGOFFSET+8(%rsp),\arg    /* 8 for return address */
25834 +       jmp  ia32_ptregs_common 
25835 +       .endm
25836 +
25837 +       CFI_STARTPROC32
25838 +
25839 +       PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
25840 +       PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
25841 +       PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
25842 +       PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
25843 +       PTREGSCALL stub32_execve, sys32_execve, %rcx
25844 +       PTREGSCALL stub32_fork, sys_fork, %rdi
25845 +       PTREGSCALL stub32_clone, sys32_clone, %rdx
25846 +       PTREGSCALL stub32_vfork, sys_vfork, %rdi
25847 +       PTREGSCALL stub32_iopl, sys_iopl, %rsi
25848 +       PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
25849 +
25850 +ENTRY(ia32_ptregs_common)
25851 +       popq %r11
25852 +       CFI_ENDPROC
25853 +       CFI_STARTPROC32 simple
25854 +       CFI_DEF_CFA     rsp,SS+8-ARGOFFSET
25855 +       CFI_REL_OFFSET  rax,RAX-ARGOFFSET
25856 +       CFI_REL_OFFSET  rcx,RCX-ARGOFFSET
25857 +       CFI_REL_OFFSET  rdx,RDX-ARGOFFSET
25858 +       CFI_REL_OFFSET  rsi,RSI-ARGOFFSET
25859 +       CFI_REL_OFFSET  rdi,RDI-ARGOFFSET
25860 +       CFI_REL_OFFSET  rip,RIP-ARGOFFSET
25861 +/*     CFI_REL_OFFSET  cs,CS-ARGOFFSET*/
25862 +/*     CFI_REL_OFFSET  rflags,EFLAGS-ARGOFFSET*/
25863 +       CFI_REL_OFFSET  rsp,RSP-ARGOFFSET
25864 +/*     CFI_REL_OFFSET  ss,SS-ARGOFFSET*/
25865 +       SAVE_REST
25866 +       call *%rax
25867 +       RESTORE_REST
25868 +       jmp  ia32_sysret        /* misbalances the return cache */
25869 +       CFI_ENDPROC
25870 +
25871 +       .section .rodata,"a"
25872 +       .align 8
25873 +       .globl ia32_sys_call_table
25874 +ia32_sys_call_table:
25875 +       .quad sys_restart_syscall
25876 +       .quad sys_exit
25877 +       .quad stub32_fork
25878 +       .quad sys_read
25879 +       .quad sys_write
25880 +       .quad compat_sys_open           /* 5 */
25881 +       .quad sys_close
25882 +       .quad sys32_waitpid
25883 +       .quad sys_creat
25884 +       .quad sys_link
25885 +       .quad sys_unlink                /* 10 */
25886 +       .quad stub32_execve
25887 +       .quad sys_chdir
25888 +       .quad compat_sys_time
25889 +       .quad sys_mknod
25890 +       .quad sys_chmod         /* 15 */
25891 +       .quad sys_lchown16
25892 +       .quad quiet_ni_syscall                  /* old break syscall holder */
25893 +       .quad sys_stat
25894 +       .quad sys32_lseek
25895 +       .quad sys_getpid                /* 20 */
25896 +       .quad compat_sys_mount  /* mount  */
25897 +       .quad sys_oldumount     /* old_umount  */
25898 +       .quad sys_setuid16
25899 +       .quad sys_getuid16
25900 +       .quad compat_sys_stime  /* stime */             /* 25 */
25901 +       .quad sys32_ptrace      /* ptrace */
25902 +       .quad sys_alarm
25903 +       .quad sys_fstat /* (old)fstat */
25904 +       .quad sys_pause
25905 +       .quad compat_sys_utime  /* 30 */
25906 +       .quad quiet_ni_syscall  /* old stty syscall holder */
25907 +       .quad quiet_ni_syscall  /* old gtty syscall holder */
25908 +       .quad sys_access
25909 +       .quad sys_nice  
25910 +       .quad quiet_ni_syscall  /* 35 */        /* old ftime syscall holder */
25911 +       .quad sys_sync
25912 +       .quad sys32_kill
25913 +       .quad sys_rename
25914 +       .quad sys_mkdir
25915 +       .quad sys_rmdir         /* 40 */
25916 +       .quad sys_dup
25917 +       .quad sys32_pipe
25918 +       .quad compat_sys_times
25919 +       .quad quiet_ni_syscall                  /* old prof syscall holder */
25920 +       .quad sys_brk           /* 45 */
25921 +       .quad sys_setgid16
25922 +       .quad sys_getgid16
25923 +       .quad sys_signal
25924 +       .quad sys_geteuid16
25925 +       .quad sys_getegid16     /* 50 */
25926 +       .quad sys_acct
25927 +       .quad sys_umount                        /* new_umount */
25928 +       .quad quiet_ni_syscall                  /* old lock syscall holder */
25929 +       .quad compat_sys_ioctl
25930 +       .quad compat_sys_fcntl64                /* 55 */
25931 +       .quad quiet_ni_syscall                  /* old mpx syscall holder */
25932 +       .quad sys_setpgid
25933 +       .quad quiet_ni_syscall                  /* old ulimit syscall holder */
25934 +       .quad sys32_olduname
25935 +       .quad sys_umask         /* 60 */
25936 +       .quad sys_chroot
25937 +       .quad sys32_ustat
25938 +       .quad sys_dup2
25939 +       .quad sys_getppid
25940 +       .quad sys_getpgrp               /* 65 */
25941 +       .quad sys_setsid
25942 +       .quad sys32_sigaction
25943 +       .quad sys_sgetmask
25944 +       .quad sys_ssetmask
25945 +       .quad sys_setreuid16    /* 70 */
25946 +       .quad sys_setregid16
25947 +       .quad stub32_sigsuspend
25948 +       .quad compat_sys_sigpending
25949 +       .quad sys_sethostname
25950 +       .quad compat_sys_setrlimit      /* 75 */
25951 +       .quad compat_sys_old_getrlimit  /* old_getrlimit */
25952 +       .quad compat_sys_getrusage
25953 +       .quad sys32_gettimeofday
25954 +       .quad sys32_settimeofday
25955 +       .quad sys_getgroups16   /* 80 */
25956 +       .quad sys_setgroups16
25957 +       .quad sys32_old_select
25958 +       .quad sys_symlink
25959 +       .quad sys_lstat
25960 +       .quad sys_readlink              /* 85 */
25961 +#ifdef CONFIG_IA32_AOUT
25962 +       .quad sys_uselib
25963 +#else
25964 +       .quad quiet_ni_syscall
25965 +#endif
25966 +       .quad sys_swapon
25967 +       .quad sys_reboot
25968 +       .quad compat_sys_old_readdir
25969 +       .quad sys32_mmap                /* 90 */
25970 +       .quad sys_munmap
25971 +       .quad sys_truncate
25972 +       .quad sys_ftruncate
25973 +       .quad sys_fchmod
25974 +       .quad sys_fchown16              /* 95 */
25975 +       .quad sys_getpriority
25976 +       .quad sys_setpriority
25977 +       .quad quiet_ni_syscall                  /* old profil syscall holder */
25978 +       .quad compat_sys_statfs
25979 +       .quad compat_sys_fstatfs                /* 100 */
25980 +       .quad sys_ioperm
25981 +       .quad compat_sys_socketcall
25982 +       .quad sys_syslog
25983 +       .quad compat_sys_setitimer
25984 +       .quad compat_sys_getitimer      /* 105 */
25985 +       .quad compat_sys_newstat
25986 +       .quad compat_sys_newlstat
25987 +       .quad compat_sys_newfstat
25988 +       .quad sys32_uname
25989 +       .quad stub32_iopl               /* 110 */
25990 +       .quad sys_vhangup
25991 +       .quad quiet_ni_syscall  /* old "idle" system call */
25992 +       .quad sys32_vm86_warning        /* vm86old */ 
25993 +       .quad compat_sys_wait4
25994 +       .quad sys_swapoff               /* 115 */
25995 +       .quad sys32_sysinfo
25996 +       .quad sys32_ipc
25997 +       .quad sys_fsync
25998 +       .quad stub32_sigreturn
25999 +       .quad stub32_clone              /* 120 */
26000 +       .quad sys_setdomainname
26001 +       .quad sys_uname
26002 +       .quad sys_modify_ldt
26003 +       .quad sys32_adjtimex
26004 +       .quad sys32_mprotect            /* 125 */
26005 +       .quad compat_sys_sigprocmask
26006 +       .quad quiet_ni_syscall          /* create_module */
26007 +       .quad sys_init_module
26008 +       .quad sys_delete_module
26009 +       .quad quiet_ni_syscall          /* 130  get_kernel_syms */
26010 +       .quad sys_quotactl
26011 +       .quad sys_getpgid
26012 +       .quad sys_fchdir
26013 +       .quad quiet_ni_syscall  /* bdflush */
26014 +       .quad sys_sysfs         /* 135 */
26015 +       .quad sys_personality
26016 +       .quad quiet_ni_syscall  /* for afs_syscall */
26017 +       .quad sys_setfsuid16
26018 +       .quad sys_setfsgid16
26019 +       .quad sys_llseek                /* 140 */
26020 +       .quad compat_sys_getdents
26021 +       .quad compat_sys_select
26022 +       .quad sys_flock
26023 +       .quad sys_msync
26024 +       .quad compat_sys_readv          /* 145 */
26025 +       .quad compat_sys_writev
26026 +       .quad sys_getsid
26027 +       .quad sys_fdatasync
26028 +       .quad sys32_sysctl      /* sysctl */
26029 +       .quad sys_mlock         /* 150 */
26030 +       .quad sys_munlock
26031 +       .quad sys_mlockall
26032 +       .quad sys_munlockall
26033 +       .quad sys_sched_setparam
26034 +       .quad sys_sched_getparam   /* 155 */
26035 +       .quad sys_sched_setscheduler
26036 +       .quad sys_sched_getscheduler
26037 +       .quad sys_sched_yield
26038 +       .quad sys_sched_get_priority_max
26039 +       .quad sys_sched_get_priority_min  /* 160 */
26040 +       .quad sys_sched_rr_get_interval
26041 +       .quad compat_sys_nanosleep
26042 +       .quad sys_mremap
26043 +       .quad sys_setresuid16
26044 +       .quad sys_getresuid16   /* 165 */
26045 +       .quad sys32_vm86_warning        /* vm86 */ 
26046 +       .quad quiet_ni_syscall  /* query_module */
26047 +       .quad sys_poll
26048 +       .quad compat_sys_nfsservctl
26049 +       .quad sys_setresgid16   /* 170 */
26050 +       .quad sys_getresgid16
26051 +       .quad sys_prctl
26052 +       .quad stub32_rt_sigreturn
26053 +       .quad sys32_rt_sigaction
26054 +       .quad sys32_rt_sigprocmask      /* 175 */
26055 +       .quad sys32_rt_sigpending
26056 +       .quad compat_sys_rt_sigtimedwait
26057 +       .quad sys32_rt_sigqueueinfo
26058 +       .quad stub32_rt_sigsuspend
26059 +       .quad sys32_pread               /* 180 */
26060 +       .quad sys32_pwrite
26061 +       .quad sys_chown16
26062 +       .quad sys_getcwd
26063 +       .quad sys_capget
26064 +       .quad sys_capset
26065 +       .quad stub32_sigaltstack
26066 +       .quad sys32_sendfile
26067 +       .quad quiet_ni_syscall          /* streams1 */
26068 +       .quad quiet_ni_syscall          /* streams2 */
26069 +       .quad stub32_vfork            /* 190 */
26070 +       .quad compat_sys_getrlimit
26071 +       .quad sys32_mmap2
26072 +       .quad sys32_truncate64
26073 +       .quad sys32_ftruncate64
26074 +       .quad sys32_stat64              /* 195 */
26075 +       .quad sys32_lstat64
26076 +       .quad sys32_fstat64
26077 +       .quad sys_lchown
26078 +       .quad sys_getuid
26079 +       .quad sys_getgid                /* 200 */
26080 +       .quad sys_geteuid
26081 +       .quad sys_getegid
26082 +       .quad sys_setreuid
26083 +       .quad sys_setregid
26084 +       .quad sys_getgroups     /* 205 */
26085 +       .quad sys_setgroups
26086 +       .quad sys_fchown
26087 +       .quad sys_setresuid
26088 +       .quad sys_getresuid
26089 +       .quad sys_setresgid     /* 210 */
26090 +       .quad sys_getresgid
26091 +       .quad sys_chown
26092 +       .quad sys_setuid
26093 +       .quad sys_setgid
26094 +       .quad sys_setfsuid              /* 215 */
26095 +       .quad sys_setfsgid
26096 +       .quad sys_pivot_root
26097 +       .quad sys_mincore
26098 +       .quad sys_madvise
26099 +       .quad compat_sys_getdents64     /* 220 getdents64 */
26100 +       .quad compat_sys_fcntl64        
26101 +       .quad quiet_ni_syscall          /* tux */
26102 +       .quad quiet_ni_syscall          /* security */
26103 +       .quad sys_gettid        
26104 +       .quad sys_readahead     /* 225 */
26105 +       .quad sys_setxattr
26106 +       .quad sys_lsetxattr
26107 +       .quad sys_fsetxattr
26108 +       .quad sys_getxattr
26109 +       .quad sys_lgetxattr     /* 230 */
26110 +       .quad sys_fgetxattr
26111 +       .quad sys_listxattr
26112 +       .quad sys_llistxattr
26113 +       .quad sys_flistxattr
26114 +       .quad sys_removexattr   /* 235 */
26115 +       .quad sys_lremovexattr
26116 +       .quad sys_fremovexattr
26117 +       .quad sys_tkill
26118 +       .quad sys_sendfile64 
26119 +       .quad compat_sys_futex          /* 240 */
26120 +       .quad compat_sys_sched_setaffinity
26121 +       .quad compat_sys_sched_getaffinity
26122 +       .quad sys32_set_thread_area
26123 +       .quad sys32_get_thread_area
26124 +       .quad compat_sys_io_setup       /* 245 */
26125 +       .quad sys_io_destroy
26126 +       .quad compat_sys_io_getevents
26127 +       .quad compat_sys_io_submit
26128 +       .quad sys_io_cancel
26129 +       .quad sys_fadvise64             /* 250 */
26130 +       .quad quiet_ni_syscall  /* free_huge_pages */
26131 +       .quad sys_exit_group
26132 +       .quad sys32_lookup_dcookie
26133 +       .quad sys_epoll_create
26134 +       .quad sys_epoll_ctl             /* 255 */
26135 +       .quad sys_epoll_wait
26136 +       .quad sys_remap_file_pages
26137 +       .quad sys_set_tid_address
26138 +       .quad compat_sys_timer_create
26139 +       .quad compat_sys_timer_settime  /* 260 */
26140 +       .quad compat_sys_timer_gettime
26141 +       .quad sys_timer_getoverrun
26142 +       .quad sys_timer_delete
26143 +       .quad compat_sys_clock_settime
26144 +       .quad compat_sys_clock_gettime  /* 265 */
26145 +       .quad compat_sys_clock_getres
26146 +       .quad compat_sys_clock_nanosleep
26147 +       .quad compat_sys_statfs64
26148 +       .quad compat_sys_fstatfs64
26149 +       .quad sys_tgkill                /* 270 */
26150 +       .quad compat_sys_utimes
26151 +       .quad sys32_fadvise64_64
26152 +       .quad quiet_ni_syscall  /* sys_vserver */
26153 +       .quad sys_mbind
26154 +       .quad compat_sys_get_mempolicy  /* 275 */
26155 +       .quad sys_set_mempolicy
26156 +       .quad compat_sys_mq_open
26157 +       .quad sys_mq_unlink
26158 +       .quad compat_sys_mq_timedsend
26159 +       .quad compat_sys_mq_timedreceive        /* 280 */
26160 +       .quad compat_sys_mq_notify
26161 +       .quad compat_sys_mq_getsetattr
26162 +       .quad compat_sys_kexec_load     /* reserved for kexec */
26163 +       .quad compat_sys_waitid
26164 +       .quad quiet_ni_syscall          /* 285: sys_altroot */
26165 +       .quad sys_add_key
26166 +       .quad sys_request_key
26167 +       .quad sys_keyctl
26168 +       .quad sys_ioprio_set
26169 +       .quad sys_ioprio_get            /* 290 */
26170 +       .quad sys_inotify_init
26171 +       .quad sys_inotify_add_watch
26172 +       .quad sys_inotify_rm_watch
26173 +       .quad sys_migrate_pages
26174 +       .quad compat_sys_openat         /* 295 */
26175 +       .quad sys_mkdirat
26176 +       .quad sys_mknodat
26177 +       .quad sys_fchownat
26178 +       .quad compat_sys_futimesat
26179 +       .quad sys32_fstatat             /* 300 */
26180 +       .quad sys_unlinkat
26181 +       .quad sys_renameat
26182 +       .quad sys_linkat
26183 +       .quad sys_symlinkat
26184 +       .quad sys_readlinkat            /* 305 */
26185 +       .quad sys_fchmodat
26186 +       .quad sys_faccessat
26187 +       .quad sys_ni_syscall            /* pselect6 for now */
26188 +       .quad sys_ni_syscall            /* ppoll for now */
26189 +       .quad sys_unshare               /* 310 */
26190 +ia32_syscall_end:              
26191 +       .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
26192 +               .quad ni_syscall
26193 +       .endr
26194 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/ia32/syscall32-xen.c linux-2.6.16/arch/x86_64/ia32/syscall32-xen.c
26195 --- linux-2.6.16.orig/arch/x86_64/ia32/syscall32-xen.c  1970-01-01 01:00:00.000000000 +0100
26196 +++ linux-2.6.16/arch/x86_64/ia32/syscall32-xen.c       2006-06-26 09:51:32.000000000 +0200
26197 @@ -0,0 +1,128 @@
26198 +/* Copyright 2002,2003 Andi Kleen, SuSE Labs */
26199 +
26200 +/* vsyscall handling for 32bit processes. Map a stub page into it 
26201 +   on demand because 32bit cannot reach the kernel's fixmaps */
26202 +
26203 +#include <linux/mm.h>
26204 +#include <linux/string.h>
26205 +#include <linux/kernel.h>
26206 +#include <linux/gfp.h>
26207 +#include <linux/init.h>
26208 +#include <linux/stringify.h>
26209 +#include <linux/security.h>
26210 +#include <asm/proto.h>
26211 +#include <asm/tlbflush.h>
26212 +#include <asm/ia32_unistd.h>
26213 +
26214 +#ifdef USE_INT80
26215 +extern unsigned char syscall32_int80[], syscall32_int80_end[];
26216 +#endif
26217 +extern unsigned char syscall32_syscall[], syscall32_syscall_end[];
26218 +extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[];
26219 +extern int sysctl_vsyscall32;
26220 +
26221 +char *syscall32_page; 
26222 +#ifndef USE_INT80
26223 +static int use_sysenter = -1;
26224 +#endif
26225 +
26226 +static struct page *
26227 +syscall32_nopage(struct vm_area_struct *vma, unsigned long adr, int *type)
26228 +{
26229 +       struct page *p = virt_to_page(adr - vma->vm_start + syscall32_page);
26230 +       get_page(p);
26231 +       return p;
26232 +}
26233 +
26234 +/* Prevent VMA merging */
26235 +static void syscall32_vma_close(struct vm_area_struct *vma)
26236 +{
26237 +}
26238 +
26239 +static struct vm_operations_struct syscall32_vm_ops = {
26240 +       .close = syscall32_vma_close,
26241 +       .nopage = syscall32_nopage,
26242 +};
26243 +
26244 +struct linux_binprm;
26245 +
26246 +/* Setup a VMA at program startup for the vsyscall page */
26247 +int syscall32_setup_pages(struct linux_binprm *bprm, int exstack)
26248 +{
26249 +       int npages = (VSYSCALL32_END - VSYSCALL32_BASE) >> PAGE_SHIFT;
26250 +       struct vm_area_struct *vma;
26251 +       struct mm_struct *mm = current->mm;
26252 +       int ret;
26253 +
26254 +       vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
26255 +       if (!vma)
26256 +               return -ENOMEM;
26257 +
26258 +       memset(vma, 0, sizeof(struct vm_area_struct));
26259 +       /* Could randomize here */
26260 +       vma->vm_start = VSYSCALL32_BASE;
26261 +       vma->vm_end = VSYSCALL32_END;
26262 +       /* MAYWRITE to allow gdb to COW and set breakpoints */
26263 +       vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE;
26264 +       vma->vm_flags |= mm->def_flags;
26265 +       vma->vm_page_prot = protection_map[vma->vm_flags & 7];
26266 +       vma->vm_ops = &syscall32_vm_ops;
26267 +       vma->vm_mm = mm;
26268 +
26269 +       down_write(&mm->mmap_sem);
26270 +       if ((ret = insert_vm_struct(mm, vma))) {
26271 +               up_write(&mm->mmap_sem);
26272 +               kmem_cache_free(vm_area_cachep, vma);
26273 +               return ret;
26274 +       }
26275 +       mm->total_vm += npages;
26276 +       up_write(&mm->mmap_sem);
26277 +       return 0;
26278 +}
26279 +
26280 +static int __init init_syscall32(void)
26281 +{ 
26282 +       syscall32_page = (void *)get_zeroed_page(GFP_KERNEL); 
26283 +       if (!syscall32_page) 
26284 +               panic("Cannot allocate syscall32 page"); 
26285 +
26286 +#ifdef USE_INT80
26287 +       /*
26288 +        * At this point we use int 0x80.
26289 +        */
26290 +       memcpy(syscall32_page, syscall32_int80,
26291 +              syscall32_int80_end - syscall32_int80);
26292 +#else
26293 +       if (use_sysenter > 0) {
26294 +               memcpy(syscall32_page, syscall32_sysenter,
26295 +                      syscall32_sysenter_end - syscall32_sysenter);
26296 +       } else {
26297 +               memcpy(syscall32_page, syscall32_syscall,
26298 +                      syscall32_syscall_end - syscall32_syscall);
26299 +       }       
26300 +#endif
26301 +       return 0;
26302 +} 
26303 +
26304 +/*
26305 + * This must be done early in case we have an initrd containing 32-bit
26306 + * binaries (e.g., hotplug). This could be pushed upstream to arch/x86_64.
26307 + */    
26308 +core_initcall(init_syscall32); 
26309 +
26310 +/* May not be __init: called during resume */
26311 +void syscall32_cpu_init(void)
26312 +{
26313 +#ifndef USE_INT80
26314 +       if (use_sysenter < 0)
26315 +               use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
26316 +
26317 +       /* Load these always in case some future AMD CPU supports
26318 +          SYSENTER from compat mode too. */
26319 +       checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
26320 +       checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL);
26321 +       checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
26322 +
26323 +       wrmsrl(MSR_CSTAR, ia32_cstar_target);
26324 +#endif
26325 +}
26326 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/ia32/syscall32_syscall-xen.S linux-2.6.16/arch/x86_64/ia32/syscall32_syscall-xen.S
26327 --- linux-2.6.16.orig/arch/x86_64/ia32/syscall32_syscall-xen.S  1970-01-01 01:00:00.000000000 +0100
26328 +++ linux-2.6.16/arch/x86_64/ia32/syscall32_syscall-xen.S       2006-06-26 09:51:32.000000000 +0200
26329 @@ -0,0 +1,28 @@
26330 +/* 32bit VDSOs mapped into user space. */
26331 +
26332 +       .section ".init.data","aw"
26333 +
26334 +#ifdef USE_INT80
26335 +
26336 +       .globl syscall32_int80
26337 +       .globl syscall32_int80_end
26338 +
26339 +syscall32_int80:
26340 +       .incbin "arch/x86_64/ia32/vsyscall-int80.so"
26341 +syscall32_int80_end:
26342 +
26343 +#endif
26344 +
26345 +       .globl syscall32_syscall
26346 +       .globl syscall32_syscall_end
26347 +
26348 +syscall32_syscall:
26349 +       .incbin "arch/x86_64/ia32/vsyscall-syscall.so"
26350 +syscall32_syscall_end:
26351 +
26352 +       .globl syscall32_sysenter
26353 +       .globl syscall32_sysenter_end
26354 +
26355 +syscall32_sysenter:
26356 +       .incbin "arch/x86_64/ia32/vsyscall-sysenter.so"
26357 +syscall32_sysenter_end:
26358 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/ia32/vsyscall-int80.S linux-2.6.16/arch/x86_64/ia32/vsyscall-int80.S
26359 --- linux-2.6.16.orig/arch/x86_64/ia32/vsyscall-int80.S 1970-01-01 01:00:00.000000000 +0100
26360 +++ linux-2.6.16/arch/x86_64/ia32/vsyscall-int80.S      2006-06-26 09:51:32.000000000 +0200
26361 @@ -0,0 +1,58 @@
26362 +/*
26363 + * Code for the vsyscall page.  This version uses the old int $0x80 method.
26364 + *
26365 + * NOTE:
26366 + * 1) __kernel_vsyscall _must_ be first in this page.
26367 + * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
26368 + *    for details.
26369 + */
26370 +#include <asm/ia32_unistd.h>
26371 +#include <asm/asm-offsets.h>
26372 +
26373 +       .code32
26374 +       .text
26375 +       .section .text.vsyscall,"ax"
26376 +       .globl __kernel_vsyscall
26377 +       .type __kernel_vsyscall,@function
26378 +__kernel_vsyscall:
26379 +.LSTART_vsyscall:
26380 +       int $0x80
26381 +       ret
26382 +.LEND_vsyscall:
26383 +       .size __kernel_vsyscall,.-.LSTART_vsyscall
26384 +       .previous
26385 +
26386 +       .section .eh_frame,"a",@progbits
26387 +.LSTARTFRAME:
26388 +       .long .LENDCIE-.LSTARTCIE
26389 +.LSTARTCIE:
26390 +       .long 0                 /* CIE ID */
26391 +       .byte 1                 /* Version number */
26392 +       .string "zR"            /* NUL-terminated augmentation string */
26393 +       .uleb128 1              /* Code alignment factor */
26394 +       .sleb128 -4             /* Data alignment factor */
26395 +       .byte 8                 /* Return address register column */
26396 +       .uleb128 1              /* Augmentation value length */
26397 +       .byte 0x1b              /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
26398 +       .byte 0x0c              /* DW_CFA_def_cfa */
26399 +       .uleb128 4
26400 +       .uleb128 4
26401 +       .byte 0x88              /* DW_CFA_offset, column 0x8 */
26402 +       .uleb128 1
26403 +       .align 4
26404 +.LENDCIE:
26405 +
26406 +       .long .LENDFDE1-.LSTARTFDE1     /* Length FDE */
26407 +.LSTARTFDE1:
26408 +       .long .LSTARTFDE1-.LSTARTFRAME  /* CIE pointer */
26409 +       .long .LSTART_vsyscall-.        /* PC-relative start address */
26410 +       .long .LEND_vsyscall-.LSTART_vsyscall
26411 +       .uleb128 0                      /* Augmentation length */
26412 +       .align 4
26413 +.LENDFDE1:
26414 +               
26415 +/*
26416 + * Get the common code for the sigreturn entry points.
26417 + */
26418 +#define SYSCALL_ENTER_KERNEL    int $0x80
26419 +#include "vsyscall-sigreturn.S"
26420 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/ia32/vsyscall-sigreturn.S linux-2.6.16/arch/x86_64/ia32/vsyscall-sigreturn.S
26421 --- linux-2.6.16.orig/arch/x86_64/ia32/vsyscall-sigreturn.S     2006-03-20 06:53:29.000000000 +0100
26422 +++ linux-2.6.16/arch/x86_64/ia32/vsyscall-sigreturn.S  2006-06-26 09:51:32.000000000 +0200
26423 @@ -120,5 +120,5 @@
26424         .align 4
26425  .LENDFDE3:
26426  
26427 -#include "../../i386/kernel/vsyscall-note.S"
26428 +#include <vsyscall-note.S>
26429  
26430 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/Makefile linux-2.6.16/arch/x86_64/kernel/Makefile
26431 --- linux-2.6.16.orig/arch/x86_64/kernel/Makefile       2006-06-26 09:49:45.000000000 +0200
26432 +++ linux-2.6.16/arch/x86_64/kernel/Makefile    2006-06-26 09:56:01.000000000 +0200
26433 @@ -20,11 +20,13 @@
26434  obj-$(CONFIG_X86_CPUID)                += cpuid.o
26435  obj-$(CONFIG_SMP)              += smp.o smpboot.o trampoline.o
26436  obj-$(CONFIG_X86_LOCAL_APIC)   += apic.o  nmi.o
26437 +obj-$(CONFIG_X86_XEN_GENAPIC)  += genapic.o genapic_xen.o
26438  obj-$(CONFIG_X86_IO_APIC)      += io_apic.o mpparse.o \
26439                 genapic.o genapic_cluster.o genapic_flat.o
26440  obj-$(CONFIG_KEXEC)            += machine_kexec.o relocate_kernel.o crash.o
26441  obj-$(CONFIG_CRASH_DUMP)       += crash_dump.o
26442 -obj-$(CONFIG_PM)               += suspend.o
26443 +obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o
26444 +obj-$(CONFIG_ACPI_SLEEP)       += suspend.o
26445  obj-$(CONFIG_SUSPEND_SHARED)   += suspend_asm.o
26446  obj-$(CONFIG_CPU_FREQ)         += cpufreq/
26447  obj-$(CONFIG_EARLY_PRINTK)     += early_printk.o
26448 @@ -51,3 +53,17 @@
26449  msr-$(subst m,y,$(CONFIG_X86_MSR))  += ../../i386/kernel/msr.o
26450  dmi_scan-y                     += ../../i386/kernel/dmi_scan.o
26451  
26452 +ifdef CONFIG_XEN
26453 +time-y                         += ../../i386/kernel/time-xen.o
26454 +pci-dma-y                      += ../../i386/kernel/pci-dma-xen.o
26455 +microcode-$(subst m,y,$(CONFIG_MICROCODE))  := ../../i386/kernel/microcode-xen.o
26456 +quirks-y                       := ../../i386/kernel/quirks-xen.o
26457 +
26458 +n-obj-xen := i8259.o reboot.o i8237.o smpboot.o trampoline.o
26459 +
26460 +include $(srctree)/scripts/Makefile.xen
26461 +
26462 +obj-y := $(call filterxen, $(obj-y), $(n-obj-xen))
26463 +obj-y := $(call cherrypickxen, $(obj-y))
26464 +extra-y := $(call cherrypickxen, $(extra-y))
26465 +endif
26466 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/acpi/Makefile linux-2.6.16/arch/x86_64/kernel/acpi/Makefile
26467 --- linux-2.6.16.orig/arch/x86_64/kernel/acpi/Makefile  2006-03-20 06:53:29.000000000 +0100
26468 +++ linux-2.6.16/arch/x86_64/kernel/acpi/Makefile       2006-06-26 09:51:32.000000000 +0200
26469 @@ -6,3 +6,4 @@
26470  obj-y                  += processor.o
26471  endif
26472  
26473 +boot-$(CONFIG_XEN)             := ../../../i386/kernel/acpi/boot-xen.o
26474 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/apic-xen.c linux-2.6.16/arch/x86_64/kernel/apic-xen.c
26475 --- linux-2.6.16.orig/arch/x86_64/kernel/apic-xen.c     1970-01-01 01:00:00.000000000 +0100
26476 +++ linux-2.6.16/arch/x86_64/kernel/apic-xen.c  2006-06-26 09:51:32.000000000 +0200
26477 @@ -0,0 +1,198 @@
26478 +/*
26479 + *     Local APIC handling, local APIC timers
26480 + *
26481 + *     (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
26482 + *
26483 + *     Fixes
26484 + *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
26485 + *                                     thanks to Eric Gilmore
26486 + *                                     and Rolf G. Tews
26487 + *                                     for testing these extensively.
26488 + *     Maciej W. Rozycki       :       Various updates and fixes.
26489 + *     Mikael Pettersson       :       Power Management for UP-APIC.
26490 + *     Pavel Machek and
26491 + *     Mikael Pettersson       :       PM converted to driver model.
26492 + */
26493 +
26494 +#include <linux/config.h>
26495 +#include <linux/init.h>
26496 +
26497 +#include <linux/mm.h>
26498 +#include <linux/delay.h>
26499 +#include <linux/bootmem.h>
26500 +#include <linux/smp_lock.h>
26501 +#include <linux/interrupt.h>
26502 +#include <linux/mc146818rtc.h>
26503 +#include <linux/kernel_stat.h>
26504 +#include <linux/sysdev.h>
26505 +#include <linux/module.h>
26506 +
26507 +#include <asm/atomic.h>
26508 +#include <asm/smp.h>
26509 +#include <asm/mtrr.h>
26510 +#include <asm/mpspec.h>
26511 +#include <asm/desc.h>
26512 +#include <asm/arch_hooks.h>
26513 +#include <asm/hpet.h>
26514 +#include <asm/idle.h>
26515 +
26516 +int apic_verbosity;
26517 +
26518 +/*
26519 + * 'what should we do if we get a hw irq event on an illegal vector'.
26520 + * each architecture has to answer this themselves.
26521 + */
26522 +void ack_bad_irq(unsigned int irq)
26523 +{
26524 +       printk("unexpected IRQ trap at vector %02x\n", irq);
26525 +       /*
26526 +        * Currently unexpected vectors happen only on SMP and APIC.
26527 +        * We _must_ ack these because every local APIC has only N
26528 +        * irq slots per priority level, and a 'hanging, unacked' IRQ
26529 +        * holds up an irq slot - in excessive cases (when multiple
26530 +        * unexpected vectors occur) that might lock up the APIC
26531 +        * completely.
26532 +        * But don't ack when the APIC is disabled. -AK
26533 +        */
26534 +       if (!disable_apic)
26535 +               ack_APIC_irq();
26536 +}
26537 +
26538 +int setup_profiling_timer(unsigned int multiplier)
26539 +{
26540 +       return -EINVAL;
26541 +}
26542 +
26543 +void smp_local_timer_interrupt(struct pt_regs *regs)
26544 +{
26545 +       profile_tick(CPU_PROFILING, regs);
26546 +#ifndef CONFIG_XEN
26547 +#ifdef CONFIG_SMP
26548 +               update_process_times(user_mode(regs));
26549 +#endif
26550 +#endif
26551 +       /*
26552 +        * We take the 'long' return path, and there every subsystem
26553 +        * grabs the appropriate locks (kernel lock/ irq lock).
26554 +        *
26555 +        * we might want to decouple profiling from the 'long path',
26556 +        * and do the profiling totally in assembly.
26557 +        *
26558 +        * Currently this isn't too much of an issue (performance wise),
26559 +        * we can take more than 100K local irqs per second on a 100 MHz P5.
26560 +        */
26561 +}
26562 +
26563 +/*
26564 + * Local APIC timer interrupt. This is the most natural way for doing
26565 + * local interrupts, but local timer interrupts can be emulated by
26566 + * broadcast interrupts too. [in case the hw doesn't support APIC timers]
26567 + *
26568 + * [ if a single-CPU system runs an SMP kernel then we call the local
26569 + *   interrupt as well. Thus we cannot inline the local irq ... ]
26570 + */
26571 +void smp_apic_timer_interrupt(struct pt_regs *regs)
26572 +{
26573 +       /*
26574 +        * the NMI deadlock-detector uses this.
26575 +        */
26576 +       add_pda(apic_timer_irqs, 1);
26577 +
26578 +       /*
26579 +        * NOTE! We'd better ACK the irq immediately,
26580 +        * because timer handling can be slow.
26581 +        */
26582 +       ack_APIC_irq();
26583 +       /*
26584 +        * update_process_times() expects us to have done irq_enter().
26585 +        * Besides, if we don't timer interrupts ignore the global
26586 +        * interrupt lock, which is the WrongThing (tm) to do.
26587 +        */
26588 +       exit_idle();
26589 +       irq_enter();
26590 +       smp_local_timer_interrupt(regs);
26591 +       irq_exit();
26592 +}
26593 +
26594 +/*
26595 + * This interrupt should _never_ happen with our APIC/SMP architecture
26596 + */
26597 +asmlinkage void smp_spurious_interrupt(void)
26598 +{
26599 +       unsigned int v;
26600 +       exit_idle();
26601 +       irq_enter();
26602 +       /*
26603 +        * Check if this really is a spurious interrupt and ACK it
26604 +        * if it is a vectored one.  Just in case...
26605 +        * Spurious interrupts should not be ACKed.
26606 +        */
26607 +       v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
26608 +       if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
26609 +               ack_APIC_irq();
26610 +
26611 +#if 0
26612 +       static unsigned long last_warning; 
26613 +       static unsigned long skipped; 
26614 +
26615 +       /* see sw-dev-man vol 3, chapter 7.4.13.5 */
26616 +       if (time_before(last_warning+30*HZ,jiffies)) { 
26617 +               printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n",
26618 +                      smp_processor_id(), skipped);
26619 +               last_warning = jiffies; 
26620 +               skipped = 0;
26621 +       } else { 
26622 +               skipped++; 
26623 +       } 
26624 +#endif 
26625 +       irq_exit();
26626 +}
26627 +
26628 +/*
26629 + * This interrupt should never happen with our APIC/SMP architecture
26630 + */
26631 +
26632 +asmlinkage void smp_error_interrupt(void)
26633 +{
26634 +       unsigned int v, v1;
26635 +
26636 +       exit_idle();
26637 +       irq_enter();
26638 +       /* First tickle the hardware, only then report what went on. -- REW */
26639 +       v = apic_read(APIC_ESR);
26640 +       apic_write(APIC_ESR, 0);
26641 +       v1 = apic_read(APIC_ESR);
26642 +       ack_APIC_irq();
26643 +       atomic_inc(&irq_err_count);
26644 +
26645 +       /* Here is what the APIC error bits mean:
26646 +          0: Send CS error
26647 +          1: Receive CS error
26648 +          2: Send accept error
26649 +          3: Receive accept error
26650 +          4: Reserved
26651 +          5: Send illegal vector
26652 +          6: Received illegal vector
26653 +          7: Illegal register address
26654 +       */
26655 +       printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
26656 +               smp_processor_id(), v , v1);
26657 +       irq_exit();
26658 +}
26659 +
26660 +int disable_apic;
26661 +
26662 +/*
26663 + * This initializes the IO-APIC and APIC hardware if this is
26664 + * a UP kernel.
26665 + */
26666 +int __init APIC_init_uniprocessor (void)
26667 +{
26668 +#ifdef CONFIG_X86_IO_APIC
26669 +       if (smp_found_config)
26670 +               if (!skip_ioapic_setup && nr_ioapics)
26671 +                       setup_IO_APIC();
26672 +#endif
26673 +
26674 +       return 0;
26675 +}
26676 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/asm-offsets.c linux-2.6.16/arch/x86_64/kernel/asm-offsets.c
26677 --- linux-2.6.16.orig/arch/x86_64/kernel/asm-offsets.c  2006-03-20 06:53:29.000000000 +0100
26678 +++ linux-2.6.16/arch/x86_64/kernel/asm-offsets.c       2006-06-26 09:51:32.000000000 +0200
26679 @@ -66,7 +66,9 @@
26680         DEFINE(pbe_address, offsetof(struct pbe, address));
26681         DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
26682         DEFINE(pbe_next, offsetof(struct pbe, next));
26683 +#ifndef CONFIG_X86_NO_TSS
26684         BLANK();
26685         DEFINE(TSS_ist, offsetof(struct tss_struct, ist));
26686 +#endif
26687         return 0;
26688  }
26689 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/e820-xen.c linux-2.6.16/arch/x86_64/kernel/e820-xen.c
26690 --- linux-2.6.16.orig/arch/x86_64/kernel/e820-xen.c     1970-01-01 01:00:00.000000000 +0100
26691 +++ linux-2.6.16/arch/x86_64/kernel/e820-xen.c  2006-06-26 09:51:32.000000000 +0200
26692 @@ -0,0 +1,744 @@
26693 +/* 
26694 + * Handle the memory map.
26695 + * The functions here do the job until bootmem takes over.
26696 + * $Id$
26697 + *
26698 + *  Getting sanitize_e820_map() in sync with i386 version by applying change:
26699 + *  -  Provisions for empty E820 memory regions (reported by certain BIOSes).
26700 + *     Alex Achenbach <xela@slit.de>, December 2002.
26701 + *  Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
26702 + *
26703 + */
26704 +#include <linux/config.h>
26705 +#include <linux/kernel.h>
26706 +#include <linux/types.h>
26707 +#include <linux/init.h>
26708 +#include <linux/bootmem.h>
26709 +#include <linux/ioport.h>
26710 +#include <linux/string.h>
26711 +#include <linux/kexec.h>
26712 +#include <linux/module.h>
26713 +
26714 +#include <asm/page.h>
26715 +#include <asm/e820.h>
26716 +#include <asm/proto.h>
26717 +#include <asm/bootsetup.h>
26718 +#include <asm/sections.h>
26719 +#include <xen/interface/memory.h>
26720 +
26721 +unsigned long pci_mem_start = 0xaeedbabe;
26722 +
26723 +/* 
26724 + * PFN of last memory page.
26725 + */
26726 +unsigned long end_pfn; 
26727 +EXPORT_SYMBOL(end_pfn);
26728 +unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT;  
26729 +unsigned long end_pfn_map; 
26730 +
26731 +/* 
26732 + * Add a memory region to the kernel e820 map.
26733 + */ 
26734 +void __init add_memory_region(unsigned long start, unsigned long size, int type)
26735 +{
26736 +       int x = e820.nr_map;
26737 +
26738 +       if (x == E820MAX) {
26739 +               printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
26740 +               return;
26741 +       }
26742 +
26743 +       e820.map[x].addr = start;
26744 +       e820.map[x].size = size;
26745 +       e820.map[x].type = type;
26746 +       e820.nr_map++;
26747 +}
26748 +
26749 +#ifndef CONFIG_XEN
26750 +
26751 +/* 
26752 + * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
26753 + * The direct mapping extends to end_pfn_map, so that we can directly access
26754 + * apertures, ACPI and other tables without having to play with fixmaps.
26755 + */ 
26756 +
26757 +/* 
26758 + * Last pfn which the user wants to use.
26759 + */
26760 +
26761 +extern struct resource code_resource, data_resource;
26762 +
26763 +/* Check for some hardcoded bad areas that early boot is not allowed to touch */ 
26764 +static inline int bad_addr(unsigned long *addrp, unsigned long size)
26765 +{ 
26766 +       unsigned long addr = *addrp, last = addr + size; 
26767 +
26768 +       /* various gunk below that needed for SMP startup */
26769 +       if (addr < 0x8000) { 
26770 +               *addrp = 0x8000;
26771 +               return 1; 
26772 +       }
26773 +
26774 +       /* direct mapping tables of the kernel */
26775 +       if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { 
26776 +               *addrp = table_end << PAGE_SHIFT; 
26777 +               return 1;
26778 +       } 
26779 +
26780 +       /* initrd */ 
26781 +#ifdef CONFIG_BLK_DEV_INITRD
26782 +       if (LOADER_TYPE && INITRD_START && last >= INITRD_START && 
26783 +           addr < INITRD_START+INITRD_SIZE) { 
26784 +               *addrp = INITRD_START + INITRD_SIZE; 
26785 +               return 1;
26786 +       } 
26787 +#endif
26788 +       /* kernel code + 640k memory hole (later should not be needed, but 
26789 +          be paranoid for now) */
26790 +       if (last >= 640*1024 && addr < __pa_symbol(&_end)) { 
26791 +               *addrp = __pa_symbol(&_end);
26792 +               return 1;
26793 +       }
26794 +       /* XXX ramdisk image here? */ 
26795 +       return 0;
26796 +} 
26797 +
26798 +int __init e820_mapped(unsigned long start, unsigned long end, unsigned type) 
26799 +{ 
26800 +       int i;
26801 +       for (i = 0; i < e820.nr_map; i++) { 
26802 +               struct e820entry *ei = &e820.map[i]; 
26803 +               if (type && ei->type != type) 
26804 +                       continue;
26805 +               if (ei->addr >= end || ei->addr + ei->size <= start)
26806 +                       continue; 
26807 +               return 1; 
26808 +       } 
26809 +       return 0;
26810 +}
26811 +
26812 +/* 
26813 + * Find a free area in a specific range. 
26814 + */ 
26815 +unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) 
26816 +{ 
26817 +       int i; 
26818 +       for (i = 0; i < e820.nr_map; i++) { 
26819 +               struct e820entry *ei = &e820.map[i]; 
26820 +               unsigned long addr = ei->addr, last; 
26821 +               if (ei->type != E820_RAM) 
26822 +                       continue; 
26823 +               if (addr < start) 
26824 +                       addr = start;
26825 +               if (addr > ei->addr + ei->size) 
26826 +                       continue; 
26827 +               while (bad_addr(&addr, size) && addr+size < ei->addr + ei->size)
26828 +                       ;
26829 +               last = addr + size;
26830 +               if (last > ei->addr + ei->size)
26831 +                       continue;
26832 +               if (last > end) 
26833 +                       continue;
26834 +               return addr; 
26835 +       } 
26836 +       return -1UL;            
26837 +} 
26838 +
26839 +/* 
26840 + * Free bootmem based on the e820 table for a node.
26841 + */
26842 +void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
26843 +{
26844 +       int i;
26845 +       for (i = 0; i < e820.nr_map; i++) {
26846 +               struct e820entry *ei = &e820.map[i]; 
26847 +               unsigned long last, addr;
26848 +
26849 +               if (ei->type != E820_RAM || 
26850 +                   ei->addr+ei->size <= start || 
26851 +                   ei->addr >= end)
26852 +                       continue;
26853 +
26854 +               addr = round_up(ei->addr, PAGE_SIZE);
26855 +               if (addr < start) 
26856 +                       addr = start;
26857 +
26858 +               last = round_down(ei->addr + ei->size, PAGE_SIZE); 
26859 +               if (last >= end)
26860 +                       last = end; 
26861 +
26862 +               if (last > addr && last-addr >= PAGE_SIZE)
26863 +                       free_bootmem_node(pgdat, addr, last-addr);
26864 +       }
26865 +}
26866 +
26867 +/*
26868 + * Find the highest page frame number we have available
26869 + */
26870 +unsigned long __init e820_end_of_ram(void)
26871 +{
26872 +       int i;
26873 +       unsigned long end_pfn = 0;
26874 +       
26875 +       for (i = 0; i < e820.nr_map; i++) {
26876 +               struct e820entry *ei = &e820.map[i]; 
26877 +               unsigned long start, end;
26878 +
26879 +               start = round_up(ei->addr, PAGE_SIZE); 
26880 +               end = round_down(ei->addr + ei->size, PAGE_SIZE); 
26881 +               if (start >= end)
26882 +                       continue;
26883 +               if (ei->type == E820_RAM) { 
26884 +               if (end > end_pfn<<PAGE_SHIFT)
26885 +                       end_pfn = end>>PAGE_SHIFT;
26886 +               } else { 
26887 +                       if (end > end_pfn_map<<PAGE_SHIFT) 
26888 +                               end_pfn_map = end>>PAGE_SHIFT;
26889 +               } 
26890 +       }
26891 +
26892 +       if (end_pfn > end_pfn_map) 
26893 +               end_pfn_map = end_pfn;
26894 +       if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
26895 +               end_pfn_map = MAXMEM>>PAGE_SHIFT;
26896 +       if (end_pfn > end_user_pfn)
26897 +               end_pfn = end_user_pfn;
26898 +       if (end_pfn > end_pfn_map) 
26899 +               end_pfn = end_pfn_map; 
26900 +
26901 +       return end_pfn; 
26902 +}
26903 +
26904 +/* 
26905 + * Compute how much memory is missing in a range.
26906 + * Unlike the other functions in this file the arguments are in page numbers.
26907 + */
26908 +unsigned long __init
26909 +e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
26910 +{
26911 +       unsigned long ram = 0;
26912 +       unsigned long start = start_pfn << PAGE_SHIFT;
26913 +       unsigned long end = end_pfn << PAGE_SHIFT;
26914 +       int i;
26915 +       for (i = 0; i < e820.nr_map; i++) {
26916 +               struct e820entry *ei = &e820.map[i];
26917 +               unsigned long last, addr;
26918 +
26919 +               if (ei->type != E820_RAM ||
26920 +                   ei->addr+ei->size <= start ||
26921 +                   ei->addr >= end)
26922 +                       continue;
26923 +
26924 +               addr = round_up(ei->addr, PAGE_SIZE);
26925 +               if (addr < start)
26926 +                       addr = start;
26927 +
26928 +               last = round_down(ei->addr + ei->size, PAGE_SIZE);
26929 +               if (last >= end)
26930 +                       last = end;
26931 +
26932 +               if (last > addr)
26933 +                       ram += last - addr;
26934 +       }
26935 +       return ((end - start) - ram) >> PAGE_SHIFT;
26936 +}
26937 +
26938 +/*
26939 + * Mark e820 reserved areas as busy for the resource manager.
26940 + */
26941 +void __init e820_reserve_resources(void)
26942 +{
26943 +       int i;
26944 +       for (i = 0; i < e820.nr_map; i++) {
26945 +               struct resource *res;
26946 +               res = alloc_bootmem_low(sizeof(struct resource));
26947 +               switch (e820.map[i].type) {
26948 +               case E820_RAM:  res->name = "System RAM"; break;
26949 +               case E820_ACPI: res->name = "ACPI Tables"; break;
26950 +               case E820_NVS:  res->name = "ACPI Non-volatile Storage"; break;
26951 +               default:        res->name = "reserved";
26952 +               }
26953 +               res->start = e820.map[i].addr;
26954 +               res->end = res->start + e820.map[i].size - 1;
26955 +               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
26956 +               request_resource(&iomem_resource, res);
26957 +               if (e820.map[i].type == E820_RAM) {
26958 +                       /*
26959 +                        *  We don't know which RAM region contains kernel data,
26960 +                        *  so we try it repeatedly and let the resource manager
26961 +                        *  test it.
26962 +                        */
26963 +                       request_resource(res, &code_resource);
26964 +                       request_resource(res, &data_resource);
26965 +#ifdef CONFIG_KEXEC
26966 +                       request_resource(res, &crashk_res);
26967 +#endif
26968 +               }
26969 +       }
26970 +}
26971 +#endif /* CONFIG_XEN */
26972 +
26973 +void __init e820_print_map(char *who)
26974 +{
26975 +       int i;
26976 +
26977 +       for (i = 0; i < e820.nr_map; i++) {
26978 +               printk(" %s: %016Lx - %016Lx ", who,
26979 +                       (unsigned long long) e820.map[i].addr,
26980 +                       (unsigned long long) (e820.map[i].addr + e820.map[i].size));
26981 +               switch (e820.map[i].type) {
26982 +               case E820_RAM:  printk("(usable)\n");
26983 +                               break;
26984 +               case E820_RESERVED:
26985 +                               printk("(reserved)\n");
26986 +                               break;
26987 +               case E820_ACPI:
26988 +                               printk("(ACPI data)\n");
26989 +                               break;
26990 +               case E820_NVS:
26991 +                               printk("(ACPI NVS)\n");
26992 +                               break;
26993 +               default:        printk("type %u\n", e820.map[i].type);
26994 +                               break;
26995 +               }
26996 +       }
26997 +}
26998 +
26999 +#ifndef CONFIG_XEN
27000 +/*
27001 + * Sanitize the BIOS e820 map.
27002 + *
27003 + * Some e820 responses include overlapping entries.  The following 
27004 + * replaces the original e820 map with a new one, removing overlaps.
27005 + *
27006 + */
27007 +static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
27008 +{
27009 +       struct change_member {
27010 +               struct e820entry *pbios; /* pointer to original bios entry */
27011 +               unsigned long long addr; /* address for this change point */
27012 +       };
27013 +       static struct change_member change_point_list[2*E820MAX] __initdata;
27014 +       static struct change_member *change_point[2*E820MAX] __initdata;
27015 +       static struct e820entry *overlap_list[E820MAX] __initdata;
27016 +       static struct e820entry new_bios[E820MAX] __initdata;
27017 +       struct change_member *change_tmp;
27018 +       unsigned long current_type, last_type;
27019 +       unsigned long long last_addr;
27020 +       int chgidx, still_changing;
27021 +       int overlap_entries;
27022 +       int new_bios_entry;
27023 +       int old_nr, new_nr, chg_nr;
27024 +       int i;
27025 +
27026 +       /*
27027 +               Visually we're performing the following (1,2,3,4 = memory types)...
27028 +
27029 +               Sample memory map (w/overlaps):
27030 +                  ____22__________________
27031 +                  ______________________4_
27032 +                  ____1111________________
27033 +                  _44_____________________
27034 +                  11111111________________
27035 +                  ____________________33__
27036 +                  ___________44___________
27037 +                  __________33333_________
27038 +                  ______________22________
27039 +                  ___________________2222_
27040 +                  _________111111111______
27041 +                  _____________________11_
27042 +                  _________________4______
27043 +
27044 +               Sanitized equivalent (no overlap):
27045 +                  1_______________________
27046 +                  _44_____________________
27047 +                  ___1____________________
27048 +                  ____22__________________
27049 +                  ______11________________
27050 +                  _________1______________
27051 +                  __________3_____________
27052 +                  ___________44___________
27053 +                  _____________33_________
27054 +                  _______________2________
27055 +                  ________________1_______
27056 +                  _________________4______
27057 +                  ___________________2____
27058 +                  ____________________33__
27059 +                  ______________________4_
27060 +       */
27061 +
27062 +       /* if there's only one memory region, don't bother */
27063 +       if (*pnr_map < 2)
27064 +               return -1;
27065 +
27066 +       old_nr = *pnr_map;
27067 +
27068 +       /* bail out if we find any unreasonable addresses in bios map */
27069 +       for (i=0; i<old_nr; i++)
27070 +               if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
27071 +                       return -1;
27072 +
27073 +       /* create pointers for initial change-point information (for sorting) */
27074 +       for (i=0; i < 2*old_nr; i++)
27075 +               change_point[i] = &change_point_list[i];
27076 +
27077 +       /* record all known change-points (starting and ending addresses),
27078 +          omitting those that are for empty memory regions */
27079 +       chgidx = 0;
27080 +       for (i=0; i < old_nr; i++)      {
27081 +               if (biosmap[i].size != 0) {
27082 +                       change_point[chgidx]->addr = biosmap[i].addr;
27083 +                       change_point[chgidx++]->pbios = &biosmap[i];
27084 +                       change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
27085 +                       change_point[chgidx++]->pbios = &biosmap[i];
27086 +               }
27087 +       }
27088 +       chg_nr = chgidx;
27089 +
27090 +       /* sort change-point list by memory addresses (low -> high) */
27091 +       still_changing = 1;
27092 +       while (still_changing)  {
27093 +               still_changing = 0;
27094 +               for (i=1; i < chg_nr; i++)  {
27095 +                       /* if <current_addr> > <last_addr>, swap */
27096 +                       /* or, if current=<start_addr> & last=<end_addr>, swap */
27097 +                       if ((change_point[i]->addr < change_point[i-1]->addr) ||
27098 +                               ((change_point[i]->addr == change_point[i-1]->addr) &&
27099 +                                (change_point[i]->addr == change_point[i]->pbios->addr) &&
27100 +                                (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
27101 +                          )
27102 +                       {
27103 +                               change_tmp = change_point[i];
27104 +                               change_point[i] = change_point[i-1];
27105 +                               change_point[i-1] = change_tmp;
27106 +                               still_changing=1;
27107 +                       }
27108 +               }
27109 +       }
27110 +
27111 +       /* create a new bios memory map, removing overlaps */
27112 +       overlap_entries=0;       /* number of entries in the overlap table */
27113 +       new_bios_entry=0;        /* index for creating new bios map entries */
27114 +       last_type = 0;           /* start with undefined memory type */
27115 +       last_addr = 0;           /* start with 0 as last starting address */
27116 +       /* loop through change-points, determining affect on the new bios map */
27117 +       for (chgidx=0; chgidx < chg_nr; chgidx++)
27118 +       {
27119 +               /* keep track of all overlapping bios entries */
27120 +               if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
27121 +               {
27122 +                       /* add map entry to overlap list (> 1 entry implies an overlap) */
27123 +                       overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
27124 +               }
27125 +               else
27126 +               {
27127 +                       /* remove entry from list (order independent, so swap with last) */
27128 +                       for (i=0; i<overlap_entries; i++)
27129 +                       {
27130 +                               if (overlap_list[i] == change_point[chgidx]->pbios)
27131 +                                       overlap_list[i] = overlap_list[overlap_entries-1];
27132 +                       }
27133 +                       overlap_entries--;
27134 +               }
27135 +               /* if there are overlapping entries, decide which "type" to use */
27136 +               /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
27137 +               current_type = 0;
27138 +               for (i=0; i<overlap_entries; i++)
27139 +                       if (overlap_list[i]->type > current_type)
27140 +                               current_type = overlap_list[i]->type;
27141 +               /* continue building up new bios map based on this information */
27142 +               if (current_type != last_type)  {
27143 +                       if (last_type != 0)      {
27144 +                               new_bios[new_bios_entry].size =
27145 +                                       change_point[chgidx]->addr - last_addr;
27146 +                               /* move forward only if the new size was non-zero */
27147 +                               if (new_bios[new_bios_entry].size != 0)
27148 +                                       if (++new_bios_entry >= E820MAX)
27149 +                                               break;  /* no more space left for new bios entries */
27150 +                       }
27151 +                       if (current_type != 0)  {
27152 +                               new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
27153 +                               new_bios[new_bios_entry].type = current_type;
27154 +                               last_addr=change_point[chgidx]->addr;
27155 +                       }
27156 +                       last_type = current_type;
27157 +               }
27158 +       }
27159 +       new_nr = new_bios_entry;   /* retain count for new bios entries */
27160 +
27161 +       /* copy new bios mapping into original location */
27162 +       memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
27163 +       *pnr_map = new_nr;
27164 +
27165 +       return 0;
27166 +}
27167 +
27168 +/*
27169 + * Copy the BIOS e820 map into a safe place.
27170 + *
27171 + * Sanity-check it while we're at it..
27172 + *
27173 + * If we're lucky and live on a modern system, the setup code
27174 + * will have given us a memory map that we can use to properly
27175 + * set up memory.  If we aren't, we'll fake a memory map.
27176 + *
27177 + * We check to see that the memory map contains at least 2 elements
27178 + * before we'll use it, because the detection code in setup.S may
27179 + * not be perfect and most every PC known to man has two memory
27180 + * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
27181 + * thinkpad 560x, for example, does not cooperate with the memory
27182 + * detection code.)
27183 + */
27184 +static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
27185 +{
27186 +       /* Only one memory region (or negative)? Ignore it */
27187 +       if (nr_map < 2)
27188 +               return -1;
27189 +
27190 +       do {
27191 +               unsigned long start = biosmap->addr;
27192 +               unsigned long size = biosmap->size;
27193 +               unsigned long end = start + size;
27194 +               unsigned long type = biosmap->type;
27195 +
27196 +               /* Overflow in 64 bits? Ignore the memory map. */
27197 +               if (start > end)
27198 +                       return -1;
27199 +
27200 +               /*
27201 +                * Some BIOSes claim RAM in the 640k - 1M region.
27202 +                * Not right. Fix it up.
27203 +                * 
27204 +                * This should be removed on Hammer which is supposed to not
27205 +                * have non e820 covered ISA mappings there, but I had some strange
27206 +                * problems so it stays for now.  -AK
27207 +                */
27208 +               if (type == E820_RAM) {
27209 +                       if (start < 0x100000ULL && end > 0xA0000ULL) {
27210 +                               if (start < 0xA0000ULL)
27211 +                                       add_memory_region(start, 0xA0000ULL-start, type);
27212 +                               if (end <= 0x100000ULL)
27213 +                                       continue;
27214 +                               start = 0x100000ULL;
27215 +                               size = end - start;
27216 +                       }
27217 +               }
27218 +
27219 +               add_memory_region(start, size, type);
27220 +       } while (biosmap++,--nr_map);
27221 +       return 0;
27222 +}
27223 +
27224 +void __init setup_memory_region(void)
27225 +{
27226 +       char *who = "BIOS-e820";
27227 +
27228 +       /*
27229 +        * Try to copy the BIOS-supplied E820-map.
27230 +        *
27231 +        * Otherwise fake a memory map; one section from 0k->640k,
27232 +        * the next section from 1mb->appropriate_mem_k
27233 +        */
27234 +       sanitize_e820_map(E820_MAP, &E820_MAP_NR);
27235 +       if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
27236 +               unsigned long mem_size;
27237 +
27238 +               /* compare results from other methods and take the greater */
27239 +               if (ALT_MEM_K < EXT_MEM_K) {
27240 +                       mem_size = EXT_MEM_K;
27241 +                       who = "BIOS-88";
27242 +               } else {
27243 +                       mem_size = ALT_MEM_K;
27244 +                       who = "BIOS-e801";
27245 +               }
27246 +
27247 +               e820.nr_map = 0;
27248 +               add_memory_region(0, LOWMEMSIZE(), E820_RAM);
27249 +               add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
27250 +       }
27251 +       printk(KERN_INFO "BIOS-provided physical RAM map:\n");
27252 +       e820_print_map(who);
27253 +}
27254 +
27255 +#else  /* CONFIG_XEN */
27256 +
27257 +extern unsigned long xen_override_max_pfn;
27258 +extern union xen_start_info_union xen_start_info_union;
27259 +
27260 +unsigned long __init e820_end_of_ram(void)
27261 +{
27262 +       unsigned long max_end_pfn;
27263 +
27264 +       if (xen_override_max_pfn == 0) {
27265 +               max_end_pfn = xen_start_info->nr_pages;
27266 +               /* Default 8MB slack (to balance backend allocations). */
27267 +               max_end_pfn += 8 << (20 - PAGE_SHIFT);
27268 +       } else if (xen_override_max_pfn > xen_start_info->nr_pages) {
27269 +               max_end_pfn = xen_override_max_pfn;
27270 +       } else {
27271 +               max_end_pfn = xen_start_info->nr_pages;
27272 +       }
27273 +
27274 +       return max_end_pfn;
27275 +}
27276 +
27277 +unsigned long __init
27278 +e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
27279 +{
27280 +       return 0;
27281 +}
27282 +
27283 +void __init e820_reserve_resources(void) 
27284 +{
27285 +       dom0_op_t op;
27286 +       struct dom0_memory_map_entry *map;
27287 +       unsigned long gapstart, gapsize, round, last;
27288 +       int i, found = 0;
27289 +
27290 +       if (!(xen_start_info->flags & SIF_INITDOMAIN))
27291 +               return;
27292 +
27293 +       map = alloc_bootmem_low_pages(PAGE_SIZE);
27294 +       op.cmd = DOM0_PHYSICAL_MEMORY_MAP;
27295 +       op.u.physical_memory_map.memory_map = map;
27296 +       op.u.physical_memory_map.max_map_entries =
27297 +               PAGE_SIZE / sizeof(struct dom0_memory_map_entry);
27298 +       BUG_ON(HYPERVISOR_dom0_op(&op));
27299 +
27300 +       last = 0x100000000ULL;
27301 +       gapstart = 0x10000000;
27302 +       gapsize = 0x400000;
27303 +
27304 +       for (i = op.u.physical_memory_map.nr_map_entries - 1; i >= 0; i--) {
27305 +               struct resource *res;
27306 +
27307 +               if ((last > map[i].end) && ((last - map[i].end) > gapsize)) {
27308 +                       gapsize = last - map[i].end;
27309 +                       gapstart = map[i].end;
27310 +                       found = 1;
27311 +               }
27312 +               if (map[i].start < last)
27313 +                       last = map[i].start;
27314 +
27315 +               if (map[i].end > 0x100000000ULL)
27316 +                       continue;
27317 +               res = alloc_bootmem_low(sizeof(struct resource));
27318 +               res->name = map[i].is_ram ? "System RAM" : "reserved";
27319 +               res->start = map[i].start;
27320 +               res->end = map[i].end - 1;
27321 +               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
27322 +               request_resource(&iomem_resource, res);
27323 +       }
27324 +
27325 +       free_bootmem(__pa(map), PAGE_SIZE);
27326 +
27327 +       if (!found) {
27328 +               gapstart = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
27329 +               gapstart = (gapstart << PAGE_SHIFT) + 1024*1024;
27330 +               printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
27331 +                      KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
27332 +       }
27333 +
27334 +       /*
27335 +        * See how much we want to round up: start off with
27336 +        * rounding to the next 1MB area.
27337 +        */
27338 +       round = 0x100000;
27339 +       while ((gapsize >> 4) > round)
27340 +               round += round;
27341 +       /* Fun with two's complement */
27342 +       pci_mem_start = (gapstart + round) & -round;
27343 +
27344 +       printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
27345 +               pci_mem_start, gapstart, gapsize);
27346 +}
27347 +
27348 +#endif
27349 +
27350 +void __init parse_memopt(char *p, char **from) 
27351 +{ 
27352 +       end_user_pfn = memparse(p, from);
27353 +       end_user_pfn >>= PAGE_SHIFT;    
27354 +       xen_override_max_pfn = (unsigned long) end_user_pfn;
27355 +} 
27356 +
27357 +void __init parse_memmapopt(char *p, char **from)
27358 +{
27359 +       unsigned long long start_at, mem_size;
27360 +
27361 +       mem_size = memparse(p, from);
27362 +       p = *from;
27363 +       if (*p == '@') {
27364 +               start_at = memparse(p+1, from);
27365 +               add_memory_region(start_at, mem_size, E820_RAM);
27366 +       } else if (*p == '#') {
27367 +               start_at = memparse(p+1, from);
27368 +               add_memory_region(start_at, mem_size, E820_ACPI);
27369 +       } else if (*p == '$') {
27370 +               start_at = memparse(p+1, from);
27371 +               add_memory_region(start_at, mem_size, E820_RESERVED);
27372 +       } else {
27373 +               end_user_pfn = (mem_size >> PAGE_SHIFT);
27374 +       }
27375 +       p = *from;
27376 +}
27377 +
27378 +/*
27379 + * Search for the biggest gap in the low 32 bits of the e820
27380 + * memory space.  We pass this space to PCI to assign MMIO resources
27381 + * for hotplug or unconfigured devices in.
27382 + * Hopefully the BIOS let enough space left.
27383 + */
27384 +__init void e820_setup_gap(void)
27385 +{
27386 +#ifndef CONFIG_XEN
27387 +       unsigned long gapstart, gapsize;
27388 +       unsigned long last;
27389 +       int i;
27390 +       int found = 0;
27391 +
27392 +       last = 0x100000000ull;
27393 +       gapstart = 0x10000000;
27394 +       gapsize = 0x400000;
27395 +       i = e820.nr_map;
27396 +       while (--i >= 0) {
27397 +               unsigned long long start = e820.map[i].addr;
27398 +               unsigned long long end = start + e820.map[i].size;
27399 +
27400 +               /*
27401 +                * Since "last" is at most 4GB, we know we'll
27402 +                * fit in 32 bits if this condition is true
27403 +                */
27404 +               if (last > end) {
27405 +                       unsigned long gap = last - end;
27406 +
27407 +                       if (gap > gapsize) {
27408 +                               gapsize = gap;
27409 +                               gapstart = end;
27410 +                               found = 1;
27411 +                       }
27412 +               }
27413 +               if (start < last)
27414 +                       last = start;
27415 +       }
27416 +
27417 +       if (!found) {
27418 +               gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
27419 +               printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
27420 +                      KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
27421 +       }
27422 +
27423 +       /*
27424 +        * Start allocating dynamic PCI memory a bit into the gap,
27425 +        * aligned up to the nearest megabyte.
27426 +        *
27427 +        * Question: should we try to pad it up a bit (do something
27428 +        * like " + (gapsize >> 3)" in there too?). We now have the
27429 +        * technology.
27430 +        */
27431 +       pci_mem_start = (gapstart + 0xfffff) & ~0xfffff;
27432 +
27433 +       printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
27434 +               pci_mem_start, gapstart, gapsize);
27435 +#endif
27436 +}
27437 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/early_printk-xen.c linux-2.6.16/arch/x86_64/kernel/early_printk-xen.c
27438 --- linux-2.6.16.orig/arch/x86_64/kernel/early_printk-xen.c     1970-01-01 01:00:00.000000000 +0100
27439 +++ linux-2.6.16/arch/x86_64/kernel/early_printk-xen.c  2006-06-26 09:51:32.000000000 +0200
27440 @@ -0,0 +1,306 @@
27441 +#include <linux/config.h>
27442 +#include <linux/console.h>
27443 +#include <linux/kernel.h>
27444 +#include <linux/init.h>
27445 +#include <linux/string.h>
27446 +#include <linux/tty.h>
27447 +#include <asm/io.h>
27448 +#include <asm/processor.h>
27449 +#include <asm/fcntl.h>
27450 +
27451 +/* Simple VGA output */
27452 +
27453 +#ifdef __i386__
27454 +#include <asm/setup.h>
27455 +#define VGABASE                (__ISA_IO_base + 0xb8000)
27456 +#else
27457 +#include <asm/bootsetup.h>
27458 +#define VGABASE                ((void __iomem *)0xffffffff800b8000UL)
27459 +#endif
27460 +
27461 +#define MAX_YPOS       max_ypos
27462 +#define MAX_XPOS       max_xpos
27463 +
27464 +static int max_ypos = 25, max_xpos = 80;
27465 +
27466 +#ifndef CONFIG_XEN
27467 +static int current_ypos = 1, current_xpos = 0; 
27468 +
27469 +static void early_vga_write(struct console *con, const char *str, unsigned n)
27470 +{
27471 +       char c;
27472 +       int  i, k, j;
27473 +
27474 +       while ((c = *str++) != '\0' && n-- > 0) {
27475 +               if (current_ypos >= MAX_YPOS) {
27476 +                       /* scroll 1 line up */
27477 +                       for (k = 1, j = 0; k < MAX_YPOS; k++, j++) {
27478 +                               for (i = 0; i < MAX_XPOS; i++) {
27479 +                                       writew(readw(VGABASE + 2*(MAX_XPOS*k + i)),
27480 +                                              VGABASE + 2*(MAX_XPOS*j + i));
27481 +                               }
27482 +                       }
27483 +                       for (i = 0; i < MAX_XPOS; i++)
27484 +                               writew(0x720, VGABASE + 2*(MAX_XPOS*j + i));
27485 +                       current_ypos = MAX_YPOS-1;
27486 +               }
27487 +               if (c == '\n') {
27488 +                       current_xpos = 0;
27489 +                       current_ypos++;
27490 +               } else if (c != '\r')  {
27491 +                       writew(((0x7 << 8) | (unsigned short) c),
27492 +                              VGABASE + 2*(MAX_XPOS*current_ypos +
27493 +                                               current_xpos++));
27494 +                       if (current_xpos >= MAX_XPOS) {
27495 +                               current_xpos = 0;
27496 +                               current_ypos++;
27497 +                       }
27498 +               }
27499 +       }
27500 +}
27501 +
27502 +static struct console early_vga_console = {
27503 +       .name =         "earlyvga",
27504 +       .write =        early_vga_write,
27505 +       .flags =        CON_PRINTBUFFER,
27506 +       .index =        -1,
27507 +};
27508 +
27509 +/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ 
27510 +
27511 +static int early_serial_base = 0x3f8;  /* ttyS0 */
27512 +
27513 +#define XMTRDY          0x20
27514 +
27515 +#define DLAB           0x80
27516 +
27517 +#define TXR             0       /*  Transmit register (WRITE) */
27518 +#define RXR             0       /*  Receive register  (READ)  */
27519 +#define IER             1       /*  Interrupt Enable          */
27520 +#define IIR             2       /*  Interrupt ID              */
27521 +#define FCR             2       /*  FIFO control              */
27522 +#define LCR             3       /*  Line control              */
27523 +#define MCR             4       /*  Modem control             */
27524 +#define LSR             5       /*  Line Status               */
27525 +#define MSR             6       /*  Modem Status              */
27526 +#define DLL             0       /*  Divisor Latch Low         */
27527 +#define DLH             1       /*  Divisor latch High        */
27528 +
27529 +static int early_serial_putc(unsigned char ch) 
27530 +{ 
27531 +       unsigned timeout = 0xffff; 
27532 +       while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) 
27533 +               cpu_relax();
27534 +       outb(ch, early_serial_base + TXR);
27535 +       return timeout ? 0 : -1;
27536 +} 
27537 +
27538 +static void early_serial_write(struct console *con, const char *s, unsigned n)
27539 +{
27540 +       while (*s && n-- > 0) { 
27541 +               early_serial_putc(*s); 
27542 +               if (*s == '\n') 
27543 +                       early_serial_putc('\r'); 
27544 +               s++; 
27545 +       } 
27546 +} 
27547 +
27548 +#define DEFAULT_BAUD 9600
27549 +
27550 +static __init void early_serial_init(char *s)
27551 +{
27552 +       unsigned char c; 
27553 +       unsigned divisor;
27554 +       unsigned baud = DEFAULT_BAUD;
27555 +       char *e;
27556 +
27557 +       if (*s == ',')
27558 +               ++s;
27559 +
27560 +       if (*s) {
27561 +               unsigned port; 
27562 +               if (!strncmp(s,"0x",2)) {
27563 +                       early_serial_base = simple_strtoul(s, &e, 16);
27564 +               } else {
27565 +                       static int bases[] = { 0x3f8, 0x2f8 };
27566 +
27567 +                       if (!strncmp(s,"ttyS",4))
27568 +                               s += 4;
27569 +                       port = simple_strtoul(s, &e, 10);
27570 +                       if (port > 1 || s == e)
27571 +                               port = 0;
27572 +                       early_serial_base = bases[port];
27573 +               }
27574 +               s += strcspn(s, ",");
27575 +               if (*s == ',')
27576 +                       s++;
27577 +       }
27578 +
27579 +       outb(0x3, early_serial_base + LCR);     /* 8n1 */
27580 +       outb(0, early_serial_base + IER);       /* no interrupt */
27581 +       outb(0, early_serial_base + FCR);       /* no fifo */
27582 +       outb(0x3, early_serial_base + MCR);     /* DTR + RTS */
27583 +
27584 +       if (*s) {
27585 +               baud = simple_strtoul(s, &e, 0); 
27586 +               if (baud == 0 || s == e) 
27587 +                       baud = DEFAULT_BAUD;
27588 +       } 
27589 +       
27590 +       divisor = 115200 / baud; 
27591 +       c = inb(early_serial_base + LCR); 
27592 +       outb(c | DLAB, early_serial_base + LCR); 
27593 +       outb(divisor & 0xff, early_serial_base + DLL); 
27594 +       outb((divisor >> 8) & 0xff, early_serial_base + DLH); 
27595 +       outb(c & ~DLAB, early_serial_base + LCR);
27596 +}
27597 +
27598 +#else /* CONFIG_XEN */
27599 +
27600 +#undef SCREEN_INFO
27601 +#define SCREEN_INFO screen_info
27602 +extern struct screen_info screen_info;
27603 +
27604 +static void
27605 +early_serial_write(struct console *con, const char *s, unsigned count)
27606 +{
27607 +       int n;
27608 +
27609 +       while (count > 0) {
27610 +               n = HYPERVISOR_console_io(CONSOLEIO_write, count, (char *)s);
27611 +               if (n <= 0)
27612 +                       break;
27613 +               count -= n;
27614 +               s += n;
27615 +       }
27616 +} 
27617 +
27618 +static __init void early_serial_init(char *s)
27619 +{
27620 +}
27621 +
27622 +/*
27623 + * No early VGA console on Xen, as we do not have convenient ISA-space
27624 + * mappings. Someone should fix this for domain 0. For now, use fake serial.
27625 + */
27626 +#define early_vga_console early_serial_console
27627 +
27628 +#endif
27629 +
27630 +static struct console early_serial_console = {
27631 +       .name =         "earlyser",
27632 +       .write =        early_serial_write,
27633 +       .flags =        CON_PRINTBUFFER,
27634 +       .index =        -1,
27635 +};
27636 +
27637 +/* Console interface to a host file on AMD's SimNow! */
27638 +
27639 +static int simnow_fd;
27640 +
27641 +enum {
27642 +       MAGIC1 = 0xBACCD00A,
27643 +       MAGIC2 = 0xCA110000,
27644 +       XOPEN = 5,
27645 +       XWRITE = 4,
27646 +};
27647 +
27648 +static noinline long simnow(long cmd, long a, long b, long c)
27649 +{
27650 +       long ret;
27651 +       asm volatile("cpuid" :
27652 +                    "=a" (ret) :
27653 +                    "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
27654 +       return ret;
27655 +}
27656 +
27657 +void __init simnow_init(char *str)
27658 +{
27659 +       char *fn = "klog";
27660 +       if (*str == '=')
27661 +               fn = ++str;
27662 +       /* error ignored */
27663 +       simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
27664 +}
27665 +
27666 +static void simnow_write(struct console *con, const char *s, unsigned n)
27667 +{
27668 +       simnow(XWRITE, simnow_fd, (unsigned long)s, n);
27669 +}
27670 +
27671 +static struct console simnow_console = {
27672 +       .name =         "simnow",
27673 +       .write =        simnow_write,
27674 +       .flags =        CON_PRINTBUFFER,
27675 +       .index =        -1,
27676 +};
27677 +
27678 +/* Direct interface for emergencies */
27679 +struct console *early_console = &early_vga_console;
27680 +static int early_console_initialized = 0;
27681 +
27682 +void early_printk(const char *fmt, ...)
27683 +{ 
27684 +       char buf[512]; 
27685 +       int n; 
27686 +       va_list ap;
27687 +
27688 +       va_start(ap,fmt); 
27689 +       n = vscnprintf(buf,512,fmt,ap);
27690 +       early_console->write(early_console,buf,n);
27691 +       va_end(ap); 
27692 +} 
27693 +
27694 +static int __initdata keep_early;
27695 +
27696 +int __init setup_early_printk(char *opt) 
27697 +{  
27698 +       char *space;
27699 +       char buf[256]; 
27700 +
27701 +       if (early_console_initialized)
27702 +               return -1;
27703 +
27704 +       strlcpy(buf,opt,sizeof(buf)); 
27705 +       space = strchr(buf, ' '); 
27706 +       if (space)
27707 +               *space = 0; 
27708 +
27709 +       if (strstr(buf,"keep"))
27710 +               keep_early = 1; 
27711 +
27712 +       if (!strncmp(buf, "serial", 6)) { 
27713 +               early_serial_init(buf + 6);
27714 +               early_console = &early_serial_console;
27715 +       } else if (!strncmp(buf, "ttyS", 4)) { 
27716 +               early_serial_init(buf);
27717 +               early_console = &early_serial_console;          
27718 +       } else if (!strncmp(buf, "vga", 3)
27719 +                  && SCREEN_INFO.orig_video_isVGA == 1) {
27720 +               max_xpos = SCREEN_INFO.orig_video_cols;
27721 +               max_ypos = SCREEN_INFO.orig_video_lines;
27722 +               early_console = &early_vga_console; 
27723 +       } else if (!strncmp(buf, "simnow", 6)) {
27724 +               simnow_init(buf + 6);
27725 +               early_console = &simnow_console;
27726 +               keep_early = 1;
27727 +       }
27728 +       early_console_initialized = 1;
27729 +       register_console(early_console);       
27730 +       return 0;
27731 +}
27732 +
27733 +void __init disable_early_printk(void)
27734 +{ 
27735 +       if (!early_console_initialized || !early_console)
27736 +               return;
27737 +       if (!keep_early) {
27738 +               printk("disabling early console\n");
27739 +               unregister_console(early_console);
27740 +               early_console_initialized = 0;
27741 +       } else { 
27742 +               printk("keeping early console\n");
27743 +       }
27744 +} 
27745 +
27746 +__setup("earlyprintk=", setup_early_printk);
27747 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/entry-xen.S linux-2.6.16/arch/x86_64/kernel/entry-xen.S
27748 --- linux-2.6.16.orig/arch/x86_64/kernel/entry-xen.S    1970-01-01 01:00:00.000000000 +0100
27749 +++ linux-2.6.16/arch/x86_64/kernel/entry-xen.S 2006-06-26 09:51:32.000000000 +0200
27750 @@ -0,0 +1,1149 @@
27751 +/*
27752 + *  linux/arch/x86_64/entry.S
27753 + *
27754 + *  Copyright (C) 1991, 1992  Linus Torvalds
27755 + *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
27756 + *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
27757 + * 
27758 + *  $Id$
27759 + *
27760 + *  Jun Nakajima <jun.nakajima@intel.com>
27761 + *  Asit Mallick <asit.k.mallick@intel.com>
27762 + *      Modified for Xen
27763 + */
27764 +
27765 +/*
27766 + * entry.S contains the system-call and fault low-level handling routines.
27767 + *
27768 + * NOTE: This code handles signal-recognition, which happens every time
27769 + * after an interrupt and after each system call.
27770 + * 
27771 + * Normal syscalls and interrupts don't save a full stack frame, this is 
27772 + * only done for syscall tracing, signals or fork/exec et.al.
27773 + * 
27774 + * A note on terminology:       
27775 + * - top of stack: Architecture defined interrupt frame from SS to RIP 
27776 + * at the top of the kernel process stack.     
27777 + * - partial stack frame: partially saved registers upto R11.
27778 + * - full stack frame: Like partial stack frame, but all register saved. 
27779 + *     
27780 + * TODO:        
27781 + * - schedule it carefully for the final hardware.
27782 + */
27783 +
27784 +#define ASSEMBLY 1
27785 +#include <linux/config.h>
27786 +#ifdef CONFIG_DEBUG_INFO
27787 +#undef CONFIG_DEBUG_INFO
27788 +#endif
27789 +#include <linux/linkage.h>
27790 +#include <asm/segment.h>
27791 +#include <asm/smp.h>
27792 +#include <asm/cache.h>
27793 +#include <asm/errno.h>
27794 +#include <asm/dwarf2.h>
27795 +#include <asm/calling.h>
27796 +#include <asm/asm-offsets.h>
27797 +#include <asm/msr.h>
27798 +#include <asm/unistd.h>
27799 +#include <asm/thread_info.h>
27800 +#include <asm/hw_irq.h>
27801 +#include <asm/page.h>
27802 +#include <asm/errno.h>
27803 +#include <xen/interface/arch-x86_64.h>
27804 +#include <xen/interface/features.h>
27805 +
27806 +#include "irq_vectors.h"
27807 +
27808 +#include "xen_entry.S"
27809 +       
27810 +       .code64
27811 +
27812 +#ifndef CONFIG_PREEMPT
27813 +#define retint_kernel retint_restore_args
27814 +#endif 
27815 +
27816 +NMI_MASK = 0x80000000
27817 +       
27818 +/*
27819 + * C code is not supposed to know about undefined top of stack. Every time 
27820 + * a C function with an pt_regs argument is called from the SYSCALL based 
27821 + * fast path FIXUP_TOP_OF_STACK is needed.
27822 + * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
27823 + * manipulation.
27824 + */            
27825 +               
27826 +       /* %rsp:at FRAMEEND */ 
27827 +       .macro FIXUP_TOP_OF_STACK tmp
27828 +       movq    $__USER_CS,CS(%rsp)
27829 +       movq    $-1,RCX(%rsp)
27830 +       .endm
27831 +
27832 +       .macro RESTORE_TOP_OF_STACK tmp,offset=0
27833 +       .endm
27834 +
27835 +       .macro FAKE_STACK_FRAME child_rip
27836 +       /* push in order ss, rsp, eflags, cs, rip */
27837 +       xorl %eax, %eax
27838 +       pushq %rax /* ss */
27839 +       CFI_ADJUST_CFA_OFFSET   8
27840 +       /*CFI_REL_OFFSET        ss,0*/
27841 +       pushq %rax /* rsp */
27842 +       CFI_ADJUST_CFA_OFFSET   8
27843 +       CFI_REL_OFFSET  rsp,0
27844 +       pushq $(1<<9) /* eflags - interrupts on */
27845 +       CFI_ADJUST_CFA_OFFSET   8
27846 +       /*CFI_REL_OFFSET        rflags,0*/
27847 +       pushq $__KERNEL_CS /* cs */
27848 +       CFI_ADJUST_CFA_OFFSET   8
27849 +       /*CFI_REL_OFFSET        cs,0*/
27850 +       pushq \child_rip /* rip */
27851 +       CFI_ADJUST_CFA_OFFSET   8
27852 +       CFI_REL_OFFSET  rip,0
27853 +       pushq   %rax /* orig rax */
27854 +       CFI_ADJUST_CFA_OFFSET   8
27855 +       .endm
27856 +
27857 +       .macro UNFAKE_STACK_FRAME
27858 +       addq $8*6, %rsp
27859 +       CFI_ADJUST_CFA_OFFSET   -(6*8)
27860 +       .endm
27861 +
27862 +       .macro  CFI_DEFAULT_STACK start=1
27863 +       .if \start
27864 +       CFI_STARTPROC   simple
27865 +       CFI_DEF_CFA     rsp,SS+8
27866 +       .else
27867 +       CFI_DEF_CFA_OFFSET SS+8
27868 +       .endif
27869 +       CFI_REL_OFFSET  r15,R15
27870 +       CFI_REL_OFFSET  r14,R14
27871 +       CFI_REL_OFFSET  r13,R13
27872 +       CFI_REL_OFFSET  r12,R12
27873 +       CFI_REL_OFFSET  rbp,RBP
27874 +       CFI_REL_OFFSET  rbx,RBX
27875 +       CFI_REL_OFFSET  r11,R11
27876 +       CFI_REL_OFFSET  r10,R10
27877 +       CFI_REL_OFFSET  r9,R9
27878 +       CFI_REL_OFFSET  r8,R8
27879 +       CFI_REL_OFFSET  rax,RAX
27880 +       CFI_REL_OFFSET  rcx,RCX
27881 +       CFI_REL_OFFSET  rdx,RDX
27882 +       CFI_REL_OFFSET  rsi,RSI
27883 +       CFI_REL_OFFSET  rdi,RDI
27884 +       CFI_REL_OFFSET  rip,RIP
27885 +       /*CFI_REL_OFFSET        cs,CS*/
27886 +       /*CFI_REL_OFFSET        rflags,EFLAGS*/
27887 +       CFI_REL_OFFSET  rsp,RSP
27888 +       /*CFI_REL_OFFSET        ss,SS*/
27889 +       .endm
27890 +
27891 +        /*
27892 +         * Must be consistent with the definition in arch-x86_64.h:    
27893 +         *     struct iret_context {
27894 +         *        u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
27895 +         *     };
27896 +         * #define VGCF_IN_SYSCALL (1<<8) 
27897 +         */
27898 +       .macro HYPERVISOR_IRET flag
27899 +       testb $3,1*8(%rsp)
27900 +       jnz   2f
27901 +       testl $NMI_MASK,2*8(%rsp)
27902 +       jnz   2f
27903 +
27904 +       testb $1,(xen_features+XENFEAT_supervisor_mode_kernel)
27905 +       jnz   1f
27906 +
27907 +       /* Direct iret to kernel space. Correct CS and SS. */
27908 +       orb   $3,1*8(%rsp)
27909 +       orb   $3,4*8(%rsp)
27910 +1:     iretq
27911 +
27912 +2:     /* Slow iret via hypervisor. */
27913 +       andl  $~NMI_MASK, 16(%rsp)
27914 +       pushq $\flag
27915 +       jmp  hypercall_page + (__HYPERVISOR_iret * 32)
27916 +       .endm
27917 +
27918 +        .macro SWITCH_TO_KERNEL ssoff,adjust=0
27919 +       jc  1f
27920 +       orb  $1,\ssoff-\adjust+4(%rsp)
27921 +1:
27922 +        .endm
27923 +
27924 +/*
27925 + * A newly forked process directly context switches into this.
27926 + */    
27927 +/* rdi:        prev */ 
27928 +ENTRY(ret_from_fork)
27929 +       CFI_DEFAULT_STACK
27930 +       call schedule_tail
27931 +       GET_THREAD_INFO(%rcx)
27932 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
27933 +       jnz rff_trace
27934 +rff_action:    
27935 +       RESTORE_REST
27936 +       testl $3,CS-ARGOFFSET(%rsp)     # from kernel_thread?
27937 +       je   int_ret_from_sys_call
27938 +       testl $_TIF_IA32,threadinfo_flags(%rcx)
27939 +       jnz  int_ret_from_sys_call
27940 +       RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
27941 +       jmp ret_from_sys_call
27942 +rff_trace:
27943 +       movq %rsp,%rdi
27944 +       call syscall_trace_leave
27945 +       GET_THREAD_INFO(%rcx)   
27946 +       jmp rff_action
27947 +       CFI_ENDPROC
27948 +
27949 +/*
27950 + * System call entry. Upto 6 arguments in registers are supported.
27951 + *
27952 + * SYSCALL does not save anything on the stack and does not change the
27953 + * stack pointer.
27954 + */
27955 +               
27956 +/*
27957 + * Register setup:     
27958 + * rax  system call number
27959 + * rdi  arg0
27960 + * rcx  return address for syscall/sysret, C arg3 
27961 + * rsi  arg1
27962 + * rdx  arg2   
27963 + * r10  arg3   (--> moved to rcx for C)
27964 + * r8   arg4
27965 + * r9   arg5
27966 + * r11  eflags for syscall/sysret, temporary for C
27967 + * r12-r15,rbp,rbx saved by C code, not touched.               
27968 + * 
27969 + * Interrupts are off on entry.
27970 + * Only called from user space.
27971 + *
27972 + * XXX if we had a free scratch register we could save the RSP into the stack frame
27973 + *      and report it properly in ps. Unfortunately we haven't.
27974 + */                                    
27975 +
27976 +ENTRY(system_call)
27977 +       CFI_STARTPROC   simple
27978 +       CFI_DEF_CFA     rsp,0
27979 +       CFI_REGISTER    rip,rcx
27980 +       /*CFI_REGISTER  rflags,r11*/
27981 +       SAVE_ARGS -8,0
27982 +       movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
27983 +        XEN_UNBLOCK_EVENTS(%r11)        
27984 +       GET_THREAD_INFO(%rcx)
27985 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
27986 +       CFI_REMEMBER_STATE
27987 +       jnz tracesys
27988 +       cmpq $__NR_syscall_max,%rax
27989 +       ja badsys
27990 +       movq %r10,%rcx
27991 +       call *sys_call_table(,%rax,8)  # XXX:    rip relative
27992 +       movq %rax,RAX-ARGOFFSET(%rsp)
27993 +/*
27994 + * Syscall return path ending with SYSRET (fast path)
27995 + * Has incomplete stack frame and undefined top of stack. 
27996 + */            
27997 +       .globl ret_from_sys_call
27998 +ret_from_sys_call:
27999 +       movl $_TIF_ALLWORK_MASK,%edi
28000 +       /* edi: flagmask */
28001 +sysret_check:          
28002 +       GET_THREAD_INFO(%rcx)
28003 +        XEN_BLOCK_EVENTS(%rsi)        
28004 +       movl threadinfo_flags(%rcx),%edx
28005 +       andl %edi,%edx
28006 +       CFI_REMEMBER_STATE
28007 +       jnz  sysret_careful 
28008 +        XEN_UNBLOCK_EVENTS(%rsi)                
28009 +       CFI_REGISTER    rip,rcx
28010 +       RESTORE_ARGS 0,8,0
28011 +       /*CFI_REGISTER  rflags,r11*/
28012 +        HYPERVISOR_IRET VGCF_IN_SYSCALL
28013 +
28014 +       /* Handle reschedules */
28015 +       /* edx: work, edi: workmask */  
28016 +sysret_careful:
28017 +       CFI_RESTORE_STATE
28018 +       bt $TIF_NEED_RESCHED,%edx
28019 +       jnc sysret_signal
28020 +        XEN_BLOCK_EVENTS(%rsi)        
28021 +       pushq %rdi
28022 +       CFI_ADJUST_CFA_OFFSET 8
28023 +       call schedule
28024 +       popq  %rdi
28025 +       CFI_ADJUST_CFA_OFFSET -8
28026 +       jmp sysret_check
28027 +
28028 +       /* Handle a signal */ 
28029 +sysret_signal:
28030 +/*     sti */
28031 +        XEN_UNBLOCK_EVENTS(%rsi)        
28032 +       testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
28033 +       jz    1f
28034 +
28035 +       /* Really a signal */
28036 +       /* edx: work flags (arg3) */
28037 +       leaq do_notify_resume(%rip),%rax
28038 +       leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
28039 +       xorl %esi,%esi # oldset -> arg2
28040 +       call ptregscall_common
28041 +1:     movl $_TIF_NEED_RESCHED,%edi
28042 +       jmp sysret_check
28043 +       
28044 +badsys:
28045 +       movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
28046 +       jmp ret_from_sys_call
28047 +
28048 +       /* Do syscall tracing */
28049 +tracesys:                       
28050 +       CFI_RESTORE_STATE
28051 +       SAVE_REST
28052 +       movq $-ENOSYS,RAX(%rsp)
28053 +       FIXUP_TOP_OF_STACK %rdi
28054 +       movq %rsp,%rdi
28055 +       call syscall_trace_enter
28056 +       LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
28057 +       RESTORE_REST
28058 +       cmpq $__NR_syscall_max,%rax
28059 +       ja  1f
28060 +       movq %r10,%rcx  /* fixup for C */
28061 +       call *sys_call_table(,%rax,8)
28062 +       movq %rax,RAX-ARGOFFSET(%rsp)
28063 +1:     SAVE_REST
28064 +       movq %rsp,%rdi
28065 +       call syscall_trace_leave
28066 +       RESTORE_TOP_OF_STACK %rbx
28067 +       RESTORE_REST
28068 +       jmp ret_from_sys_call
28069 +       CFI_ENDPROC
28070 +               
28071 +/* 
28072 + * Syscall return path ending with IRET.
28073 + * Has correct top of stack, but partial stack frame.
28074 + */    
28075 +ENTRY(int_ret_from_sys_call)
28076 +       CFI_STARTPROC   simple
28077 +       CFI_DEF_CFA     rsp,SS+8-ARGOFFSET
28078 +       /*CFI_REL_OFFSET        ss,SS-ARGOFFSET*/
28079 +       CFI_REL_OFFSET  rsp,RSP-ARGOFFSET
28080 +       /*CFI_REL_OFFSET        rflags,EFLAGS-ARGOFFSET*/
28081 +       /*CFI_REL_OFFSET        cs,CS-ARGOFFSET*/
28082 +       CFI_REL_OFFSET  rip,RIP-ARGOFFSET
28083 +       CFI_REL_OFFSET  rdx,RDX-ARGOFFSET
28084 +       CFI_REL_OFFSET  rcx,RCX-ARGOFFSET
28085 +       CFI_REL_OFFSET  rax,RAX-ARGOFFSET
28086 +       CFI_REL_OFFSET  rdi,RDI-ARGOFFSET
28087 +       CFI_REL_OFFSET  rsi,RSI-ARGOFFSET
28088 +       CFI_REL_OFFSET  r8,R8-ARGOFFSET
28089 +       CFI_REL_OFFSET  r9,R9-ARGOFFSET
28090 +       CFI_REL_OFFSET  r10,R10-ARGOFFSET
28091 +       CFI_REL_OFFSET  r11,R11-ARGOFFSET
28092 +        XEN_BLOCK_EVENTS(%rsi)
28093 +       testb $3,CS-ARGOFFSET(%rsp)
28094 +        jnz 1f
28095 +        /* Need to set the proper %ss (not NULL) for ring 3 iretq */
28096 +        movl $__KERNEL_DS,SS-ARGOFFSET(%rsp)
28097 +        jmp retint_restore_args   # retrun from ring3 kernel
28098 +1:              
28099 +       movl $_TIF_ALLWORK_MASK,%edi
28100 +       /* edi: mask to check */
28101 +int_with_check:
28102 +       GET_THREAD_INFO(%rcx)
28103 +       movl threadinfo_flags(%rcx),%edx
28104 +       andl %edi,%edx
28105 +       jnz   int_careful
28106 +       andl    $~TS_COMPAT,threadinfo_status(%rcx)
28107 +       jmp   retint_restore_args
28108 +
28109 +       /* Either reschedule or signal or syscall exit tracking needed. */
28110 +       /* First do a reschedule test. */
28111 +       /* edx: work, edi: workmask */
28112 +int_careful:
28113 +       bt $TIF_NEED_RESCHED,%edx
28114 +       jnc  int_very_careful
28115 +/*     sti */
28116 +        XEN_UNBLOCK_EVENTS(%rsi)
28117 +       pushq %rdi
28118 +       CFI_ADJUST_CFA_OFFSET 8
28119 +       call schedule
28120 +       popq %rdi
28121 +       CFI_ADJUST_CFA_OFFSET -8
28122 +       cli
28123 +       jmp int_with_check
28124 +
28125 +       /* handle signals and tracing -- both require a full stack frame */
28126 +int_very_careful:
28127 +/*     sti */
28128 +        XEN_UNBLOCK_EVENTS(%rsi)
28129 +       SAVE_REST
28130 +       /* Check for syscall exit trace */      
28131 +       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
28132 +       jz int_signal
28133 +       pushq %rdi
28134 +       CFI_ADJUST_CFA_OFFSET 8
28135 +       leaq 8(%rsp),%rdi       # &ptregs -> arg1       
28136 +       call syscall_trace_leave
28137 +       popq %rdi
28138 +       CFI_ADJUST_CFA_OFFSET -8
28139 +       andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
28140 +       cli
28141 +       jmp int_restore_rest
28142 +       
28143 +int_signal:
28144 +       testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx
28145 +       jz 1f
28146 +       movq %rsp,%rdi          # &ptregs -> arg1
28147 +       xorl %esi,%esi          # oldset -> arg2
28148 +       call do_notify_resume
28149 +1:     movl $_TIF_NEED_RESCHED,%edi    
28150 +int_restore_rest:
28151 +       RESTORE_REST
28152 +       cli
28153 +       jmp int_with_check
28154 +       CFI_ENDPROC
28155 +               
28156 +/* 
28157 + * Certain special system calls that need to save a complete full stack frame.
28158 + */                                                            
28159 +       
28160 +       .macro PTREGSCALL label,func,arg
28161 +       .globl \label
28162 +\label:
28163 +       leaq    \func(%rip),%rax
28164 +       leaq    -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
28165 +       jmp     ptregscall_common
28166 +       .endm
28167 +
28168 +       CFI_STARTPROC
28169 +
28170 +       PTREGSCALL stub_clone, sys_clone, %r8
28171 +       PTREGSCALL stub_fork, sys_fork, %rdi
28172 +       PTREGSCALL stub_vfork, sys_vfork, %rdi
28173 +       PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
28174 +       PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
28175 +       PTREGSCALL stub_iopl, sys_iopl, %rsi
28176 +
28177 +ENTRY(ptregscall_common)
28178 +       popq %r11
28179 +       CFI_ADJUST_CFA_OFFSET -8
28180 +       CFI_REGISTER rip, r11
28181 +       SAVE_REST
28182 +       movq %r11, %r15
28183 +       CFI_REGISTER rip, r15
28184 +       FIXUP_TOP_OF_STACK %r11
28185 +       call *%rax
28186 +       RESTORE_TOP_OF_STACK %r11
28187 +       movq %r15, %r11
28188 +       CFI_REGISTER rip, r11
28189 +       RESTORE_REST
28190 +       pushq %r11
28191 +       CFI_ADJUST_CFA_OFFSET 8
28192 +       CFI_REL_OFFSET rip, 0
28193 +       ret
28194 +       CFI_ENDPROC
28195 +       
28196 +ENTRY(stub_execve)
28197 +       CFI_STARTPROC
28198 +       popq %r11
28199 +       CFI_ADJUST_CFA_OFFSET -8
28200 +       CFI_REGISTER rip, r11
28201 +       SAVE_REST
28202 +       movq %r11, %r15
28203 +       CFI_REGISTER rip, r15
28204 +       FIXUP_TOP_OF_STACK %r11
28205 +       call sys_execve
28206 +       GET_THREAD_INFO(%rcx)
28207 +       bt $TIF_IA32,threadinfo_flags(%rcx)
28208 +       CFI_REMEMBER_STATE
28209 +       jc exec_32bit
28210 +       RESTORE_TOP_OF_STACK %r11
28211 +       movq %r15, %r11
28212 +       CFI_REGISTER rip, r11
28213 +       RESTORE_REST
28214 +       pushq %r11
28215 +       CFI_ADJUST_CFA_OFFSET 8
28216 +       CFI_REL_OFFSET rip, 0
28217 +       ret
28218 +
28219 +exec_32bit:
28220 +       CFI_RESTORE_STATE
28221 +       movq %rax,RAX(%rsp)
28222 +       RESTORE_REST
28223 +       jmp int_ret_from_sys_call
28224 +       CFI_ENDPROC
28225 +       
28226 +/*
28227 + * sigreturn is special because it needs to restore all registers on return.
28228 + * This cannot be done with SYSRET, so use the IRET return path instead.
28229 + */                
28230 +ENTRY(stub_rt_sigreturn)
28231 +       CFI_STARTPROC
28232 +       addq $8, %rsp
28233 +       CFI_ADJUST_CFA_OFFSET   -8
28234 +       SAVE_REST
28235 +       movq %rsp,%rdi
28236 +       FIXUP_TOP_OF_STACK %r11
28237 +       call sys_rt_sigreturn
28238 +       movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
28239 +       RESTORE_REST
28240 +       jmp int_ret_from_sys_call
28241 +       CFI_ENDPROC
28242 +
28243 +/*
28244 + * initial frame state for interrupts and exceptions
28245 + */
28246 +       .macro _frame ref
28247 +       CFI_STARTPROC simple
28248 +       CFI_DEF_CFA rsp,SS+8-\ref
28249 +       /*CFI_REL_OFFSET ss,SS-\ref*/
28250 +       CFI_REL_OFFSET rsp,RSP-\ref
28251 +       /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
28252 +       /*CFI_REL_OFFSET cs,CS-\ref*/
28253 +       CFI_REL_OFFSET rip,RIP-\ref
28254 +       .endm
28255 +
28256 +/* initial frame state for interrupts (and exceptions without error code) */
28257 +#define INTR_FRAME _frame RIP
28258 +/* initial frame state for exceptions with error code (and interrupts with
28259 +   vector already pushed) */
28260 +#define XCPT_FRAME _frame ORIG_RAX
28261 +
28262 +/* 
28263 + * Interrupt exit.
28264 + *
28265 + */ 
28266 +
28267 +retint_check:
28268 +       movl threadinfo_flags(%rcx),%edx
28269 +       andl %edi,%edx
28270 +       CFI_REMEMBER_STATE
28271 +       jnz  retint_careful
28272 +retint_restore_args:
28273 +       movl EFLAGS-REST_SKIP(%rsp), %eax
28274 +       shr $9, %eax                    # EAX[0] == IRET_EFLAGS.IF
28275 +       XEN_GET_VCPU_INFO(%rsi)
28276 +       andb evtchn_upcall_mask(%rsi),%al
28277 +       andb $1,%al                     # EAX[0] == IRET_EFLAGS.IF & event_mask
28278 +       jnz restore_all_enable_events   #        != 0 => enable event delivery
28279 +       XEN_PUT_VCPU_INFO(%rsi)
28280 +               
28281 +       RESTORE_ARGS 0,8,0
28282 +       HYPERVISOR_IRET 0
28283 +       
28284 +       /* edi: workmask, edx: work */
28285 +retint_careful:
28286 +       CFI_RESTORE_STATE
28287 +       bt    $TIF_NEED_RESCHED,%edx
28288 +       jnc   retint_signal
28289 +       XEN_UNBLOCK_EVENTS(%rsi)
28290 +/*     sti */        
28291 +       pushq %rdi
28292 +       CFI_ADJUST_CFA_OFFSET   8
28293 +       call  schedule
28294 +       popq %rdi               
28295 +       CFI_ADJUST_CFA_OFFSET   -8
28296 +       XEN_BLOCK_EVENTS(%rsi)          
28297 +       GET_THREAD_INFO(%rcx)
28298 +/*     cli */
28299 +       jmp retint_check
28300 +       
28301 +retint_signal:
28302 +       testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
28303 +       jz    retint_restore_args
28304 +        XEN_UNBLOCK_EVENTS(%rsi)
28305 +       SAVE_REST
28306 +       movq $-1,ORIG_RAX(%rsp)                         
28307 +       xorl %esi,%esi          # oldset
28308 +       movq %rsp,%rdi          # &pt_regs
28309 +       call do_notify_resume
28310 +       RESTORE_REST
28311 +        XEN_BLOCK_EVENTS(%rsi)         
28312 +       movl $_TIF_NEED_RESCHED,%edi
28313 +       GET_THREAD_INFO(%rcx)
28314 +       jmp retint_check
28315 +
28316 +#ifdef CONFIG_PREEMPT
28317 +       /* Returning to kernel space. Check if we need preemption */
28318 +       /* rcx:  threadinfo. interrupts off. */
28319 +       .p2align
28320 +retint_kernel: 
28321 +       cmpl $0,threadinfo_preempt_count(%rcx)
28322 +       jnz  retint_restore_args
28323 +       bt  $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
28324 +       jnc  retint_restore_args
28325 +       bt   $9,EFLAGS-ARGOFFSET(%rsp)  /* interrupts off? */
28326 +       jnc  retint_restore_args
28327 +       call preempt_schedule_irq
28328 +       jmp retint_kernel       /* check again */
28329 +#endif 
28330 +       CFI_ENDPROC
28331 +       
28332 +/*
28333 + * APIC interrupts.
28334 + */            
28335 +       .macro apicinterrupt num,func
28336 +       INTR_FRAME
28337 +       pushq $~(\num)
28338 +       CFI_ADJUST_CFA_OFFSET 8
28339 +       interrupt \func
28340 +       jmp error_entry
28341 +       CFI_ENDPROC
28342 +       .endm
28343 +
28344 +#ifndef CONFIG_XEN
28345 +ENTRY(thermal_interrupt)
28346 +       apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
28347 +
28348 +ENTRY(threshold_interrupt)
28349 +       apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
28350 +
28351 +#ifdef CONFIG_SMP      
28352 +ENTRY(reschedule_interrupt)
28353 +       apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
28354 +
28355 +       .macro INVALIDATE_ENTRY num
28356 +ENTRY(invalidate_interrupt\num)
28357 +       apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt 
28358 +       .endm
28359 +
28360 +       INVALIDATE_ENTRY 0
28361 +       INVALIDATE_ENTRY 1
28362 +       INVALIDATE_ENTRY 2
28363 +       INVALIDATE_ENTRY 3
28364 +       INVALIDATE_ENTRY 4
28365 +       INVALIDATE_ENTRY 5
28366 +       INVALIDATE_ENTRY 6
28367 +       INVALIDATE_ENTRY 7
28368 +
28369 +ENTRY(call_function_interrupt)
28370 +       apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
28371 +#endif
28372 +
28373 +#ifdef CONFIG_X86_LOCAL_APIC   
28374 +ENTRY(apic_timer_interrupt)
28375 +       apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
28376 +
28377 +ENTRY(error_interrupt)
28378 +       apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
28379 +
28380 +ENTRY(spurious_interrupt)
28381 +       apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
28382 +#endif
28383 +#endif /* !CONFIG_XEN */
28384 +                               
28385 +/*
28386 + * Exception entry points.
28387 + */            
28388 +       .macro zeroentry sym
28389 +       INTR_FRAME
28390 +        movq (%rsp),%rcx
28391 +        movq 8(%rsp),%r11
28392 +        addq $0x10,%rsp /* skip rcx and r11 */
28393 +       pushq $0        /* push error code/oldrax */ 
28394 +       CFI_ADJUST_CFA_OFFSET 8
28395 +       pushq %rax      /* push real oldrax to the rdi slot */ 
28396 +       CFI_ADJUST_CFA_OFFSET 8
28397 +       leaq  \sym(%rip),%rax
28398 +       jmp error_entry
28399 +       CFI_ENDPROC
28400 +       .endm   
28401 +
28402 +       .macro errorentry sym
28403 +       XCPT_FRAME
28404 +        movq (%rsp),%rcx
28405 +        movq 8(%rsp),%r11
28406 +        addq $0x10,%rsp /* rsp points to the error code */
28407 +       pushq %rax
28408 +       CFI_ADJUST_CFA_OFFSET 8
28409 +       leaq  \sym(%rip),%rax
28410 +       jmp error_entry
28411 +       CFI_ENDPROC
28412 +       .endm
28413 +
28414 +#if 0 /* not XEN */
28415 +       /* error code is on the stack already */
28416 +       /* handle NMI like exceptions that can happen everywhere */
28417 +       .macro paranoidentry sym, ist=0
28418 +        movq (%rsp),%rcx
28419 +        movq 8(%rsp),%r11
28420 +        addq $0x10,%rsp /* skip rcx and r11 */        
28421 +       SAVE_ALL
28422 +       cld
28423 +#if 0 /* not XEN */
28424 +       movl $1,%ebx
28425 +       movl  $MSR_GS_BASE,%ecx
28426 +       rdmsr
28427 +       testl %edx,%edx
28428 +       js    1f
28429 +       swapgs
28430 +       xorl  %ebx,%ebx
28431 +1:
28432 +#endif
28433 +       .if \ist
28434 +       movq    %gs:pda_data_offset, %rbp
28435 +       .endif
28436 +       movq %rsp,%rdi
28437 +       movq ORIG_RAX(%rsp),%rsi
28438 +       movq $-1,ORIG_RAX(%rsp)
28439 +       .if \ist
28440 +       subq    $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
28441 +       .endif
28442 +       call \sym
28443 +       .if \ist
28444 +       addq    $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
28445 +       .endif
28446 +/*     cli */
28447 +       XEN_BLOCK_EVENTS(%rsi)          
28448 +       .endm
28449 +#endif
28450 +       
28451 +/*
28452 + * Exception entry point. This expects an error code/orig_rax on the stack
28453 + * and the exception handler in %rax.  
28454 + */                                            
28455 +ENTRY(error_entry)
28456 +       _frame RDI
28457 +       /* rdi slot contains rax, oldrax contains error code */
28458 +       cld     
28459 +       subq  $14*8,%rsp
28460 +       CFI_ADJUST_CFA_OFFSET   (14*8)
28461 +       movq %rsi,13*8(%rsp)
28462 +       CFI_REL_OFFSET  rsi,RSI
28463 +       movq 14*8(%rsp),%rsi    /* load rax from rdi slot */
28464 +       movq %rdx,12*8(%rsp)
28465 +       CFI_REL_OFFSET  rdx,RDX
28466 +       movq %rcx,11*8(%rsp)
28467 +       CFI_REL_OFFSET  rcx,RCX
28468 +       movq %rsi,10*8(%rsp)    /* store rax */ 
28469 +       CFI_REL_OFFSET  rax,RAX
28470 +       movq %r8, 9*8(%rsp)
28471 +       CFI_REL_OFFSET  r8,R8
28472 +       movq %r9, 8*8(%rsp)
28473 +       CFI_REL_OFFSET  r9,R9
28474 +       movq %r10,7*8(%rsp)
28475 +       CFI_REL_OFFSET  r10,R10
28476 +       movq %r11,6*8(%rsp)
28477 +       CFI_REL_OFFSET  r11,R11
28478 +       movq %rbx,5*8(%rsp) 
28479 +       CFI_REL_OFFSET  rbx,RBX
28480 +       movq %rbp,4*8(%rsp) 
28481 +       CFI_REL_OFFSET  rbp,RBP
28482 +       movq %r12,3*8(%rsp) 
28483 +       CFI_REL_OFFSET  r12,R12
28484 +       movq %r13,2*8(%rsp) 
28485 +       CFI_REL_OFFSET  r13,R13
28486 +       movq %r14,1*8(%rsp) 
28487 +       CFI_REL_OFFSET  r14,R14
28488 +       movq %r15,(%rsp) 
28489 +       CFI_REL_OFFSET  r15,R15
28490 +#if 0        
28491 +       cmpl $__KERNEL_CS,CS(%rsp)
28492 +       je  error_kernelspace
28493 +#endif        
28494 +error_call_handler:
28495 +       movq %rdi, RDI(%rsp)            
28496 +       movq %rsp,%rdi
28497 +       movq ORIG_RAX(%rsp),%rsi        # get error code 
28498 +       movq $-1,ORIG_RAX(%rsp)
28499 +       call *%rax
28500 +error_exit:            
28501 +       RESTORE_REST
28502 +/*     cli */
28503 +       XEN_BLOCK_EVENTS(%rsi)          
28504 +       GET_THREAD_INFO(%rcx)   
28505 +       testb $3,CS-ARGOFFSET(%rsp)
28506 +       jz retint_kernel
28507 +       movl  threadinfo_flags(%rcx),%edx
28508 +       movl  $_TIF_WORK_MASK,%edi      
28509 +       andl  %edi,%edx
28510 +       jnz   retint_careful
28511 +       jmp   retint_restore_args
28512 +
28513 +error_kernelspace:
28514 +         /*
28515 +         * We need to re-write the logic here because we don't do iretq to 
28516 +         * to return to user mode. It's still possible that we get trap/fault
28517 +         * in the kernel (when accessing buffers pointed to by system calls, 
28518 +         * for example).
28519 +         *
28520 +         */           
28521 +#if 0
28522 +       incl %ebx
28523 +       /* There are two places in the kernel that can potentially fault with
28524 +          usergs. Handle them here. The exception handlers after
28525 +          iret run with kernel gs again, so don't set the user space flag.
28526 +          B stepping K8s sometimes report an truncated RIP for IRET 
28527 +          exceptions returning to compat mode. Check for these here too. */
28528 +       leaq iret_label(%rip),%rbp
28529 +       cmpq %rbp,RIP(%rsp) 
28530 +       je   error_swapgs
28531 +       movl %ebp,%ebp  /* zero extend */
28532 +       cmpq %rbp,RIP(%rsp) 
28533 +       je   error_swapgs
28534 +       cmpq $gs_change,RIP(%rsp)
28535 +        je   error_swapgs
28536 +       jmp  error_sti
28537 +#endif        
28538 +       
28539 +ENTRY(hypervisor_callback)
28540 +       zeroentry do_hypervisor_callback
28541 +        
28542 +/*
28543 + * Copied from arch/xen/i386/kernel/entry.S
28544 + */               
28545 +# A note on the "critical region" in our callback handler.
28546 +# We want to avoid stacking callback handlers due to events occurring
28547 +# during handling of the last event. To do this, we keep events disabled
28548 +# until we've done all processing. HOWEVER, we must enable events before
28549 +# popping the stack frame (can't be done atomically) and so it would still
28550 +# be possible to get enough handler activations to overflow the stack.
28551 +# Although unlikely, bugs of that kind are hard to track down, so we'd
28552 +# like to avoid the possibility.
28553 +# So, on entry to the handler we detect whether we interrupted an
28554 +# existing activation in its critical region -- if so, we pop the current
28555 +# activation and restart the handler using the previous one.
28556 +ENTRY(do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
28557 +# Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
28558 +# see the correct pointer to the pt_regs
28559 +       movq %rdi, %rsp            # we don't return, adjust the stack frame
28560 +11:    movq %gs:pda_irqstackptr,%rax
28561 +       incl %gs:pda_irqcount
28562 +       cmovzq %rax,%rsp
28563 +       pushq %rdi
28564 +       call evtchn_do_upcall
28565 +       popq %rsp
28566 +       decl %gs:pda_irqcount
28567 +       jmp  error_exit
28568 +
28569 +#ifdef CONFIG_X86_LOCAL_APIC
28570 +KPROBE_ENTRY(nmi)
28571 +       zeroentry do_nmi_callback
28572 +ENTRY(do_nmi_callback)
28573 +        addq $8, %rsp
28574 +        call do_nmi
28575 +        orl  $NMI_MASK,EFLAGS(%rsp)
28576 +        RESTORE_REST
28577 +        XEN_BLOCK_EVENTS(%rsi)
28578 +        GET_THREAD_INFO(%rcx)
28579 +        jmp  retint_restore_args
28580 +       .previous .text
28581 +#endif
28582 +
28583 +        ALIGN
28584 +restore_all_enable_events:  
28585 +       XEN_UNBLOCK_EVENTS(%rsi)        # %rsi is already set up...
28586 +
28587 +scrit: /**** START OF CRITICAL REGION ****/
28588 +       XEN_TEST_PENDING(%rsi)
28589 +       jnz  14f                        # process more events if necessary...
28590 +       XEN_PUT_VCPU_INFO(%rsi)
28591 +        RESTORE_ARGS 0,8,0
28592 +        HYPERVISOR_IRET 0
28593 +        
28594 +14:    XEN_LOCKED_BLOCK_EVENTS(%rsi)
28595 +       XEN_PUT_VCPU_INFO(%rsi)
28596 +       SAVE_REST
28597 +        movq %rsp,%rdi                  # set the argument again
28598 +       jmp  11b
28599 +ecrit:  /**** END OF CRITICAL REGION ****/
28600 +# At this point, unlike on x86-32, we don't do the fixup to simplify the 
28601 +# code and the stack frame is more complex on x86-64.
28602 +# When the kernel is interrupted in the critical section, the kernel 
28603 +# will do IRET in that case, and everything will be restored at that point, 
28604 +# i.e. it just resumes from the next instruction interrupted with the same context. 
28605 +
28606 +# Hypervisor uses this for application faults while it executes.
28607 +# We get here for two reasons:
28608 +#  1. Fault while reloading DS, ES, FS or GS
28609 +#  2. Fault while executing IRET
28610 +# Category 1 we do not need to fix up as Xen has already reloaded all segment
28611 +# registers that could be reloaded and zeroed the others.
28612 +# Category 2 we fix up by killing the current process. We cannot use the
28613 +# normal Linux return path in this case because if we use the IRET hypercall
28614 +# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
28615 +# We distinguish between categories by comparing each saved segment register
28616 +# with its current contents: any discrepancy means we in category 1.
28617 +ENTRY(failsafe_callback)
28618 +       movw %ds,%cx
28619 +       cmpw %cx,0x10(%rsp)
28620 +       jne 1f
28621 +       movw %es,%cx
28622 +       cmpw %cx,0x18(%rsp)
28623 +       jne 1f
28624 +       movw %fs,%cx
28625 +       cmpw %cx,0x20(%rsp)
28626 +       jne 1f
28627 +       movw %gs,%cx
28628 +       cmpw %cx,0x28(%rsp)
28629 +       jne 1f
28630 +       /* All segments match their saved values => Category 2 (Bad IRET). */
28631 +       movq (%rsp),%rcx
28632 +       movq 8(%rsp),%r11
28633 +       addq $0x30,%rsp
28634 +       movq $-9999,%rdi        /* better code? */
28635 +       jmp do_exit                     
28636 +1:     /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
28637 +       movq (%rsp),%rcx
28638 +       movq 8(%rsp),%r11
28639 +       addq $0x30,%rsp
28640 +       pushq $0
28641 +       SAVE_ALL
28642 +       jmp error_exit
28643 +#if 0        
28644 +        .section __ex_table,"a"
28645 +        .align 8
28646 +        .quad gs_change,bad_gs
28647 +        .previous
28648 +        .section .fixup,"ax"
28649 +       /* running with kernelgs */
28650 +bad_gs: 
28651 +/*     swapgs          */      /* switch back to user gs */
28652 +       xorl %eax,%eax
28653 +        movl %eax,%gs
28654 +        jmp  2b
28655 +        .previous       
28656 +#endif
28657 +       
28658 +/*
28659 + * Create a kernel thread.
28660 + *
28661 + * C extern interface:
28662 + *     extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
28663 + *
28664 + * asm input arguments:
28665 + *     rdi: fn, rsi: arg, rdx: flags
28666 + */
28667 +ENTRY(kernel_thread)
28668 +       CFI_STARTPROC
28669 +       FAKE_STACK_FRAME $child_rip
28670 +       SAVE_ALL
28671 +
28672 +       # rdi: flags, rsi: usp, rdx: will be &pt_regs
28673 +       movq %rdx,%rdi
28674 +       orq  kernel_thread_flags(%rip),%rdi
28675 +       movq $-1, %rsi
28676 +       movq %rsp, %rdx
28677 +
28678 +       xorl %r8d,%r8d
28679 +       xorl %r9d,%r9d
28680 +       
28681 +       # clone now
28682 +       call do_fork
28683 +       movq %rax,RAX(%rsp)
28684 +       xorl %edi,%edi
28685 +
28686 +       /*
28687 +        * It isn't worth to check for reschedule here,
28688 +        * so internally to the x86_64 port you can rely on kernel_thread()
28689 +        * not to reschedule the child before returning, this avoids the need
28690 +        * of hacks for example to fork off the per-CPU idle tasks.
28691 +         * [Hopefully no generic code relies on the reschedule -AK]    
28692 +        */
28693 +       RESTORE_ALL
28694 +       UNFAKE_STACK_FRAME
28695 +       ret
28696 +       CFI_ENDPROC
28697 +
28698 +       
28699 +child_rip:
28700 +       /*
28701 +        * Here we are in the child and the registers are set as they were
28702 +        * at kernel_thread() invocation in the parent.
28703 +        */
28704 +       movq %rdi, %rax
28705 +       movq %rsi, %rdi
28706 +       call *%rax
28707 +       # exit
28708 +       xorl %edi, %edi
28709 +       call do_exit
28710 +
28711 +/*
28712 + * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
28713 + *
28714 + * C extern interface:
28715 + *      extern long execve(char *name, char **argv, char **envp)
28716 + *
28717 + * asm input arguments:
28718 + *     rdi: name, rsi: argv, rdx: envp
28719 + *
28720 + * We want to fallback into:
28721 + *     extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
28722 + *
28723 + * do_sys_execve asm fallback arguments:
28724 + *     rdi: name, rsi: argv, rdx: envp, fake frame on the stack
28725 + */
28726 +ENTRY(execve)
28727 +       CFI_STARTPROC
28728 +       FAKE_STACK_FRAME $0
28729 +       SAVE_ALL        
28730 +       call sys_execve
28731 +       movq %rax, RAX(%rsp)    
28732 +       RESTORE_REST
28733 +       testq %rax,%rax
28734 +       jne 1f
28735 +        jmp int_ret_from_sys_call
28736 +1:      RESTORE_ARGS
28737 +       UNFAKE_STACK_FRAME
28738 +       ret
28739 +       CFI_ENDPROC
28740 +
28741 +KPROBE_ENTRY(page_fault)
28742 +       errorentry do_page_fault
28743 +       .previous .text
28744 +
28745 +ENTRY(coprocessor_error)
28746 +       zeroentry do_coprocessor_error
28747 +
28748 +ENTRY(simd_coprocessor_error)
28749 +       zeroentry do_simd_coprocessor_error     
28750 +
28751 +ENTRY(device_not_available)
28752 +       zeroentry math_state_restore
28753 +
28754 +       /* runs on exception stack */
28755 +KPROBE_ENTRY(debug)
28756 +       INTR_FRAME
28757 +/*     pushq $0
28758 +       CFI_ADJUST_CFA_OFFSET 8 */
28759 +       zeroentry do_debug
28760 +/*     jmp paranoid_exit */
28761 +       CFI_ENDPROC
28762 +       .previous .text
28763 +
28764 +#if 0
28765 +       /* runs on exception stack */   
28766 +KPROBE_ENTRY(nmi)
28767 +       INTR_FRAME
28768 +       pushq $-1
28769 +       CFI_ADJUST_CFA_OFFSET 8
28770 +       paranoidentry do_nmi
28771 +       /*
28772 +        * "Paranoid" exit path from exception stack.
28773 +        * Paranoid because this is used by NMIs and cannot take
28774 +        * any kernel state for granted.
28775 +        * We don't do kernel preemption checks here, because only
28776 +        * NMI should be common and it does not enable IRQs and
28777 +        * cannot get reschedule ticks.
28778 +        */
28779 +       /* ebx: no swapgs flag */
28780 +paranoid_exit:
28781 +       testl %ebx,%ebx                         /* swapgs needed? */
28782 +       jnz paranoid_restore
28783 +       testl $3,CS(%rsp)
28784 +       jnz   paranoid_userspace
28785 +paranoid_swapgs:       
28786 +       swapgs
28787 +paranoid_restore:      
28788 +       RESTORE_ALL 8
28789 +       iretq
28790 +paranoid_userspace:    
28791 +       GET_THREAD_INFO(%rcx)
28792 +       movl threadinfo_flags(%rcx),%ebx
28793 +       andl $_TIF_WORK_MASK,%ebx
28794 +       jz paranoid_swapgs
28795 +       movq %rsp,%rdi                  /* &pt_regs */
28796 +       call sync_regs
28797 +       movq %rax,%rsp                  /* switch stack for scheduling */
28798 +       testl $_TIF_NEED_RESCHED,%ebx
28799 +       jnz paranoid_schedule
28800 +       movl %ebx,%edx                  /* arg3: thread flags */
28801 +       sti
28802 +       xorl %esi,%esi                  /* arg2: oldset */
28803 +       movq %rsp,%rdi                  /* arg1: &pt_regs */
28804 +       call do_notify_resume
28805 +       cli
28806 +       jmp paranoid_userspace
28807 +paranoid_schedule:
28808 +       sti
28809 +       call schedule
28810 +       cli
28811 +       jmp paranoid_userspace
28812 +       CFI_ENDPROC
28813 +       .previous .text
28814 +#endif        
28815 +
28816 +KPROBE_ENTRY(int3)
28817 +       INTR_FRAME
28818 +/*     pushq $0
28819 +       CFI_ADJUST_CFA_OFFSET 8 */
28820 +       zeroentry do_int3
28821 +/*     jmp paranoid_exit */
28822 +       CFI_ENDPROC
28823 +       .previous .text
28824 +
28825 +ENTRY(overflow)
28826 +       zeroentry do_overflow
28827 +
28828 +ENTRY(bounds)
28829 +       zeroentry do_bounds
28830 +
28831 +ENTRY(invalid_op)
28832 +       zeroentry do_invalid_op 
28833 +
28834 +ENTRY(coprocessor_segment_overrun)
28835 +       zeroentry do_coprocessor_segment_overrun
28836 +
28837 +ENTRY(reserved)
28838 +       zeroentry do_reserved
28839 +
28840 +#if 0
28841 +       /* runs on exception stack */
28842 +ENTRY(double_fault)
28843 +       XCPT_FRAME
28844 +       paranoidentry do_double_fault
28845 +       jmp paranoid_exit
28846 +       CFI_ENDPROC
28847 +#endif
28848 +
28849 +ENTRY(invalid_TSS)
28850 +       errorentry do_invalid_TSS
28851 +
28852 +ENTRY(segment_not_present)
28853 +       errorentry do_segment_not_present
28854 +
28855 +       /* runs on exception stack */
28856 +ENTRY(stack_segment)
28857 +       XCPT_FRAME
28858 +       errorentry do_stack_segment
28859 +       CFI_ENDPROC
28860 +
28861 +KPROBE_ENTRY(general_protection)
28862 +       errorentry do_general_protection
28863 +       .previous .text
28864 +
28865 +ENTRY(alignment_check)
28866 +       errorentry do_alignment_check
28867 +
28868 +ENTRY(divide_error)
28869 +       zeroentry do_divide_error
28870 +
28871 +ENTRY(spurious_interrupt_bug)
28872 +       zeroentry do_spurious_interrupt_bug
28873 +
28874 +#ifdef CONFIG_X86_MCE
28875 +       /* runs on exception stack */
28876 +ENTRY(machine_check)
28877 +       INTR_FRAME
28878 +       pushq $0
28879 +       CFI_ADJUST_CFA_OFFSET 8 
28880 +       paranoidentry do_machine_check
28881 +       jmp paranoid_exit
28882 +       CFI_ENDPROC
28883 +#endif
28884 +
28885 +ENTRY(call_softirq)
28886 +       CFI_STARTPROC
28887 +       movq %gs:pda_irqstackptr,%rax
28888 +       movq %rsp,%rdx
28889 +       CFI_DEF_CFA_REGISTER    rdx
28890 +       incl %gs:pda_irqcount
28891 +       cmove %rax,%rsp
28892 +       pushq %rdx
28893 +       /*todo CFI_DEF_CFA_EXPRESSION ...*/
28894 +       call __do_softirq
28895 +       popq %rsp
28896 +       CFI_DEF_CFA_REGISTER    rsp
28897 +       decl %gs:pda_irqcount
28898 +       ret
28899 +       CFI_ENDPROC
28900 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/genapic-xen.c linux-2.6.16/arch/x86_64/kernel/genapic-xen.c
28901 --- linux-2.6.16.orig/arch/x86_64/kernel/genapic-xen.c  1970-01-01 01:00:00.000000000 +0100
28902 +++ linux-2.6.16/arch/x86_64/kernel/genapic-xen.c       2006-06-26 09:51:32.000000000 +0200
28903 @@ -0,0 +1,144 @@
28904 +/*
28905 + * Copyright 2004 James Cleverdon, IBM.
28906 + * Subject to the GNU Public License, v.2
28907 + *
28908 + * Generic APIC sub-arch probe layer.
28909 + *
28910 + * Hacked for x86-64 by James Cleverdon from i386 architecture code by
28911 + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
28912 + * James Cleverdon.
28913 + */
28914 +#include <linux/config.h>
28915 +#include <linux/threads.h>
28916 +#include <linux/cpumask.h>
28917 +#include <linux/string.h>
28918 +#include <linux/kernel.h>
28919 +#include <linux/ctype.h>
28920 +#include <linux/init.h>
28921 +#include <linux/module.h>
28922 +
28923 +#include <asm/smp.h>
28924 +#include <asm/ipi.h>
28925 +
28926 +#if defined(CONFIG_ACPI)
28927 +#include <acpi/acpi_bus.h>
28928 +#endif
28929 +
28930 +/* which logical CPU number maps to which CPU (physical APIC ID) */
28931 +u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
28932 +EXPORT_SYMBOL(x86_cpu_to_apicid);
28933 +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
28934 +
28935 +extern struct genapic apic_cluster;
28936 +extern struct genapic apic_flat;
28937 +extern struct genapic apic_physflat;
28938 +
28939 +#ifndef CONFIG_XEN
28940 +struct genapic *genapic = &apic_flat;
28941 +#else
28942 +extern struct genapic apic_xen;
28943 +struct genapic *genapic = &apic_xen;
28944 +#endif
28945 +
28946 +
28947 +/*
28948 + * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
28949 + */
28950 +void __init clustered_apic_check(void)
28951 +{
28952 +#ifndef CONFIG_XEN
28953 +       long i;
28954 +       u8 clusters, max_cluster;
28955 +       u8 id;
28956 +       u8 cluster_cnt[NUM_APIC_CLUSTERS];
28957 +       int max_apic = 0;
28958 +
28959 +#if defined(CONFIG_ACPI)
28960 +       /*
28961 +        * Some x86_64 machines use physical APIC mode regardless of how many
28962 +        * procs/clusters are present (x86_64 ES7000 is an example).
28963 +        */
28964 +       if (acpi_fadt.revision > FADT2_REVISION_ID)
28965 +               if (acpi_fadt.force_apic_physical_destination_mode) {
28966 +                       genapic = &apic_cluster;
28967 +                       goto print;
28968 +               }
28969 +#endif
28970 +
28971 +       memset(cluster_cnt, 0, sizeof(cluster_cnt));
28972 +       for (i = 0; i < NR_CPUS; i++) {
28973 +               id = bios_cpu_apicid[i];
28974 +               if (id == BAD_APICID)
28975 +                       continue;
28976 +               if (id > max_apic)
28977 +                       max_apic = id;
28978 +               cluster_cnt[APIC_CLUSTERID(id)]++;
28979 +       }
28980 +
28981 +       /* Don't use clustered mode on AMD platforms. */
28982 +       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
28983 +               genapic = &apic_physflat;
28984 +#ifndef CONFIG_HOTPLUG_CPU
28985 +               /* In the CPU hotplug case we cannot use broadcast mode
28986 +                  because that opens a race when a CPU is removed.
28987 +                  Stay at physflat mode in this case.
28988 +                  It is bad to do this unconditionally though. Once
28989 +                  we have ACPI platform support for CPU hotplug
28990 +                  we should detect hotplug capablity from ACPI tables and
28991 +                  only do this when really needed. -AK */
28992 +               if (max_apic <= 8)
28993 +                       genapic = &apic_flat;
28994 +#endif
28995 +               goto print;
28996 +       }
28997 +
28998 +       clusters = 0;
28999 +       max_cluster = 0;
29000 +
29001 +       for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
29002 +               if (cluster_cnt[i] > 0) {
29003 +                       ++clusters;
29004 +                       if (cluster_cnt[i] > max_cluster)
29005 +                               max_cluster = cluster_cnt[i];
29006 +               }
29007 +       }
29008 +
29009 +       /*
29010 +        * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
29011 +        * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
29012 +        * else physical mode.
29013 +        * (We don't use lowest priority delivery + HW APIC IRQ steering, so
29014 +        * can ignore the clustered logical case and go straight to physical.)
29015 +        */
29016 +       if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) {
29017 +#ifdef CONFIG_HOTPLUG_CPU
29018 +               /* Don't use APIC shortcuts in CPU hotplug to avoid races */
29019 +               genapic = &apic_physflat;
29020 +#else
29021 +               genapic = &apic_flat;
29022 +#endif
29023 +       } else
29024 +               genapic = &apic_cluster;
29025 +
29026 +print:
29027 +#else
29028 +       /* hardcode to xen apic functions */
29029 +       genapic = &apic_xen;
29030 +#endif
29031 +       printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
29032 +}
29033 +
29034 +/* Same for both flat and clustered. */
29035 +
29036 +#ifdef CONFIG_XEN
29037 +extern void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest);
29038 +#endif
29039 +
29040 +void send_IPI_self(int vector)
29041 +{
29042 +#ifndef CONFIG_XEN
29043 +       __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
29044 +#else
29045 +       xen_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
29046 +#endif
29047 +}
29048 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/genapic_xen.c linux-2.6.16/arch/x86_64/kernel/genapic_xen.c
29049 --- linux-2.6.16.orig/arch/x86_64/kernel/genapic_xen.c  1970-01-01 01:00:00.000000000 +0100
29050 +++ linux-2.6.16/arch/x86_64/kernel/genapic_xen.c       2006-06-26 09:51:32.000000000 +0200
29051 @@ -0,0 +1,162 @@
29052 +/*
29053 + * Copyright 2004 James Cleverdon, IBM.
29054 + * Subject to the GNU Public License, v.2
29055 + *
29056 + * Xen APIC subarch code.  Maximum 8 CPUs, logical delivery.
29057 + *
29058 + * Hacked for x86-64 by James Cleverdon from i386 architecture code by
29059 + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
29060 + * James Cleverdon.
29061 + *
29062 + * Hacked to pieces for Xen by Chris Wright.
29063 + */
29064 +#include <linux/config.h>
29065 +#include <linux/threads.h>
29066 +#include <linux/cpumask.h>
29067 +#include <linux/string.h>
29068 +#include <linux/kernel.h>
29069 +#include <linux/ctype.h>
29070 +#include <linux/init.h>
29071 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
29072 +#include <asm/smp.h>
29073 +#include <asm/ipi.h>
29074 +#else
29075 +#include <asm/apic.h>
29076 +#include <asm/apicdef.h>
29077 +#include <asm/genapic.h>
29078 +#endif
29079 +#include <xen/evtchn.h>
29080 +
29081 +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
29082 +
29083 +static inline void __send_IPI_one(unsigned int cpu, int vector)
29084 +{
29085 +       int irq = per_cpu(ipi_to_irq, cpu)[vector];
29086 +       BUG_ON(irq < 0);
29087 +       notify_remote_via_irq(irq);
29088 +}
29089 +
29090 +void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
29091 +{
29092 +       int cpu;
29093 +
29094 +       switch (shortcut) {
29095 +       case APIC_DEST_SELF:
29096 +               __send_IPI_one(smp_processor_id(), vector);
29097 +               break;
29098 +       case APIC_DEST_ALLBUT:
29099 +               for (cpu = 0; cpu < NR_CPUS; ++cpu) {
29100 +                       if (cpu == smp_processor_id())
29101 +                               continue;
29102 +                       if (cpu_isset(cpu, cpu_online_map)) {
29103 +                               __send_IPI_one(cpu, vector);
29104 +                       }
29105 +               }
29106 +               break;
29107 +       case APIC_DEST_ALLINC:
29108 +               for (cpu = 0; cpu < NR_CPUS; ++cpu) {
29109 +                       if (cpu_isset(cpu, cpu_online_map)) {
29110 +                               __send_IPI_one(cpu, vector);
29111 +                       }
29112 +               }
29113 +               break;
29114 +       default:
29115 +               printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
29116 +                      vector);
29117 +               break;
29118 +       }
29119 +}
29120 +
29121 +static cpumask_t xen_target_cpus(void)
29122 +{
29123 +       return cpu_online_map;
29124 +}
29125 +
29126 +/*
29127 + * Set up the logical destination ID.
29128 + * Do nothing, not called now.
29129 + */
29130 +static void xen_init_apic_ldr(void)
29131 +{
29132 +       Dprintk("%s\n", __FUNCTION__);
29133 +       return;
29134 +}
29135 +
29136 +static void xen_send_IPI_allbutself(int vector)
29137 +{
29138 +       /*
29139 +        * if there are no other CPUs in the system then
29140 +        * we get an APIC send error if we try to broadcast.
29141 +        * thus we have to avoid sending IPIs in this case.
29142 +        */
29143 +       Dprintk("%s\n", __FUNCTION__);
29144 +       if (num_online_cpus() > 1)
29145 +               xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
29146 +}
29147 +
29148 +static void xen_send_IPI_all(int vector)
29149 +{
29150 +       Dprintk("%s\n", __FUNCTION__);
29151 +       xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
29152 +}
29153 +
29154 +static void xen_send_IPI_mask(cpumask_t cpumask, int vector)
29155 +{
29156 +       unsigned long mask = cpus_addr(cpumask)[0];
29157 +       unsigned int cpu;
29158 +       unsigned long flags;
29159 +
29160 +       Dprintk("%s\n", __FUNCTION__);
29161 +       local_irq_save(flags);
29162 +       WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
29163 +
29164 +       for (cpu = 0; cpu < NR_CPUS; ++cpu) {
29165 +               if (cpu_isset(cpu, cpumask)) {
29166 +                       __send_IPI_one(cpu, vector);
29167 +               }
29168 +       }
29169 +       local_irq_restore(flags);
29170 +}
29171 +
29172 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
29173 +static int xen_apic_id_registered(void)
29174 +{
29175 +       /* better be set */
29176 +       Dprintk("%s\n", __FUNCTION__);
29177 +       return physid_isset(smp_processor_id(), phys_cpu_present_map);
29178 +}
29179 +#endif
29180 +
29181 +static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask)
29182 +{
29183 +       Dprintk("%s\n", __FUNCTION__);
29184 +       return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
29185 +}
29186 +
29187 +static unsigned int phys_pkg_id(int index_msb)
29188 +{
29189 +       u32 ebx;
29190 +
29191 +       Dprintk("%s\n", __FUNCTION__);
29192 +       ebx = cpuid_ebx(1);
29193 +       return ((ebx >> 24) & 0xFF) >> index_msb;
29194 +}
29195 +
29196 +struct genapic apic_xen =  {
29197 +       .name = "xen",
29198 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
29199 +       .int_delivery_mode = dest_LowestPrio,
29200 +#endif
29201 +       .int_dest_mode = (APIC_DEST_LOGICAL != 0),
29202 +       .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST,
29203 +       .target_cpus = xen_target_cpus,
29204 +#ifdef CONFIG_XEN_PRIVILEGED_GUEST
29205 +       .apic_id_registered = xen_apic_id_registered,
29206 +#endif
29207 +       .init_apic_ldr = xen_init_apic_ldr,
29208 +       .send_IPI_all = xen_send_IPI_all,
29209 +       .send_IPI_allbutself = xen_send_IPI_allbutself,
29210 +       .send_IPI_mask = xen_send_IPI_mask,
29211 +       .cpu_mask_to_apicid = xen_cpu_mask_to_apicid,
29212 +       .phys_pkg_id = phys_pkg_id,
29213 +};
29214 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/head-xen.S linux-2.6.16/arch/x86_64/kernel/head-xen.S
29215 --- linux-2.6.16.orig/arch/x86_64/kernel/head-xen.S     1970-01-01 01:00:00.000000000 +0100
29216 +++ linux-2.6.16/arch/x86_64/kernel/head-xen.S  2006-06-26 09:51:32.000000000 +0200
29217 @@ -0,0 +1,156 @@
29218 +/*
29219 + *  linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
29220 + *
29221 + *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
29222 + *  Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
29223 + *  Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
29224 + *  Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
29225 + *
29226 + *  $Id$
29227 + *
29228 + *  Jun Nakajima <jun.nakajima@intel.com>
29229 + *    Modified for Xen                                
29230 + */
29231 +
29232 +
29233 +#include <linux/linkage.h>
29234 +#include <linux/threads.h>
29235 +#include <linux/init.h>
29236 +#include <asm/desc.h>
29237 +#include <asm/segment.h>
29238 +#include <asm/page.h>
29239 +#include <asm/msr.h>
29240 +#include <asm/cache.h>
29241 +
29242 +       .text
29243 +       .code64
29244 +       .globl startup_64
29245 +startup_64:
29246 +ENTRY(_start)
29247 +       movq $(init_thread_union+THREAD_SIZE-8),%rsp
29248 +       /* zero EFLAGS after setting rsp */
29249 +       pushq $0
29250 +       popfq
29251 +
29252 +       /* rsi is pointer to startup info structure.
29253 +          pass it to C */
29254 +       movq %rsi,%rdi
29255 +       jmp x86_64_start_kernel
29256 +
29257 +ENTRY(stext)
29258 +ENTRY(_stext)
29259 +
29260 +       $page = 0
29261 +#define NEXT_PAGE(name) \
29262 +       $page = $page + 1; \
29263 +       .org $page * 0x1000; \
29264 +       phys_/**/name = $page * 0x1000 + __PHYSICAL_START; \
29265 +ENTRY(name)
29266 +
29267 +NEXT_PAGE(init_level4_pgt)
29268 +       /* This gets initialized in x86_64_start_kernel */
29269 +       .fill   512,8,0
29270 +
29271 +        /*
29272 +         * We update two pgd entries to make kernel and user pgd consistent
29273 +         * at pgd_populate(). It can be used for kernel modules. So we place 
29274 +         * this page here for those cases to avoid memory corruption.
29275 +         * We also use this page to establish the initiali mapping for
29276 +         * vsyscall area.
29277 +         */
29278 +NEXT_PAGE(init_level4_user_pgt)
29279 +       .fill   512,8,0
29280 +
29281 +NEXT_PAGE(level3_kernel_pgt)
29282 +       .fill   512,8,0
29283 +
29284 +        /*
29285 +         * This is used for vsyscall area mapping as we have a different
29286 +         * level4 page table for user.
29287 +         */
29288 +NEXT_PAGE(level3_user_pgt)
29289 +        .fill  512,8,0
29290 +
29291 +NEXT_PAGE(level2_kernel_pgt)
29292 +       .fill   512,8,0
29293 +
29294 +NEXT_PAGE(empty_zero_page)
29295 +       .skip PAGE_SIZE
29296 +
29297 +NEXT_PAGE(hypercall_page)
29298 +       .fill   512,8,0
29299 +
29300 +#undef NEXT_PAGE
29301 +
29302 +       .data
29303 +
29304 +       .align 16
29305 +       .globl cpu_gdt_descr
29306 +cpu_gdt_descr:
29307 +       .word   gdt_end-cpu_gdt_table
29308 +gdt:
29309 +       .quad   cpu_gdt_table
29310 +#ifdef CONFIG_SMP
29311 +       .rept   NR_CPUS-1
29312 +       .word   0
29313 +       .quad   0
29314 +       .endr
29315 +#endif
29316 +
29317 +/* We need valid kernel segments for data and code in long mode too
29318 + * IRET will check the segment types  kkeil 2000/10/28
29319 + * Also sysret mandates a special GDT layout 
29320 + */
29321 +                               
29322 +       .section .data.page_aligned, "aw"
29323 +       .align PAGE_SIZE
29324 +
29325 +/* The TLS descriptors are currently at a different place compared to i386.
29326 +   Hopefully nobody expects them at a fixed place (Wine?) */
29327 +
29328 +ENTRY(cpu_gdt_table)
29329 +       .quad   0x0000000000000000      /* NULL descriptor */
29330 +       .quad   0x0                     /* unused */
29331 +       .quad   0x00af9a000000ffff      /* __KERNEL_CS */
29332 +       .quad   0x00cf92000000ffff      /* __KERNEL_DS */
29333 +       .quad   0x00cffa000000ffff      /* __USER32_CS */
29334 +       .quad   0x00cff2000000ffff      /* __USER_DS, __USER32_DS  */
29335 +       .quad   0x00affa000000ffff      /* __USER_CS */
29336 +       .quad   0x00cf9a000000ffff      /* __KERNEL32_CS */
29337 +       .quad   0,0                     /* TSS */
29338 +       .quad   0,0                     /* LDT */
29339 +       .quad   0,0,0                   /* three TLS descriptors */
29340 +       .quad   0                       /* unused */
29341 +gdt_end:
29342 +       /* asm/segment.h:GDT_ENTRIES must match this */
29343 +       /* This should be a multiple of the cache line size */
29344 +       /* GDTs of other CPUs are now dynamically allocated */
29345 +
29346 +       /* zero the remaining page */
29347 +       .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
29348 +
29349 +/*
29350 + * __xen_guest information
29351 + */
29352 +.macro utoh value
29353 + .if (\value) < 0 || (\value) >= 0x10
29354 +       utoh (((\value)>>4)&0x0fffffffffffffff)
29355 + .endif
29356 + .if ((\value) & 0xf) < 10
29357 +  .byte '0' + ((\value) & 0xf)
29358 + .else
29359 +  .byte 'A' + ((\value) & 0xf) - 10
29360 + .endif
29361 +.endm
29362 +
29363 +.section __xen_guest
29364 +       .ascii  "GUEST_OS=linux,GUEST_VER=2.6"
29365 +       .ascii  ",XEN_VER=xen-3.0"
29366 +       .ascii  ",VIRT_BASE=0x"; utoh __START_KERNEL_map
29367 +       .ascii  ",HYPERCALL_PAGE=0x"; utoh (phys_hypercall_page >> PAGE_SHIFT)
29368 +       .ascii  ",FEATURES=writable_page_tables"
29369 +       .ascii           "|writable_descriptor_tables"
29370 +       .ascii           "|auto_translated_physmap"
29371 +       .ascii           "|supervisor_mode_kernel"
29372 +       .ascii  ",LOADER=generic"
29373 +       .byte   0
29374 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/head64-xen.c linux-2.6.16/arch/x86_64/kernel/head64-xen.c
29375 --- linux-2.6.16.orig/arch/x86_64/kernel/head64-xen.c   1970-01-01 01:00:00.000000000 +0100
29376 +++ linux-2.6.16/arch/x86_64/kernel/head64-xen.c        2006-06-26 09:51:32.000000000 +0200
29377 @@ -0,0 +1,140 @@
29378 +/*
29379 + *  linux/arch/x86_64/kernel/head64.c -- prepare to run common code
29380 + *
29381 + *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
29382 + *
29383 + *  $Id$
29384 + *
29385 + *  Jun Nakajima <jun.nakajima@intel.com>
29386 + *     Modified for Xen.
29387 + */
29388 +
29389 +#include <linux/init.h>
29390 +#include <linux/linkage.h>
29391 +#include <linux/types.h>
29392 +#include <linux/kernel.h>
29393 +#include <linux/string.h>
29394 +#include <linux/percpu.h>
29395 +
29396 +#include <asm/processor.h>
29397 +#include <asm/proto.h>
29398 +#include <asm/smp.h>
29399 +#include <asm/bootsetup.h>
29400 +#include <asm/setup.h>
29401 +#include <asm/desc.h>
29402 +#include <asm/pgtable.h>
29403 +#include <asm/sections.h>
29404 +
29405 +unsigned long start_pfn;
29406 +
29407 +/* Don't add a printk in there. printk relies on the PDA which is not initialized 
29408 +   yet. */
29409 +#if 0
29410 +static void __init clear_bss(void)
29411 +{
29412 +       memset(__bss_start, 0,
29413 +              (unsigned long) __bss_stop - (unsigned long) __bss_start);
29414 +}
29415 +#endif
29416 +
29417 +#define NEW_CL_POINTER         0x228   /* Relative to real mode data */
29418 +#define OLD_CL_MAGIC_ADDR      0x90020
29419 +#define OLD_CL_MAGIC            0xA33F
29420 +#define OLD_CL_BASE_ADDR        0x90000
29421 +#define OLD_CL_OFFSET           0x90022
29422 +
29423 +extern char saved_command_line[];
29424 +
29425 +static void __init copy_bootdata(char *real_mode_data)
29426 +{
29427 +#ifndef CONFIG_XEN
29428 +       int new_data;
29429 +       char * command_line;
29430 +
29431 +       memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE);
29432 +       new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
29433 +       if (!new_data) {
29434 +               if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
29435 +                       printk("so old bootloader that it does not support commandline?!\n");
29436 +                       return;
29437 +               }
29438 +               new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
29439 +               printk("old bootloader convention, maybe loadlin?\n");
29440 +       }
29441 +       command_line = (char *) ((u64)(new_data));
29442 +       memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
29443 +#else
29444 +       int max_cmdline;
29445 +       
29446 +       if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
29447 +               max_cmdline = COMMAND_LINE_SIZE;
29448 +       memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
29449 +       saved_command_line[max_cmdline-1] = '\0';
29450 +#endif
29451 +       printk("Bootdata ok (command line is %s)\n", saved_command_line);
29452 +}
29453 +
29454 +static void __init setup_boot_cpu_data(void)
29455 +{
29456 +       unsigned int dummy, eax;
29457 +
29458 +       /* get vendor info */
29459 +       cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level,
29460 +             (unsigned int *)&boot_cpu_data.x86_vendor_id[0],
29461 +             (unsigned int *)&boot_cpu_data.x86_vendor_id[8],
29462 +             (unsigned int *)&boot_cpu_data.x86_vendor_id[4]);
29463 +
29464 +       /* get cpu type */
29465 +       cpuid(1, &eax, &dummy, &dummy,
29466 +               (unsigned int *) &boot_cpu_data.x86_capability);
29467 +       boot_cpu_data.x86 = (eax >> 8) & 0xf;
29468 +       boot_cpu_data.x86_model = (eax >> 4) & 0xf;
29469 +       boot_cpu_data.x86_mask = eax & 0xf;
29470 +}
29471 +
29472 +void __init x86_64_start_kernel(char * real_mode_data)
29473 +{
29474 +       char *s;
29475 +       int i;
29476 +
29477 +       xen_start_info = (struct start_info *)real_mode_data;
29478 +       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
29479 +               phys_to_machine_mapping =
29480 +                       (unsigned long *)xen_start_info->mfn_list;
29481 +               start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
29482 +                       xen_start_info->nr_pt_frames;
29483 +       }
29484 +
29485 +#if 0
29486 +       for (i = 0; i < 256; i++)
29487 +               set_intr_gate(i, early_idt_handler);
29488 +       asm volatile("lidt %0" :: "m" (idt_descr));
29489 +#endif
29490 +
29491 +       for (i = 0; i < NR_CPUS; i++)
29492 +               cpu_pda(i) = &boot_cpu_pda[i];
29493 +
29494 +       pda_init(0);
29495 +       copy_bootdata(real_mode_data);
29496 +#ifdef CONFIG_SMP
29497 +       cpu_set(0, cpu_online_map);
29498 +#endif
29499 +       s = strstr(saved_command_line, "earlyprintk=");
29500 +       if (s != NULL)
29501 +               setup_early_printk(strchr(s, '=') + 1);
29502 +#ifdef CONFIG_NUMA
29503 +       s = strstr(saved_command_line, "numa=");
29504 +       if (s != NULL)
29505 +               numa_setup(s+5);
29506 +#endif
29507 +#ifdef CONFIG_X86_IO_APIC
29508 +       if (strstr(saved_command_line, "disableapic"))
29509 +               disable_apic = 1;
29510 +#endif
29511 +       /* You need early console to see that */
29512 +       if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE)
29513 +               panic("Kernel too big for kernel mapping\n");
29514 +
29515 +       setup_boot_cpu_data();
29516 +       start_kernel();
29517 +}
29518 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/init_task.c linux-2.6.16/arch/x86_64/kernel/init_task.c
29519 --- linux-2.6.16.orig/arch/x86_64/kernel/init_task.c    2006-03-20 06:53:29.000000000 +0100
29520 +++ linux-2.6.16/arch/x86_64/kernel/init_task.c 2006-06-26 09:51:32.000000000 +0200
29521 @@ -37,6 +37,8 @@
29522  struct task_struct init_task = INIT_TASK(init_task);
29523  
29524  EXPORT_SYMBOL(init_task);
29525 +
29526 +#ifndef CONFIG_X86_NO_TSS
29527  /*
29528   * per-CPU TSS segments. Threads are completely 'soft' on Linux,
29529   * no more per-task TSS's. The TSS size is kept cacheline-aligned
29530 @@ -45,5 +47,6 @@
29531   * on exact cacheline boundaries, to eliminate cacheline ping-pong.
29532   */ 
29533  DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
29534 +#endif
29535  
29536  #define ALIGN_TO_4K __attribute__((section(".data.init_task")))
29537 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/io_apic-xen.c linux-2.6.16/arch/x86_64/kernel/io_apic-xen.c
29538 --- linux-2.6.16.orig/arch/x86_64/kernel/io_apic-xen.c  1970-01-01 01:00:00.000000000 +0100
29539 +++ linux-2.6.16/arch/x86_64/kernel/io_apic-xen.c       2006-06-26 09:51:32.000000000 +0200
29540 @@ -0,0 +1,2231 @@
29541 +/*
29542 + *     Intel IO-APIC support for multi-Pentium hosts.
29543 + *
29544 + *     Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
29545 + *
29546 + *     Many thanks to Stig Venaas for trying out countless experimental
29547 + *     patches and reporting/debugging problems patiently!
29548 + *
29549 + *     (c) 1999, Multiple IO-APIC support, developed by
29550 + *     Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
29551 + *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
29552 + *     further tested and cleaned up by Zach Brown <zab@redhat.com>
29553 + *     and Ingo Molnar <mingo@redhat.com>
29554 + *
29555 + *     Fixes
29556 + *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
29557 + *                                     thanks to Eric Gilmore
29558 + *                                     and Rolf G. Tews
29559 + *                                     for testing these extensively
29560 + *     Paul Diefenbaugh        :       Added full ACPI support
29561 + */
29562 +
29563 +#include <linux/mm.h>
29564 +#include <linux/interrupt.h>
29565 +#include <linux/init.h>
29566 +#include <linux/delay.h>
29567 +#include <linux/sched.h>
29568 +#include <linux/config.h>
29569 +#include <linux/smp_lock.h>
29570 +#include <linux/mc146818rtc.h>
29571 +#include <linux/acpi.h>
29572 +#include <linux/sysdev.h>
29573 +#ifdef CONFIG_ACPI
29574 +#include <acpi/acpi_bus.h>
29575 +#endif
29576 +
29577 +#include <asm/io.h>
29578 +#include <asm/smp.h>
29579 +#include <asm/desc.h>
29580 +#include <asm/proto.h>
29581 +#include <asm/mach_apic.h>
29582 +#include <asm/acpi.h>
29583 +#include <asm/dma.h>
29584 +
29585 +#define __apicdebuginit  __init
29586 +
29587 +int sis_apic_bug; /* not actually supported, dummy for compile */
29588 +
29589 +static int no_timer_check;
29590 +
29591 +int disable_timer_pin_1 __initdata;
29592 +
29593 +#ifndef CONFIG_XEN
29594 +int timer_over_8254 __initdata = 1;
29595 +
29596 +/* Where if anywhere is the i8259 connect in external int mode */
29597 +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
29598 +#endif
29599 +
29600 +static DEFINE_SPINLOCK(ioapic_lock);
29601 +
29602 +/*
29603 + * # of IRQ routing registers
29604 + */
29605 +int nr_ioapic_registers[MAX_IO_APICS];
29606 +
29607 +/*
29608 + * Rough estimation of how many shared IRQs there are, can
29609 + * be changed anytime.
29610 + */
29611 +#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS
29612 +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
29613 +
29614 +/*
29615 + * This is performance-critical, we want to do it O(1)
29616 + *
29617 + * the indexing order of this array favors 1:1 mappings
29618 + * between pins and IRQs.
29619 + */
29620 +
29621 +static struct irq_pin_list {
29622 +       short apic, pin, next;
29623 +} irq_2_pin[PIN_MAP_SIZE];
29624 +
29625 +int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
29626 +#ifdef CONFIG_PCI_MSI
29627 +#define vector_to_irq(vector)  \
29628 +       (platform_legacy_irq(vector) ? vector : vector_irq[vector])
29629 +#else
29630 +#define vector_to_irq(vector)  (vector)
29631 +#endif
29632 +
29633 +#ifdef CONFIG_XEN
29634 +
29635 +#include <xen/interface/xen.h>
29636 +#include <xen/interface/physdev.h>
29637 +
29638 +/* Fake i8259 */
29639 +#define make_8259A_irq(_irq)     (io_apic_irqs &= ~(1UL<<(_irq)))
29640 +#define disable_8259A_irq(_irq)  ((void)0)
29641 +#define i8259A_irq_pending(_irq) (0)
29642 +
29643 +unsigned long io_apic_irqs;
29644 +
29645 +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
29646 +{
29647 +       physdev_op_t op;
29648 +       int ret;
29649 +
29650 +       op.cmd = PHYSDEVOP_APIC_READ;
29651 +       op.u.apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
29652 +       op.u.apic_op.reg = reg;
29653 +       ret = HYPERVISOR_physdev_op(&op);
29654 +       if (ret)
29655 +               return ret;
29656 +       return op.u.apic_op.value;
29657 +}
29658 +
29659 +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
29660 +{
29661 +       physdev_op_t op;
29662 +
29663 +       op.cmd = PHYSDEVOP_APIC_WRITE;
29664 +       op.u.apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
29665 +       op.u.apic_op.reg = reg;
29666 +       op.u.apic_op.value = value;
29667 +       HYPERVISOR_physdev_op(&op);
29668 +}
29669 +
29670 +#define io_apic_read(a,r)    xen_io_apic_read(a,r)
29671 +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
29672 +
29673 +#define clear_IO_APIC() ((void)0)
29674 +
29675 +#else
29676 +
29677 +#ifdef CONFIG_SMP
29678 +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
29679 +{
29680 +       unsigned long flags;
29681 +       unsigned int dest;
29682 +       cpumask_t tmp;
29683 +
29684 +       cpus_and(tmp, mask, cpu_online_map);
29685 +       if (cpus_empty(tmp))
29686 +               tmp = TARGET_CPUS;
29687 +
29688 +       cpus_and(mask, tmp, CPU_MASK_ALL);
29689 +
29690 +       dest = cpu_mask_to_apicid(mask);
29691 +
29692 +       /*
29693 +        * Only the high 8 bits are valid.
29694 +        */
29695 +       dest = SET_APIC_LOGICAL_ID(dest);
29696 +
29697 +       spin_lock_irqsave(&ioapic_lock, flags);
29698 +       __DO_ACTION(1, = dest, )
29699 +       set_irq_info(irq, mask);
29700 +       spin_unlock_irqrestore(&ioapic_lock, flags);
29701 +}
29702 +#endif
29703 +
29704 +#endif /* !CONFIG_XEN */
29705 +
29706 +/*
29707 + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
29708 + * shared ISA-space IRQs, so we have to support them. We are super
29709 + * fast in the common case, and fast for shared ISA-space IRQs.
29710 + */
29711 +static void add_pin_to_irq(unsigned int irq, int apic, int pin)
29712 +{
29713 +       static int first_free_entry = NR_IRQS;
29714 +       struct irq_pin_list *entry = irq_2_pin + irq;
29715 +
29716 +       BUG_ON(irq >= NR_IRQS);
29717 +       while (entry->next)
29718 +               entry = irq_2_pin + entry->next;
29719 +
29720 +       if (entry->pin != -1) {
29721 +               entry->next = first_free_entry;
29722 +               entry = irq_2_pin + entry->next;
29723 +               if (++first_free_entry >= PIN_MAP_SIZE)
29724 +                       panic("io_apic.c: ran out of irq_2_pin entries!");
29725 +       }
29726 +       entry->apic = apic;
29727 +       entry->pin = pin;
29728 +}
29729 +
29730 +#ifndef CONFIG_XEN
29731 +#define __DO_ACTION(R, ACTION, FINAL)                                  \
29732 +                                                                       \
29733 +{                                                                      \
29734 +       int pin;                                                        \
29735 +       struct irq_pin_list *entry = irq_2_pin + irq;                   \
29736 +                                                                       \
29737 +       BUG_ON(irq >= NR_IRQS);                                         \
29738 +       for (;;) {                                                      \
29739 +               unsigned int reg;                                       \
29740 +               pin = entry->pin;                                       \
29741 +               if (pin == -1)                                          \
29742 +                       break;                                          \
29743 +               reg = io_apic_read(entry->apic, 0x10 + R + pin*2);      \
29744 +               reg ACTION;                                             \
29745 +               io_apic_modify(entry->apic, reg);                       \
29746 +               if (!entry->next)                                       \
29747 +                       break;                                          \
29748 +               entry = irq_2_pin + entry->next;                        \
29749 +       }                                                               \
29750 +       FINAL;                                                          \
29751 +}
29752 +
29753 +#define DO_ACTION(name,R,ACTION, FINAL)                                        \
29754 +                                                                       \
29755 +       static void name##_IO_APIC_irq (unsigned int irq)               \
29756 +       __DO_ACTION(R, ACTION, FINAL)
29757 +
29758 +DO_ACTION( __mask,             0, |= 0x00010000, io_apic_sync(entry->apic) )
29759 +                                               /* mask = 1 */
29760 +DO_ACTION( __unmask,           0, &= 0xfffeffff, )
29761 +                                               /* mask = 0 */
29762 +
29763 +static void mask_IO_APIC_irq (unsigned int irq)
29764 +{
29765 +       unsigned long flags;
29766 +
29767 +       spin_lock_irqsave(&ioapic_lock, flags);
29768 +       __mask_IO_APIC_irq(irq);
29769 +       spin_unlock_irqrestore(&ioapic_lock, flags);
29770 +}
29771 +
29772 +static void unmask_IO_APIC_irq (unsigned int irq)
29773 +{
29774 +       unsigned long flags;
29775 +
29776 +       spin_lock_irqsave(&ioapic_lock, flags);
29777 +       __unmask_IO_APIC_irq(irq);
29778 +       spin_unlock_irqrestore(&ioapic_lock, flags);
29779 +}
29780 +
29781 +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
29782 +{
29783 +       struct IO_APIC_route_entry entry;
29784 +       unsigned long flags;
29785 +
29786 +       /* Check delivery_mode to be sure we're not clearing an SMI pin */
29787 +       spin_lock_irqsave(&ioapic_lock, flags);
29788 +       *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
29789 +       *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
29790 +       spin_unlock_irqrestore(&ioapic_lock, flags);
29791 +       if (entry.delivery_mode == dest_SMI)
29792 +               return;
29793 +       /*
29794 +        * Disable it in the IO-APIC irq-routing table:
29795 +        */
29796 +       memset(&entry, 0, sizeof(entry));
29797 +       entry.mask = 1;
29798 +       spin_lock_irqsave(&ioapic_lock, flags);
29799 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
29800 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
29801 +       spin_unlock_irqrestore(&ioapic_lock, flags);
29802 +}
29803 +
29804 +static void clear_IO_APIC (void)
29805 +{
29806 +       int apic, pin;
29807 +
29808 +       for (apic = 0; apic < nr_ioapics; apic++)
29809 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
29810 +                       clear_IO_APIC_pin(apic, pin);
29811 +}
29812 +
29813 +#endif /* !CONFIG_XEN */
29814 +
29815 +static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
29816 +
29817 +/*
29818 + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
29819 + * specific CPU-side IRQs.
29820 + */
29821 +
29822 +#define MAX_PIRQS 8
29823 +static int pirq_entries [MAX_PIRQS];
29824 +static int pirqs_enabled;
29825 +int skip_ioapic_setup;
29826 +int ioapic_force;
29827 +
29828 +/* dummy parsing: see setup.c */
29829 +
29830 +static int __init disable_ioapic_setup(char *str)
29831 +{
29832 +       skip_ioapic_setup = 1;
29833 +       return 1;
29834 +}
29835 +
29836 +static int __init enable_ioapic_setup(char *str)
29837 +{
29838 +       ioapic_force = 1;
29839 +       skip_ioapic_setup = 0;
29840 +       return 1;
29841 +}
29842 +
29843 +__setup("noapic", disable_ioapic_setup);
29844 +__setup("apic", enable_ioapic_setup);
29845 +
29846 +#ifndef CONFIG_XEN
29847 +static int __init setup_disable_8254_timer(char *s)
29848 +{
29849 +       timer_over_8254 = -1;
29850 +       return 1;
29851 +}
29852 +static int __init setup_enable_8254_timer(char *s)
29853 +{
29854 +       timer_over_8254 = 2;
29855 +       return 1;
29856 +}
29857 +
29858 +__setup("disable_8254_timer", setup_disable_8254_timer);
29859 +__setup("enable_8254_timer", setup_enable_8254_timer);
29860 +#endif /* !CONFIG_XEN */
29861 +
29862 +#include <asm/pci-direct.h>
29863 +#include <linux/pci_ids.h>
29864 +#include <linux/pci.h>
29865 +
29866 +/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
29867 +   off. Check for an Nvidia or VIA PCI bridge and turn it off.
29868 +   Use pci direct infrastructure because this runs before the PCI subsystem. 
29869 +
29870 +   Can be overwritten with "apic"
29871 +
29872 +   And another hack to disable the IOMMU on VIA chipsets.
29873 +
29874 +   ... and others. Really should move this somewhere else.
29875 +
29876 +   Kludge-O-Rama. */
29877 +void __init check_ioapic(void) 
29878 +{ 
29879 +       int num,slot,func; 
29880 +       /* Poor man's PCI discovery */
29881 +       for (num = 0; num < 32; num++) { 
29882 +               for (slot = 0; slot < 32; slot++) { 
29883 +                       for (func = 0; func < 8; func++) { 
29884 +                               u32 class;
29885 +                               u32 vendor;
29886 +                               u8 type;
29887 +                               class = read_pci_config(num,slot,func,
29888 +                                                       PCI_CLASS_REVISION);
29889 +                               if (class == 0xffffffff)
29890 +                                       break; 
29891 +
29892 +                               if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
29893 +                                       continue; 
29894 +
29895 +                               vendor = read_pci_config(num, slot, func, 
29896 +                                                        PCI_VENDOR_ID);
29897 +                               vendor &= 0xffff;
29898 +                               switch (vendor) { 
29899 +                               case PCI_VENDOR_ID_VIA:
29900 +#ifdef CONFIG_GART_IOMMU
29901 +                                       if ((end_pfn > MAX_DMA32_PFN ||
29902 +                                            force_iommu) &&
29903 +                                           !iommu_aperture_allowed) {
29904 +                                               printk(KERN_INFO
29905 +    "Looks like a VIA chipset. Disabling IOMMU. Overwrite with \"iommu=allowed\"\n");
29906 +                                               iommu_aperture_disabled = 1;
29907 +                                       }
29908 +#endif
29909 +                                       return;
29910 +                               case PCI_VENDOR_ID_NVIDIA:
29911 +#ifdef CONFIG_ACPI
29912 +                                       /* All timer overrides on Nvidia
29913 +                                          seem to be wrong. Skip them. */
29914 +                                       acpi_skip_timer_override = 1;
29915 +                                       printk(KERN_INFO 
29916 +            "Nvidia board detected. Ignoring ACPI timer override.\n");
29917 +#endif
29918 +                                       /* RED-PEN skip them on mptables too? */
29919 +                                       return;
29920 +                               case PCI_VENDOR_ID_ATI:
29921 +
29922 +                               /* This should be actually default, but
29923 +                                  for 2.6.16 let's do it for ATI only where
29924 +                                  it's really needed. */
29925 +#ifndef CONFIG_XEN
29926 +                                       if (timer_over_8254 == 1) {     
29927 +                                               timer_over_8254 = 0;    
29928 +                                       printk(KERN_INFO
29929 +               "ATI board detected. Disabling timer routing over 8254.\n");
29930 +                                       }       
29931 +#endif
29932 +                                       return;
29933 +                               } 
29934 +
29935 +
29936 +                               /* No multi-function device? */
29937 +                               type = read_pci_config_byte(num,slot,func,
29938 +                                                           PCI_HEADER_TYPE);
29939 +                               if (!(type & 0x80))
29940 +                                       break;
29941 +                       } 
29942 +               }
29943 +       }
29944 +} 
29945 +
29946 +static int __init ioapic_pirq_setup(char *str)
29947 +{
29948 +       int i, max;
29949 +       int ints[MAX_PIRQS+1];
29950 +
29951 +       get_options(str, ARRAY_SIZE(ints), ints);
29952 +
29953 +       for (i = 0; i < MAX_PIRQS; i++)
29954 +               pirq_entries[i] = -1;
29955 +
29956 +       pirqs_enabled = 1;
29957 +       apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n");
29958 +       max = MAX_PIRQS;
29959 +       if (ints[0] < MAX_PIRQS)
29960 +               max = ints[0];
29961 +
29962 +       for (i = 0; i < max; i++) {
29963 +               apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
29964 +               /*
29965 +                * PIRQs are mapped upside down, usually.
29966 +                */
29967 +               pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
29968 +       }
29969 +       return 1;
29970 +}
29971 +
29972 +__setup("pirq=", ioapic_pirq_setup);
29973 +
29974 +/*
29975 + * Find the IRQ entry number of a certain pin.
29976 + */
29977 +static int find_irq_entry(int apic, int pin, int type)
29978 +{
29979 +       int i;
29980 +
29981 +       for (i = 0; i < mp_irq_entries; i++)
29982 +               if (mp_irqs[i].mpc_irqtype == type &&
29983 +                   (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
29984 +                    mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
29985 +                   mp_irqs[i].mpc_dstirq == pin)
29986 +                       return i;
29987 +
29988 +       return -1;
29989 +}
29990 +
29991 +#ifndef CONFIG_XEN
29992 +/*
29993 + * Find the pin to which IRQ[irq] (ISA) is connected
29994 + */
29995 +static int __init find_isa_irq_pin(int irq, int type)
29996 +{
29997 +       int i;
29998 +
29999 +       for (i = 0; i < mp_irq_entries; i++) {
30000 +               int lbus = mp_irqs[i].mpc_srcbus;
30001 +
30002 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
30003 +                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
30004 +                    mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
30005 +                   (mp_irqs[i].mpc_irqtype == type) &&
30006 +                   (mp_irqs[i].mpc_srcbusirq == irq))
30007 +
30008 +                       return mp_irqs[i].mpc_dstirq;
30009 +       }
30010 +       return -1;
30011 +}
30012 +
30013 +static int __init find_isa_irq_apic(int irq, int type)
30014 +{
30015 +       int i;
30016 +
30017 +       for (i = 0; i < mp_irq_entries; i++) {
30018 +               int lbus = mp_irqs[i].mpc_srcbus;
30019 +
30020 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
30021 +                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
30022 +                    mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
30023 +                   (mp_irqs[i].mpc_irqtype == type) &&
30024 +                   (mp_irqs[i].mpc_srcbusirq == irq))
30025 +                       break;
30026 +       }
30027 +       if (i < mp_irq_entries) {
30028 +               int apic;
30029 +               for(apic = 0; apic < nr_ioapics; apic++) {
30030 +                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
30031 +                               return apic;
30032 +               }
30033 +       }
30034 +
30035 +       return -1;
30036 +}
30037 +#endif
30038 +
30039 +/*
30040 + * Find a specific PCI IRQ entry.
30041 + * Not an __init, possibly needed by modules
30042 + */
30043 +static int pin_2_irq(int idx, int apic, int pin);
30044 +
30045 +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
30046 +{
30047 +       int apic, i, best_guess = -1;
30048 +
30049 +       apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
30050 +               bus, slot, pin);
30051 +       if (mp_bus_id_to_pci_bus[bus] == -1) {
30052 +               apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
30053 +               return -1;
30054 +       }
30055 +       for (i = 0; i < mp_irq_entries; i++) {
30056 +               int lbus = mp_irqs[i].mpc_srcbus;
30057 +
30058 +               for (apic = 0; apic < nr_ioapics; apic++)
30059 +                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
30060 +                           mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
30061 +                               break;
30062 +
30063 +               if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
30064 +                   !mp_irqs[i].mpc_irqtype &&
30065 +                   (bus == lbus) &&
30066 +                   (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
30067 +                       int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
30068 +
30069 +                       if (!(apic || IO_APIC_IRQ(irq)))
30070 +                               continue;
30071 +
30072 +                       if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
30073 +                               return irq;
30074 +                       /*
30075 +                        * Use the first all-but-pin matching entry as a
30076 +                        * best-guess fuzzy result for broken mptables.
30077 +                        */
30078 +                       if (best_guess < 0)
30079 +                               best_guess = irq;
30080 +               }
30081 +       }
30082 +       BUG_ON(best_guess >= NR_IRQS);
30083 +       return best_guess;
30084 +}
30085 +
30086 +/*
30087 + * EISA Edge/Level control register, ELCR
30088 + */
30089 +static int EISA_ELCR(unsigned int irq)
30090 +{
30091 +       if (irq < 16) {
30092 +               unsigned int port = 0x4d0 + (irq >> 3);
30093 +               return (inb(port) >> (irq & 7)) & 1;
30094 +       }
30095 +       apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq);
30096 +       return 0;
30097 +}
30098 +
30099 +/* EISA interrupts are always polarity zero and can be edge or level
30100 + * trigger depending on the ELCR value.  If an interrupt is listed as
30101 + * EISA conforming in the MP table, that means its trigger type must
30102 + * be read in from the ELCR */
30103 +
30104 +#define default_EISA_trigger(idx)      (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
30105 +#define default_EISA_polarity(idx)     (0)
30106 +
30107 +/* ISA interrupts are always polarity zero edge triggered,
30108 + * when listed as conforming in the MP table. */
30109 +
30110 +#define default_ISA_trigger(idx)       (0)
30111 +#define default_ISA_polarity(idx)      (0)
30112 +
30113 +/* PCI interrupts are always polarity one level triggered,
30114 + * when listed as conforming in the MP table. */
30115 +
30116 +#define default_PCI_trigger(idx)       (1)
30117 +#define default_PCI_polarity(idx)      (1)
30118 +
30119 +/* MCA interrupts are always polarity zero level triggered,
30120 + * when listed as conforming in the MP table. */
30121 +
30122 +#define default_MCA_trigger(idx)       (1)
30123 +#define default_MCA_polarity(idx)      (0)
30124 +
30125 +static int __init MPBIOS_polarity(int idx)
30126 +{
30127 +       int bus = mp_irqs[idx].mpc_srcbus;
30128 +       int polarity;
30129 +
30130 +       /*
30131 +        * Determine IRQ line polarity (high active or low active):
30132 +        */
30133 +       switch (mp_irqs[idx].mpc_irqflag & 3)
30134 +       {
30135 +               case 0: /* conforms, ie. bus-type dependent polarity */
30136 +               {
30137 +                       switch (mp_bus_id_to_type[bus])
30138 +                       {
30139 +                               case MP_BUS_ISA: /* ISA pin */
30140 +                               {
30141 +                                       polarity = default_ISA_polarity(idx);
30142 +                                       break;
30143 +                               }
30144 +                               case MP_BUS_EISA: /* EISA pin */
30145 +                               {
30146 +                                       polarity = default_EISA_polarity(idx);
30147 +                                       break;
30148 +                               }
30149 +                               case MP_BUS_PCI: /* PCI pin */
30150 +                               {
30151 +                                       polarity = default_PCI_polarity(idx);
30152 +                                       break;
30153 +                               }
30154 +                               case MP_BUS_MCA: /* MCA pin */
30155 +                               {
30156 +                                       polarity = default_MCA_polarity(idx);
30157 +                                       break;
30158 +                               }
30159 +                               default:
30160 +                               {
30161 +                                       printk(KERN_WARNING "broken BIOS!!\n");
30162 +                                       polarity = 1;
30163 +                                       break;
30164 +                               }
30165 +                       }
30166 +                       break;
30167 +               }
30168 +               case 1: /* high active */
30169 +               {
30170 +                       polarity = 0;
30171 +                       break;
30172 +               }
30173 +               case 2: /* reserved */
30174 +               {
30175 +                       printk(KERN_WARNING "broken BIOS!!\n");
30176 +                       polarity = 1;
30177 +                       break;
30178 +               }
30179 +               case 3: /* low active */
30180 +               {
30181 +                       polarity = 1;
30182 +                       break;
30183 +               }
30184 +               default: /* invalid */
30185 +               {
30186 +                       printk(KERN_WARNING "broken BIOS!!\n");
30187 +                       polarity = 1;
30188 +                       break;
30189 +               }
30190 +       }
30191 +       return polarity;
30192 +}
30193 +
30194 +static int MPBIOS_trigger(int idx)
30195 +{
30196 +       int bus = mp_irqs[idx].mpc_srcbus;
30197 +       int trigger;
30198 +
30199 +       /*
30200 +        * Determine IRQ trigger mode (edge or level sensitive):
30201 +        */
30202 +       switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
30203 +       {
30204 +               case 0: /* conforms, ie. bus-type dependent */
30205 +               {
30206 +                       switch (mp_bus_id_to_type[bus])
30207 +                       {
30208 +                               case MP_BUS_ISA: /* ISA pin */
30209 +                               {
30210 +                                       trigger = default_ISA_trigger(idx);
30211 +                                       break;
30212 +                               }
30213 +                               case MP_BUS_EISA: /* EISA pin */
30214 +                               {
30215 +                                       trigger = default_EISA_trigger(idx);
30216 +                                       break;
30217 +                               }
30218 +                               case MP_BUS_PCI: /* PCI pin */
30219 +                               {
30220 +                                       trigger = default_PCI_trigger(idx);
30221 +                                       break;
30222 +                               }
30223 +                               case MP_BUS_MCA: /* MCA pin */
30224 +                               {
30225 +                                       trigger = default_MCA_trigger(idx);
30226 +                                       break;
30227 +                               }
30228 +                               default:
30229 +                               {
30230 +                                       printk(KERN_WARNING "broken BIOS!!\n");
30231 +                                       trigger = 1;
30232 +                                       break;
30233 +                               }
30234 +                       }
30235 +                       break;
30236 +               }
30237 +               case 1: /* edge */
30238 +               {
30239 +                       trigger = 0;
30240 +                       break;
30241 +               }
30242 +               case 2: /* reserved */
30243 +               {
30244 +                       printk(KERN_WARNING "broken BIOS!!\n");
30245 +                       trigger = 1;
30246 +                       break;
30247 +               }
30248 +               case 3: /* level */
30249 +               {
30250 +                       trigger = 1;
30251 +                       break;
30252 +               }
30253 +               default: /* invalid */
30254 +               {
30255 +                       printk(KERN_WARNING "broken BIOS!!\n");
30256 +                       trigger = 0;
30257 +                       break;
30258 +               }
30259 +       }
30260 +       return trigger;
30261 +}
30262 +
30263 +static inline int irq_polarity(int idx)
30264 +{
30265 +       return MPBIOS_polarity(idx);
30266 +}
30267 +
30268 +static inline int irq_trigger(int idx)
30269 +{
30270 +       return MPBIOS_trigger(idx);
30271 +}
30272 +
30273 +static int next_irq = 16;
30274 +
30275 +/*
30276 + * gsi_irq_sharing -- Name overload!  "irq" can be either a legacy IRQ
30277 + * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
30278 + * from ACPI, which can reach 800 in large boxen.
30279 + *
30280 + * Compact the sparse GSI space into a sequential IRQ series and reuse
30281 + * vectors if possible.
30282 + */
30283 +int gsi_irq_sharing(int gsi)
30284 +{
30285 +       int i, tries, vector;
30286 +
30287 +       BUG_ON(gsi >= NR_IRQ_VECTORS);
30288 +
30289 +       if (platform_legacy_irq(gsi))
30290 +               return gsi;
30291 +
30292 +       if (gsi_2_irq[gsi] != 0xFF)
30293 +               return (int)gsi_2_irq[gsi];
30294 +
30295 +       tries = NR_IRQS;
30296 +  try_again:
30297 +       vector = assign_irq_vector(gsi);
30298 +
30299 +       /*
30300 +        * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
30301 +        * use of vector and if found, return that IRQ.  However, we never want
30302 +        * to share legacy IRQs, which usually have a different trigger mode
30303 +        * than PCI.
30304 +        */
30305 +       for (i = 0; i < NR_IRQS; i++)
30306 +               if (IO_APIC_VECTOR(i) == vector)
30307 +                       break;
30308 +       if (platform_legacy_irq(i)) {
30309 +               if (--tries >= 0) {
30310 +                       IO_APIC_VECTOR(i) = 0;
30311 +                       goto try_again;
30312 +               }
30313 +               panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
30314 +       }
30315 +       if (i < NR_IRQS) {
30316 +               gsi_2_irq[gsi] = i;
30317 +               printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
30318 +                               gsi, vector, i);
30319 +               return i;
30320 +       }
30321 +
30322 +       i = next_irq++;
30323 +       BUG_ON(i >= NR_IRQS);
30324 +       gsi_2_irq[gsi] = i;
30325 +       IO_APIC_VECTOR(i) = vector;
30326 +       printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
30327 +                       gsi, vector, i);
30328 +       return i;
30329 +}
30330 +
30331 +static int pin_2_irq(int idx, int apic, int pin)
30332 +{
30333 +       int irq, i;
30334 +       int bus = mp_irqs[idx].mpc_srcbus;
30335 +
30336 +       /*
30337 +        * Debugging check, we are in big trouble if this message pops up!
30338 +        */
30339 +       if (mp_irqs[idx].mpc_dstirq != pin)
30340 +               printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
30341 +
30342 +       switch (mp_bus_id_to_type[bus])
30343 +       {
30344 +               case MP_BUS_ISA: /* ISA pin */
30345 +               case MP_BUS_EISA:
30346 +               case MP_BUS_MCA:
30347 +               {
30348 +                       irq = mp_irqs[idx].mpc_srcbusirq;
30349 +                       break;
30350 +               }
30351 +               case MP_BUS_PCI: /* PCI pin */
30352 +               {
30353 +                       /*
30354 +                        * PCI IRQs are mapped in order
30355 +                        */
30356 +                       i = irq = 0;
30357 +                       while (i < apic)
30358 +                               irq += nr_ioapic_registers[i++];
30359 +                       irq += pin;
30360 +                       irq = gsi_irq_sharing(irq);
30361 +                       break;
30362 +               }
30363 +               default:
30364 +               {
30365 +                       printk(KERN_ERR "unknown bus type %d.\n",bus); 
30366 +                       irq = 0;
30367 +                       break;
30368 +               }
30369 +       }
30370 +       BUG_ON(irq >= NR_IRQS);
30371 +
30372 +       /*
30373 +        * PCI IRQ command line redirection. Yes, limits are hardcoded.
30374 +        */
30375 +       if ((pin >= 16) && (pin <= 23)) {
30376 +               if (pirq_entries[pin-16] != -1) {
30377 +                       if (!pirq_entries[pin-16]) {
30378 +                               apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16);
30379 +                       } else {
30380 +                               irq = pirq_entries[pin-16];
30381 +                               apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n",
30382 +                                               pin-16, irq);
30383 +                       }
30384 +               }
30385 +       }
30386 +       BUG_ON(irq >= NR_IRQS);
30387 +       return irq;
30388 +}
30389 +
30390 +static inline int IO_APIC_irq_trigger(int irq)
30391 +{
30392 +       int apic, idx, pin;
30393 +
30394 +       for (apic = 0; apic < nr_ioapics; apic++) {
30395 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
30396 +                       idx = find_irq_entry(apic,pin,mp_INT);
30397 +                       if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
30398 +                               return irq_trigger(idx);
30399 +               }
30400 +       }
30401 +       /*
30402 +        * nonexistent IRQs are edge default
30403 +        */
30404 +       return 0;
30405 +}
30406 +
30407 +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
30408 +u8 irq_vector[NR_IRQ_VECTORS] __read_mostly;
30409 +
30410 +int assign_irq_vector(int irq)
30411 +{
30412 +       static int current_vector = FIRST_DEVICE_VECTOR;
30413 +       physdev_op_t op;
30414 +  
30415 +       BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
30416 +       if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
30417 +               return IO_APIC_VECTOR(irq);
30418 +
30419 +       op.cmd = PHYSDEVOP_ASSIGN_VECTOR;
30420 +       op.u.irq_op.irq = irq;
30421 +       if (HYPERVISOR_physdev_op(&op))
30422 +               return -ENOSPC;
30423 +       current_vector = op.u.irq_op.vector;
30424 +
30425 +       vector_irq[current_vector] = irq;
30426 +       if (irq != AUTO_ASSIGN)
30427 +               IO_APIC_VECTOR(irq) = current_vector;
30428 +
30429 +       return current_vector;
30430 +}
30431 +
30432 +extern void (*interrupt[NR_IRQS])(void);
30433 +#ifndef CONFIG_XEN
30434 +static struct hw_interrupt_type ioapic_level_type;
30435 +static struct hw_interrupt_type ioapic_edge_type;
30436 +
30437 +#define IOAPIC_AUTO    -1
30438 +#define IOAPIC_EDGE    0
30439 +#define IOAPIC_LEVEL   1
30440 +
30441 +static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
30442 +{
30443 +       if (use_pci_vector() && !platform_legacy_irq(irq)) {
30444 +               if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
30445 +                               trigger == IOAPIC_LEVEL)
30446 +                       irq_desc[vector].handler = &ioapic_level_type;
30447 +               else
30448 +                       irq_desc[vector].handler = &ioapic_edge_type;
30449 +               set_intr_gate(vector, interrupt[vector]);
30450 +       } else  {
30451 +               if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
30452 +                               trigger == IOAPIC_LEVEL)
30453 +                       irq_desc[irq].handler = &ioapic_level_type;
30454 +               else
30455 +                       irq_desc[irq].handler = &ioapic_edge_type;
30456 +               set_intr_gate(vector, interrupt[irq]);
30457 +       }
30458 +}
30459 +#else
30460 +#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
30461 +#endif /* !CONFIG_XEN */
30462 +
30463 +static void __init setup_IO_APIC_irqs(void)
30464 +{
30465 +       struct IO_APIC_route_entry entry;
30466 +       int apic, pin, idx, irq, first_notcon = 1, vector;
30467 +       unsigned long flags;
30468 +
30469 +       apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
30470 +
30471 +       for (apic = 0; apic < nr_ioapics; apic++) {
30472 +       for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
30473 +
30474 +               /*
30475 +                * add it to the IO-APIC irq-routing table:
30476 +                */
30477 +               memset(&entry,0,sizeof(entry));
30478 +
30479 +               entry.delivery_mode = INT_DELIVERY_MODE;
30480 +               entry.dest_mode = INT_DEST_MODE;
30481 +               entry.mask = 0;                         /* enable IRQ */
30482 +               entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
30483 +
30484 +               idx = find_irq_entry(apic,pin,mp_INT);
30485 +               if (idx == -1) {
30486 +                       if (first_notcon) {
30487 +                               apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
30488 +                               first_notcon = 0;
30489 +                       } else
30490 +                               apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
30491 +                       continue;
30492 +               }
30493 +
30494 +               entry.trigger = irq_trigger(idx);
30495 +               entry.polarity = irq_polarity(idx);
30496 +
30497 +               if (irq_trigger(idx)) {
30498 +                       entry.trigger = 1;
30499 +                       entry.mask = 1;
30500 +                       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
30501 +               }
30502 +
30503 +               irq = pin_2_irq(idx, apic, pin);
30504 +               add_pin_to_irq(irq, apic, pin);
30505 +
30506 +               if (/* !apic && */ !IO_APIC_IRQ(irq))
30507 +                       continue;
30508 +
30509 +               if (IO_APIC_IRQ(irq)) {
30510 +                       vector = assign_irq_vector(irq);
30511 +                       entry.vector = vector;
30512 +
30513 +                       ioapic_register_intr(irq, vector, IOAPIC_AUTO);
30514 +                       if (!apic && (irq < 16))
30515 +                               disable_8259A_irq(irq);
30516 +               }
30517 +               spin_lock_irqsave(&ioapic_lock, flags);
30518 +               io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
30519 +               io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
30520 +               set_native_irq_info(irq, TARGET_CPUS);
30521 +               spin_unlock_irqrestore(&ioapic_lock, flags);
30522 +       }
30523 +       }
30524 +
30525 +       if (!first_notcon)
30526 +               apic_printk(APIC_VERBOSE," not connected.\n");
30527 +}
30528 +
30529 +#ifndef CONFIG_XEN
30530 +/*
30531 + * Set up the 8259A-master output pin as broadcast to all
30532 + * CPUs.
30533 + */
30534 +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
30535 +{
30536 +       struct IO_APIC_route_entry entry;
30537 +       unsigned long flags;
30538 +
30539 +       memset(&entry,0,sizeof(entry));
30540 +
30541 +       disable_8259A_irq(0);
30542 +
30543 +       /* mask LVT0 */
30544 +       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
30545 +
30546 +       /*
30547 +        * We use logical delivery to get the timer IRQ
30548 +        * to the first CPU.
30549 +        */
30550 +       entry.dest_mode = INT_DEST_MODE;
30551 +       entry.mask = 0;                                 /* unmask IRQ now */
30552 +       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
30553 +       entry.delivery_mode = INT_DELIVERY_MODE;
30554 +       entry.polarity = 0;
30555 +       entry.trigger = 0;
30556 +       entry.vector = vector;
30557 +
30558 +       /*
30559 +        * The timer IRQ doesn't have to know that behind the
30560 +        * scene we have a 8259A-master in AEOI mode ...
30561 +        */
30562 +       irq_desc[0].handler = &ioapic_edge_type;
30563 +
30564 +       /*
30565 +        * Add it to the IO-APIC irq-routing table:
30566 +        */
30567 +       spin_lock_irqsave(&ioapic_lock, flags);
30568 +       io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
30569 +       io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
30570 +       spin_unlock_irqrestore(&ioapic_lock, flags);
30571 +
30572 +       enable_8259A_irq(0);
30573 +}
30574 +
30575 +void __init UNEXPECTED_IO_APIC(void)
30576 +{
30577 +}
30578 +
30579 +void __apicdebuginit print_IO_APIC(void)
30580 +{
30581 +       int apic, i;
30582 +       union IO_APIC_reg_00 reg_00;
30583 +       union IO_APIC_reg_01 reg_01;
30584 +       union IO_APIC_reg_02 reg_02;
30585 +       unsigned long flags;
30586 +
30587 +       if (apic_verbosity == APIC_QUIET)
30588 +               return;
30589 +
30590 +       printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
30591 +       for (i = 0; i < nr_ioapics; i++)
30592 +               printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
30593 +                      mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
30594 +
30595 +       /*
30596 +        * We are a bit conservative about what we expect.  We have to
30597 +        * know about every hardware change ASAP.
30598 +        */
30599 +       printk(KERN_INFO "testing the IO APIC.......................\n");
30600 +
30601 +       for (apic = 0; apic < nr_ioapics; apic++) {
30602 +
30603 +       spin_lock_irqsave(&ioapic_lock, flags);
30604 +       reg_00.raw = io_apic_read(apic, 0);
30605 +       reg_01.raw = io_apic_read(apic, 1);
30606 +       if (reg_01.bits.version >= 0x10)
30607 +               reg_02.raw = io_apic_read(apic, 2);
30608 +       spin_unlock_irqrestore(&ioapic_lock, flags);
30609 +
30610 +       printk("\n");
30611 +       printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
30612 +       printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
30613 +       printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
30614 +       if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
30615 +               UNEXPECTED_IO_APIC();
30616 +
30617 +       printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
30618 +       printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
30619 +       if (    (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
30620 +               (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
30621 +               (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
30622 +               (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
30623 +               (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
30624 +               (reg_01.bits.entries != 0x2E) &&
30625 +               (reg_01.bits.entries != 0x3F) &&
30626 +               (reg_01.bits.entries != 0x03) 
30627 +       )
30628 +               UNEXPECTED_IO_APIC();
30629 +
30630 +       printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
30631 +       printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
30632 +       if (    (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
30633 +               (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */
30634 +               (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
30635 +               (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
30636 +               (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
30637 +               (reg_01.bits.version != 0x20)    /* Intel P64H (82806 AA) */
30638 +       )
30639 +               UNEXPECTED_IO_APIC();
30640 +       if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
30641 +               UNEXPECTED_IO_APIC();
30642 +
30643 +       if (reg_01.bits.version >= 0x10) {
30644 +               printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
30645 +               printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
30646 +               if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
30647 +                       UNEXPECTED_IO_APIC();
30648 +       }
30649 +
30650 +       printk(KERN_DEBUG ".... IRQ redirection table:\n");
30651 +
30652 +       printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
30653 +                         " Stat Dest Deli Vect:   \n");
30654 +
30655 +       for (i = 0; i <= reg_01.bits.entries; i++) {
30656 +               struct IO_APIC_route_entry entry;
30657 +
30658 +               spin_lock_irqsave(&ioapic_lock, flags);
30659 +               *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
30660 +               *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
30661 +               spin_unlock_irqrestore(&ioapic_lock, flags);
30662 +
30663 +               printk(KERN_DEBUG " %02x %03X %02X  ",
30664 +                       i,
30665 +                       entry.dest.logical.logical_dest,
30666 +                       entry.dest.physical.physical_dest
30667 +               );
30668 +
30669 +               printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
30670 +                       entry.mask,
30671 +                       entry.trigger,
30672 +                       entry.irr,
30673 +                       entry.polarity,
30674 +                       entry.delivery_status,
30675 +                       entry.dest_mode,
30676 +                       entry.delivery_mode,
30677 +                       entry.vector
30678 +               );
30679 +       }
30680 +       }
30681 +       if (use_pci_vector())
30682 +               printk(KERN_INFO "Using vector-based indexing\n");
30683 +       printk(KERN_DEBUG "IRQ to pin mappings:\n");
30684 +       for (i = 0; i < NR_IRQS; i++) {
30685 +               struct irq_pin_list *entry = irq_2_pin + i;
30686 +               if (entry->pin < 0)
30687 +                       continue;
30688 +               if (use_pci_vector() && !platform_legacy_irq(i))
30689 +                       printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
30690 +               else
30691 +                       printk(KERN_DEBUG "IRQ%d ", i);
30692 +               for (;;) {
30693 +                       printk("-> %d:%d", entry->apic, entry->pin);
30694 +                       if (!entry->next)
30695 +                               break;
30696 +                       entry = irq_2_pin + entry->next;
30697 +               }
30698 +               printk("\n");
30699 +       }
30700 +
30701 +       printk(KERN_INFO ".................................... done.\n");
30702 +
30703 +       return;
30704 +}
30705 +
30706 +#if 0
30707 +
30708 +static __apicdebuginit void print_APIC_bitfield (int base)
30709 +{
30710 +       unsigned int v;
30711 +       int i, j;
30712 +
30713 +       if (apic_verbosity == APIC_QUIET)
30714 +               return;
30715 +
30716 +       printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
30717 +       for (i = 0; i < 8; i++) {
30718 +               v = apic_read(base + i*0x10);
30719 +               for (j = 0; j < 32; j++) {
30720 +                       if (v & (1<<j))
30721 +                               printk("1");
30722 +                       else
30723 +                               printk("0");
30724 +               }
30725 +               printk("\n");
30726 +       }
30727 +}
30728 +
30729 +void __apicdebuginit print_local_APIC(void * dummy)
30730 +{
30731 +       unsigned int v, ver, maxlvt;
30732 +
30733 +       if (apic_verbosity == APIC_QUIET)
30734 +               return;
30735 +
30736 +       printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
30737 +               smp_processor_id(), hard_smp_processor_id());
30738 +       v = apic_read(APIC_ID);
30739 +       printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(v));
30740 +       v = apic_read(APIC_LVR);
30741 +       printk(KERN_INFO "... APIC VERSION: %08x\n", v);
30742 +       ver = GET_APIC_VERSION(v);
30743 +       maxlvt = get_maxlvt();
30744 +
30745 +       v = apic_read(APIC_TASKPRI);
30746 +       printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
30747 +
30748 +       v = apic_read(APIC_ARBPRI);
30749 +       printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
30750 +               v & APIC_ARBPRI_MASK);
30751 +       v = apic_read(APIC_PROCPRI);
30752 +       printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
30753 +
30754 +       v = apic_read(APIC_EOI);
30755 +       printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
30756 +       v = apic_read(APIC_RRR);
30757 +       printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
30758 +       v = apic_read(APIC_LDR);
30759 +       printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
30760 +       v = apic_read(APIC_DFR);
30761 +       printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
30762 +       v = apic_read(APIC_SPIV);
30763 +       printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
30764 +
30765 +       printk(KERN_DEBUG "... APIC ISR field:\n");
30766 +       print_APIC_bitfield(APIC_ISR);
30767 +       printk(KERN_DEBUG "... APIC TMR field:\n");
30768 +       print_APIC_bitfield(APIC_TMR);
30769 +       printk(KERN_DEBUG "... APIC IRR field:\n");
30770 +       print_APIC_bitfield(APIC_IRR);
30771 +
30772 +       v = apic_read(APIC_ESR);
30773 +       printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
30774 +
30775 +       v = apic_read(APIC_ICR);
30776 +       printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
30777 +       v = apic_read(APIC_ICR2);
30778 +       printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
30779 +
30780 +       v = apic_read(APIC_LVTT);
30781 +       printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
30782 +
30783 +       if (maxlvt > 3) {                       /* PC is LVT#4. */
30784 +               v = apic_read(APIC_LVTPC);
30785 +               printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
30786 +       }
30787 +       v = apic_read(APIC_LVT0);
30788 +       printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
30789 +       v = apic_read(APIC_LVT1);
30790 +       printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
30791 +
30792 +       if (maxlvt > 2) {                       /* ERR is LVT#3. */
30793 +               v = apic_read(APIC_LVTERR);
30794 +               printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
30795 +       }
30796 +
30797 +       v = apic_read(APIC_TMICT);
30798 +       printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
30799 +       v = apic_read(APIC_TMCCT);
30800 +       printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
30801 +       v = apic_read(APIC_TDCR);
30802 +       printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
30803 +       printk("\n");
30804 +}
30805 +
30806 +void print_all_local_APICs (void)
30807 +{
30808 +       on_each_cpu(print_local_APIC, NULL, 1, 1);
30809 +}
30810 +
30811 +void __apicdebuginit print_PIC(void)
30812 +{
30813 +       unsigned int v;
30814 +       unsigned long flags;
30815 +
30816 +       if (apic_verbosity == APIC_QUIET)
30817 +               return;
30818 +
30819 +       printk(KERN_DEBUG "\nprinting PIC contents\n");
30820 +
30821 +       spin_lock_irqsave(&i8259A_lock, flags);
30822 +
30823 +       v = inb(0xa1) << 8 | inb(0x21);
30824 +       printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
30825 +
30826 +       v = inb(0xa0) << 8 | inb(0x20);
30827 +       printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
30828 +
30829 +       outb(0x0b,0xa0);
30830 +       outb(0x0b,0x20);
30831 +       v = inb(0xa0) << 8 | inb(0x20);
30832 +       outb(0x0a,0xa0);
30833 +       outb(0x0a,0x20);
30834 +
30835 +       spin_unlock_irqrestore(&i8259A_lock, flags);
30836 +
30837 +       printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
30838 +
30839 +       v = inb(0x4d1) << 8 | inb(0x4d0);
30840 +       printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
30841 +}
30842 +
30843 +#endif  /*  0  */
30844 +
30845 +#else
30846 +void __init print_IO_APIC(void) { }
30847 +#endif /* !CONFIG_XEN */
30848 +
30849 +static void __init enable_IO_APIC(void)
30850 +{
30851 +       union IO_APIC_reg_01 reg_01;
30852 +#ifndef CONFIG_XEN
30853 +       int i8259_apic, i8259_pin;
30854 +#endif
30855 +       int i, apic;
30856 +       unsigned long flags;
30857 +
30858 +       for (i = 0; i < PIN_MAP_SIZE; i++) {
30859 +               irq_2_pin[i].pin = -1;
30860 +               irq_2_pin[i].next = 0;
30861 +       }
30862 +       if (!pirqs_enabled)
30863 +               for (i = 0; i < MAX_PIRQS; i++)
30864 +                       pirq_entries[i] = -1;
30865 +
30866 +       /*
30867 +        * The number of IO-APIC IRQ registers (== #pins):
30868 +        */
30869 +       for (apic = 0; apic < nr_ioapics; apic++) {
30870 +               spin_lock_irqsave(&ioapic_lock, flags);
30871 +               reg_01.raw = io_apic_read(apic, 1);
30872 +               spin_unlock_irqrestore(&ioapic_lock, flags);
30873 +               nr_ioapic_registers[apic] = reg_01.bits.entries+1;
30874 +       }
30875 +#ifndef CONFIG_XEN
30876 +       for(apic = 0; apic < nr_ioapics; apic++) {
30877 +               int pin;
30878 +               /* See if any of the pins is in ExtINT mode */
30879 +               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
30880 +                       struct IO_APIC_route_entry entry;
30881 +                       spin_lock_irqsave(&ioapic_lock, flags);
30882 +                       *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
30883 +                       *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
30884 +                       spin_unlock_irqrestore(&ioapic_lock, flags);
30885 +
30886 +
30887 +                       /* If the interrupt line is enabled and in ExtInt mode
30888 +                        * I have found the pin where the i8259 is connected.
30889 +                        */
30890 +                       if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
30891 +                               ioapic_i8259.apic = apic;
30892 +                               ioapic_i8259.pin  = pin;
30893 +                               goto found_i8259;
30894 +                       }
30895 +               }
30896 +       }
30897 + found_i8259:
30898 +       /* Look to see what if the MP table has reported the ExtINT */
30899 +       i8259_pin  = find_isa_irq_pin(0, mp_ExtINT);
30900 +       i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
30901 +       /* Trust the MP table if nothing is setup in the hardware */
30902 +       if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
30903 +               printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
30904 +               ioapic_i8259.pin  = i8259_pin;
30905 +               ioapic_i8259.apic = i8259_apic;
30906 +       }
30907 +       /* Complain if the MP table and the hardware disagree */
30908 +       if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
30909 +               (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
30910 +       {
30911 +               printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
30912 +       }
30913 +#endif
30914 +
30915 +       /*
30916 +        * Do not trust the IO-APIC being empty at bootup
30917 +        */
30918 +       clear_IO_APIC();
30919 +}
30920 +
30921 +/*
30922 + * Not an __init, needed by the reboot code
30923 + */
30924 +void disable_IO_APIC(void)
30925 +{
30926 +       /*
30927 +        * Clear the IO-APIC before rebooting:
30928 +        */
30929 +       clear_IO_APIC();
30930 +
30931 +#ifndef CONFIG_XEN
30932 +       /*
30933 +        * If the i8259 is routed through an IOAPIC
30934 +        * Put that IOAPIC in virtual wire mode
30935 +        * so legacy interrupts can be delivered.
30936 +        */
30937 +       if (ioapic_i8259.pin != -1) {
30938 +               struct IO_APIC_route_entry entry;
30939 +               unsigned long flags;
30940 +
30941 +               memset(&entry, 0, sizeof(entry));
30942 +               entry.mask            = 0; /* Enabled */
30943 +               entry.trigger         = 0; /* Edge */
30944 +               entry.irr             = 0;
30945 +               entry.polarity        = 0; /* High */
30946 +               entry.delivery_status = 0;
30947 +               entry.dest_mode       = 0; /* Physical */
30948 +               entry.delivery_mode   = dest_ExtINT; /* ExtInt */
30949 +               entry.vector          = 0;
30950 +               entry.dest.physical.physical_dest =
30951 +                                       GET_APIC_ID(apic_read(APIC_ID));
30952 +
30953 +               /*
30954 +                * Add it to the IO-APIC irq-routing table:
30955 +                */
30956 +               spin_lock_irqsave(&ioapic_lock, flags);
30957 +               io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
30958 +                       *(((int *)&entry)+1));
30959 +               io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
30960 +                       *(((int *)&entry)+0));
30961 +               spin_unlock_irqrestore(&ioapic_lock, flags);
30962 +       }
30963 +
30964 +       disconnect_bsp_APIC(ioapic_i8259.pin != -1);
30965 +#endif
30966 +}
30967 +
30968 +/*
30969 + * function to set the IO-APIC physical IDs based on the
30970 + * values stored in the MPC table.
30971 + *
30972 + * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
30973 + */
30974 +
30975 +#ifndef CONFIG_XEN
30976 +static void __init setup_ioapic_ids_from_mpc (void)
30977 +{
30978 +       union IO_APIC_reg_00 reg_00;
30979 +       int apic;
30980 +       int i;
30981 +       unsigned char old_id;
30982 +       unsigned long flags;
30983 +
30984 +       /*
30985 +        * Set the IOAPIC ID to the value stored in the MPC table.
30986 +        */
30987 +       for (apic = 0; apic < nr_ioapics; apic++) {
30988 +
30989 +               /* Read the register 0 value */
30990 +               spin_lock_irqsave(&ioapic_lock, flags);
30991 +               reg_00.raw = io_apic_read(apic, 0);
30992 +               spin_unlock_irqrestore(&ioapic_lock, flags);
30993 +               
30994 +               old_id = mp_ioapics[apic].mpc_apicid;
30995 +
30996 +
30997 +               printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid);
30998 +
30999 +
31000 +               /*
31001 +                * We need to adjust the IRQ routing table
31002 +                * if the ID changed.
31003 +                */
31004 +               if (old_id != mp_ioapics[apic].mpc_apicid)
31005 +                       for (i = 0; i < mp_irq_entries; i++)
31006 +                               if (mp_irqs[i].mpc_dstapic == old_id)
31007 +                                       mp_irqs[i].mpc_dstapic
31008 +                                               = mp_ioapics[apic].mpc_apicid;
31009 +
31010 +               /*
31011 +                * Read the right value from the MPC table and
31012 +                * write it into the ID register.
31013 +                */
31014 +               apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
31015 +                               mp_ioapics[apic].mpc_apicid);
31016 +
31017 +               reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
31018 +               spin_lock_irqsave(&ioapic_lock, flags);
31019 +               io_apic_write(apic, 0, reg_00.raw);
31020 +               spin_unlock_irqrestore(&ioapic_lock, flags);
31021 +
31022 +               /*
31023 +                * Sanity check
31024 +                */
31025 +               spin_lock_irqsave(&ioapic_lock, flags);
31026 +               reg_00.raw = io_apic_read(apic, 0);
31027 +               spin_unlock_irqrestore(&ioapic_lock, flags);
31028 +               if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
31029 +                       printk("could not set ID!\n");
31030 +               else
31031 +                       apic_printk(APIC_VERBOSE," ok.\n");
31032 +       }
31033 +}
31034 +#else
31035 +static void __init setup_ioapic_ids_from_mpc(void) { }
31036 +#endif
31037 +
31038 +/*
31039 + * There is a nasty bug in some older SMP boards, their mptable lies
31040 + * about the timer IRQ. We do the following to work around the situation:
31041 + *
31042 + *     - timer IRQ defaults to IO-APIC IRQ
31043 + *     - if this function detects that timer IRQs are defunct, then we fall
31044 + *       back to ISA timer IRQs
31045 + */
31046 +#ifndef CONFIG_XEN
31047 +static int __init timer_irq_works(void)
31048 +{
31049 +       unsigned long t1 = jiffies;
31050 +
31051 +       local_irq_enable();
31052 +       /* Let ten ticks pass... */
31053 +       mdelay((10 * 1000) / HZ);
31054 +
31055 +       /*
31056 +        * Expect a few ticks at least, to be sure some possible
31057 +        * glue logic does not lock up after one or two first
31058 +        * ticks in a non-ExtINT mode.  Also the local APIC
31059 +        * might have cached one ExtINT interrupt.  Finally, at
31060 +        * least one tick may be lost due to delays.
31061 +        */
31062 +
31063 +       /* jiffies wrap? */
31064 +       if (jiffies - t1 > 4)
31065 +               return 1;
31066 +       return 0;
31067 +}
31068 +
31069 +/*
31070 + * In the SMP+IOAPIC case it might happen that there are an unspecified
31071 + * number of pending IRQ events unhandled. These cases are very rare,
31072 + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
31073 + * better to do it this way as thus we do not have to be aware of
31074 + * 'pending' interrupts in the IRQ path, except at this point.
31075 + */
31076 +/*
31077 + * Edge triggered needs to resend any interrupt
31078 + * that was delayed but this is now handled in the device
31079 + * independent code.
31080 + */
31081 +
31082 +/*
31083 + * Starting up a edge-triggered IO-APIC interrupt is
31084 + * nasty - we need to make sure that we get the edge.
31085 + * If it is already asserted for some reason, we need
31086 + * return 1 to indicate that is was pending.
31087 + *
31088 + * This is not complete - we should be able to fake
31089 + * an edge even if it isn't on the 8259A...
31090 + */
31091 +
31092 +static unsigned int startup_edge_ioapic_irq(unsigned int irq)
31093 +{
31094 +       int was_pending = 0;
31095 +       unsigned long flags;
31096 +
31097 +       spin_lock_irqsave(&ioapic_lock, flags);
31098 +       if (irq < 16) {
31099 +               disable_8259A_irq(irq);
31100 +               if (i8259A_irq_pending(irq))
31101 +                       was_pending = 1;
31102 +       }
31103 +       __unmask_IO_APIC_irq(irq);
31104 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31105 +
31106 +       return was_pending;
31107 +}
31108 +
31109 +/*
31110 + * Once we have recorded IRQ_PENDING already, we can mask the
31111 + * interrupt for real. This prevents IRQ storms from unhandled
31112 + * devices.
31113 + */
31114 +static void ack_edge_ioapic_irq(unsigned int irq)
31115 +{
31116 +       move_irq(irq);
31117 +       if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
31118 +                                       == (IRQ_PENDING | IRQ_DISABLED))
31119 +               mask_IO_APIC_irq(irq);
31120 +       ack_APIC_irq();
31121 +}
31122 +
31123 +/*
31124 + * Level triggered interrupts can just be masked,
31125 + * and shutting down and starting up the interrupt
31126 + * is the same as enabling and disabling them -- except
31127 + * with a startup need to return a "was pending" value.
31128 + *
31129 + * Level triggered interrupts are special because we
31130 + * do not touch any IO-APIC register while handling
31131 + * them. We ack the APIC in the end-IRQ handler, not
31132 + * in the start-IRQ-handler. Protection against reentrance
31133 + * from the same interrupt is still provided, both by the
31134 + * generic IRQ layer and by the fact that an unacked local
31135 + * APIC does not accept IRQs.
31136 + */
31137 +static unsigned int startup_level_ioapic_irq (unsigned int irq)
31138 +{
31139 +       unmask_IO_APIC_irq(irq);
31140 +
31141 +       return 0; /* don't check for pending */
31142 +}
31143 +
31144 +static void end_level_ioapic_irq (unsigned int irq)
31145 +{
31146 +       move_irq(irq);
31147 +       ack_APIC_irq();
31148 +}
31149 +
31150 +#ifdef CONFIG_PCI_MSI
31151 +static unsigned int startup_edge_ioapic_vector(unsigned int vector)
31152 +{
31153 +       int irq = vector_to_irq(vector);
31154 +
31155 +       return startup_edge_ioapic_irq(irq);
31156 +}
31157 +
31158 +static void ack_edge_ioapic_vector(unsigned int vector)
31159 +{
31160 +       int irq = vector_to_irq(vector);
31161 +
31162 +       move_native_irq(vector);
31163 +       ack_edge_ioapic_irq(irq);
31164 +}
31165 +
31166 +static unsigned int startup_level_ioapic_vector (unsigned int vector)
31167 +{
31168 +       int irq = vector_to_irq(vector);
31169 +
31170 +       return startup_level_ioapic_irq (irq);
31171 +}
31172 +
31173 +static void end_level_ioapic_vector (unsigned int vector)
31174 +{
31175 +       int irq = vector_to_irq(vector);
31176 +
31177 +       move_native_irq(vector);
31178 +       end_level_ioapic_irq(irq);
31179 +}
31180 +
31181 +static void mask_IO_APIC_vector (unsigned int vector)
31182 +{
31183 +       int irq = vector_to_irq(vector);
31184 +
31185 +       mask_IO_APIC_irq(irq);
31186 +}
31187 +
31188 +static void unmask_IO_APIC_vector (unsigned int vector)
31189 +{
31190 +       int irq = vector_to_irq(vector);
31191 +
31192 +       unmask_IO_APIC_irq(irq);
31193 +}
31194 +
31195 +#ifdef CONFIG_SMP
31196 +static void set_ioapic_affinity_vector (unsigned int vector,
31197 +                                       cpumask_t cpu_mask)
31198 +{
31199 +       int irq = vector_to_irq(vector);
31200 +
31201 +       set_native_irq_info(vector, cpu_mask);
31202 +       set_ioapic_affinity_irq(irq, cpu_mask);
31203 +}
31204 +#endif // CONFIG_SMP
31205 +#endif // CONFIG_PCI_MSI
31206 +
31207 +/*
31208 + * Level and edge triggered IO-APIC interrupts need different handling,
31209 + * so we use two separate IRQ descriptors. Edge triggered IRQs can be
31210 + * handled with the level-triggered descriptor, but that one has slightly
31211 + * more overhead. Level-triggered interrupts cannot be handled with the
31212 + * edge-triggered handler, without risking IRQ storms and other ugly
31213 + * races.
31214 + */
31215 +
31216 +static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
31217 +       .typename = "IO-APIC-edge",
31218 +       .startup        = startup_edge_ioapic,
31219 +       .shutdown       = shutdown_edge_ioapic,
31220 +       .enable         = enable_edge_ioapic,
31221 +       .disable        = disable_edge_ioapic,
31222 +       .ack            = ack_edge_ioapic,
31223 +       .end            = end_edge_ioapic,
31224 +#ifdef CONFIG_SMP
31225 +       .set_affinity = set_ioapic_affinity,
31226 +#endif
31227 +};
31228 +
31229 +static struct hw_interrupt_type ioapic_level_type __read_mostly = {
31230 +       .typename = "IO-APIC-level",
31231 +       .startup        = startup_level_ioapic,
31232 +       .shutdown       = shutdown_level_ioapic,
31233 +       .enable         = enable_level_ioapic,
31234 +       .disable        = disable_level_ioapic,
31235 +       .ack            = mask_and_ack_level_ioapic,
31236 +       .end            = end_level_ioapic,
31237 +#ifdef CONFIG_SMP
31238 +       .set_affinity = set_ioapic_affinity,
31239 +#endif
31240 +};
31241 +#endif /* !CONFIG_XEN */
31242 +
31243 +static inline void init_IO_APIC_traps(void)
31244 +{
31245 +       int irq;
31246 +
31247 +       /*
31248 +        * NOTE! The local APIC isn't very good at handling
31249 +        * multiple interrupts at the same interrupt level.
31250 +        * As the interrupt level is determined by taking the
31251 +        * vector number and shifting that right by 4, we
31252 +        * want to spread these out a bit so that they don't
31253 +        * all fall in the same interrupt level.
31254 +        *
31255 +        * Also, we've got to be careful not to trash gate
31256 +        * 0x80, because int 0x80 is hm, kind of importantish. ;)
31257 +        */
31258 +       for (irq = 0; irq < NR_IRQS ; irq++) {
31259 +               int tmp = irq;
31260 +               if (use_pci_vector()) {
31261 +                       if (!platform_legacy_irq(tmp))
31262 +                               if ((tmp = vector_to_irq(tmp)) == -1)
31263 +                                       continue;
31264 +               }
31265 +               if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
31266 +                       /*
31267 +                        * Hmm.. We don't have an entry for this,
31268 +                        * so default to an old-fashioned 8259
31269 +                        * interrupt if we can..
31270 +                        */
31271 +                       if (irq < 16)
31272 +                               make_8259A_irq(irq);
31273 +#ifndef CONFIG_XEN
31274 +                       else
31275 +                               /* Strange. Oh, well.. */
31276 +                               irq_desc[irq].handler = &no_irq_type;
31277 +#endif
31278 +               }
31279 +       }
31280 +}
31281 +
31282 +#ifndef CONFIG_XEN
31283 +static void enable_lapic_irq (unsigned int irq)
31284 +{
31285 +       unsigned long v;
31286 +
31287 +       v = apic_read(APIC_LVT0);
31288 +       apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
31289 +}
31290 +
31291 +static void disable_lapic_irq (unsigned int irq)
31292 +{
31293 +       unsigned long v;
31294 +
31295 +       v = apic_read(APIC_LVT0);
31296 +       apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
31297 +}
31298 +
31299 +static void ack_lapic_irq (unsigned int irq)
31300 +{
31301 +       ack_APIC_irq();
31302 +}
31303 +
31304 +static void end_lapic_irq (unsigned int i) { /* nothing */ }
31305 +
31306 +static struct hw_interrupt_type lapic_irq_type __read_mostly = {
31307 +       .typename = "local-APIC-edge",
31308 +       .startup = NULL, /* startup_irq() not used for IRQ0 */
31309 +       .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
31310 +       .enable = enable_lapic_irq,
31311 +       .disable = disable_lapic_irq,
31312 +       .ack = ack_lapic_irq,
31313 +       .end = end_lapic_irq,
31314 +};
31315 +
31316 +static void setup_nmi (void)
31317 +{
31318 +       /*
31319 +        * Dirty trick to enable the NMI watchdog ...
31320 +        * We put the 8259A master into AEOI mode and
31321 +        * unmask on all local APICs LVT0 as NMI.
31322 +        *
31323 +        * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
31324 +        * is from Maciej W. Rozycki - so we do not have to EOI from
31325 +        * the NMI handler or the timer interrupt.
31326 +        */ 
31327 +       printk(KERN_INFO "activating NMI Watchdog ...");
31328 +
31329 +       enable_NMI_through_LVT0(NULL);
31330 +
31331 +       printk(" done.\n");
31332 +}
31333 +
31334 +/*
31335 + * This looks a bit hackish but it's about the only one way of sending
31336 + * a few INTA cycles to 8259As and any associated glue logic.  ICR does
31337 + * not support the ExtINT mode, unfortunately.  We need to send these
31338 + * cycles as some i82489DX-based boards have glue logic that keeps the
31339 + * 8259A interrupt line asserted until INTA.  --macro
31340 + */
31341 +static inline void unlock_ExtINT_logic(void)
31342 +{
31343 +       int apic, pin, i;
31344 +       struct IO_APIC_route_entry entry0, entry1;
31345 +       unsigned char save_control, save_freq_select;
31346 +       unsigned long flags;
31347 +
31348 +       pin  = find_isa_irq_pin(8, mp_INT);
31349 +       apic = find_isa_irq_apic(8, mp_INT);
31350 +       if (pin == -1)
31351 +               return;
31352 +
31353 +       spin_lock_irqsave(&ioapic_lock, flags);
31354 +       *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
31355 +       *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
31356 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31357 +       clear_IO_APIC_pin(apic, pin);
31358 +
31359 +       memset(&entry1, 0, sizeof(entry1));
31360 +
31361 +       entry1.dest_mode = 0;                   /* physical delivery */
31362 +       entry1.mask = 0;                        /* unmask IRQ now */
31363 +       entry1.dest.physical.physical_dest = hard_smp_processor_id();
31364 +       entry1.delivery_mode = dest_ExtINT;
31365 +       entry1.polarity = entry0.polarity;
31366 +       entry1.trigger = 0;
31367 +       entry1.vector = 0;
31368 +
31369 +       spin_lock_irqsave(&ioapic_lock, flags);
31370 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
31371 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
31372 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31373 +
31374 +       save_control = CMOS_READ(RTC_CONTROL);
31375 +       save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
31376 +       CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
31377 +                  RTC_FREQ_SELECT);
31378 +       CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
31379 +
31380 +       i = 100;
31381 +       while (i-- > 0) {
31382 +               mdelay(10);
31383 +               if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
31384 +                       i -= 10;
31385 +       }
31386 +
31387 +       CMOS_WRITE(save_control, RTC_CONTROL);
31388 +       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
31389 +       clear_IO_APIC_pin(apic, pin);
31390 +
31391 +       spin_lock_irqsave(&ioapic_lock, flags);
31392 +       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
31393 +       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
31394 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31395 +}
31396 +
31397 +/*
31398 + * This code may look a bit paranoid, but it's supposed to cooperate with
31399 + * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
31400 + * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
31401 + * fanatically on his truly buggy board.
31402 + *
31403 + * FIXME: really need to revamp this for modern platforms only.
31404 + */
31405 +static inline void check_timer(void)
31406 +{
31407 +       int apic1, pin1, apic2, pin2;
31408 +       int vector;
31409 +
31410 +       /*
31411 +        * get/set the timer IRQ vector:
31412 +        */
31413 +       disable_8259A_irq(0);
31414 +       vector = assign_irq_vector(0);
31415 +       set_intr_gate(vector, interrupt[0]);
31416 +
31417 +       /*
31418 +        * Subtle, code in do_timer_interrupt() expects an AEOI
31419 +        * mode for the 8259A whenever interrupts are routed
31420 +        * through I/O APICs.  Also IRQ0 has to be enabled in
31421 +        * the 8259A which implies the virtual wire has to be
31422 +        * disabled in the local APIC.
31423 +        */
31424 +       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
31425 +       init_8259A(1);
31426 +       if (timer_over_8254 > 0)
31427 +               enable_8259A_irq(0);
31428 +
31429 +       pin1  = find_isa_irq_pin(0, mp_INT);
31430 +       apic1 = find_isa_irq_apic(0, mp_INT);
31431 +       pin2  = ioapic_i8259.pin;
31432 +       apic2 = ioapic_i8259.apic;
31433 +
31434 +       apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
31435 +               vector, apic1, pin1, apic2, pin2);
31436 +
31437 +       if (pin1 != -1) {
31438 +               /*
31439 +                * Ok, does IRQ0 through the IOAPIC work?
31440 +                */
31441 +               unmask_IO_APIC_irq(0);
31442 +               if (!no_timer_check && timer_irq_works()) {
31443 +                       nmi_watchdog_default();
31444 +                       if (nmi_watchdog == NMI_IO_APIC) {
31445 +                               disable_8259A_irq(0);
31446 +                               setup_nmi();
31447 +                               enable_8259A_irq(0);
31448 +                       }
31449 +                       if (disable_timer_pin_1 > 0)
31450 +                               clear_IO_APIC_pin(0, pin1);
31451 +                       return;
31452 +               }
31453 +               clear_IO_APIC_pin(apic1, pin1);
31454 +               apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
31455 +                               "connected to IO-APIC\n");
31456 +       }
31457 +
31458 +       apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
31459 +                               "through the 8259A ... ");
31460 +       if (pin2 != -1) {
31461 +               apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
31462 +                       apic2, pin2);
31463 +               /*
31464 +                * legacy devices should be connected to IO APIC #0
31465 +                */
31466 +               setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
31467 +               if (timer_irq_works()) {
31468 +                       printk("works.\n");
31469 +                       nmi_watchdog_default();
31470 +                       if (nmi_watchdog == NMI_IO_APIC) {
31471 +                               setup_nmi();
31472 +                       }
31473 +                       return;
31474 +               }
31475 +               /*
31476 +                * Cleanup, just in case ...
31477 +                */
31478 +               clear_IO_APIC_pin(apic2, pin2);
31479 +       }
31480 +       printk(" failed.\n");
31481 +
31482 +       if (nmi_watchdog == NMI_IO_APIC) {
31483 +               printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
31484 +               nmi_watchdog = 0;
31485 +       }
31486 +
31487 +       apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
31488 +
31489 +       disable_8259A_irq(0);
31490 +       irq_desc[0].handler = &lapic_irq_type;
31491 +       apic_write(APIC_LVT0, APIC_DM_FIXED | vector);  /* Fixed mode */
31492 +       enable_8259A_irq(0);
31493 +
31494 +       if (timer_irq_works()) {
31495 +               apic_printk(APIC_QUIET, " works.\n");
31496 +               return;
31497 +       }
31498 +       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
31499 +       apic_printk(APIC_VERBOSE," failed.\n");
31500 +
31501 +       apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
31502 +
31503 +       init_8259A(0);
31504 +       make_8259A_irq(0);
31505 +       apic_write(APIC_LVT0, APIC_DM_EXTINT);
31506 +
31507 +       unlock_ExtINT_logic();
31508 +
31509 +       if (timer_irq_works()) {
31510 +               apic_printk(APIC_VERBOSE," works.\n");
31511 +               return;
31512 +       }
31513 +       apic_printk(APIC_VERBOSE," failed :(.\n");
31514 +       panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
31515 +}
31516 +#else
31517 +#define check_timer() ((void)0)
31518 +#endif /* !CONFIG_XEN */
31519 +
31520 +static int __init notimercheck(char *s)
31521 +{
31522 +       no_timer_check = 1;
31523 +       return 1;
31524 +}
31525 +__setup("no_timer_check", notimercheck);
31526 +
31527 +/*
31528 + *
31529 + * IRQ's that are handled by the PIC in the MPS IOAPIC case.
31530 + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
31531 + *   Linux doesn't really care, as it's not actually used
31532 + *   for any interrupt handling anyway.
31533 + */
31534 +#define PIC_IRQS       (1<<2)
31535 +
31536 +void __init setup_IO_APIC(void)
31537 +{
31538 +       enable_IO_APIC();
31539 +
31540 +       if (acpi_ioapic)
31541 +               io_apic_irqs = ~0;      /* all IRQs go through IOAPIC */
31542 +       else
31543 +               io_apic_irqs = ~PIC_IRQS;
31544 +
31545 +       apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
31546 +
31547 +       /*
31548 +        * Set up the IO-APIC IRQ routing table.
31549 +        */
31550 +       if (!acpi_ioapic)
31551 +               setup_ioapic_ids_from_mpc();
31552 +#ifndef CONFIG_XEN
31553 +       sync_Arb_IDs();
31554 +#endif /* !CONFIG_XEN */
31555 +       setup_IO_APIC_irqs();
31556 +       init_IO_APIC_traps();
31557 +       check_timer();
31558 +       if (!acpi_ioapic)
31559 +               print_IO_APIC();
31560 +}
31561 +
31562 +struct sysfs_ioapic_data {
31563 +       struct sys_device dev;
31564 +       struct IO_APIC_route_entry entry[0];
31565 +};
31566 +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
31567 +
31568 +static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
31569 +{
31570 +       struct IO_APIC_route_entry *entry;
31571 +       struct sysfs_ioapic_data *data;
31572 +       unsigned long flags;
31573 +       int i;
31574 +
31575 +       data = container_of(dev, struct sysfs_ioapic_data, dev);
31576 +       entry = data->entry;
31577 +       spin_lock_irqsave(&ioapic_lock, flags);
31578 +       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
31579 +               *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
31580 +               *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
31581 +       }
31582 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31583 +
31584 +       return 0;
31585 +}
31586 +
31587 +static int ioapic_resume(struct sys_device *dev)
31588 +{
31589 +       struct IO_APIC_route_entry *entry;
31590 +       struct sysfs_ioapic_data *data;
31591 +       unsigned long flags;
31592 +       union IO_APIC_reg_00 reg_00;
31593 +       int i;
31594 +
31595 +       data = container_of(dev, struct sysfs_ioapic_data, dev);
31596 +       entry = data->entry;
31597 +
31598 +       spin_lock_irqsave(&ioapic_lock, flags);
31599 +       reg_00.raw = io_apic_read(dev->id, 0);
31600 +       if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
31601 +               reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
31602 +               io_apic_write(dev->id, 0, reg_00.raw);
31603 +       }
31604 +       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
31605 +               io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
31606 +               io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
31607 +       }
31608 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31609 +
31610 +       return 0;
31611 +}
31612 +
31613 +static struct sysdev_class ioapic_sysdev_class = {
31614 +       set_kset_name("ioapic"),
31615 +       .suspend = ioapic_suspend,
31616 +       .resume = ioapic_resume,
31617 +};
31618 +
31619 +static int __init ioapic_init_sysfs(void)
31620 +{
31621 +       struct sys_device * dev;
31622 +       int i, size, error = 0;
31623 +
31624 +       error = sysdev_class_register(&ioapic_sysdev_class);
31625 +       if (error)
31626 +               return error;
31627 +
31628 +       for (i = 0; i < nr_ioapics; i++ ) {
31629 +               size = sizeof(struct sys_device) + nr_ioapic_registers[i]
31630 +                       * sizeof(struct IO_APIC_route_entry);
31631 +               mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
31632 +               if (!mp_ioapic_data[i]) {
31633 +                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
31634 +                       continue;
31635 +               }
31636 +               memset(mp_ioapic_data[i], 0, size);
31637 +               dev = &mp_ioapic_data[i]->dev;
31638 +               dev->id = i;
31639 +               dev->cls = &ioapic_sysdev_class;
31640 +               error = sysdev_register(dev);
31641 +               if (error) {
31642 +                       kfree(mp_ioapic_data[i]);
31643 +                       mp_ioapic_data[i] = NULL;
31644 +                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
31645 +                       continue;
31646 +               }
31647 +       }
31648 +
31649 +       return 0;
31650 +}
31651 +
31652 +device_initcall(ioapic_init_sysfs);
31653 +
31654 +/* --------------------------------------------------------------------------
31655 +                          ACPI-based IOAPIC Configuration
31656 +   -------------------------------------------------------------------------- */
31657 +
31658 +#ifdef CONFIG_ACPI
31659 +
31660 +#define IO_APIC_MAX_ID         0xFE
31661 +
31662 +int __init io_apic_get_version (int ioapic)
31663 +{
31664 +       union IO_APIC_reg_01    reg_01;
31665 +       unsigned long flags;
31666 +
31667 +       spin_lock_irqsave(&ioapic_lock, flags);
31668 +       reg_01.raw = io_apic_read(ioapic, 1);
31669 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31670 +
31671 +       return reg_01.bits.version;
31672 +}
31673 +
31674 +
31675 +int __init io_apic_get_redir_entries (int ioapic)
31676 +{
31677 +       union IO_APIC_reg_01    reg_01;
31678 +       unsigned long flags;
31679 +
31680 +       spin_lock_irqsave(&ioapic_lock, flags);
31681 +       reg_01.raw = io_apic_read(ioapic, 1);
31682 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31683 +
31684 +       return reg_01.bits.entries;
31685 +}
31686 +
31687 +
31688 +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
31689 +{
31690 +       struct IO_APIC_route_entry entry;
31691 +       unsigned long flags;
31692 +
31693 +       if (!IO_APIC_IRQ(irq)) {
31694 +               apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
31695 +                       ioapic);
31696 +               return -EINVAL;
31697 +       }
31698 +
31699 +       /*
31700 +        * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
31701 +        * Note that we mask (disable) IRQs now -- these get enabled when the
31702 +        * corresponding device driver registers for this IRQ.
31703 +        */
31704 +
31705 +       memset(&entry,0,sizeof(entry));
31706 +
31707 +       entry.delivery_mode = INT_DELIVERY_MODE;
31708 +       entry.dest_mode = INT_DEST_MODE;
31709 +       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
31710 +       entry.trigger = edge_level;
31711 +       entry.polarity = active_high_low;
31712 +       entry.mask = 1;                                  /* Disabled (masked) */
31713 +
31714 +       irq = gsi_irq_sharing(irq);
31715 +       /*
31716 +        * IRQs < 16 are already in the irq_2_pin[] map
31717 +        */
31718 +       if (irq >= 16)
31719 +               add_pin_to_irq(irq, ioapic, pin);
31720 +
31721 +       entry.vector = assign_irq_vector(irq);
31722 +
31723 +       apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> "
31724 +               "IRQ %d Mode:%i Active:%i)\n", ioapic, 
31725 +              mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
31726 +              edge_level, active_high_low);
31727 +
31728 +       ioapic_register_intr(irq, entry.vector, edge_level);
31729 +
31730 +       if (!ioapic && (irq < 16))
31731 +               disable_8259A_irq(irq);
31732 +
31733 +       spin_lock_irqsave(&ioapic_lock, flags);
31734 +       io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
31735 +       io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
31736 +       set_native_irq_info(use_pci_vector() ?  entry.vector : irq, TARGET_CPUS);
31737 +       spin_unlock_irqrestore(&ioapic_lock, flags);
31738 +
31739 +       return 0;
31740 +}
31741 +
31742 +#endif /* CONFIG_ACPI */
31743 +
31744 +
31745 +#ifndef CONFIG_XEN
31746 +/*
31747 + * This function currently is only a helper for the i386 smp boot process where
31748 + * we need to reprogram the ioredtbls to cater for the cpus which have come online
31749 + * so mask in all cases should simply be TARGET_CPUS
31750 + */
31751 +#ifdef CONFIG_SMP
31752 +void __init setup_ioapic_dest(void)
31753 +{
31754 +       int pin, ioapic, irq, irq_entry;
31755 +
31756 +       if (skip_ioapic_setup == 1)
31757 +               return;
31758 +
31759 +       for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
31760 +               for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
31761 +                       irq_entry = find_irq_entry(ioapic, pin, mp_INT);
31762 +                       if (irq_entry == -1)
31763 +                               continue;
31764 +                       irq = pin_2_irq(irq_entry, ioapic, pin);
31765 +                       set_ioapic_affinity_irq(irq, TARGET_CPUS);
31766 +               }
31767 +
31768 +       }
31769 +}
31770 +#endif
31771 +#endif /* !CONFIG_XEN */
31772 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/ioport-xen.c linux-2.6.16/arch/x86_64/kernel/ioport-xen.c
31773 --- linux-2.6.16.orig/arch/x86_64/kernel/ioport-xen.c   1970-01-01 01:00:00.000000000 +0100
31774 +++ linux-2.6.16/arch/x86_64/kernel/ioport-xen.c        2006-06-26 09:51:32.000000000 +0200
31775 @@ -0,0 +1,58 @@
31776 +/*
31777 + *     linux/arch/x86_64/kernel/ioport.c
31778 + *
31779 + * This contains the io-permission bitmap code - written by obz, with changes
31780 + * by Linus.
31781 + */
31782 +
31783 +#include <linux/sched.h>
31784 +#include <linux/kernel.h>
31785 +#include <linux/capability.h>
31786 +#include <linux/errno.h>
31787 +#include <linux/types.h>
31788 +#include <linux/ioport.h>
31789 +#include <linux/mm.h>
31790 +#include <linux/smp.h>
31791 +#include <linux/smp_lock.h>
31792 +#include <linux/stddef.h>
31793 +#include <linux/slab.h>
31794 +#include <linux/thread_info.h>
31795 +#include <xen/interface/physdev.h>
31796 +
31797 +/*
31798 + * sys_iopl has to be used when you want to access the IO ports
31799 + * beyond the 0x3ff range: to get the full 65536 ports bitmapped
31800 + * you'd need 8kB of bitmaps/process, which is a bit excessive.
31801 + *
31802 + */
31803 +
31804 +asmlinkage long sys_iopl(unsigned int new_iopl, struct pt_regs *regs)
31805 +{
31806 +        unsigned int old_iopl = current->thread.iopl;
31807 +        physdev_op_t op;
31808 +
31809 +       if (new_iopl > 3)
31810 +               return -EINVAL;
31811 +
31812 +       /* Need "raw I/O" privileges for direct port access. */
31813 +       if ((new_iopl > old_iopl) && !capable(CAP_SYS_RAWIO))
31814 +               return -EPERM;
31815 +
31816 +       /* Change our version of the privilege levels. */
31817 +       current->thread.iopl = new_iopl;
31818 +
31819 +       /* Force the change at ring 0. */
31820 +       op.cmd             = PHYSDEVOP_SET_IOPL;
31821 +       op.u.set_iopl.iopl = (new_iopl == 0) ? 1 : new_iopl;
31822 +       HYPERVISOR_physdev_op(&op);
31823 +
31824 +       return 0;
31825 +}
31826 +
31827 +/*
31828 + * this changes the io permissions bitmap in the current task.
31829 + */
31830 +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
31831 +{
31832 +  return turn_on ? sys_iopl(3, NULL) : 0;
31833 +}
31834 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/irq-xen.c linux-2.6.16/arch/x86_64/kernel/irq-xen.c
31835 --- linux-2.6.16.orig/arch/x86_64/kernel/irq-xen.c      1970-01-01 01:00:00.000000000 +0100
31836 +++ linux-2.6.16/arch/x86_64/kernel/irq-xen.c   2006-06-26 09:51:32.000000000 +0200
31837 @@ -0,0 +1,168 @@
31838 +/*
31839 + *     linux/arch/x86_64/kernel/irq.c
31840 + *
31841 + *     Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
31842 + *
31843 + * This file contains the lowest level x86_64-specific interrupt
31844 + * entry and irq statistics code. All the remaining irq logic is
31845 + * done by the generic kernel/irq/ code and in the
31846 + * x86_64-specific irq controller code. (e.g. i8259.c and
31847 + * io_apic.c.)
31848 + */
31849 +
31850 +#include <linux/kernel_stat.h>
31851 +#include <linux/interrupt.h>
31852 +#include <linux/seq_file.h>
31853 +#include <linux/module.h>
31854 +#include <linux/delay.h>
31855 +#include <asm/uaccess.h>
31856 +#include <asm/io_apic.h>
31857 +#include <asm/idle.h>
31858 +
31859 +atomic_t irq_err_count;
31860 +#ifdef CONFIG_X86_IO_APIC
31861 +#ifdef APIC_MISMATCH_DEBUG
31862 +atomic_t irq_mis_count;
31863 +#endif
31864 +#endif
31865 +
31866 +/*
31867 + * Generic, controller-independent functions:
31868 + */
31869 +
31870 +int show_interrupts(struct seq_file *p, void *v)
31871 +{
31872 +       int i = *(loff_t *) v, j;
31873 +       struct irqaction * action;
31874 +       unsigned long flags;
31875 +
31876 +       if (i == 0) {
31877 +               seq_printf(p, "           ");
31878 +               for (j=0; j<NR_CPUS; j++)
31879 +                       if (cpu_online(j))
31880 +                               seq_printf(p, "CPU%d       ",j);
31881 +               seq_putc(p, '\n');
31882 +       }
31883 +
31884 +       if (i < NR_IRQS) {
31885 +               spin_lock_irqsave(&irq_desc[i].lock, flags);
31886 +               action = irq_desc[i].action;
31887 +               if (!action) 
31888 +                       goto skip;
31889 +               seq_printf(p, "%3d: ",i);
31890 +#ifndef CONFIG_SMP
31891 +               seq_printf(p, "%10u ", kstat_irqs(i));
31892 +#else
31893 +               for (j=0; j<NR_CPUS; j++)
31894 +                       if (cpu_online(j))
31895 +                       seq_printf(p, "%10u ",
31896 +                               kstat_cpu(j).irqs[i]);
31897 +#endif
31898 +               seq_printf(p, " %14s", irq_desc[i].handler->typename);
31899 +
31900 +               seq_printf(p, "  %s", action->name);
31901 +               for (action=action->next; action; action = action->next)
31902 +                       seq_printf(p, ", %s", action->name);
31903 +               seq_putc(p, '\n');
31904 +skip:
31905 +               spin_unlock_irqrestore(&irq_desc[i].lock, flags);
31906 +       } else if (i == NR_IRQS) {
31907 +               seq_printf(p, "NMI: ");
31908 +               for (j = 0; j < NR_CPUS; j++)
31909 +                       if (cpu_online(j))
31910 +                               seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
31911 +               seq_putc(p, '\n');
31912 +#ifdef CONFIG_X86_LOCAL_APIC
31913 +               seq_printf(p, "LOC: ");
31914 +               for (j = 0; j < NR_CPUS; j++)
31915 +                       if (cpu_online(j))
31916 +                               seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
31917 +               seq_putc(p, '\n');
31918 +#endif
31919 +               seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
31920 +#ifdef CONFIG_X86_IO_APIC
31921 +#ifdef APIC_MISMATCH_DEBUG
31922 +               seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
31923 +#endif
31924 +#endif
31925 +       }
31926 +       return 0;
31927 +}
31928 +
31929 +/*
31930 + * do_IRQ handles all normal device IRQ's (the special
31931 + * SMP cross-CPU interrupts have their own specific
31932 + * handlers).
31933 + */
31934 +asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
31935 +{      
31936 +       /* high bit used in ret_from_ code  */
31937 +       unsigned irq = ~regs->orig_rax;
31938 +
31939 +       exit_idle();
31940 +       irq_enter();
31941 +
31942 +       __do_IRQ(irq, regs);
31943 +       irq_exit();
31944 +
31945 +       return 1;
31946 +}
31947 +
31948 +#ifdef CONFIG_HOTPLUG_CPU
31949 +void fixup_irqs(cpumask_t map)
31950 +{
31951 +       unsigned int irq;
31952 +       static int warned;
31953 +
31954 +       for (irq = 0; irq < NR_IRQS; irq++) {
31955 +               cpumask_t mask;
31956 +               if (irq == 2)
31957 +                       continue;
31958 +
31959 +               cpus_and(mask, irq_affinity[irq], map);
31960 +               if (any_online_cpu(mask) == NR_CPUS) {
31961 +                       printk("Breaking affinity for irq %i\n", irq);
31962 +                       mask = map;
31963 +               }
31964 +               if (irq_desc[irq].handler->set_affinity)
31965 +                       irq_desc[irq].handler->set_affinity(irq, mask);
31966 +               else if (irq_desc[irq].action && !(warned++))
31967 +                       printk("Cannot set affinity for irq %i\n", irq);
31968 +       }
31969 +
31970 +       /* That doesn't seem sufficient.  Give it 1ms. */
31971 +       local_irq_enable();
31972 +       mdelay(1);
31973 +       local_irq_disable();
31974 +}
31975 +#endif
31976 +
31977 +extern void call_softirq(void);
31978 +
31979 +asmlinkage void do_softirq(void)
31980 +{
31981 +       __u32 pending;
31982 +       unsigned long flags;
31983 +
31984 +       if (in_interrupt())
31985 +               return;
31986 +
31987 +       local_irq_save(flags);
31988 +       pending = local_softirq_pending();
31989 +       /* Switch to interrupt stack */
31990 +       if (pending)
31991 +               call_softirq();
31992 +       local_irq_restore(flags);
31993 +}
31994 +EXPORT_SYMBOL(do_softirq);
31995 +
31996 +#ifndef CONFIG_X86_LOCAL_APIC
31997 +/*
31998 + * 'what should we do if we get a hw irq event on an illegal vector'.
31999 + * each architecture has to answer this themselves.
32000 + */
32001 +void ack_bad_irq(unsigned int irq)
32002 +{
32003 +        printk("unexpected IRQ trap at vector %02x\n", irq);
32004 +}
32005 +#endif
32006 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/ldt-xen.c linux-2.6.16/arch/x86_64/kernel/ldt-xen.c
32007 --- linux-2.6.16.orig/arch/x86_64/kernel/ldt-xen.c      1970-01-01 01:00:00.000000000 +0100
32008 +++ linux-2.6.16/arch/x86_64/kernel/ldt-xen.c   2006-06-26 09:51:32.000000000 +0200
32009 @@ -0,0 +1,282 @@
32010 +/*
32011 + * linux/arch/x86_64/kernel/ldt.c
32012 + *
32013 + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
32014 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
32015 + * Copyright (C) 2002 Andi Kleen
32016 + * 
32017 + * This handles calls from both 32bit and 64bit mode.
32018 + */
32019 +
32020 +#include <linux/errno.h>
32021 +#include <linux/sched.h>
32022 +#include <linux/string.h>
32023 +#include <linux/mm.h>
32024 +#include <linux/smp.h>
32025 +#include <linux/smp_lock.h>
32026 +#include <linux/vmalloc.h>
32027 +#include <linux/slab.h>
32028 +
32029 +#include <asm/uaccess.h>
32030 +#include <asm/system.h>
32031 +#include <asm/ldt.h>
32032 +#include <asm/desc.h>
32033 +#include <asm/proto.h>
32034 +#include <asm/pgalloc.h>
32035 +
32036 +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
32037 +static void flush_ldt(void *null)
32038 +{
32039 +       if (current->active_mm)
32040 +               load_LDT(&current->active_mm->context);
32041 +}
32042 +#endif
32043 +
32044 +static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
32045 +{
32046 +       void *oldldt;
32047 +       void *newldt;
32048 +       unsigned oldsize;
32049 +
32050 +       if (mincount <= (unsigned)pc->size)
32051 +               return 0;
32052 +       oldsize = pc->size;
32053 +       mincount = (mincount+511)&(~511);
32054 +       if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
32055 +               newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
32056 +       else
32057 +               newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
32058 +
32059 +       if (!newldt)
32060 +               return -ENOMEM;
32061 +
32062 +       if (oldsize)
32063 +               memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
32064 +       oldldt = pc->ldt;
32065 +       memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
32066 +       wmb();
32067 +       pc->ldt = newldt;
32068 +       wmb();
32069 +       pc->size = mincount;
32070 +       wmb();
32071 +       if (reload) {
32072 +#ifdef CONFIG_SMP
32073 +               cpumask_t mask;
32074 +
32075 +               preempt_disable();
32076 +#endif
32077 +               make_pages_readonly(
32078 +                       pc->ldt,
32079 +                       (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
32080 +                       XENFEAT_writable_descriptor_tables);
32081 +               load_LDT(pc);
32082 +#ifdef CONFIG_SMP
32083 +               mask = cpumask_of_cpu(smp_processor_id());
32084 +               if (!cpus_equal(current->mm->cpu_vm_mask, mask))
32085 +                       smp_call_function(flush_ldt, NULL, 1, 1);
32086 +               preempt_enable();
32087 +#endif
32088 +       }
32089 +       if (oldsize) {
32090 +               make_pages_writable(
32091 +                       oldldt,
32092 +                       (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
32093 +                       XENFEAT_writable_descriptor_tables);
32094 +               if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
32095 +                       vfree(oldldt);
32096 +               else
32097 +                       kfree(oldldt);
32098 +       }
32099 +       return 0;
32100 +}
32101 +
32102 +static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
32103 +{
32104 +       int err = alloc_ldt(new, old->size, 0);
32105 +       if (err < 0)
32106 +               return err;
32107 +       memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
32108 +       make_pages_readonly(
32109 +               new->ldt,
32110 +               (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
32111 +               XENFEAT_writable_descriptor_tables);
32112 +       return 0;
32113 +}
32114 +
32115 +/*
32116 + * we do not have to muck with descriptors here, that is
32117 + * done in switch_mm() as needed.
32118 + */
32119 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
32120 +{
32121 +       struct mm_struct * old_mm;
32122 +       int retval = 0;
32123 +
32124 +       memset(&mm->context, 0, sizeof(mm->context));
32125 +       init_MUTEX(&mm->context.sem);
32126 +       old_mm = current->mm;
32127 +       if (old_mm && old_mm->context.size > 0) {
32128 +               down(&old_mm->context.sem);
32129 +               retval = copy_ldt(&mm->context, &old_mm->context);
32130 +               up(&old_mm->context.sem);
32131 +       }
32132 +       if (retval == 0) {
32133 +               spin_lock(&mm_unpinned_lock);
32134 +               list_add(&mm->context.unpinned, &mm_unpinned);
32135 +               spin_unlock(&mm_unpinned_lock);
32136 +       }
32137 +       return retval;
32138 +}
32139 +
32140 +/*
32141 + * 
32142 + * Don't touch the LDT register - we're already in the next thread.
32143 + */
32144 +void destroy_context(struct mm_struct *mm)
32145 +{
32146 +       if (mm->context.size) {
32147 +               if (mm == current->active_mm)
32148 +                       clear_LDT();
32149 +               make_pages_writable(
32150 +                       mm->context.ldt,
32151 +                       (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
32152 +                       XENFEAT_writable_descriptor_tables);
32153 +               if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
32154 +                       vfree(mm->context.ldt);
32155 +               else
32156 +                       kfree(mm->context.ldt);
32157 +               mm->context.size = 0;
32158 +       }
32159 +       if (!mm->context.pinned) {
32160 +               spin_lock(&mm_unpinned_lock);
32161 +               list_del(&mm->context.unpinned);
32162 +               spin_unlock(&mm_unpinned_lock);
32163 +       }
32164 +}
32165 +
32166 +static int read_ldt(void __user * ptr, unsigned long bytecount)
32167 +{
32168 +       int err;
32169 +       unsigned long size;
32170 +       struct mm_struct * mm = current->mm;
32171 +
32172 +       if (!mm->context.size)
32173 +               return 0;
32174 +       if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
32175 +               bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
32176 +
32177 +       down(&mm->context.sem);
32178 +       size = mm->context.size*LDT_ENTRY_SIZE;
32179 +       if (size > bytecount)
32180 +               size = bytecount;
32181 +
32182 +       err = 0;
32183 +       if (copy_to_user(ptr, mm->context.ldt, size))
32184 +               err = -EFAULT;
32185 +       up(&mm->context.sem);
32186 +       if (err < 0)
32187 +               goto error_return;
32188 +       if (size != bytecount) {
32189 +               /* zero-fill the rest */
32190 +               if (clear_user(ptr+size, bytecount-size) != 0) {
32191 +                       err = -EFAULT;
32192 +                       goto error_return;
32193 +               }
32194 +       }
32195 +       return bytecount;
32196 +error_return:
32197 +       return err;
32198 +}
32199 +
32200 +static int read_default_ldt(void __user * ptr, unsigned long bytecount)
32201 +{
32202 +       /* Arbitrary number */ 
32203 +       /* x86-64 default LDT is all zeros */
32204 +       if (bytecount > 128) 
32205 +               bytecount = 128;        
32206 +       if (clear_user(ptr, bytecount))
32207 +               return -EFAULT;
32208 +       return bytecount; 
32209 +}
32210 +
32211 +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
32212 +{
32213 +       struct task_struct *me = current;
32214 +       struct mm_struct * mm = me->mm;
32215 +       __u32 entry_1, entry_2, *lp;
32216 +       unsigned long mach_lp;
32217 +       int error;
32218 +       struct user_desc ldt_info;
32219 +
32220 +       error = -EINVAL;
32221 +
32222 +       if (bytecount != sizeof(ldt_info))
32223 +               goto out;
32224 +       error = -EFAULT;        
32225 +       if (copy_from_user(&ldt_info, ptr, bytecount))
32226 +               goto out;
32227 +
32228 +       error = -EINVAL;
32229 +       if (ldt_info.entry_number >= LDT_ENTRIES)
32230 +               goto out;
32231 +       if (ldt_info.contents == 3) {
32232 +               if (oldmode)
32233 +                       goto out;
32234 +               if (ldt_info.seg_not_present == 0)
32235 +                       goto out;
32236 +       }
32237 +
32238 +       down(&mm->context.sem);
32239 +       if (ldt_info.entry_number >= (unsigned)mm->context.size) {
32240 +               error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
32241 +               if (error < 0)
32242 +                       goto out_unlock;
32243 +       }
32244 +
32245 +       lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
32246 +       mach_lp = arbitrary_virt_to_machine(lp);
32247 +
32248 +       /* Allow LDTs to be cleared by the user. */
32249 +       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
32250 +               if (oldmode || LDT_empty(&ldt_info)) {
32251 +                       entry_1 = 0;
32252 +                       entry_2 = 0;
32253 +                       goto install;
32254 +               }
32255 +       }
32256 +
32257 +       entry_1 = LDT_entry_a(&ldt_info);
32258 +       entry_2 = LDT_entry_b(&ldt_info);
32259 +       if (oldmode)
32260 +               entry_2 &= ~(1 << 20);
32261 +
32262 +       /* Install the new entry ...  */
32263 +install:
32264 +       error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32)));
32265 +
32266 +out_unlock:
32267 +       up(&mm->context.sem);
32268 +out:
32269 +       return error;
32270 +}
32271 +
32272 +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
32273 +{
32274 +       int ret = -ENOSYS;
32275 +
32276 +       switch (func) {
32277 +       case 0:
32278 +               ret = read_ldt(ptr, bytecount);
32279 +               break;
32280 +       case 1:
32281 +               ret = write_ldt(ptr, bytecount, 1);
32282 +               break;
32283 +       case 2:
32284 +               ret = read_default_ldt(ptr, bytecount);
32285 +               break;
32286 +       case 0x11:
32287 +               ret = write_ldt(ptr, bytecount, 0);
32288 +               break;
32289 +       }
32290 +       return ret;
32291 +}
32292 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/mpparse-xen.c linux-2.6.16/arch/x86_64/kernel/mpparse-xen.c
32293 --- linux-2.6.16.orig/arch/x86_64/kernel/mpparse-xen.c  1970-01-01 01:00:00.000000000 +0100
32294 +++ linux-2.6.16/arch/x86_64/kernel/mpparse-xen.c       2006-06-26 09:51:32.000000000 +0200
32295 @@ -0,0 +1,1005 @@
32296 +/*
32297 + *     Intel Multiprocessor Specification 1.1 and 1.4
32298 + *     compliant MP-table parsing routines.
32299 + *
32300 + *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
32301 + *     (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
32302 + *
32303 + *     Fixes
32304 + *             Erich Boleyn    :       MP v1.4 and additional changes.
32305 + *             Alan Cox        :       Added EBDA scanning
32306 + *             Ingo Molnar     :       various cleanups and rewrites
32307 + *             Maciej W. Rozycki:      Bits for default MP configurations
32308 + *             Paul Diefenbaugh:       Added full ACPI support
32309 + */
32310 +
32311 +#include <linux/mm.h>
32312 +#include <linux/init.h>
32313 +#include <linux/delay.h>
32314 +#include <linux/config.h>
32315 +#include <linux/bootmem.h>
32316 +#include <linux/smp_lock.h>
32317 +#include <linux/kernel_stat.h>
32318 +#include <linux/mc146818rtc.h>
32319 +#include <linux/acpi.h>
32320 +#include <linux/module.h>
32321 +
32322 +#include <asm/smp.h>
32323 +#include <asm/mtrr.h>
32324 +#include <asm/mpspec.h>
32325 +#include <asm/pgalloc.h>
32326 +#include <asm/io_apic.h>
32327 +#include <asm/proto.h>
32328 +#include <asm/acpi.h>
32329 +
32330 +/* Have we found an MP table */
32331 +int smp_found_config;
32332 +unsigned int __initdata maxcpus = NR_CPUS;
32333 +
32334 +int acpi_found_madt;
32335 +
32336 +/*
32337 + * Various Linux-internal data structures created from the
32338 + * MP-table.
32339 + */
32340 +unsigned char apic_version [MAX_APICS];
32341 +unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
32342 +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
32343 +
32344 +static int mp_current_pci_id = 0;
32345 +/* I/O APIC entries */
32346 +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
32347 +
32348 +/* # of MP IRQ source entries */
32349 +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
32350 +
32351 +/* MP IRQ source entries */
32352 +int mp_irq_entries;
32353 +
32354 +int nr_ioapics;
32355 +int pic_mode;
32356 +unsigned long mp_lapic_addr = 0;
32357 +
32358 +
32359 +
32360 +/* Processor that is doing the boot up */
32361 +unsigned int boot_cpu_id = -1U;
32362 +/* Internal processor count */
32363 +unsigned int num_processors __initdata = 0;
32364 +
32365 +unsigned disabled_cpus __initdata;
32366 +
32367 +/* Bitmask of physically existing CPUs */
32368 +physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
32369 +
32370 +/* ACPI MADT entry parsing functions */
32371 +#ifdef CONFIG_ACPI
32372 +extern struct acpi_boot_flags acpi_boot;
32373 +#ifdef CONFIG_X86_LOCAL_APIC
32374 +extern int acpi_parse_lapic (acpi_table_entry_header *header);
32375 +extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header);
32376 +extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header);
32377 +#endif /*CONFIG_X86_LOCAL_APIC*/
32378 +#ifdef CONFIG_X86_IO_APIC
32379 +extern int acpi_parse_ioapic (acpi_table_entry_header *header);
32380 +#endif /*CONFIG_X86_IO_APIC*/
32381 +#endif /*CONFIG_ACPI*/
32382 +
32383 +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
32384 +
32385 +
32386 +/*
32387 + * Intel MP BIOS table parsing routines:
32388 + */
32389 +
32390 +/*
32391 + * Checksum an MP configuration block.
32392 + */
32393 +
32394 +static int __init mpf_checksum(unsigned char *mp, int len)
32395 +{
32396 +       int sum = 0;
32397 +
32398 +       while (len--)
32399 +               sum += *mp++;
32400 +
32401 +       return sum & 0xFF;
32402 +}
32403 +
32404 +#ifndef CONFIG_XEN
32405 +static void __init MP_processor_info (struct mpc_config_processor *m)
32406 +{
32407 +       int cpu;
32408 +       unsigned char ver;
32409 +       static int found_bsp=0;
32410 +
32411 +       if (!(m->mpc_cpuflag & CPU_ENABLED)) {
32412 +               disabled_cpus++;
32413 +               return;
32414 +       }
32415 +
32416 +       printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
32417 +               m->mpc_apicid,
32418 +              (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8,
32419 +              (m->mpc_cpufeature & CPU_MODEL_MASK)>>4,
32420 +               m->mpc_apicver);
32421 +
32422 +       if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
32423 +               Dprintk("    Bootup CPU\n");
32424 +               boot_cpu_id = m->mpc_apicid;
32425 +       }
32426 +       if (num_processors >= NR_CPUS) {
32427 +               printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
32428 +                       " Processor ignored.\n", NR_CPUS);
32429 +               return;
32430 +       }
32431 +
32432 +       cpu = num_processors++;
32433 +       
32434 +#if MAX_APICS < 255    
32435 +       if ((int)m->mpc_apicid > MAX_APICS) {
32436 +               printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
32437 +                       m->mpc_apicid, MAX_APICS);
32438 +               return;
32439 +       }
32440 +#endif
32441 +       ver = m->mpc_apicver;
32442 +
32443 +       physid_set(m->mpc_apicid, phys_cpu_present_map);
32444 +       /*
32445 +        * Validate version
32446 +        */
32447 +       if (ver == 0x0) {
32448 +               printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
32449 +               ver = 0x10;
32450 +       }
32451 +       apic_version[m->mpc_apicid] = ver;
32452 +       if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
32453 +               /*
32454 +                * bios_cpu_apicid is required to have processors listed
32455 +                * in same order as logical cpu numbers. Hence the first
32456 +                * entry is BSP, and so on.
32457 +                */
32458 +               cpu = 0;
32459 +
32460 +               bios_cpu_apicid[0] = m->mpc_apicid;
32461 +               x86_cpu_to_apicid[0] = m->mpc_apicid;
32462 +               found_bsp = 1;
32463 +       } else
32464 +               cpu = num_processors - found_bsp;
32465 +       bios_cpu_apicid[cpu] = m->mpc_apicid;
32466 +       x86_cpu_to_apicid[cpu] = m->mpc_apicid;
32467 +
32468 +       cpu_set(cpu, cpu_possible_map);
32469 +       cpu_set(cpu, cpu_present_map);
32470 +}
32471 +#else
32472 +void __init MP_processor_info (struct mpc_config_processor *m)
32473 +{
32474 +       num_processors++;
32475 +}
32476 +#endif /* CONFIG_XEN */
32477 +
32478 +static void __init MP_bus_info (struct mpc_config_bus *m)
32479 +{
32480 +       char str[7];
32481 +
32482 +       memcpy(str, m->mpc_bustype, 6);
32483 +       str[6] = 0;
32484 +       Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
32485 +
32486 +       if (strncmp(str, "ISA", 3) == 0) {
32487 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
32488 +       } else if (strncmp(str, "EISA", 4) == 0) {
32489 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
32490 +       } else if (strncmp(str, "PCI", 3) == 0) {
32491 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
32492 +               mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
32493 +               mp_current_pci_id++;
32494 +       } else if (strncmp(str, "MCA", 3) == 0) {
32495 +               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
32496 +       } else {
32497 +               printk(KERN_ERR "Unknown bustype %s\n", str);
32498 +       }
32499 +}
32500 +
32501 +static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
32502 +{
32503 +       if (!(m->mpc_flags & MPC_APIC_USABLE))
32504 +               return;
32505 +
32506 +       printk("I/O APIC #%d Version %d at 0x%X.\n",
32507 +               m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
32508 +       if (nr_ioapics >= MAX_IO_APICS) {
32509 +               printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n",
32510 +                       MAX_IO_APICS, nr_ioapics);
32511 +               panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
32512 +       }
32513 +       if (!m->mpc_apicaddr) {
32514 +               printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
32515 +                       " found in MP table, skipping!\n");
32516 +               return;
32517 +       }
32518 +       mp_ioapics[nr_ioapics] = *m;
32519 +       nr_ioapics++;
32520 +}
32521 +
32522 +static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
32523 +{
32524 +       mp_irqs [mp_irq_entries] = *m;
32525 +       Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
32526 +               " IRQ %02x, APIC ID %x, APIC INT %02x\n",
32527 +                       m->mpc_irqtype, m->mpc_irqflag & 3,
32528 +                       (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
32529 +                       m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
32530 +       if (++mp_irq_entries >= MAX_IRQ_SOURCES)
32531 +               panic("Max # of irq sources exceeded!!\n");
32532 +}
32533 +
32534 +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
32535 +{
32536 +       Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
32537 +               " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
32538 +                       m->mpc_irqtype, m->mpc_irqflag & 3,
32539 +                       (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
32540 +                       m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
32541 +       /*
32542 +        * Well it seems all SMP boards in existence
32543 +        * use ExtINT/LVT1 == LINT0 and
32544 +        * NMI/LVT2 == LINT1 - the following check
32545 +        * will show us if this assumptions is false.
32546 +        * Until then we do not have to add baggage.
32547 +        */
32548 +       if ((m->mpc_irqtype == mp_ExtINT) &&
32549 +               (m->mpc_destapiclint != 0))
32550 +                       BUG();
32551 +       if ((m->mpc_irqtype == mp_NMI) &&
32552 +               (m->mpc_destapiclint != 1))
32553 +                       BUG();
32554 +}
32555 +
32556 +/*
32557 + * Read/parse the MPC
32558 + */
32559 +
32560 +static int __init smp_read_mpc(struct mp_config_table *mpc)
32561 +{
32562 +       char str[16];
32563 +       int count=sizeof(*mpc);
32564 +       unsigned char *mpt=((unsigned char *)mpc)+count;
32565 +
32566 +       if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
32567 +               printk("SMP mptable: bad signature [%c%c%c%c]!\n",
32568 +                       mpc->mpc_signature[0],
32569 +                       mpc->mpc_signature[1],
32570 +                       mpc->mpc_signature[2],
32571 +                       mpc->mpc_signature[3]);
32572 +               return 0;
32573 +       }
32574 +       if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
32575 +               printk("SMP mptable: checksum error!\n");
32576 +               return 0;
32577 +       }
32578 +       if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
32579 +               printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
32580 +                       mpc->mpc_spec);
32581 +               return 0;
32582 +       }
32583 +       if (!mpc->mpc_lapic) {
32584 +               printk(KERN_ERR "SMP mptable: null local APIC address!\n");
32585 +               return 0;
32586 +       }
32587 +       memcpy(str,mpc->mpc_oem,8);
32588 +       str[8]=0;
32589 +       printk(KERN_INFO "OEM ID: %s ",str);
32590 +
32591 +       memcpy(str,mpc->mpc_productid,12);
32592 +       str[12]=0;
32593 +       printk("Product ID: %s ",str);
32594 +
32595 +       printk("APIC at: 0x%X\n",mpc->mpc_lapic);
32596 +
32597 +       /* save the local APIC address, it might be non-default */
32598 +       if (!acpi_lapic)
32599 +       mp_lapic_addr = mpc->mpc_lapic;
32600 +
32601 +       /*
32602 +        *      Now process the configuration blocks.
32603 +        */
32604 +       while (count < mpc->mpc_length) {
32605 +               switch(*mpt) {
32606 +                       case MP_PROCESSOR:
32607 +                       {
32608 +                               struct mpc_config_processor *m=
32609 +                                       (struct mpc_config_processor *)mpt;
32610 +                               if (!acpi_lapic)
32611 +                               MP_processor_info(m);
32612 +                               mpt += sizeof(*m);
32613 +                               count += sizeof(*m);
32614 +                               break;
32615 +                       }
32616 +                       case MP_BUS:
32617 +                       {
32618 +                               struct mpc_config_bus *m=
32619 +                                       (struct mpc_config_bus *)mpt;
32620 +                               MP_bus_info(m);
32621 +                               mpt += sizeof(*m);
32622 +                               count += sizeof(*m);
32623 +                               break;
32624 +                       }
32625 +                       case MP_IOAPIC:
32626 +                       {
32627 +                               struct mpc_config_ioapic *m=
32628 +                                       (struct mpc_config_ioapic *)mpt;
32629 +                               MP_ioapic_info(m);
32630 +                               mpt+=sizeof(*m);
32631 +                               count+=sizeof(*m);
32632 +                               break;
32633 +                       }
32634 +                       case MP_INTSRC:
32635 +                       {
32636 +                               struct mpc_config_intsrc *m=
32637 +                                       (struct mpc_config_intsrc *)mpt;
32638 +
32639 +                               MP_intsrc_info(m);
32640 +                               mpt+=sizeof(*m);
32641 +                               count+=sizeof(*m);
32642 +                               break;
32643 +                       }
32644 +                       case MP_LINTSRC:
32645 +                       {
32646 +                               struct mpc_config_lintsrc *m=
32647 +                                       (struct mpc_config_lintsrc *)mpt;
32648 +                               MP_lintsrc_info(m);
32649 +                               mpt+=sizeof(*m);
32650 +                               count+=sizeof(*m);
32651 +                               break;
32652 +                       }
32653 +               }
32654 +       }
32655 +       clustered_apic_check();
32656 +       if (!num_processors)
32657 +               printk(KERN_ERR "SMP mptable: no processors registered!\n");
32658 +       return num_processors;
32659 +}
32660 +
32661 +static int __init ELCR_trigger(unsigned int irq)
32662 +{
32663 +       unsigned int port;
32664 +
32665 +       port = 0x4d0 + (irq >> 3);
32666 +       return (inb(port) >> (irq & 7)) & 1;
32667 +}
32668 +
32669 +static void __init construct_default_ioirq_mptable(int mpc_default_type)
32670 +{
32671 +       struct mpc_config_intsrc intsrc;
32672 +       int i;
32673 +       int ELCR_fallback = 0;
32674 +
32675 +       intsrc.mpc_type = MP_INTSRC;
32676 +       intsrc.mpc_irqflag = 0;                 /* conforming */
32677 +       intsrc.mpc_srcbus = 0;
32678 +       intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
32679 +
32680 +       intsrc.mpc_irqtype = mp_INT;
32681 +
32682 +       /*
32683 +        *  If true, we have an ISA/PCI system with no IRQ entries
32684 +        *  in the MP table. To prevent the PCI interrupts from being set up
32685 +        *  incorrectly, we try to use the ELCR. The sanity check to see if
32686 +        *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
32687 +        *  never be level sensitive, so we simply see if the ELCR agrees.
32688 +        *  If it does, we assume it's valid.
32689 +        */
32690 +       if (mpc_default_type == 5) {
32691 +               printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
32692 +
32693 +               if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
32694 +                       printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
32695 +               else {
32696 +                       printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
32697 +                       ELCR_fallback = 1;
32698 +               }
32699 +       }
32700 +
32701 +       for (i = 0; i < 16; i++) {
32702 +               switch (mpc_default_type) {
32703 +               case 2:
32704 +                       if (i == 0 || i == 13)
32705 +                               continue;       /* IRQ0 & IRQ13 not connected */
32706 +                       /* fall through */
32707 +               default:
32708 +                       if (i == 2)
32709 +                               continue;       /* IRQ2 is never connected */
32710 +               }
32711 +
32712 +               if (ELCR_fallback) {
32713 +                       /*
32714 +                        *  If the ELCR indicates a level-sensitive interrupt, we
32715 +                        *  copy that information over to the MP table in the
32716 +                        *  irqflag field (level sensitive, active high polarity).
32717 +                        */
32718 +                       if (ELCR_trigger(i))
32719 +                               intsrc.mpc_irqflag = 13;
32720 +                       else
32721 +                               intsrc.mpc_irqflag = 0;
32722 +               }
32723 +
32724 +               intsrc.mpc_srcbusirq = i;
32725 +               intsrc.mpc_dstirq = i ? i : 2;          /* IRQ0 to INTIN2 */
32726 +               MP_intsrc_info(&intsrc);
32727 +       }
32728 +
32729 +       intsrc.mpc_irqtype = mp_ExtINT;
32730 +       intsrc.mpc_srcbusirq = 0;
32731 +       intsrc.mpc_dstirq = 0;                          /* 8259A to INTIN0 */
32732 +       MP_intsrc_info(&intsrc);
32733 +}
32734 +
32735 +static inline void __init construct_default_ISA_mptable(int mpc_default_type)
32736 +{
32737 +       struct mpc_config_processor processor;
32738 +       struct mpc_config_bus bus;
32739 +       struct mpc_config_ioapic ioapic;
32740 +       struct mpc_config_lintsrc lintsrc;
32741 +       int linttypes[2] = { mp_ExtINT, mp_NMI };
32742 +       int i;
32743 +
32744 +       /*
32745 +        * local APIC has default address
32746 +        */
32747 +       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
32748 +
32749 +       /*
32750 +        * 2 CPUs, numbered 0 & 1.
32751 +        */
32752 +       processor.mpc_type = MP_PROCESSOR;
32753 +       /* Either an integrated APIC or a discrete 82489DX. */
32754 +       processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
32755 +       processor.mpc_cpuflag = CPU_ENABLED;
32756 +       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
32757 +                                  (boot_cpu_data.x86_model << 4) |
32758 +                                  boot_cpu_data.x86_mask;
32759 +       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
32760 +       processor.mpc_reserved[0] = 0;
32761 +       processor.mpc_reserved[1] = 0;
32762 +       for (i = 0; i < 2; i++) {
32763 +               processor.mpc_apicid = i;
32764 +               MP_processor_info(&processor);
32765 +       }
32766 +
32767 +       bus.mpc_type = MP_BUS;
32768 +       bus.mpc_busid = 0;
32769 +       switch (mpc_default_type) {
32770 +               default:
32771 +                       printk(KERN_ERR "???\nUnknown standard configuration %d\n",
32772 +                               mpc_default_type);
32773 +                       /* fall through */
32774 +               case 1:
32775 +               case 5:
32776 +                       memcpy(bus.mpc_bustype, "ISA   ", 6);
32777 +                       break;
32778 +               case 2:
32779 +               case 6:
32780 +               case 3:
32781 +                       memcpy(bus.mpc_bustype, "EISA  ", 6);
32782 +                       break;
32783 +               case 4:
32784 +               case 7:
32785 +                       memcpy(bus.mpc_bustype, "MCA   ", 6);
32786 +       }
32787 +       MP_bus_info(&bus);
32788 +       if (mpc_default_type > 4) {
32789 +               bus.mpc_busid = 1;
32790 +               memcpy(bus.mpc_bustype, "PCI   ", 6);
32791 +               MP_bus_info(&bus);
32792 +       }
32793 +
32794 +       ioapic.mpc_type = MP_IOAPIC;
32795 +       ioapic.mpc_apicid = 2;
32796 +       ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
32797 +       ioapic.mpc_flags = MPC_APIC_USABLE;
32798 +       ioapic.mpc_apicaddr = 0xFEC00000;
32799 +       MP_ioapic_info(&ioapic);
32800 +
32801 +       /*
32802 +        * We set up most of the low 16 IO-APIC pins according to MPS rules.
32803 +        */
32804 +       construct_default_ioirq_mptable(mpc_default_type);
32805 +
32806 +       lintsrc.mpc_type = MP_LINTSRC;
32807 +       lintsrc.mpc_irqflag = 0;                /* conforming */
32808 +       lintsrc.mpc_srcbusid = 0;
32809 +       lintsrc.mpc_srcbusirq = 0;
32810 +       lintsrc.mpc_destapic = MP_APIC_ALL;
32811 +       for (i = 0; i < 2; i++) {
32812 +               lintsrc.mpc_irqtype = linttypes[i];
32813 +               lintsrc.mpc_destapiclint = i;
32814 +               MP_lintsrc_info(&lintsrc);
32815 +       }
32816 +}
32817 +
32818 +static struct intel_mp_floating *mpf_found;
32819 +
32820 +/*
32821 + * Scan the memory blocks for an SMP configuration block.
32822 + */
32823 +void __init get_smp_config (void)
32824 +{
32825 +       struct intel_mp_floating *mpf = mpf_found;
32826 +
32827 +       /*
32828 +        * ACPI supports both logical (e.g. Hyper-Threading) and physical 
32829 +        * processors, where MPS only supports physical.
32830 +        */
32831 +       if (acpi_lapic && acpi_ioapic) {
32832 +               printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
32833 +               return;
32834 +       }
32835 +       else if (acpi_lapic)
32836 +               printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
32837 +
32838 +       printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
32839 +       if (mpf->mpf_feature2 & (1<<7)) {
32840 +               printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
32841 +               pic_mode = 1;
32842 +       } else {
32843 +               printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
32844 +               pic_mode = 0;
32845 +       }
32846 +
32847 +       /*
32848 +        * Now see if we need to read further.
32849 +        */
32850 +       if (mpf->mpf_feature1 != 0) {
32851 +
32852 +               printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
32853 +               construct_default_ISA_mptable(mpf->mpf_feature1);
32854 +
32855 +       } else if (mpf->mpf_physptr) {
32856 +
32857 +               /*
32858 +                * Read the physical hardware table.  Anything here will
32859 +                * override the defaults.
32860 +                */
32861 +               if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
32862 +                       smp_found_config = 0;
32863 +                       printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
32864 +                       printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
32865 +                       return;
32866 +               }
32867 +               /*
32868 +                * If there are no explicit MP IRQ entries, then we are
32869 +                * broken.  We set up most of the low 16 IO-APIC pins to
32870 +                * ISA defaults and hope it will work.
32871 +                */
32872 +               if (!mp_irq_entries) {
32873 +                       struct mpc_config_bus bus;
32874 +
32875 +                       printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
32876 +
32877 +                       bus.mpc_type = MP_BUS;
32878 +                       bus.mpc_busid = 0;
32879 +                       memcpy(bus.mpc_bustype, "ISA   ", 6);
32880 +                       MP_bus_info(&bus);
32881 +
32882 +                       construct_default_ioirq_mptable(0);
32883 +               }
32884 +
32885 +       } else
32886 +               BUG();
32887 +
32888 +       printk(KERN_INFO "Processors: %d\n", num_processors);
32889 +       /*
32890 +        * Only use the first configuration found.
32891 +        */
32892 +}
32893 +
32894 +static int __init smp_scan_config (unsigned long base, unsigned long length)
32895 +{
32896 +       extern void __bad_mpf_size(void); 
32897 +       unsigned int *bp = isa_bus_to_virt(base);
32898 +       struct intel_mp_floating *mpf;
32899 +
32900 +       Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
32901 +       if (sizeof(*mpf) != 16)
32902 +               __bad_mpf_size();
32903 +
32904 +       while (length > 0) {
32905 +               mpf = (struct intel_mp_floating *)bp;
32906 +               if ((*bp == SMP_MAGIC_IDENT) &&
32907 +                       (mpf->mpf_length == 1) &&
32908 +                       !mpf_checksum((unsigned char *)bp, 16) &&
32909 +                       ((mpf->mpf_specification == 1)
32910 +                               || (mpf->mpf_specification == 4)) ) {
32911 +
32912 +                       smp_found_config = 1;
32913 +                       mpf_found = mpf;
32914 +                       return 1;
32915 +               }
32916 +               bp += 4;
32917 +               length -= 16;
32918 +       }
32919 +       return 0;
32920 +}
32921 +
32922 +void __init find_intel_smp (void)
32923 +{
32924 +       unsigned int address;
32925 +
32926 +       /*
32927 +        * FIXME: Linux assumes you have 640K of base ram..
32928 +        * this continues the error...
32929 +        *
32930 +        * 1) Scan the bottom 1K for a signature
32931 +        * 2) Scan the top 1K of base RAM
32932 +        * 3) Scan the 64K of bios
32933 +        */
32934 +       if (smp_scan_config(0x0,0x400) ||
32935 +               smp_scan_config(639*0x400,0x400) ||
32936 +                       smp_scan_config(0xF0000,0x10000))
32937 +               return;
32938 +       /*
32939 +        * If it is an SMP machine we should know now, unless the
32940 +        * configuration is in an EISA/MCA bus machine with an
32941 +        * extended bios data area.
32942 +        *
32943 +        * there is a real-mode segmented pointer pointing to the
32944 +        * 4K EBDA area at 0x40E, calculate and scan it here.
32945 +        *
32946 +        * NOTE! There are Linux loaders that will corrupt the EBDA
32947 +        * area, and as such this kind of SMP config may be less
32948 +        * trustworthy, simply because the SMP table may have been
32949 +        * stomped on during early boot. These loaders are buggy and
32950 +        * should be fixed.
32951 +        */
32952 +
32953 +       address = *(unsigned short *)phys_to_virt(0x40E);
32954 +       address <<= 4;
32955 +       if (smp_scan_config(address, 0x1000))
32956 +               return;
32957 +
32958 +       /* If we have come this far, we did not find an MP table  */
32959 +        printk(KERN_INFO "No mptable found.\n");
32960 +}
32961 +
32962 +/*
32963 + * - Intel MP Configuration Table
32964 + */
32965 +void __init find_smp_config (void)
32966 +{
32967 +#ifdef CONFIG_X86_LOCAL_APIC
32968 +       find_intel_smp();
32969 +#endif
32970 +}
32971 +
32972 +
32973 +/* --------------------------------------------------------------------------
32974 +                            ACPI-based MP Configuration
32975 +   -------------------------------------------------------------------------- */
32976 +
32977 +#ifdef CONFIG_ACPI
32978 +
32979 +void __init mp_register_lapic_address (
32980 +       u64                     address)
32981 +{
32982 +#ifndef CONFIG_XEN
32983 +       mp_lapic_addr = (unsigned long) address;
32984 +
32985 +       set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
32986 +
32987 +       if (boot_cpu_id == -1U)
32988 +               boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
32989 +
32990 +       Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
32991 +#endif
32992 +}
32993 +
32994 +
32995 +void __init mp_register_lapic (
32996 +       u8                      id, 
32997 +       u8                      enabled)
32998 +{
32999 +       struct mpc_config_processor processor;
33000 +       int                     boot_cpu = 0;
33001 +       
33002 +       if (id >= MAX_APICS) {
33003 +               printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
33004 +                       id, MAX_APICS);
33005 +               return;
33006 +       }
33007 +
33008 +       if (id == boot_cpu_physical_apicid)
33009 +               boot_cpu = 1;
33010 +
33011 +#ifndef CONFIG_XEN
33012 +       processor.mpc_type = MP_PROCESSOR;
33013 +       processor.mpc_apicid = id;
33014 +       processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
33015 +       processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
33016 +       processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
33017 +       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | 
33018 +               (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
33019 +       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
33020 +       processor.mpc_reserved[0] = 0;
33021 +       processor.mpc_reserved[1] = 0;
33022 +#endif
33023 +
33024 +       MP_processor_info(&processor);
33025 +}
33026 +
33027 +#ifdef CONFIG_X86_IO_APIC
33028 +
33029 +#define MP_ISA_BUS             0
33030 +#define MP_MAX_IOAPIC_PIN      127
33031 +
33032 +static struct mp_ioapic_routing {
33033 +       int                     apic_id;
33034 +       int                     gsi_start;
33035 +       int                     gsi_end;
33036 +       u32                     pin_programmed[4];
33037 +} mp_ioapic_routing[MAX_IO_APICS];
33038 +
33039 +
33040 +static int mp_find_ioapic (
33041 +       int                     gsi)
33042 +{
33043 +       int                     i = 0;
33044 +
33045 +       /* Find the IOAPIC that manages this GSI. */
33046 +       for (i = 0; i < nr_ioapics; i++) {
33047 +               if ((gsi >= mp_ioapic_routing[i].gsi_start)
33048 +                       && (gsi <= mp_ioapic_routing[i].gsi_end))
33049 +                       return i;
33050 +       }
33051 +
33052 +       printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
33053 +
33054 +       return -1;
33055 +}
33056 +       
33057 +
33058 +void __init mp_register_ioapic (
33059 +       u8                      id, 
33060 +       u32                     address,
33061 +       u32                     gsi_base)
33062 +{
33063 +       int                     idx = 0;
33064 +
33065 +       if (nr_ioapics >= MAX_IO_APICS) {
33066 +               printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
33067 +                       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
33068 +               panic("Recompile kernel with bigger MAX_IO_APICS!\n");
33069 +       }
33070 +       if (!address) {
33071 +               printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
33072 +                       " found in MADT table, skipping!\n");
33073 +               return;
33074 +       }
33075 +
33076 +       idx = nr_ioapics++;
33077 +
33078 +       mp_ioapics[idx].mpc_type = MP_IOAPIC;
33079 +       mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
33080 +       mp_ioapics[idx].mpc_apicaddr = address;
33081 +
33082 +#ifndef CONFIG_XEN
33083 +       set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
33084 +#endif
33085 +       mp_ioapics[idx].mpc_apicid = id;
33086 +       mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
33087 +       
33088 +       /* 
33089 +        * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
33090 +        * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
33091 +        */
33092 +       mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
33093 +       mp_ioapic_routing[idx].gsi_start = gsi_base;
33094 +       mp_ioapic_routing[idx].gsi_end = gsi_base + 
33095 +               io_apic_get_redir_entries(idx);
33096 +
33097 +       printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
33098 +               "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 
33099 +               mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
33100 +               mp_ioapic_routing[idx].gsi_start,
33101 +               mp_ioapic_routing[idx].gsi_end);
33102 +
33103 +       return;
33104 +}
33105 +
33106 +
33107 +void __init mp_override_legacy_irq (
33108 +       u8                      bus_irq,
33109 +       u8                      polarity, 
33110 +       u8                      trigger, 
33111 +       u32                     gsi)
33112 +{
33113 +       struct mpc_config_intsrc intsrc;
33114 +       int                     ioapic = -1;
33115 +       int                     pin = -1;
33116 +
33117 +       /* 
33118 +        * Convert 'gsi' to 'ioapic.pin'.
33119 +        */
33120 +       ioapic = mp_find_ioapic(gsi);
33121 +       if (ioapic < 0)
33122 +               return;
33123 +       pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
33124 +
33125 +       /*
33126 +        * TBD: This check is for faulty timer entries, where the override
33127 +        *      erroneously sets the trigger to level, resulting in a HUGE 
33128 +        *      increase of timer interrupts!
33129 +        */
33130 +       if ((bus_irq == 0) && (trigger == 3))
33131 +               trigger = 1;
33132 +
33133 +       intsrc.mpc_type = MP_INTSRC;
33134 +       intsrc.mpc_irqtype = mp_INT;
33135 +       intsrc.mpc_irqflag = (trigger << 2) | polarity;
33136 +       intsrc.mpc_srcbus = MP_ISA_BUS;
33137 +       intsrc.mpc_srcbusirq = bus_irq;                                /* IRQ */
33138 +       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;        /* APIC ID */
33139 +       intsrc.mpc_dstirq = pin;                                    /* INTIN# */
33140 +
33141 +       Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", 
33142 +               intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
33143 +               (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
33144 +               intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
33145 +
33146 +       mp_irqs[mp_irq_entries] = intsrc;
33147 +       if (++mp_irq_entries == MAX_IRQ_SOURCES)
33148 +               panic("Max # of irq sources exceeded!\n");
33149 +
33150 +       return;
33151 +}
33152 +
33153 +
33154 +void __init mp_config_acpi_legacy_irqs (void)
33155 +{
33156 +       struct mpc_config_intsrc intsrc;
33157 +       int                     i = 0;
33158 +       int                     ioapic = -1;
33159 +
33160 +       /* 
33161 +        * Fabricate the legacy ISA bus (bus #31).
33162 +        */
33163 +       mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
33164 +       Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
33165 +
33166 +       /* 
33167 +        * Locate the IOAPIC that manages the ISA IRQs (0-15). 
33168 +        */
33169 +       ioapic = mp_find_ioapic(0);
33170 +       if (ioapic < 0)
33171 +               return;
33172 +
33173 +       intsrc.mpc_type = MP_INTSRC;
33174 +       intsrc.mpc_irqflag = 0;                                 /* Conforming */
33175 +       intsrc.mpc_srcbus = MP_ISA_BUS;
33176 +       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
33177 +
33178 +       /* 
33179 +        * Use the default configuration for the IRQs 0-15.  Unless
33180 +        * overridden by (MADT) interrupt source override entries.
33181 +        */
33182 +       for (i = 0; i < 16; i++) {
33183 +               int idx;
33184 +
33185 +               for (idx = 0; idx < mp_irq_entries; idx++) {
33186 +                       struct mpc_config_intsrc *irq = mp_irqs + idx;
33187 +
33188 +                       /* Do we already have a mapping for this ISA IRQ? */
33189 +                       if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
33190 +                               break;
33191 +
33192 +                       /* Do we already have a mapping for this IOAPIC pin */
33193 +                       if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
33194 +                               (irq->mpc_dstirq == i))
33195 +                               break;
33196 +               }
33197 +
33198 +               if (idx != mp_irq_entries) {
33199 +                       printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
33200 +                       continue;                       /* IRQ already used */
33201 +               }
33202 +
33203 +               intsrc.mpc_irqtype = mp_INT;
33204 +               intsrc.mpc_srcbusirq = i;                  /* Identity mapped */
33205 +               intsrc.mpc_dstirq = i;
33206 +
33207 +               Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
33208 +                       "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
33209 +                       (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
33210 +                       intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, 
33211 +                       intsrc.mpc_dstirq);
33212 +
33213 +               mp_irqs[mp_irq_entries] = intsrc;
33214 +               if (++mp_irq_entries == MAX_IRQ_SOURCES)
33215 +                       panic("Max # of irq sources exceeded!\n");
33216 +       }
33217 +
33218 +       return;
33219 +}
33220 +
33221 +#define MAX_GSI_NUM    4096
33222 +
33223 +int mp_register_gsi(u32 gsi, int triggering, int polarity)
33224 +{
33225 +       int                     ioapic = -1;
33226 +       int                     ioapic_pin = 0;
33227 +       int                     idx, bit = 0;
33228 +       static int              pci_irq = 16;
33229 +       /*
33230 +        * Mapping between Global System Interrupts, which
33231 +        * represent all possible interrupts, to the IRQs
33232 +        * assigned to actual devices.
33233 +        */
33234 +       static int              gsi_to_irq[MAX_GSI_NUM];
33235 +
33236 +       if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
33237 +               return gsi;
33238 +
33239 +       /* Don't set up the ACPI SCI because it's already set up */
33240 +       if (acpi_fadt.sci_int == gsi)
33241 +               return gsi;
33242 +
33243 +       ioapic = mp_find_ioapic(gsi);
33244 +       if (ioapic < 0) {
33245 +               printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
33246 +               return gsi;
33247 +       }
33248 +
33249 +       ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
33250 +
33251 +       /* 
33252 +        * Avoid pin reprogramming.  PRTs typically include entries  
33253 +        * with redundant pin->gsi mappings (but unique PCI devices);
33254 +        * we only program the IOAPIC on the first.
33255 +        */
33256 +       bit = ioapic_pin % 32;
33257 +       idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
33258 +       if (idx > 3) {
33259 +               printk(KERN_ERR "Invalid reference to IOAPIC pin "
33260 +                       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, 
33261 +                       ioapic_pin);
33262 +               return gsi;
33263 +       }
33264 +       if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
33265 +               Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
33266 +                       mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
33267 +               return gsi_to_irq[gsi];
33268 +       }
33269 +
33270 +       mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
33271 +
33272 +       if (triggering == ACPI_LEVEL_SENSITIVE) {
33273 +               /*
33274 +                * For PCI devices assign IRQs in order, avoiding gaps
33275 +                * due to unused I/O APIC pins.
33276 +                */
33277 +               int irq = gsi;
33278 +               if (gsi < MAX_GSI_NUM) {
33279 +                       if (gsi > 15)
33280 +                               gsi = pci_irq++;
33281 +                       /*
33282 +                        * Don't assign IRQ used by ACPI SCI
33283 +                        */
33284 +                       if (gsi == acpi_fadt.sci_int)
33285 +                               gsi = pci_irq++;
33286 +                       gsi_to_irq[irq] = gsi;
33287 +               } else {
33288 +                       printk(KERN_ERR "GSI %u is too high\n", gsi);
33289 +                       return gsi;
33290 +               }
33291 +       }
33292 +
33293 +       io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
33294 +               triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
33295 +               polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
33296 +       return gsi;
33297 +}
33298 +
33299 +#endif /*CONFIG_X86_IO_APIC*/
33300 +#endif /*CONFIG_ACPI*/
33301 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/pci-swiotlb-xen.c linux-2.6.16/arch/x86_64/kernel/pci-swiotlb-xen.c
33302 --- linux-2.6.16.orig/arch/x86_64/kernel/pci-swiotlb-xen.c      1970-01-01 01:00:00.000000000 +0100
33303 +++ linux-2.6.16/arch/x86_64/kernel/pci-swiotlb-xen.c   2006-06-26 09:51:32.000000000 +0200
33304 @@ -0,0 +1,54 @@
33305 +/* Glue code to lib/swiotlb.c */
33306 +
33307 +#include <linux/pci.h>
33308 +#include <linux/cache.h>
33309 +#include <linux/module.h>
33310 +#include <asm/dma-mapping.h>
33311 +#include <asm/proto.h>
33312 +#include <asm/swiotlb.h>
33313 +#include <asm/dma.h>
33314 +
33315 +#if 0
33316 +int swiotlb __read_mostly;
33317 +EXPORT_SYMBOL(swiotlb);
33318 +#endif
33319 +
33320 +struct dma_mapping_ops swiotlb_dma_ops = {
33321 +#if 0
33322 +       .mapping_error = swiotlb_dma_mapping_error,
33323 +       .alloc_coherent = swiotlb_alloc_coherent,
33324 +       .free_coherent = swiotlb_free_coherent,
33325 +       .map_single = swiotlb_map_single,
33326 +       .unmap_single = swiotlb_unmap_single,
33327 +       .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
33328 +       .sync_single_for_device = swiotlb_sync_single_for_device,
33329 +       .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
33330 +       .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
33331 +       .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
33332 +       .sync_sg_for_device = swiotlb_sync_sg_for_device,
33333 +       .map_sg = swiotlb_map_sg,
33334 +       .unmap_sg = swiotlb_unmap_sg,
33335 +       .dma_supported = NULL,
33336 +#endif
33337 +};
33338 +
33339 +void pci_swiotlb_init(void)
33340 +{
33341 +#if 0
33342 +       /* don't initialize swiotlb if iommu=off (no_iommu=1) */
33343 +       if (!iommu_aperture && !no_iommu &&
33344 +           (end_pfn > MAX_DMA32_PFN || force_iommu))
33345 +              swiotlb = 1;
33346 +       if (swiotlb) {
33347 +               printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
33348 +               swiotlb_init();
33349 +               dma_ops = &swiotlb_dma_ops;
33350 +       }
33351 +#else
33352 +       swiotlb_init();
33353 +       if (swiotlb) {
33354 +               printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
33355 +               dma_ops = &swiotlb_dma_ops;
33356 +       }
33357 +#endif
33358 +}
33359 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/process-xen.c linux-2.6.16/arch/x86_64/kernel/process-xen.c
33360 --- linux-2.6.16.orig/arch/x86_64/kernel/process-xen.c  1970-01-01 01:00:00.000000000 +0100
33361 +++ linux-2.6.16/arch/x86_64/kernel/process-xen.c       2006-06-26 09:51:32.000000000 +0200
33362 @@ -0,0 +1,810 @@
33363 +/*
33364 + *  linux/arch/x86-64/kernel/process.c
33365 + *
33366 + *  Copyright (C) 1995  Linus Torvalds
33367 + *
33368 + *  Pentium III FXSR, SSE support
33369 + *     Gareth Hughes <gareth@valinux.com>, May 2000
33370 + * 
33371 + *  X86-64 port
33372 + *     Andi Kleen.
33373 + *
33374 + *     CPU hotplug support - ashok.raj@intel.com
33375 + *  $Id$
33376 + * 
33377 + *  Jun Nakajima <jun.nakajima@intel.com> 
33378 + *     Modified for Xen
33379 + */
33380 +
33381 +/*
33382 + * This file handles the architecture-dependent parts of process handling..
33383 + */
33384 +
33385 +#include <stdarg.h>
33386 +
33387 +#include <linux/cpu.h>
33388 +#include <linux/errno.h>
33389 +#include <linux/sched.h>
33390 +#include <linux/kernel.h>
33391 +#include <linux/mm.h>
33392 +#include <linux/elfcore.h>
33393 +#include <linux/smp.h>
33394 +#include <linux/slab.h>
33395 +#include <linux/user.h>
33396 +#include <linux/module.h>
33397 +#include <linux/a.out.h>
33398 +#include <linux/interrupt.h>
33399 +#include <linux/delay.h>
33400 +#include <linux/ptrace.h>
33401 +#include <linux/utsname.h>
33402 +#include <linux/random.h>
33403 +#include <linux/kprobes.h>
33404 +#include <linux/notifier.h>
33405 +
33406 +#include <asm/uaccess.h>
33407 +#include <asm/pgtable.h>
33408 +#include <asm/system.h>
33409 +#include <asm/io.h>
33410 +#include <asm/processor.h>
33411 +#include <asm/i387.h>
33412 +#include <asm/mmu_context.h>
33413 +#include <asm/pda.h>
33414 +#include <asm/prctl.h>
33415 +#include <asm/kdebug.h>
33416 +#include <xen/interface/dom0_ops.h>
33417 +#include <xen/interface/physdev.h>
33418 +#include <xen/interface/vcpu.h>
33419 +#include <asm/desc.h>
33420 +#include <asm/proto.h>
33421 +#include <asm/hardirq.h>
33422 +#include <asm/ia32.h>
33423 +#include <asm/idle.h>
33424 +
33425 +asmlinkage extern void ret_from_fork(void);
33426 +
33427 +unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
33428 +
33429 +unsigned long boot_option_idle_override = 0;
33430 +EXPORT_SYMBOL(boot_option_idle_override);
33431 +
33432 +/*
33433 + * Powermanagement idle function, if any..
33434 + */
33435 +void (*pm_idle)(void);
33436 +static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
33437 +
33438 +static struct notifier_block *idle_notifier;
33439 +static DEFINE_SPINLOCK(idle_notifier_lock);
33440 +
33441 +void idle_notifier_register(struct notifier_block *n)
33442 +{
33443 +       unsigned long flags;
33444 +       spin_lock_irqsave(&idle_notifier_lock, flags);
33445 +       notifier_chain_register(&idle_notifier, n);
33446 +       spin_unlock_irqrestore(&idle_notifier_lock, flags);
33447 +}
33448 +EXPORT_SYMBOL_GPL(idle_notifier_register);
33449 +
33450 +void idle_notifier_unregister(struct notifier_block *n)
33451 +{
33452 +       unsigned long flags;
33453 +       spin_lock_irqsave(&idle_notifier_lock, flags);
33454 +       notifier_chain_unregister(&idle_notifier, n);
33455 +       spin_unlock_irqrestore(&idle_notifier_lock, flags);
33456 +}
33457 +EXPORT_SYMBOL(idle_notifier_unregister);
33458 +
33459 +enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
33460 +static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
33461 +
33462 +void enter_idle(void)
33463 +{
33464 +       __get_cpu_var(idle_state) = CPU_IDLE;
33465 +       notifier_call_chain(&idle_notifier, IDLE_START, NULL);
33466 +}
33467 +
33468 +static void __exit_idle(void)
33469 +{
33470 +       __get_cpu_var(idle_state) = CPU_NOT_IDLE;
33471 +       notifier_call_chain(&idle_notifier, IDLE_END, NULL);
33472 +}
33473 +
33474 +/* Called from interrupts to signify idle end */
33475 +void exit_idle(void)
33476 +{
33477 +       if (current->pid | read_pda(irqcount))
33478 +               return;
33479 +       __exit_idle();
33480 +}
33481 +
33482 +/* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */
33483 +extern void stop_hz_timer(void);
33484 +extern void start_hz_timer(void);
33485 +void xen_idle(void)
33486 +{
33487 +       local_irq_disable();
33488 +
33489 +       if (need_resched())
33490 +               local_irq_enable();
33491 +       else {
33492 +               clear_thread_flag(TIF_POLLING_NRFLAG);
33493 +               smp_mb__after_clear_bit();
33494 +               stop_hz_timer();
33495 +               /* Blocking includes an implicit local_irq_enable(). */
33496 +               HYPERVISOR_block();
33497 +               start_hz_timer();
33498 +               set_thread_flag(TIF_POLLING_NRFLAG);
33499 +       }
33500 +}
33501 +
33502 +#ifdef CONFIG_HOTPLUG_CPU
33503 +static inline void play_dead(void)
33504 +{
33505 +       idle_task_exit();
33506 +       local_irq_disable();
33507 +       cpu_clear(smp_processor_id(), cpu_initialized);
33508 +       preempt_enable_no_resched();
33509 +       HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
33510 +       /* Same as drivers/xen/core/smpboot.c:cpu_bringup(). */
33511 +       cpu_init();
33512 +       touch_softlockup_watchdog();
33513 +       preempt_disable();
33514 +       local_irq_enable();
33515 +}
33516 +#else
33517 +static inline void play_dead(void)
33518 +{
33519 +       BUG();
33520 +}
33521 +#endif /* CONFIG_HOTPLUG_CPU */
33522 +
33523 +/*
33524 + * The idle thread. There's no useful work to be
33525 + * done, so just try to conserve power and have a
33526 + * low exit latency (ie sit in a loop waiting for
33527 + * somebody to say that they'd like to reschedule)
33528 + */
33529 +void cpu_idle (void)
33530 +{
33531 +       set_thread_flag(TIF_POLLING_NRFLAG);
33532 +
33533 +       /* endless idle loop with no priority at all */
33534 +       while (1) {
33535 +               while (!need_resched()) {
33536 +                       if (__get_cpu_var(cpu_idle_state))
33537 +                               __get_cpu_var(cpu_idle_state) = 0;
33538 +                       rmb();
33539 +                       
33540 +                       if (cpu_is_offline(smp_processor_id()))
33541 +                               play_dead();
33542 +                       enter_idle();
33543 +                       xen_idle();
33544 +                       __exit_idle();
33545 +               }
33546 +
33547 +               preempt_enable_no_resched();
33548 +               schedule();
33549 +               preempt_disable();
33550 +       }
33551 +}
33552 +
33553 +void cpu_idle_wait(void)
33554 +{
33555 +       unsigned int cpu, this_cpu = get_cpu();
33556 +       cpumask_t map;
33557 +
33558 +       set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
33559 +       put_cpu();
33560 +
33561 +       cpus_clear(map);
33562 +       for_each_online_cpu(cpu) {
33563 +               per_cpu(cpu_idle_state, cpu) = 1;
33564 +               cpu_set(cpu, map);
33565 +       }
33566 +
33567 +       __get_cpu_var(cpu_idle_state) = 0;
33568 +
33569 +       wmb();
33570 +       do {
33571 +               ssleep(1);
33572 +               for_each_online_cpu(cpu) {
33573 +                       if (cpu_isset(cpu, map) &&
33574 +                                       !per_cpu(cpu_idle_state, cpu))
33575 +                               cpu_clear(cpu, map);
33576 +               }
33577 +               cpus_and(map, map, cpu_online_map);
33578 +       } while (!cpus_empty(map));
33579 +}
33580 +EXPORT_SYMBOL_GPL(cpu_idle_wait);
33581 +
33582 +/* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */
33583 +/* Always use xen_idle() instead. */
33584 +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) {}
33585 +
33586 +/* Prints also some state that isn't saved in the pt_regs */ 
33587 +void __show_regs(struct pt_regs * regs)
33588 +{
33589 +       unsigned long fs, gs, shadowgs;
33590 +       unsigned int fsindex,gsindex;
33591 +       unsigned int ds,cs,es; 
33592 +
33593 +       printk("\n");
33594 +       print_modules();
33595 +       printk("Pid: %d, comm: %.20s %s %s %.*s\n",
33596 +               current->pid, current->comm, print_tainted(),
33597 +               system_utsname.release,
33598 +               (int)strcspn(system_utsname.version, " "),
33599 +               system_utsname.version);
33600 +       printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
33601 +       printk_address(regs->rip); 
33602 +       printk("\nRSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
33603 +               regs->eflags);
33604 +       printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
33605 +              regs->rax, regs->rbx, regs->rcx);
33606 +       printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
33607 +              regs->rdx, regs->rsi, regs->rdi); 
33608 +       printk("RBP: %016lx R08: %016lx R09: %016lx\n",
33609 +              regs->rbp, regs->r8, regs->r9); 
33610 +       printk("R10: %016lx R11: %016lx R12: %016lx\n",
33611 +              regs->r10, regs->r11, regs->r12); 
33612 +       printk("R13: %016lx R14: %016lx R15: %016lx\n",
33613 +              regs->r13, regs->r14, regs->r15); 
33614 +
33615 +       asm("mov %%ds,%0" : "=r" (ds)); 
33616 +       asm("mov %%cs,%0" : "=r" (cs)); 
33617 +       asm("mov %%es,%0" : "=r" (es)); 
33618 +       asm("mov %%fs,%0" : "=r" (fsindex));
33619 +       asm("mov %%gs,%0" : "=r" (gsindex));
33620 +
33621 +       rdmsrl(MSR_FS_BASE, fs);
33622 +       rdmsrl(MSR_GS_BASE, gs); 
33623 +       rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 
33624 +
33625 +       printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 
33626 +              fs,fsindex,gs,gsindex,shadowgs); 
33627 +       printk("CS:  %04x DS: %04x ES: %04x\n", cs, ds, es); 
33628 +
33629 +}
33630 +
33631 +void show_regs(struct pt_regs *regs)
33632 +{
33633 +       printk("CPU %d:", smp_processor_id());
33634 +       __show_regs(regs);
33635 +       show_trace(&regs->rsp);
33636 +}
33637 +
33638 +/*
33639 + * Free current thread data structures etc..
33640 + */
33641 +void exit_thread(void)
33642 +{
33643 +       struct task_struct *me = current;
33644 +       struct thread_struct *t = &me->thread;
33645 +
33646 +       /*
33647 +        * Remove function-return probe instances associated with this task
33648 +        * and put them back on the free list. Do not insert an exit probe for
33649 +        * this function, it will be disabled by kprobe_flush_task if you do.
33650 +        */
33651 +       kprobe_flush_task(me);
33652 +
33653 +       if (me->thread.io_bitmap_ptr) { 
33654 +#ifndef CONFIG_X86_NO_TSS
33655 +               struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
33656 +#endif
33657 +#ifdef CONFIG_XEN
33658 +               static physdev_op_t iobmp_op = {
33659 +                       .cmd = PHYSDEVOP_SET_IOBITMAP
33660 +               };
33661 +#endif
33662 +
33663 +               kfree(t->io_bitmap_ptr);
33664 +               t->io_bitmap_ptr = NULL;
33665 +               /*
33666 +                * Careful, clear this in the TSS too:
33667 +                */
33668 +#ifndef CONFIG_X86_NO_TSS
33669 +               memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
33670 +               put_cpu();
33671 +#endif
33672 +#ifdef CONFIG_XEN
33673 +               HYPERVISOR_physdev_op(&iobmp_op);
33674 +#endif
33675 +               t->io_bitmap_max = 0;
33676 +       }
33677 +}
33678 +
33679 +void load_gs_index(unsigned gs)
33680 +{
33681 +       HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs);
33682 +}
33683 +
33684 +void flush_thread(void)
33685 +{
33686 +       struct task_struct *tsk = current;
33687 +       struct thread_info *t = current_thread_info();
33688 +
33689 +       if (t->flags & _TIF_ABI_PENDING)
33690 +               t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
33691 +
33692 +       tsk->thread.debugreg0 = 0;
33693 +       tsk->thread.debugreg1 = 0;
33694 +       tsk->thread.debugreg2 = 0;
33695 +       tsk->thread.debugreg3 = 0;
33696 +       tsk->thread.debugreg6 = 0;
33697 +       tsk->thread.debugreg7 = 0;
33698 +       memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));        
33699 +       /*
33700 +        * Forget coprocessor state..
33701 +        */
33702 +       clear_fpu(tsk);
33703 +       clear_used_math();
33704 +}
33705 +
33706 +void release_thread(struct task_struct *dead_task)
33707 +{
33708 +       if (dead_task->mm) {
33709 +               if (dead_task->mm->context.size) {
33710 +                       printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
33711 +                                       dead_task->comm,
33712 +                                       dead_task->mm->context.ldt,
33713 +                                       dead_task->mm->context.size);
33714 +                       BUG();
33715 +               }
33716 +       }
33717 +}
33718 +
33719 +static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
33720 +{
33721 +       struct user_desc ud = { 
33722 +               .base_addr = addr,
33723 +               .limit = 0xfffff,
33724 +               .contents = (3 << 3), /* user */
33725 +               .seg_32bit = 1,
33726 +               .limit_in_pages = 1,
33727 +               .useable = 1,
33728 +       };
33729 +       struct n_desc_struct *desc = (void *)t->thread.tls_array;
33730 +       desc += tls;
33731 +       desc->a = LDT_entry_a(&ud); 
33732 +       desc->b = LDT_entry_b(&ud); 
33733 +}
33734 +
33735 +static inline u32 read_32bit_tls(struct task_struct *t, int tls)
33736 +{
33737 +       struct desc_struct *desc = (void *)t->thread.tls_array;
33738 +       desc += tls;
33739 +       return desc->base0 | 
33740 +               (((u32)desc->base1) << 16) | 
33741 +               (((u32)desc->base2) << 24);
33742 +}
33743 +
33744 +/*
33745 + * This gets called before we allocate a new thread and copy
33746 + * the current task into it.
33747 + */
33748 +void prepare_to_copy(struct task_struct *tsk)
33749 +{
33750 +       unlazy_fpu(tsk);
33751 +}
33752 +
33753 +int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, 
33754 +               unsigned long unused,
33755 +       struct task_struct * p, struct pt_regs * regs)
33756 +{
33757 +       int err;
33758 +       struct pt_regs * childregs;
33759 +       struct task_struct *me = current;
33760 +
33761 +       childregs = ((struct pt_regs *)
33762 +                       (THREAD_SIZE + task_stack_page(p))) - 1;
33763 +       *childregs = *regs;
33764 +
33765 +       childregs->rax = 0;
33766 +       childregs->rsp = rsp;
33767 +       if (rsp == ~0UL)
33768 +               childregs->rsp = (unsigned long)childregs;
33769 +
33770 +       p->thread.rsp = (unsigned long) childregs;
33771 +       p->thread.rsp0 = (unsigned long) (childregs+1);
33772 +       p->thread.userrsp = me->thread.userrsp; 
33773 +
33774 +       set_tsk_thread_flag(p, TIF_FORK);
33775 +
33776 +       p->thread.fs = me->thread.fs;
33777 +       p->thread.gs = me->thread.gs;
33778 +
33779 +       asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
33780 +       asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
33781 +       asm("mov %%es,%0" : "=m" (p->thread.es));
33782 +       asm("mov %%ds,%0" : "=m" (p->thread.ds));
33783 +
33784 +       if (unlikely(me->thread.io_bitmap_ptr != NULL)) { 
33785 +               p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
33786 +               if (!p->thread.io_bitmap_ptr) {
33787 +                       p->thread.io_bitmap_max = 0;
33788 +                       return -ENOMEM;
33789 +               }
33790 +               memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
33791 +                               IO_BITMAP_BYTES);
33792 +       } 
33793 +
33794 +       /*
33795 +        * Set a new TLS for the child thread?
33796 +        */
33797 +       if (clone_flags & CLONE_SETTLS) {
33798 +#ifdef CONFIG_IA32_EMULATION
33799 +               if (test_thread_flag(TIF_IA32))
33800 +                       err = ia32_child_tls(p, childregs); 
33801 +               else                    
33802 +#endif  
33803 +                       err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 
33804 +               if (err) 
33805 +                       goto out;
33806 +       }
33807 +        p->thread.iopl = current->thread.iopl;
33808 +
33809 +       err = 0;
33810 +out:
33811 +       if (err && p->thread.io_bitmap_ptr) {
33812 +               kfree(p->thread.io_bitmap_ptr);
33813 +               p->thread.io_bitmap_max = 0;
33814 +       }
33815 +       return err;
33816 +}
33817 +
33818 +static inline void __save_init_fpu( struct task_struct *tsk )
33819 +{
33820 +       asm volatile( "rex64 ; fxsave %0 ; fnclex"
33821 +                     : "=m" (tsk->thread.i387.fxsave));
33822 +       tsk->thread_info->status &= ~TS_USEDFPU;
33823 +}
33824 +
33825 +/*
33826 + *     switch_to(x,y) should switch tasks from x to y.
33827 + *
33828 + * This could still be optimized: 
33829 + * - fold all the options into a flag word and test it with a single test.
33830 + * - could test fs/gs bitsliced
33831 + *
33832 + * Kprobes not supported here. Set the probe on schedule instead.
33833 + */
33834 +__kprobes struct task_struct *
33835 +__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
33836 +{
33837 +       struct thread_struct *prev = &prev_p->thread,
33838 +                                *next = &next_p->thread;
33839 +       int cpu = smp_processor_id();  
33840 +#ifndef CONFIG_X86_NO_TSS
33841 +       struct tss_struct *tss = &per_cpu(init_tss, cpu);
33842 +#endif
33843 +       physdev_op_t iopl_op, iobmp_op;
33844 +       multicall_entry_t _mcl[8], *mcl = _mcl;
33845 +
33846 +       /*
33847 +        * This is basically '__unlazy_fpu', except that we queue a
33848 +        * multicall to indicate FPU task switch, rather than
33849 +        * synchronously trapping to Xen.
33850 +        */
33851 +       if (prev_p->thread_info->status & TS_USEDFPU) {
33852 +               __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
33853 +               mcl->op      = __HYPERVISOR_fpu_taskswitch;
33854 +               mcl->args[0] = 1;
33855 +               mcl++;
33856 +       }
33857 +
33858 +       /*
33859 +        * Reload esp0, LDT and the page table pointer:
33860 +        */
33861 +       mcl->op      = __HYPERVISOR_stack_switch;
33862 +       mcl->args[0] = __KERNEL_DS;
33863 +       mcl->args[1] = next->rsp0;
33864 +       mcl++;
33865 +
33866 +       /*
33867 +        * Load the per-thread Thread-Local Storage descriptor.
33868 +        * This is load_TLS(next, cpu) with multicalls.
33869 +        */
33870 +#define C(i) do {                                                      \
33871 +       if (unlikely(next->tls_array[i] != prev->tls_array[i])) {       \
33872 +               mcl->op      = __HYPERVISOR_update_descriptor;          \
33873 +               mcl->args[0] = virt_to_machine(                         \
33874 +                       &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]);          \
33875 +               mcl->args[1] = next->tls_array[i];                      \
33876 +               mcl++;                                                  \
33877 +       }                                                               \
33878 +} while (0)
33879 +       C(0); C(1); C(2);
33880 +#undef C
33881 +
33882 +       if (unlikely(prev->iopl != next->iopl)) {
33883 +               iopl_op.cmd             = PHYSDEVOP_SET_IOPL;
33884 +               iopl_op.u.set_iopl.iopl = (next->iopl == 0) ? 1 : next->iopl;
33885 +               mcl->op      = __HYPERVISOR_physdev_op;
33886 +               mcl->args[0] = (unsigned long)&iopl_op;
33887 +               mcl++;
33888 +       }
33889 +
33890 +       if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
33891 +               iobmp_op.cmd                     =
33892 +                       PHYSDEVOP_SET_IOBITMAP;
33893 +               iobmp_op.u.set_iobitmap.bitmap   =
33894 +                       (char *)next->io_bitmap_ptr;
33895 +               iobmp_op.u.set_iobitmap.nr_ports =
33896 +                       next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
33897 +               mcl->op      = __HYPERVISOR_physdev_op;
33898 +               mcl->args[0] = (unsigned long)&iobmp_op;
33899 +               mcl++;
33900 +       }
33901 +
33902 +       (void)HYPERVISOR_multicall(_mcl, mcl - _mcl);
33903 +       /* 
33904 +        * Switch DS and ES.
33905 +        * This won't pick up thread selector changes, but I guess that is ok.
33906 +        */
33907 +       if (unlikely(next->es))
33908 +               loadsegment(es, next->es); 
33909 +       
33910 +       if (unlikely(next->ds))
33911 +               loadsegment(ds, next->ds);
33912 +
33913 +       /* 
33914 +        * Switch FS and GS.
33915 +        */
33916 +       if (unlikely(next->fsindex))
33917 +               loadsegment(fs, next->fsindex);
33918 +
33919 +       if (next->fs)
33920 +               HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs); 
33921 +       
33922 +       if (unlikely(next->gsindex))
33923 +               load_gs_index(next->gsindex);
33924 +
33925 +       if (next->gs)
33926 +               HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs); 
33927 +
33928 +       /* 
33929 +        * Switch the PDA context.
33930 +        */
33931 +       prev->userrsp = read_pda(oldrsp); 
33932 +       write_pda(oldrsp, next->userrsp); 
33933 +       write_pda(pcurrent, next_p); 
33934 +       write_pda(kernelstack,
33935 +                 task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
33936 +
33937 +       /*
33938 +        * Now maybe reload the debug registers
33939 +        */
33940 +       if (unlikely(next->debugreg7)) {
33941 +               set_debugreg(next->debugreg0, 0);
33942 +               set_debugreg(next->debugreg1, 1);
33943 +               set_debugreg(next->debugreg2, 2);
33944 +               set_debugreg(next->debugreg3, 3);
33945 +               /* no 4 and 5 */
33946 +               set_debugreg(next->debugreg6, 6);
33947 +               set_debugreg(next->debugreg7, 7);
33948 +       }
33949 +
33950 +       return prev_p;
33951 +}
33952 +
33953 +/*
33954 + * sys_execve() executes a new program.
33955 + */
33956 +asmlinkage 
33957 +long sys_execve(char __user *name, char __user * __user *argv,
33958 +               char __user * __user *envp, struct pt_regs regs)
33959 +{
33960 +       long error;
33961 +       char * filename;
33962 +
33963 +       filename = getname(name);
33964 +       error = PTR_ERR(filename);
33965 +       if (IS_ERR(filename)) 
33966 +               return error;
33967 +       error = do_execve(filename, argv, envp, &regs); 
33968 +       if (error == 0) {
33969 +               task_lock(current);
33970 +               current->ptrace &= ~PT_DTRACE;
33971 +               task_unlock(current);
33972 +       }
33973 +       putname(filename);
33974 +       return error;
33975 +}
33976 +
33977 +void set_personality_64bit(void)
33978 +{
33979 +       /* inherit personality from parent */
33980 +
33981 +       /* Make sure to be in 64bit mode */
33982 +       clear_thread_flag(TIF_IA32); 
33983 +
33984 +       /* TBD: overwrites user setup. Should have two bits.
33985 +          But 64bit processes have always behaved this way,
33986 +          so it's not too bad. The main problem is just that
33987 +          32bit childs are affected again. */
33988 +       current->personality &= ~READ_IMPLIES_EXEC;
33989 +}
33990 +
33991 +asmlinkage long sys_fork(struct pt_regs *regs)
33992 +{
33993 +       return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
33994 +}
33995 +
33996 +asmlinkage long
33997 +sys_clone(unsigned long clone_flags, unsigned long newsp,
33998 +         void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
33999 +{
34000 +       if (!newsp)
34001 +               newsp = regs->rsp;
34002 +       return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
34003 +}
34004 +
34005 +/*
34006 + * This is trivial, and on the face of it looks like it
34007 + * could equally well be done in user mode.
34008 + *
34009 + * Not so, for quite unobvious reasons - register pressure.
34010 + * In user mode vfork() cannot have a stack frame, and if
34011 + * done by calling the "clone()" system call directly, you
34012 + * do not have enough call-clobbered registers to hold all
34013 + * the information you need.
34014 + */
34015 +asmlinkage long sys_vfork(struct pt_regs *regs)
34016 +{
34017 +       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
34018 +                   NULL, NULL);
34019 +}
34020 +
34021 +unsigned long get_wchan(struct task_struct *p)
34022 +{
34023 +       unsigned long stack;
34024 +       u64 fp,rip;
34025 +       int count = 0;
34026 +
34027 +       if (!p || p == current || p->state==TASK_RUNNING)
34028 +               return 0; 
34029 +       stack = (unsigned long)task_stack_page(p);
34030 +       if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
34031 +               return 0;
34032 +       fp = *(u64 *)(p->thread.rsp);
34033 +       do { 
34034 +               if (fp < (unsigned long)stack ||
34035 +                   fp > (unsigned long)stack+THREAD_SIZE)
34036 +                       return 0; 
34037 +               rip = *(u64 *)(fp+8); 
34038 +               if (!in_sched_functions(rip))
34039 +                       return rip; 
34040 +               fp = *(u64 *)fp; 
34041 +       } while (count++ < 16); 
34042 +       return 0;
34043 +}
34044 +
34045 +long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
34046 +{ 
34047 +       int ret = 0; 
34048 +       int doit = task == current;
34049 +       int cpu;
34050 +
34051 +       switch (code) { 
34052 +       case ARCH_SET_GS:
34053 +               if (addr >= TASK_SIZE_OF(task))
34054 +                       return -EPERM; 
34055 +               cpu = get_cpu();
34056 +               /* handle small bases via the GDT because that's faster to 
34057 +                  switch. */
34058 +               if (addr <= 0xffffffff) {  
34059 +                       set_32bit_tls(task, GS_TLS, addr); 
34060 +                       if (doit) { 
34061 +                               load_TLS(&task->thread, cpu);
34062 +                               load_gs_index(GS_TLS_SEL); 
34063 +                       }
34064 +                       task->thread.gsindex = GS_TLS_SEL; 
34065 +                       task->thread.gs = 0;
34066 +               } else { 
34067 +                       task->thread.gsindex = 0;
34068 +                       task->thread.gs = addr;
34069 +                       if (doit) {
34070 +                               load_gs_index(0);
34071 +                               ret = HYPERVISOR_set_segment_base(
34072 +                                       SEGBASE_GS_USER, addr);
34073 +                       } 
34074 +               }
34075 +               put_cpu();
34076 +               break;
34077 +       case ARCH_SET_FS:
34078 +               /* Not strictly needed for fs, but do it for symmetry
34079 +                  with gs */
34080 +               if (addr >= TASK_SIZE_OF(task))
34081 +                       return -EPERM; 
34082 +               cpu = get_cpu();
34083 +               /* handle small bases via the GDT because that's faster to 
34084 +                  switch. */
34085 +               if (addr <= 0xffffffff) { 
34086 +                       set_32bit_tls(task, FS_TLS, addr);
34087 +                       if (doit) { 
34088 +                               load_TLS(&task->thread, cpu); 
34089 +                               asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
34090 +                       }
34091 +                       task->thread.fsindex = FS_TLS_SEL;
34092 +                       task->thread.fs = 0;
34093 +               } else { 
34094 +                       task->thread.fsindex = 0;
34095 +                       task->thread.fs = addr;
34096 +                       if (doit) {
34097 +                               /* set the selector to 0 to not confuse
34098 +                                  __switch_to */
34099 +                               asm volatile("movl %0,%%fs" :: "r" (0));
34100 +                                ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
34101 +                                                                 addr);
34102 +                       }
34103 +               }
34104 +               put_cpu();
34105 +               break;
34106 +       case ARCH_GET_FS: { 
34107 +               unsigned long base; 
34108 +               if (task->thread.fsindex == FS_TLS_SEL)
34109 +                       base = read_32bit_tls(task, FS_TLS);
34110 +               else if (doit)
34111 +                       rdmsrl(MSR_FS_BASE, base);
34112 +               else
34113 +                       base = task->thread.fs;
34114 +               ret = put_user(base, (unsigned long __user *)addr); 
34115 +               break; 
34116 +       }
34117 +       case ARCH_GET_GS: { 
34118 +               unsigned long base;
34119 +               if (task->thread.gsindex == GS_TLS_SEL)
34120 +                       base = read_32bit_tls(task, GS_TLS);
34121 +               else if (doit)
34122 +                       rdmsrl(MSR_KERNEL_GS_BASE, base);
34123 +               else
34124 +                       base = task->thread.gs;
34125 +               ret = put_user(base, (unsigned long __user *)addr); 
34126 +               break;
34127 +       }
34128 +
34129 +       default:
34130 +               ret = -EINVAL;
34131 +               break;
34132 +       } 
34133 +
34134 +       return ret;     
34135 +} 
34136 +
34137 +long sys_arch_prctl(int code, unsigned long addr)
34138 +{
34139 +       return do_arch_prctl(current, code, addr);
34140 +} 
34141 +
34142 +/* 
34143 + * Capture the user space registers if the task is not running (in user space)
34144 + */
34145 +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
34146 +{
34147 +       struct pt_regs *pp, ptregs;
34148 +
34149 +       pp = task_pt_regs(tsk);
34150 +
34151 +       ptregs = *pp; 
34152 +       ptregs.cs &= 0xffff;
34153 +       ptregs.ss &= 0xffff;
34154 +
34155 +       elf_core_copy_regs(regs, &ptregs);
34156
34157 +        boot_option_idle_override = 1;
34158 +       return 1;
34159 +}
34160 +
34161 +unsigned long arch_align_stack(unsigned long sp)
34162 +{
34163 +       if (randomize_va_space)
34164 +               sp -= get_random_int() % 8192;
34165 +       return sp & ~0xf;
34166 +}
34167 +
34168 +#ifndef CONFIG_SMP
34169 +void _restore_vcpu(void)
34170 +{
34171 +}
34172 +#endif
34173 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/setup-xen.c linux-2.6.16/arch/x86_64/kernel/setup-xen.c
34174 --- linux-2.6.16.orig/arch/x86_64/kernel/setup-xen.c    1970-01-01 01:00:00.000000000 +0100
34175 +++ linux-2.6.16/arch/x86_64/kernel/setup-xen.c 2006-06-26 09:51:32.000000000 +0200
34176 @@ -0,0 +1,1661 @@
34177 +/*
34178 + *  linux/arch/x86-64/kernel/setup.c
34179 + *
34180 + *  Copyright (C) 1995  Linus Torvalds
34181 + *
34182 + *  Nov 2001 Dave Jones <davej@suse.de>
34183 + *  Forked from i386 setup code.
34184 + *
34185 + *  $Id$
34186 + */
34187 +
34188 +/*
34189 + * This file handles the architecture-dependent parts of initialization
34190 + */
34191 +
34192 +#include <linux/errno.h>
34193 +#include <linux/sched.h>
34194 +#include <linux/kernel.h>
34195 +#include <linux/mm.h>
34196 +#include <linux/stddef.h>
34197 +#include <linux/unistd.h>
34198 +#include <linux/ptrace.h>
34199 +#include <linux/slab.h>
34200 +#include <linux/user.h>
34201 +#include <linux/a.out.h>
34202 +#include <linux/tty.h>
34203 +#include <linux/ioport.h>
34204 +#include <linux/delay.h>
34205 +#include <linux/config.h>
34206 +#include <linux/init.h>
34207 +#include <linux/initrd.h>
34208 +#include <linux/highmem.h>
34209 +#include <linux/bootmem.h>
34210 +#include <linux/module.h>
34211 +#include <asm/processor.h>
34212 +#include <linux/console.h>
34213 +#include <linux/seq_file.h>
34214 +#include <linux/crash_dump.h>
34215 +#include <linux/root_dev.h>
34216 +#include <linux/pci.h>
34217 +#include <linux/acpi.h>
34218 +#include <linux/kallsyms.h>
34219 +#include <linux/edd.h>
34220 +#include <linux/mmzone.h>
34221 +#include <linux/kexec.h>
34222 +#include <linux/cpufreq.h>
34223 +#include <linux/dmi.h>
34224 +#include <linux/dma-mapping.h>
34225 +
34226 +#include <asm/mtrr.h>
34227 +#include <asm/uaccess.h>
34228 +#include <asm/system.h>
34229 +#include <asm/io.h>
34230 +#include <asm/smp.h>
34231 +#include <asm/msr.h>
34232 +#include <asm/desc.h>
34233 +#include <video/edid.h>
34234 +#include <asm/e820.h>
34235 +#include <asm/dma.h>
34236 +#include <asm/mpspec.h>
34237 +#include <asm/mmu_context.h>
34238 +#include <asm/bootsetup.h>
34239 +#include <asm/proto.h>
34240 +#include <asm/setup.h>
34241 +#include <asm/mach_apic.h>
34242 +#include <asm/numa.h>
34243 +#include <asm/swiotlb.h>
34244 +#include <asm/sections.h>
34245 +#include <asm/gart-mapping.h>
34246 +#ifdef CONFIG_XEN
34247 +#include <linux/percpu.h>
34248 +#include <xen/interface/physdev.h>
34249 +#include "setup_arch_pre.h"
34250 +#include <asm/hypervisor.h>
34251 +#include <xen/interface/nmi.h>
34252 +#include <xen/features.h>
34253 +#define PFN_UP(x)       (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
34254 +#define PFN_PHYS(x)     ((x) << PAGE_SHIFT)
34255 +#define end_pfn_map end_pfn
34256 +#include <asm/mach-xen/setup_arch_post.h>
34257 +
34258 +extern unsigned long start_pfn;
34259 +extern struct edid_info edid_info;
34260 +
34261 +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
34262 +EXPORT_SYMBOL(HYPERVISOR_shared_info);
34263 +
34264 +extern char hypercall_page[PAGE_SIZE];
34265 +EXPORT_SYMBOL(hypercall_page);
34266 +
34267 +/* Allows setting of maximum possible memory size  */
34268 +unsigned long xen_override_max_pfn;
34269 +
34270 +static int xen_panic_event(struct notifier_block *, unsigned long, void *);
34271 +static struct notifier_block xen_panic_block = {
34272 +       xen_panic_event, NULL, 0 /* try to go last */
34273 +};
34274 +
34275 +unsigned long *phys_to_machine_mapping;
34276 +unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512];
34277 +
34278 +EXPORT_SYMBOL(phys_to_machine_mapping);
34279 +
34280 +DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]);
34281 +DEFINE_PER_CPU(int, nr_multicall_ents);
34282 +
34283 +/* Raw start-of-day parameters from the hypervisor. */
34284 +start_info_t *xen_start_info;
34285 +EXPORT_SYMBOL(xen_start_info);
34286 +#endif
34287 +
34288 +/*
34289 + * Machine setup..
34290 + */
34291 +
34292 +struct cpuinfo_x86 boot_cpu_data __read_mostly;
34293 +
34294 +unsigned long mmu_cr4_features;
34295 +
34296 +int acpi_disabled;
34297 +EXPORT_SYMBOL(acpi_disabled);
34298 +#ifdef CONFIG_ACPI
34299 +extern int __initdata acpi_ht;
34300 +extern acpi_interrupt_flags    acpi_sci_flags;
34301 +int __initdata acpi_force = 0;
34302 +#endif
34303 +
34304 +int acpi_numa __initdata;
34305 +
34306 +/* Boot loader ID as an integer, for the benefit of proc_dointvec */
34307 +int bootloader_type;
34308 +
34309 +unsigned long saved_video_mode;
34310 +
34311 +/*
34312 + * Setup options
34313 + */
34314 +struct screen_info screen_info;
34315 +struct sys_desc_table_struct {
34316 +       unsigned short length;
34317 +       unsigned char table[0];
34318 +};
34319 +
34320 +struct edid_info edid_info;
34321 +struct e820map e820;
34322 +
34323 +extern int root_mountflags;
34324 +
34325 +char command_line[COMMAND_LINE_SIZE];
34326 +
34327 +struct resource standard_io_resources[] = {
34328 +       { .name = "dma1", .start = 0x00, .end = 0x1f,
34329 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
34330 +       { .name = "pic1", .start = 0x20, .end = 0x21,
34331 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
34332 +       { .name = "timer0", .start = 0x40, .end = 0x43,
34333 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
34334 +       { .name = "timer1", .start = 0x50, .end = 0x53,
34335 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
34336 +       { .name = "keyboard", .start = 0x60, .end = 0x6f,
34337 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
34338 +       { .name = "dma page reg", .start = 0x80, .end = 0x8f,
34339 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
34340 +       { .name = "pic2", .start = 0xa0, .end = 0xa1,
34341 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
34342 +       { .name = "dma2", .start = 0xc0, .end = 0xdf,
34343 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
34344 +       { .name = "fpu", .start = 0xf0, .end = 0xff,
34345 +               .flags = IORESOURCE_BUSY | IORESOURCE_IO }
34346 +};
34347 +
34348 +#define STANDARD_IO_RESOURCES \
34349 +       (sizeof standard_io_resources / sizeof standard_io_resources[0])
34350 +
34351 +#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
34352 +
34353 +struct resource data_resource = {
34354 +       .name = "Kernel data",
34355 +       .start = 0,
34356 +       .end = 0,
34357 +       .flags = IORESOURCE_RAM,
34358 +};
34359 +struct resource code_resource = {
34360 +       .name = "Kernel code",
34361 +       .start = 0,
34362 +       .end = 0,
34363 +       .flags = IORESOURCE_RAM,
34364 +};
34365 +
34366 +#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM)
34367 +
34368 +#if defined(CONFIG_XEN_PRIVILEGED_GUEST) || !defined(CONFIG_XEN)
34369 +static struct resource system_rom_resource = {
34370 +       .name = "System ROM",
34371 +       .start = 0xf0000,
34372 +       .end = 0xfffff,
34373 +       .flags = IORESOURCE_ROM,
34374 +};
34375 +
34376 +static struct resource extension_rom_resource = {
34377 +       .name = "Extension ROM",
34378 +       .start = 0xe0000,
34379 +       .end = 0xeffff,
34380 +       .flags = IORESOURCE_ROM,
34381 +};
34382 +
34383 +static struct resource adapter_rom_resources[] = {
34384 +       { .name = "Adapter ROM", .start = 0xc8000, .end = 0,
34385 +               .flags = IORESOURCE_ROM },
34386 +       { .name = "Adapter ROM", .start = 0, .end = 0,
34387 +               .flags = IORESOURCE_ROM },
34388 +       { .name = "Adapter ROM", .start = 0, .end = 0,
34389 +               .flags = IORESOURCE_ROM },
34390 +       { .name = "Adapter ROM", .start = 0, .end = 0,
34391 +               .flags = IORESOURCE_ROM },
34392 +       { .name = "Adapter ROM", .start = 0, .end = 0,
34393 +               .flags = IORESOURCE_ROM },
34394 +       { .name = "Adapter ROM", .start = 0, .end = 0,
34395 +               .flags = IORESOURCE_ROM }
34396 +};
34397 +#endif
34398 +
34399 +#define ADAPTER_ROM_RESOURCES \
34400 +       (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
34401 +
34402 +#if defined(CONFIG_XEN_PRIVILEGED_GUEST) || !defined(CONFIG_XEN)
34403 +static struct resource video_rom_resource = {
34404 +       .name = "Video ROM",
34405 +       .start = 0xc0000,
34406 +       .end = 0xc7fff,
34407 +       .flags = IORESOURCE_ROM,
34408 +};
34409 +#endif
34410 +
34411 +static struct resource video_ram_resource = {
34412 +       .name = "Video RAM area",
34413 +       .start = 0xa0000,
34414 +       .end = 0xbffff,
34415 +       .flags = IORESOURCE_RAM,
34416 +};
34417 +
34418 +#if defined(CONFIG_XEN_PRIVILEGED_GUEST) || !defined(CONFIG_XEN)
34419 +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
34420 +
34421 +static int __init romchecksum(unsigned char *rom, unsigned long length)
34422 +{
34423 +       unsigned char *p, sum = 0;
34424 +
34425 +       for (p = rom; p < rom + length; p++)
34426 +               sum += *p;
34427 +       return sum == 0;
34428 +}
34429 +
34430 +static void __init probe_roms(void)
34431 +{
34432 +       unsigned long start, length, upper;
34433 +       unsigned char *rom;
34434 +       int           i;
34435 +
34436 +       /* video rom */
34437 +       upper = adapter_rom_resources[0].start;
34438 +       for (start = video_rom_resource.start; start < upper; start += 2048) {
34439 +               rom = isa_bus_to_virt(start);
34440 +               if (!romsignature(rom))
34441 +                       continue;
34442 +
34443 +               video_rom_resource.start = start;
34444 +
34445 +               /* 0 < length <= 0x7f * 512, historically */
34446 +               length = rom[2] * 512;
34447 +
34448 +               /* if checksum okay, trust length byte */
34449 +               if (length && romchecksum(rom, length))
34450 +                       video_rom_resource.end = start + length - 1;
34451 +
34452 +               request_resource(&iomem_resource, &video_rom_resource);
34453 +               break;
34454 +                       }
34455 +
34456 +       start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
34457 +       if (start < upper)
34458 +               start = upper;
34459 +
34460 +       /* system rom */
34461 +       request_resource(&iomem_resource, &system_rom_resource);
34462 +       upper = system_rom_resource.start;
34463 +
34464 +       /* check for extension rom (ignore length byte!) */
34465 +       rom = isa_bus_to_virt(extension_rom_resource.start);
34466 +       if (romsignature(rom)) {
34467 +               length = extension_rom_resource.end - extension_rom_resource.start + 1;
34468 +               if (romchecksum(rom, length)) {
34469 +                       request_resource(&iomem_resource, &extension_rom_resource);
34470 +                       upper = extension_rom_resource.start;
34471 +               }
34472 +       }
34473 +
34474 +       /* check for adapter roms on 2k boundaries */
34475 +       for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
34476 +               rom = isa_bus_to_virt(start);
34477 +               if (!romsignature(rom))
34478 +                       continue;
34479 +
34480 +               /* 0 < length <= 0x7f * 512, historically */
34481 +               length = rom[2] * 512;
34482 +
34483 +               /* but accept any length that fits if checksum okay */
34484 +               if (!length || start + length > upper || !romchecksum(rom, length))
34485 +                       continue;
34486 +
34487 +               adapter_rom_resources[i].start = start;
34488 +               adapter_rom_resources[i].end = start + length - 1;
34489 +               request_resource(&iomem_resource, &adapter_rom_resources[i]);
34490 +
34491 +               start = adapter_rom_resources[i++].end & ~2047UL;
34492 +       }
34493 +}
34494 +#endif
34495 +
34496 +static __init void parse_cmdline_early (char ** cmdline_p)
34497 +{
34498 +       char c = ' ', *to = command_line, *from = COMMAND_LINE;
34499 +       int len = 0;
34500 +       int userdef = 0;
34501 +
34502 +       for (;;) {
34503 +               if (c != ' ') 
34504 +                       goto next_char; 
34505 +
34506 +#ifdef  CONFIG_SMP
34507 +               /*
34508 +                * If the BIOS enumerates physical processors before logical,
34509 +                * maxcpus=N at enumeration-time can be used to disable HT.
34510 +                */
34511 +               else if (!memcmp(from, "maxcpus=", 8)) {
34512 +                       extern unsigned int maxcpus;
34513 +
34514 +                       maxcpus = simple_strtoul(from + 8, NULL, 0);
34515 +               }
34516 +#endif
34517 +#ifdef CONFIG_ACPI
34518 +               /* "acpi=off" disables both ACPI table parsing and interpreter init */
34519 +               if (!memcmp(from, "acpi=off", 8))
34520 +                       disable_acpi();
34521 +
34522 +               if (!memcmp(from, "acpi=force", 10)) { 
34523 +                       /* add later when we do DMI horrors: */
34524 +                       acpi_force = 1;
34525 +                       acpi_disabled = 0;
34526 +               }
34527 +
34528 +               /* acpi=ht just means: do ACPI MADT parsing 
34529 +                  at bootup, but don't enable the full ACPI interpreter */
34530 +               if (!memcmp(from, "acpi=ht", 7)) { 
34531 +                       if (!acpi_force)
34532 +                               disable_acpi();
34533 +                       acpi_ht = 1; 
34534 +               }
34535 +                else if (!memcmp(from, "pci=noacpi", 10)) 
34536 +                       acpi_disable_pci();
34537 +               else if (!memcmp(from, "acpi=noirq", 10))
34538 +                       acpi_noirq_set();
34539 +
34540 +               else if (!memcmp(from, "acpi_sci=edge", 13))
34541 +                       acpi_sci_flags.trigger =  1;
34542 +               else if (!memcmp(from, "acpi_sci=level", 14))
34543 +                       acpi_sci_flags.trigger = 3;
34544 +               else if (!memcmp(from, "acpi_sci=high", 13))
34545 +                       acpi_sci_flags.polarity = 1;
34546 +               else if (!memcmp(from, "acpi_sci=low", 12))
34547 +                       acpi_sci_flags.polarity = 3;
34548 +
34549 +               /* acpi=strict disables out-of-spec workarounds */
34550 +               else if (!memcmp(from, "acpi=strict", 11)) {
34551 +                       acpi_strict = 1;
34552 +               }
34553 +#ifdef CONFIG_X86_IO_APIC
34554 +               else if (!memcmp(from, "acpi_skip_timer_override", 24))
34555 +                       acpi_skip_timer_override = 1;
34556 +#endif
34557 +#endif
34558 +
34559 +#ifndef CONFIG_XEN
34560 +               if (!memcmp(from, "nolapic", 7) ||
34561 +                   !memcmp(from, "disableapic", 11))
34562 +                       disable_apic = 1;
34563 +
34564 +               /* Don't confuse with noapictimer */
34565 +               if (!memcmp(from, "noapic", 6) &&
34566 +                       (from[6] == ' ' || from[6] == 0))
34567 +                       skip_ioapic_setup = 1;
34568 +
34569 +               /* Make sure to not confuse with apic= */
34570 +               if (!memcmp(from, "apic", 4) &&
34571 +                       (from[4] == ' ' || from[4] == 0)) {
34572 +                       skip_ioapic_setup = 0;
34573 +                       ioapic_force = 1;
34574 +               }
34575 +#endif
34576 +                       
34577 +               if (!memcmp(from, "mem=", 4))
34578 +                       parse_memopt(from+4, &from); 
34579 +
34580 +               if (!memcmp(from, "memmap=", 7)) {
34581 +                       /* exactmap option is for used defined memory */
34582 +                       if (!memcmp(from+7, "exactmap", 8)) {
34583 +#ifdef CONFIG_CRASH_DUMP
34584 +                               /* If we are doing a crash dump, we
34585 +                                * still need to know the real mem
34586 +                                * size before original memory map is
34587 +                                * reset.
34588 +                                */
34589 +                               saved_max_pfn = e820_end_of_ram();
34590 +#endif
34591 +                               from += 8+7;
34592 +                               end_pfn_map = 0;
34593 +                               e820.nr_map = 0;
34594 +                               userdef = 1;
34595 +                       }
34596 +                       else {
34597 +                               parse_memmapopt(from+7, &from);
34598 +                               userdef = 1;
34599 +                       }
34600 +               }
34601 +
34602 +#ifdef CONFIG_NUMA
34603 +               if (!memcmp(from, "numa=", 5))
34604 +                       numa_setup(from+5); 
34605 +#endif
34606 +
34607 +               if (!memcmp(from,"iommu=",6)) { 
34608 +                       iommu_setup(from+6); 
34609 +               }
34610 +
34611 +               if (!memcmp(from,"oops=panic", 10))
34612 +                       panic_on_oops = 1;
34613 +
34614 +               if (!memcmp(from, "noexec=", 7))
34615 +                       nonx_setup(from + 7);
34616 +
34617 +#ifdef CONFIG_KEXEC
34618 +               /* crashkernel=size@addr specifies the location to reserve for
34619 +                * a crash kernel.  By reserving this memory we guarantee
34620 +                * that linux never set's it up as a DMA target.
34621 +                * Useful for holding code to do something appropriate
34622 +                * after a kernel panic.
34623 +                */
34624 +               else if (!memcmp(from, "crashkernel=", 12)) {
34625 +                       unsigned long size, base;
34626 +                       size = memparse(from+12, &from);
34627 +                       if (*from == '@') {
34628 +                               base = memparse(from+1, &from);
34629 +                               /* FIXME: Do I want a sanity check
34630 +                                * to validate the memory range?
34631 +                                */
34632 +                               crashk_res.start = base;
34633 +                               crashk_res.end   = base + size - 1;
34634 +                       }
34635 +               }
34636 +#endif
34637 +
34638 +#ifdef CONFIG_PROC_VMCORE
34639 +               /* elfcorehdr= specifies the location of elf core header
34640 +                * stored by the crashed kernel. This option will be passed
34641 +                * by kexec loader to the capture kernel.
34642 +                */
34643 +               else if(!memcmp(from, "elfcorehdr=", 11))
34644 +                       elfcorehdr_addr = memparse(from+11, &from);
34645 +#endif
34646 +
34647 +#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN)
34648 +               else if (!memcmp(from, "additional_cpus=", 16))
34649 +                       setup_additional_cpus(from+16);
34650 +#endif
34651 +
34652 +       next_char:
34653 +               c = *(from++);
34654 +               if (!c)
34655 +                       break;
34656 +               if (COMMAND_LINE_SIZE <= ++len)
34657 +                       break;
34658 +               *(to++) = c;
34659 +       }
34660 +       if (userdef) {
34661 +               printk(KERN_INFO "user-defined physical RAM map:\n");
34662 +               e820_print_map("user");
34663 +       }
34664 +       *to = '\0';
34665 +       *cmdline_p = command_line;
34666 +}
34667 +
34668 +#ifndef CONFIG_NUMA
34669 +#ifdef CONFIG_XEN
34670 +static void __init
34671 +contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
34672 +{
34673 +        unsigned long bootmap_size;
34674 +
34675 +        bootmap_size = init_bootmem(start_pfn, end_pfn);
34676 +        free_bootmem(0, xen_start_info->nr_pages << PAGE_SHIFT);   
34677 +        reserve_bootmem(HIGH_MEMORY,
34678 +                        (PFN_PHYS(start_pfn) + bootmap_size + PAGE_SIZE-1)
34679 +                        - HIGH_MEMORY);
34680 +}
34681 +#else
34682 +static void __init
34683 +contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
34684 +{
34685 +       unsigned long bootmap_size, bootmap;
34686 +
34687 +       bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
34688 +       bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
34689 +       if (bootmap == -1L)
34690 +               panic("Cannot find bootmem map of size %ld\n",bootmap_size);
34691 +       bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
34692 +       e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
34693 +       reserve_bootmem(bootmap, bootmap_size);
34694 +} 
34695 +#endif /* !CONFIG_XEN */
34696 +#endif
34697 +
34698 +/* Use inline assembly to define this because the nops are defined 
34699 +   as inline assembly strings in the include files and we cannot 
34700 +   get them easily into strings. */
34701 +asm("\t.data\nk8nops: " 
34702 +    K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
34703 +    K8_NOP7 K8_NOP8); 
34704 +    
34705 +extern unsigned char k8nops[];
34706 +static unsigned char *k8_nops[ASM_NOP_MAX+1] = { 
34707 +     NULL,
34708 +     k8nops,
34709 +     k8nops + 1,
34710 +     k8nops + 1 + 2,
34711 +     k8nops + 1 + 2 + 3,
34712 +     k8nops + 1 + 2 + 3 + 4,
34713 +     k8nops + 1 + 2 + 3 + 4 + 5,
34714 +     k8nops + 1 + 2 + 3 + 4 + 5 + 6,
34715 +     k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
34716 +}; 
34717 +
34718 +extern char __vsyscall_0;
34719 +
34720 +/* Replace instructions with better alternatives for this CPU type.
34721 +
34722 +   This runs before SMP is initialized to avoid SMP problems with
34723 +   self modifying code. This implies that assymetric systems where
34724 +   APs have less capabilities than the boot processor are not handled. 
34725 +   In this case boot with "noreplacement". */ 
34726 +void apply_alternatives(void *start, void *end) 
34727 +{ 
34728 +       struct alt_instr *a; 
34729 +       int diff, i, k;
34730 +       for (a = start; (void *)a < end; a++) { 
34731 +               u8 *instr;
34732 +
34733 +               if (!boot_cpu_has(a->cpuid))
34734 +                       continue;
34735 +
34736 +               BUG_ON(a->replacementlen > a->instrlen); 
34737 +               instr = a->instr;
34738 +               /* vsyscall code is not mapped yet. resolve it manually. */
34739 +               if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END)
34740 +                       instr -= VSYSCALL_START - (unsigned long)&__vsyscall_0;
34741 +               __inline_memcpy(instr, a->replacement, a->replacementlen);
34742 +               diff = a->instrlen - a->replacementlen; 
34743 +
34744 +               /* Pad the rest with nops */
34745 +               for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
34746 +                       k = diff;
34747 +                       if (k > ASM_NOP_MAX)
34748 +                               k = ASM_NOP_MAX;
34749 +                       __inline_memcpy(instr + i, k8_nops[k], k);
34750 +               } 
34751 +       }
34752 +} 
34753 +
34754 +static int no_replacement __initdata = 0; 
34755
34756 +void __init alternative_instructions(void)
34757 +{
34758 +       extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
34759 +       if (no_replacement) 
34760 +               return;
34761 +       apply_alternatives(__alt_instructions, __alt_instructions_end);
34762 +}
34763 +
34764 +static int __init noreplacement_setup(char *s)
34765 +{ 
34766 +     no_replacement = 1; 
34767 +     return 0; 
34768 +} 
34769 +
34770 +__setup("noreplacement", noreplacement_setup); 
34771 +
34772 +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
34773 +struct edd edd;
34774 +#ifdef CONFIG_EDD_MODULE
34775 +EXPORT_SYMBOL(edd);
34776 +#endif
34777 +/**
34778 + * copy_edd() - Copy the BIOS EDD information
34779 + *              from boot_params into a safe place.
34780 + *
34781 + */
34782 +static inline void copy_edd(void)
34783 +{
34784 +     memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
34785 +     memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
34786 +     edd.mbr_signature_nr = EDD_MBR_SIG_NR;
34787 +     edd.edd_info_nr = EDD_NR;
34788 +}
34789 +#else
34790 +static inline void copy_edd(void)
34791 +{
34792 +}
34793 +#endif
34794 +
34795 +#ifndef CONFIG_XEN
34796 +#define EBDA_ADDR_POINTER 0x40E
34797 +static void __init reserve_ebda_region(void)
34798 +{
34799 +       unsigned int addr;
34800 +       /** 
34801 +        * there is a real-mode segmented pointer pointing to the 
34802 +        * 4K EBDA area at 0x40E
34803 +        */
34804 +       addr = *(unsigned short *)phys_to_virt(EBDA_ADDR_POINTER);
34805 +       addr <<= 4;
34806 +       if (addr)
34807 +               reserve_bootmem_generic(addr, PAGE_SIZE);
34808 +}
34809 +#endif
34810 +
34811 +void __init setup_arch(char **cmdline_p)
34812 +{
34813 +       unsigned long kernel_end;
34814 +
34815 +#ifdef CONFIG_XEN
34816 +       /* Register a call for panic conditions. */
34817 +       notifier_chain_register(&panic_notifier_list, &xen_panic_block);
34818 +
34819 +       ROOT_DEV = MKDEV(RAMDISK_MAJOR,0); 
34820 +       kernel_end = 0;         /* dummy */
34821 +       screen_info = SCREEN_INFO;
34822 +
34823 +       if (xen_start_info->flags & SIF_INITDOMAIN) {
34824 +               /* This is drawn from a dump from vgacon:startup in
34825 +                * standard Linux. */
34826 +               screen_info.orig_video_mode = 3;
34827 +               screen_info.orig_video_isVGA = 1;
34828 +               screen_info.orig_video_lines = 25;
34829 +               screen_info.orig_video_cols = 80;
34830 +               screen_info.orig_video_ega_bx = 3;
34831 +               screen_info.orig_video_points = 16;
34832 +       } else
34833 +               screen_info.orig_video_isVGA = 0;
34834 +
34835 +       edid_info = EDID_INFO;
34836 +       saved_video_mode = SAVED_VIDEO_MODE;
34837 +       bootloader_type = LOADER_TYPE;
34838 +
34839 +#ifdef CONFIG_BLK_DEV_RAM
34840 +       rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
34841 +       rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
34842 +       rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
34843 +
34844 +
34845 +#endif
34846 +
34847 +       setup_xen_features();
34848 +
34849 +       if (xen_feature(XENFEAT_auto_translated_physmap) &&
34850 +           xen_start_info->shared_info < xen_start_info->nr_pages) {
34851 +               HYPERVISOR_shared_info =
34852 +                       (shared_info_t *)__va(xen_start_info->shared_info);
34853 +               memset(empty_zero_page, 0, sizeof(empty_zero_page));
34854 +       }
34855 +
34856 +       HYPERVISOR_vm_assist(VMASST_CMD_enable,
34857 +                            VMASST_TYPE_writable_pagetables);
34858 +
34859 +       ARCH_SETUP
34860 +#else
34861 +       ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
34862 +       screen_info = SCREEN_INFO;
34863 +       edid_info = EDID_INFO;
34864 +       saved_video_mode = SAVED_VIDEO_MODE;
34865 +       bootloader_type = LOADER_TYPE;
34866 +
34867 +#ifdef CONFIG_BLK_DEV_RAM
34868 +       rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
34869 +       rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
34870 +       rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
34871 +#endif
34872 +       setup_memory_region();
34873 +       copy_edd();
34874 +#endif /* !CONFIG_XEN */
34875 +
34876 +       if (!MOUNT_ROOT_RDONLY)
34877 +               root_mountflags &= ~MS_RDONLY;
34878 +       init_mm.start_code = (unsigned long) &_text;
34879 +       init_mm.end_code = (unsigned long) &_etext;
34880 +       init_mm.end_data = (unsigned long) &_edata;
34881 +#ifdef CONFIG_XEN
34882 +       init_mm.brk = start_pfn << PAGE_SHIFT;
34883 +#else
34884 +       init_mm.brk = (unsigned long) &_end;    
34885 +
34886 +       code_resource.start = virt_to_phys(&_text);
34887 +       code_resource.end = virt_to_phys(&_etext)-1;
34888 +       data_resource.start = virt_to_phys(&_etext);
34889 +       data_resource.end = virt_to_phys(&_edata)-1;
34890 +#endif
34891 +
34892 +       parse_cmdline_early(cmdline_p);
34893 +
34894 +       early_identify_cpu(&boot_cpu_data);
34895 +
34896 +       /*
34897 +        * partially used pages are not usable - thus
34898 +        * we are rounding upwards:
34899 +        */
34900 +       end_pfn = e820_end_of_ram();
34901 +
34902 +       check_efer();
34903 +
34904 +       init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
34905 +
34906 +#ifdef CONFIG_ACPI_NUMA
34907 +       /*
34908 +        * Parse SRAT to discover nodes.
34909 +        */
34910 +       acpi_numa_init();
34911 +#endif
34912 +
34913 +#ifdef CONFIG_NUMA
34914 +       numa_initmem_init(start_pfn, end_pfn); 
34915 +#else
34916 +       contig_initmem_init(start_pfn, end_pfn);
34917 +#endif
34918 +
34919 +#ifndef CONFIG_XEN
34920 +       /* Reserve direct mapping */
34921 +       reserve_bootmem_generic(table_start << PAGE_SHIFT, 
34922 +                               (table_end - table_start) << PAGE_SHIFT);
34923 +
34924 +       /* reserve kernel */
34925 +       kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE);
34926 +       reserve_bootmem_generic(HIGH_MEMORY, kernel_end - HIGH_MEMORY);
34927 +
34928 +       /*
34929 +        * reserve physical page 0 - it's a special BIOS page on many boxes,
34930 +        * enabling clean reboots, SMP operation, laptop functions.
34931 +        */
34932 +       reserve_bootmem_generic(0, PAGE_SIZE);
34933 +
34934 +       /* reserve ebda region */
34935 +       reserve_ebda_region();
34936 +#endif
34937 +
34938 +#ifdef CONFIG_SMP
34939 +       /*
34940 +        * But first pinch a few for the stack/trampoline stuff
34941 +        * FIXME: Don't need the extra page at 4K, but need to fix
34942 +        * trampoline before removing it. (see the GDT stuff)
34943 +        */
34944 +       reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE);
34945 +
34946 +       /* Reserve SMP trampoline */
34947 +       reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE);
34948 +#endif
34949 +
34950 +#ifdef CONFIG_ACPI_SLEEP
34951 +       /*
34952 +        * Reserve low memory region for sleep support.
34953 +        */
34954 +       acpi_reserve_bootmem();
34955 +#endif
34956 +#ifdef CONFIG_XEN
34957 +#ifdef CONFIG_BLK_DEV_INITRD
34958 +       if (xen_start_info->mod_start) {
34959 +               if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
34960 +                       /*reserve_bootmem_generic(INITRD_START, INITRD_SIZE);*/
34961 +                       initrd_start = INITRD_START + PAGE_OFFSET;
34962 +                       initrd_end = initrd_start+INITRD_SIZE;
34963 +                       initrd_below_start_ok = 1;
34964 +               } else {
34965 +                       printk(KERN_ERR "initrd extends beyond end of memory "
34966 +                               "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
34967 +                               (unsigned long)(INITRD_START + INITRD_SIZE),
34968 +                               (unsigned long)(end_pfn << PAGE_SHIFT));
34969 +                       initrd_start = 0;
34970 +               }
34971 +       }
34972 +#endif
34973 +#else  /* CONFIG_XEN */
34974 +#ifdef CONFIG_BLK_DEV_INITRD
34975 +       if (LOADER_TYPE && INITRD_START) {
34976 +               if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
34977 +                       reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
34978 +                       initrd_start =
34979 +                               INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
34980 +                       initrd_end = initrd_start+INITRD_SIZE;
34981 +               }
34982 +               else {
34983 +                       printk(KERN_ERR "initrd extends beyond end of memory "
34984 +                           "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
34985 +                           (unsigned long)(INITRD_START + INITRD_SIZE),
34986 +                           (unsigned long)(end_pfn << PAGE_SHIFT));
34987 +                       initrd_start = 0;
34988 +               }
34989 +       }
34990 +#endif
34991 +#endif /* !CONFIG_XEN */
34992 +#ifdef CONFIG_KEXEC
34993 +       if (crashk_res.start != crashk_res.end) {
34994 +               reserve_bootmem(crashk_res.start,
34995 +                       crashk_res.end - crashk_res.start + 1);
34996 +       }
34997 +#endif
34998 +
34999 +       paging_init();
35000 +#ifdef CONFIG_X86_LOCAL_APIC
35001 +       /*
35002 +        * Find and reserve possible boot-time SMP configuration:
35003 +        */
35004 +       find_smp_config();
35005 +#endif
35006 +#ifdef CONFIG_XEN
35007 +       {
35008 +               int i, j, k, fpp;
35009 +               unsigned long va;
35010 +
35011 +               /* 'Initial mapping' of initrd must be destroyed. */
35012 +               for (va = xen_start_info->mod_start;
35013 +                    va < (xen_start_info->mod_start+xen_start_info->mod_len);
35014 +                    va += PAGE_SIZE) {
35015 +                       HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
35016 +               }
35017 +
35018 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
35019 +                       /* Make sure we have a large enough P->M table. */
35020 +                       phys_to_machine_mapping = alloc_bootmem(
35021 +                               end_pfn * sizeof(unsigned long));
35022 +                       memset(phys_to_machine_mapping, ~0,
35023 +                              end_pfn * sizeof(unsigned long));
35024 +                       memcpy(phys_to_machine_mapping,
35025 +                              (unsigned long *)xen_start_info->mfn_list,
35026 +                              xen_start_info->nr_pages * sizeof(unsigned long));
35027 +                       free_bootmem(
35028 +                               __pa(xen_start_info->mfn_list),
35029 +                               PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
35030 +                                               sizeof(unsigned long))));
35031 +
35032 +                       /* Destroyed 'initial mapping' of old p2m table. */
35033 +                       for (va = xen_start_info->mfn_list;
35034 +                            va < (xen_start_info->mfn_list +
35035 +                                  (xen_start_info->nr_pages*sizeof(unsigned long)));
35036 +                            va += PAGE_SIZE) {
35037 +                               HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
35038 +                       }
35039 +
35040 +                       /*
35041 +                        * Initialise the list of the frames that specify the
35042 +                        * list of frames that make up the p2m table. Used by
35043 +                         * save/restore.
35044 +                        */
35045 +                       pfn_to_mfn_frame_list_list = alloc_bootmem(PAGE_SIZE);
35046 +                       HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
35047 +                               virt_to_mfn(pfn_to_mfn_frame_list_list);
35048 +
35049 +                       fpp = PAGE_SIZE/sizeof(unsigned long);
35050 +                       for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) {
35051 +                               if ((j % fpp) == 0) {
35052 +                                       k++;
35053 +                                       BUG_ON(k>=fpp);
35054 +                                       pfn_to_mfn_frame_list[k] =
35055 +                                               alloc_bootmem(PAGE_SIZE);
35056 +                                       pfn_to_mfn_frame_list_list[k] =
35057 +                                               virt_to_mfn(pfn_to_mfn_frame_list[k]);
35058 +                                       j=0;
35059 +                               }
35060 +                               pfn_to_mfn_frame_list[k][j] =
35061 +                                       virt_to_mfn(&phys_to_machine_mapping[i]);
35062 +                       }
35063 +                       HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
35064 +               }
35065 +
35066 +       }
35067 +
35068 +       if ( ! (xen_start_info->flags & SIF_INITDOMAIN))
35069 +       {
35070 +               acpi_disabled = 1;
35071 +#ifdef  CONFIG_ACPI
35072 +               acpi_ht = 0;
35073 +#endif
35074 +       }
35075 +#endif
35076 +
35077 +#ifndef CONFIG_XEN
35078 +       check_ioapic();
35079 +#endif
35080 +
35081 +       zap_low_mappings(0);
35082 +
35083 +#ifdef CONFIG_ACPI
35084 +       /*
35085 +        * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
35086 +        * Call this early for SRAT node setup.
35087 +        */
35088 +       acpi_boot_table_init();
35089 +
35090 +       /*
35091 +        * Read APIC and some other early information from ACPI tables.
35092 +        */
35093 +       acpi_boot_init();
35094 +#endif
35095 +
35096 +       init_cpu_to_node();
35097 +
35098 +#ifdef CONFIG_X86_LOCAL_APIC
35099 +       /*
35100 +        * get boot-time SMP configuration:
35101 +        */
35102 +       if (smp_found_config)
35103 +               get_smp_config();
35104 +#ifndef CONFIG_XEN
35105 +       init_apic_mappings();
35106 +#endif
35107 +#endif
35108 +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
35109 +       prefill_possible_map();
35110 +#endif
35111 +
35112 +#if defined(CONFIG_XEN_PRIVILEGED_GUEST) || !defined(CONFIG_XEN)
35113 +       /*
35114 +        * Request address space for all standard RAM and ROM resources
35115 +        * and also for regions reported as reserved by the e820.
35116 +        */
35117 +       probe_roms();
35118 +       e820_reserve_resources(); 
35119 +#endif
35120 +
35121 +       request_resource(&iomem_resource, &video_ram_resource);
35122 +
35123 +       {
35124 +       unsigned i;
35125 +       /* request I/O space for devices used on all i[345]86 PCs */
35126 +       for (i = 0; i < STANDARD_IO_RESOURCES; i++)
35127 +               request_resource(&ioport_resource, &standard_io_resources[i]);
35128 +       }
35129 +
35130 +       e820_setup_gap();
35131 +
35132 +#ifdef CONFIG_GART_IOMMU
35133 +       iommu_hole_init();
35134 +#endif
35135 +
35136 +#ifdef CONFIG_XEN
35137 +       {
35138 +               physdev_op_t op;
35139 +
35140 +               op.cmd             = PHYSDEVOP_SET_IOPL;
35141 +               op.u.set_iopl.iopl = 1;
35142 +               HYPERVISOR_physdev_op(&op);
35143 +
35144 +               if (xen_start_info->flags & SIF_INITDOMAIN) {
35145 +                       if (!(xen_start_info->flags & SIF_PRIVILEGED))
35146 +                               panic("Xen granted us console access "
35147 +                                     "but not privileged status");
35148 +                      
35149 +#ifdef CONFIG_VT
35150 +#if defined(CONFIG_VGA_CONSOLE)
35151 +                       conswitchp = &vga_con;
35152 +#elif defined(CONFIG_DUMMY_CONSOLE)
35153 +                       conswitchp = &dummy_con;
35154 +#endif
35155 +#endif
35156 +               } else {
35157 +                       extern int console_use_vt;
35158 +                       console_use_vt = 0;
35159 +               }
35160 +       }
35161 +#else  /* CONFIG_XEN */
35162 +
35163 +#ifdef CONFIG_VT
35164 +#if defined(CONFIG_VGA_CONSOLE)
35165 +       conswitchp = &vga_con;
35166 +#elif defined(CONFIG_DUMMY_CONSOLE)
35167 +       conswitchp = &dummy_con;
35168 +#endif
35169 +#endif
35170 +
35171 +#endif /* !CONFIG_XEN */
35172 +}
35173 +
35174 +#ifdef CONFIG_XEN
35175 +static int
35176 +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
35177 +{
35178 +       HYPERVISOR_shutdown(SHUTDOWN_crash);
35179 +       /* we're never actually going to get here... */
35180 +       return NOTIFY_DONE;
35181 +}
35182 +#endif /* !CONFIG_XEN */
35183 +
35184 +
35185 +static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
35186 +{
35187 +       unsigned int *v;
35188 +
35189 +       if (c->extended_cpuid_level < 0x80000004)
35190 +               return 0;
35191 +
35192 +       v = (unsigned int *) c->x86_model_id;
35193 +       cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
35194 +       cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
35195 +       cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
35196 +       c->x86_model_id[48] = 0;
35197 +       return 1;
35198 +}
35199 +
35200 +
35201 +static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
35202 +{
35203 +       unsigned int n, dummy, eax, ebx, ecx, edx;
35204 +
35205 +       n = c->extended_cpuid_level;
35206 +
35207 +       if (n >= 0x80000005) {
35208 +               cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
35209 +               printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
35210 +                       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
35211 +               c->x86_cache_size=(ecx>>24)+(edx>>24);
35212 +               /* On K8 L1 TLB is inclusive, so don't count it */
35213 +               c->x86_tlbsize = 0;
35214 +       }
35215 +
35216 +       if (n >= 0x80000006) {
35217 +               cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
35218 +               ecx = cpuid_ecx(0x80000006);
35219 +               c->x86_cache_size = ecx >> 16;
35220 +               c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
35221 +
35222 +               printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
35223 +               c->x86_cache_size, ecx & 0xFF);
35224 +       }
35225 +
35226 +       if (n >= 0x80000007)
35227 +               cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); 
35228 +       if (n >= 0x80000008) {
35229 +               cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); 
35230 +               c->x86_virt_bits = (eax >> 8) & 0xff;
35231 +               c->x86_phys_bits = eax & 0xff;
35232 +       }
35233 +}
35234 +
35235 +#ifdef CONFIG_NUMA
35236 +static int nearby_node(int apicid)
35237 +{
35238 +       int i;
35239 +       for (i = apicid - 1; i >= 0; i--) {
35240 +               int node = apicid_to_node[i];
35241 +               if (node != NUMA_NO_NODE && node_online(node))
35242 +                       return node;
35243 +       }
35244 +       for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
35245 +               int node = apicid_to_node[i];
35246 +               if (node != NUMA_NO_NODE && node_online(node))
35247 +                       return node;
35248 +       }
35249 +       return first_node(node_online_map); /* Shouldn't happen */
35250 +}
35251 +#endif
35252 +
35253 +/*
35254 + * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
35255 + * Assumes number of cores is a power of two.
35256 + */
35257 +static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
35258 +{
35259 +#ifdef CONFIG_SMP
35260 +       int cpu = smp_processor_id();
35261 +       unsigned bits;
35262 +#ifdef CONFIG_NUMA
35263 +       int node = 0;
35264 +       unsigned apicid = phys_proc_id[cpu];
35265 +#endif
35266 +
35267 +       bits = 0;
35268 +       while ((1 << bits) < c->x86_max_cores)
35269 +               bits++;
35270 +
35271 +       /* Low order bits define the core id (index of core in socket) */
35272 +       cpu_core_id[cpu] = phys_proc_id[cpu] & ((1 << bits)-1);
35273 +       /* Convert the APIC ID into the socket ID */
35274 +       phys_proc_id[cpu] >>= bits;
35275 +
35276 +#ifdef CONFIG_NUMA
35277 +       node = phys_proc_id[cpu];
35278 +       if (apicid_to_node[apicid] != NUMA_NO_NODE)
35279 +               node = apicid_to_node[apicid];
35280 +       if (!node_online(node)) {
35281 +               /* Two possibilities here:
35282 +                  - The CPU is missing memory and no node was created.
35283 +                  In that case try picking one from a nearby CPU
35284 +                  - The APIC IDs differ from the HyperTransport node IDs
35285 +                  which the K8 northbridge parsing fills in.
35286 +                  Assume they are all increased by a constant offset,
35287 +                  but in the same order as the HT nodeids.
35288 +                  If that doesn't result in a usable node fall back to the
35289 +                  path for the previous case.  */
35290 +               int ht_nodeid = apicid - (phys_proc_id[0] << bits);
35291 +               if (ht_nodeid >= 0 &&
35292 +                   apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
35293 +                       node = apicid_to_node[ht_nodeid];
35294 +               /* Pick a nearby node */
35295 +               if (!node_online(node))
35296 +                       node = nearby_node(apicid);
35297 +       }
35298 +       numa_set_node(cpu, node);
35299 +
35300 +       printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n",
35301 +                       cpu, c->x86_max_cores, node, cpu_core_id[cpu]);
35302 +#endif
35303 +#endif
35304 +}
35305 +
35306 +static int __init init_amd(struct cpuinfo_x86 *c)
35307 +{
35308 +       int r;
35309 +       unsigned level;
35310 +
35311 +#ifdef CONFIG_SMP
35312 +       unsigned long value;
35313 +
35314 +       /*
35315 +        * Disable TLB flush filter by setting HWCR.FFDIS on K8
35316 +        * bit 6 of msr C001_0015
35317 +        *
35318 +        * Errata 63 for SH-B3 steppings
35319 +        * Errata 122 for all steppings (F+ have it disabled by default)
35320 +        */
35321 +       if (c->x86 == 15) {
35322 +               rdmsrl(MSR_K8_HWCR, value);
35323 +               value |= 1 << 6;
35324 +               wrmsrl(MSR_K8_HWCR, value);
35325 +       }
35326 +#endif
35327 +
35328 +       /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
35329 +          3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
35330 +       clear_bit(0*32+31, &c->x86_capability);
35331 +       
35332 +       /* On C+ stepping K8 rep microcode works well for copy/memset */
35333 +       level = cpuid_eax(1);
35334 +       if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
35335 +               set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
35336 +
35337 +       r = get_model_name(c);
35338 +       if (!r) { 
35339 +               switch (c->x86) { 
35340 +               case 15:
35341 +                       /* Should distinguish Models here, but this is only
35342 +                          a fallback anyways. */
35343 +                       strcpy(c->x86_model_id, "Hammer");
35344 +                       break; 
35345 +               } 
35346 +       } 
35347 +       display_cacheinfo(c);
35348 +
35349 +       /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
35350 +       if (c->x86_power & (1<<8))
35351 +               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
35352 +
35353 +       if (c->extended_cpuid_level >= 0x80000008) {
35354 +               c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
35355 +               if (c->x86_max_cores & (c->x86_max_cores - 1))
35356 +                       c->x86_max_cores = 1;
35357 +
35358 +               amd_detect_cmp(c);
35359 +       }
35360 +
35361 +       return r;
35362 +}
35363 +
35364 +static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
35365 +{
35366 +#ifdef CONFIG_SMP
35367 +       u32     eax, ebx, ecx, edx;
35368 +       int     index_msb, core_bits;
35369 +       int     cpu = smp_processor_id();
35370 +
35371 +       cpuid(1, &eax, &ebx, &ecx, &edx);
35372 +
35373 +       c->apicid = phys_pkg_id(0);
35374 +
35375 +       if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
35376 +               return;
35377 +
35378 +       smp_num_siblings = (ebx & 0xff0000) >> 16;
35379 +
35380 +       if (smp_num_siblings == 1) {
35381 +               printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
35382 +       } else if (smp_num_siblings > 1 ) {
35383 +
35384 +               if (smp_num_siblings > NR_CPUS) {
35385 +                       printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
35386 +                       smp_num_siblings = 1;
35387 +                       return;
35388 +               }
35389 +
35390 +               index_msb = get_count_order(smp_num_siblings);
35391 +               phys_proc_id[cpu] = phys_pkg_id(index_msb);
35392 +
35393 +               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
35394 +                      phys_proc_id[cpu]);
35395 +
35396 +               smp_num_siblings = smp_num_siblings / c->x86_max_cores;
35397 +
35398 +               index_msb = get_count_order(smp_num_siblings) ;
35399 +
35400 +               core_bits = get_count_order(c->x86_max_cores);
35401 +
35402 +               cpu_core_id[cpu] = phys_pkg_id(index_msb) &
35403 +                                              ((1 << core_bits) - 1);
35404 +
35405 +               if (c->x86_max_cores > 1)
35406 +                       printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
35407 +                              cpu_core_id[cpu]);
35408 +       }
35409 +#endif
35410 +}
35411 +
35412 +/*
35413 + * find out the number of processor cores on the die
35414 + */
35415 +static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
35416 +{
35417 +       unsigned int eax;
35418 +
35419 +       if (c->cpuid_level < 4)
35420 +               return 1;
35421 +
35422 +       __asm__("cpuid"
35423 +               : "=a" (eax)
35424 +               : "0" (4), "c" (0)
35425 +               : "bx", "dx");
35426 +
35427 +       if (eax & 0x1f)
35428 +               return ((eax >> 26) + 1);
35429 +       else
35430 +               return 1;
35431 +}
35432 +
35433 +static void srat_detect_node(void)
35434 +{
35435 +#ifdef CONFIG_NUMA
35436 +       unsigned node;
35437 +       int cpu = smp_processor_id();
35438 +
35439 +       /* Don't do the funky fallback heuristics the AMD version employs
35440 +          for now. */
35441 +       node = apicid_to_node[hard_smp_processor_id()];
35442 +       if (node == NUMA_NO_NODE)
35443 +               node = 0;
35444 +       numa_set_node(cpu, node);
35445 +
35446 +       if (acpi_numa > 0)
35447 +               printk(KERN_INFO "CPU %d -> Node %d\n", cpu, node);
35448 +#endif
35449 +}
35450 +
35451 +static void __cpuinit init_intel(struct cpuinfo_x86 *c)
35452 +{
35453 +       /* Cache sizes */
35454 +       unsigned n;
35455 +
35456 +       init_intel_cacheinfo(c);
35457 +       n = c->extended_cpuid_level;
35458 +       if (n >= 0x80000008) {
35459 +               unsigned eax = cpuid_eax(0x80000008);
35460 +               c->x86_virt_bits = (eax >> 8) & 0xff;
35461 +               c->x86_phys_bits = eax & 0xff;
35462 +               /* CPUID workaround for Intel 0F34 CPU */
35463 +               if (c->x86_vendor == X86_VENDOR_INTEL &&
35464 +                   c->x86 == 0xF && c->x86_model == 0x3 &&
35465 +                   c->x86_mask == 0x4)
35466 +                       c->x86_phys_bits = 36;
35467 +       }
35468 +
35469 +       if (c->x86 == 15)
35470 +               c->x86_cache_alignment = c->x86_clflush_size * 2;
35471 +       if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
35472 +           (c->x86 == 0x6 && c->x86_model >= 0x0e))
35473 +               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
35474 +       set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
35475 +       c->x86_max_cores = intel_num_cpu_cores(c);
35476 +
35477 +       srat_detect_node();
35478 +}
35479 +
35480 +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
35481 +{
35482 +       char *v = c->x86_vendor_id;
35483 +
35484 +       if (!strcmp(v, "AuthenticAMD"))
35485 +               c->x86_vendor = X86_VENDOR_AMD;
35486 +       else if (!strcmp(v, "GenuineIntel"))
35487 +               c->x86_vendor = X86_VENDOR_INTEL;
35488 +       else
35489 +               c->x86_vendor = X86_VENDOR_UNKNOWN;
35490 +}
35491 +
35492 +struct cpu_model_info {
35493 +       int vendor;
35494 +       int family;
35495 +       char *model_names[16];
35496 +};
35497 +
35498 +/* Do some early cpuid on the boot CPU to get some parameter that are
35499 +   needed before check_bugs. Everything advanced is in identify_cpu
35500 +   below. */
35501 +void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
35502 +{
35503 +       u32 tfms;
35504 +
35505 +       c->loops_per_jiffy = loops_per_jiffy;
35506 +       c->x86_cache_size = -1;
35507 +       c->x86_vendor = X86_VENDOR_UNKNOWN;
35508 +       c->x86_model = c->x86_mask = 0; /* So far unknown... */
35509 +       c->x86_vendor_id[0] = '\0'; /* Unset */
35510 +       c->x86_model_id[0] = '\0';  /* Unset */
35511 +       c->x86_clflush_size = 64;
35512 +       c->x86_cache_alignment = c->x86_clflush_size;
35513 +       c->x86_max_cores = 1;
35514 +       c->extended_cpuid_level = 0;
35515 +       memset(&c->x86_capability, 0, sizeof c->x86_capability);
35516 +
35517 +       /* Get vendor name */
35518 +       cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
35519 +             (unsigned int *)&c->x86_vendor_id[0],
35520 +             (unsigned int *)&c->x86_vendor_id[8],
35521 +             (unsigned int *)&c->x86_vendor_id[4]);
35522 +               
35523 +       get_cpu_vendor(c);
35524 +
35525 +       /* Initialize the standard set of capabilities */
35526 +       /* Note that the vendor-specific code below might override */
35527 +
35528 +       /* Intel-defined flags: level 0x00000001 */
35529 +       if (c->cpuid_level >= 0x00000001) {
35530 +               __u32 misc;
35531 +               cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
35532 +                     &c->x86_capability[0]);
35533 +               c->x86 = (tfms >> 8) & 0xf;
35534 +               c->x86_model = (tfms >> 4) & 0xf;
35535 +               c->x86_mask = tfms & 0xf;
35536 +               if (c->x86 == 0xf)
35537 +                       c->x86 += (tfms >> 20) & 0xff;
35538 +               if (c->x86 >= 0x6)
35539 +                       c->x86_model += ((tfms >> 16) & 0xF) << 4;
35540 +               if (c->x86_capability[0] & (1<<19)) 
35541 +                       c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
35542 +       } else {
35543 +               /* Have CPUID level 0 only - unheard of */
35544 +               c->x86 = 4;
35545 +       }
35546 +
35547 +#ifdef CONFIG_SMP
35548 +       phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
35549 +#endif
35550 +}
35551 +
35552 +/*
35553 + * This does the hard work of actually picking apart the CPU stuff...
35554 + */
35555 +void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
35556 +{
35557 +       int i;
35558 +       u32 xlvl;
35559 +
35560 +       early_identify_cpu(c);
35561 +
35562 +       /* AMD-defined flags: level 0x80000001 */
35563 +       xlvl = cpuid_eax(0x80000000);
35564 +       c->extended_cpuid_level = xlvl;
35565 +       if ((xlvl & 0xffff0000) == 0x80000000) {
35566 +               if (xlvl >= 0x80000001) {
35567 +                       c->x86_capability[1] = cpuid_edx(0x80000001);
35568 +                       c->x86_capability[6] = cpuid_ecx(0x80000001);
35569 +               }
35570 +               if (xlvl >= 0x80000004)
35571 +                       get_model_name(c); /* Default name */
35572 +       }
35573 +
35574 +       /* Transmeta-defined flags: level 0x80860001 */
35575 +       xlvl = cpuid_eax(0x80860000);
35576 +       if ((xlvl & 0xffff0000) == 0x80860000) {
35577 +               /* Don't set x86_cpuid_level here for now to not confuse. */
35578 +               if (xlvl >= 0x80860001)
35579 +                       c->x86_capability[2] = cpuid_edx(0x80860001);
35580 +       }
35581 +
35582 +       /*
35583 +        * Vendor-specific initialization.  In this section we
35584 +        * canonicalize the feature flags, meaning if there are
35585 +        * features a certain CPU supports which CPUID doesn't
35586 +        * tell us, CPUID claiming incorrect flags, or other bugs,
35587 +        * we handle them here.
35588 +        *
35589 +        * At the end of this section, c->x86_capability better
35590 +        * indicate the features this CPU genuinely supports!
35591 +        */
35592 +       switch (c->x86_vendor) {
35593 +       case X86_VENDOR_AMD:
35594 +               init_amd(c);
35595 +               break;
35596 +
35597 +       case X86_VENDOR_INTEL:
35598 +               init_intel(c);
35599 +               break;
35600 +
35601 +       case X86_VENDOR_UNKNOWN:
35602 +       default:
35603 +               display_cacheinfo(c);
35604 +               break;
35605 +       }
35606 +
35607 +       select_idle_routine(c);
35608 +       detect_ht(c); 
35609 +
35610 +       /*
35611 +        * On SMP, boot_cpu_data holds the common feature set between
35612 +        * all CPUs; so make sure that we indicate which features are
35613 +        * common between the CPUs.  The first time this routine gets
35614 +        * executed, c == &boot_cpu_data.
35615 +        */
35616 +       if (c != &boot_cpu_data) {
35617 +               /* AND the already accumulated flags with these */
35618 +               for (i = 0 ; i < NCAPINTS ; i++)
35619 +                       boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
35620 +       }
35621 +
35622 +#ifdef CONFIG_X86_MCE
35623 +       mcheck_init(c);
35624 +#endif
35625 +       if (c == &boot_cpu_data)
35626 +               mtrr_bp_init();
35627 +       else
35628 +               mtrr_ap_init();
35629 +#ifdef CONFIG_NUMA
35630 +       numa_add_cpu(smp_processor_id());
35631 +#endif
35632 +}
35633
35634 +
35635 +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
35636 +{
35637 +       if (c->x86_model_id[0])
35638 +               printk("%s", c->x86_model_id);
35639 +
35640 +       if (c->x86_mask || c->cpuid_level >= 0) 
35641 +               printk(" stepping %02x\n", c->x86_mask);
35642 +       else
35643 +               printk("\n");
35644 +}
35645 +
35646 +/*
35647 + *     Get CPU information for use by the procfs.
35648 + */
35649 +
35650 +static int show_cpuinfo(struct seq_file *m, void *v)
35651 +{
35652 +       struct cpuinfo_x86 *c = v;
35653 +
35654 +       /* 
35655 +        * These flag bits must match the definitions in <asm/cpufeature.h>.
35656 +        * NULL means this bit is undefined or reserved; either way it doesn't
35657 +        * have meaning as far as Linux is concerned.  Note that it's important
35658 +        * to realize there is a difference between this table and CPUID -- if
35659 +        * applications want to get the raw CPUID data, they should access
35660 +        * /dev/cpu/<cpu_nr>/cpuid instead.
35661 +        */
35662 +       static char *x86_cap_flags[] = {
35663 +               /* Intel-defined */
35664 +               "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
35665 +               "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
35666 +               "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
35667 +               "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL,
35668 +
35669 +               /* AMD-defined */
35670 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35671 +               NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
35672 +               NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
35673 +               NULL, "fxsr_opt", "rdtscp", NULL, NULL, "lm", "3dnowext", "3dnow",
35674 +
35675 +               /* Transmeta-defined */
35676 +               "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
35677 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35678 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35679 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35680 +
35681 +               /* Other (Linux-defined) */
35682 +               "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL,
35683 +               "constant_tsc", NULL, NULL,
35684 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35685 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35686 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35687 +
35688 +               /* Intel-defined (#2) */
35689 +               "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", NULL, "est",
35690 +               "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
35691 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35692 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35693 +
35694 +               /* VIA/Cyrix/Centaur-defined */
35695 +               NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
35696 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35697 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35698 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35699 +
35700 +               /* AMD-defined (#2) */
35701 +               "lahf_lm", "cmp_legacy", "svm", NULL, "cr8_legacy", NULL, NULL, NULL,
35702 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35703 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35704 +               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35705 +       };
35706 +       static char *x86_power_flags[] = { 
35707 +               "ts",   /* temperature sensor */
35708 +               "fid",  /* frequency id control */
35709 +               "vid",  /* voltage id control */
35710 +               "ttp",  /* thermal trip */
35711 +               "tm",
35712 +               "stc",
35713 +               NULL,
35714 +               /* nothing */   /* constant_tsc - moved to flags */
35715 +       };
35716 +
35717 +
35718 +#ifdef CONFIG_SMP
35719 +       if (!cpu_online(c-cpu_data))
35720 +               return 0;
35721 +#endif
35722 +
35723 +       seq_printf(m,"processor\t: %u\n"
35724 +                    "vendor_id\t: %s\n"
35725 +                    "cpu family\t: %d\n"
35726 +                    "model\t\t: %d\n"
35727 +                    "model name\t: %s\n",
35728 +                    (unsigned)(c-cpu_data),
35729 +                    c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
35730 +                    c->x86,
35731 +                    (int)c->x86_model,
35732 +                    c->x86_model_id[0] ? c->x86_model_id : "unknown");
35733 +       
35734 +       if (c->x86_mask || c->cpuid_level >= 0)
35735 +               seq_printf(m, "stepping\t: %d\n", c->x86_mask);
35736 +       else
35737 +               seq_printf(m, "stepping\t: unknown\n");
35738 +       
35739 +       if (cpu_has(c,X86_FEATURE_TSC)) {
35740 +               unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data));
35741 +               if (!freq)
35742 +                       freq = cpu_khz;
35743 +               seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
35744 +                            freq / 1000, (freq % 1000));
35745 +       }
35746 +
35747 +       /* Cache size */
35748 +       if (c->x86_cache_size >= 0) 
35749 +               seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
35750 +       
35751 +#ifdef CONFIG_SMP
35752 +       if (smp_num_siblings * c->x86_max_cores > 1) {
35753 +               int cpu = c - cpu_data;
35754 +               seq_printf(m, "physical id\t: %d\n", phys_proc_id[cpu]);
35755 +               seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu]));
35756 +               seq_printf(m, "core id\t\t: %d\n", cpu_core_id[cpu]);
35757 +               seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
35758 +       }
35759 +#endif 
35760 +
35761 +       seq_printf(m,
35762 +               "fpu\t\t: yes\n"
35763 +               "fpu_exception\t: yes\n"
35764 +               "cpuid level\t: %d\n"
35765 +               "wp\t\t: yes\n"
35766 +               "flags\t\t:",
35767 +                  c->cpuid_level);
35768 +
35769 +       { 
35770 +               int i; 
35771 +               for ( i = 0 ; i < 32*NCAPINTS ; i++ )
35772 +                       if ( test_bit(i, &c->x86_capability) &&
35773 +                            x86_cap_flags[i] != NULL )
35774 +                               seq_printf(m, " %s", x86_cap_flags[i]);
35775 +       }
35776 +               
35777 +       seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
35778 +                  c->loops_per_jiffy/(500000/HZ),
35779 +                  (c->loops_per_jiffy/(5000/HZ)) % 100);
35780 +
35781 +       if (c->x86_tlbsize > 0) 
35782 +               seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
35783 +       seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
35784 +       seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
35785 +
35786 +       seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", 
35787 +                  c->x86_phys_bits, c->x86_virt_bits);
35788 +
35789 +       seq_printf(m, "power management:");
35790 +       {
35791 +               unsigned i;
35792 +               for (i = 0; i < 32; i++) 
35793 +                       if (c->x86_power & (1 << i)) {
35794 +                               if (i < ARRAY_SIZE(x86_power_flags) &&
35795 +                                       x86_power_flags[i])
35796 +                                       seq_printf(m, "%s%s",
35797 +                                               x86_power_flags[i][0]?" ":"",
35798 +                                               x86_power_flags[i]);
35799 +                               else
35800 +                                       seq_printf(m, " [%d]", i);
35801 +                       }
35802 +       }
35803 +
35804 +       seq_printf(m, "\n\n");
35805 +
35806 +       return 0;
35807 +}
35808 +
35809 +static void *c_start(struct seq_file *m, loff_t *pos)
35810 +{
35811 +       return *pos < NR_CPUS ? cpu_data + *pos : NULL;
35812 +}
35813 +
35814 +static void *c_next(struct seq_file *m, void *v, loff_t *pos)
35815 +{
35816 +       ++*pos;
35817 +       return c_start(m, pos);
35818 +}
35819 +
35820 +static void c_stop(struct seq_file *m, void *v)
35821 +{
35822 +}
35823 +
35824 +struct seq_operations cpuinfo_op = {
35825 +       .start =c_start,
35826 +       .next = c_next,
35827 +       .stop = c_stop,
35828 +       .show = show_cpuinfo,
35829 +};
35830 +
35831 +static int __init run_dmi_scan(void)
35832 +{
35833 +       dmi_scan_machine();
35834 +       return 0;
35835 +}
35836 +core_initcall(run_dmi_scan);
35837 +
35838 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/setup64-xen.c linux-2.6.16/arch/x86_64/kernel/setup64-xen.c
35839 --- linux-2.6.16.orig/arch/x86_64/kernel/setup64-xen.c  1970-01-01 01:00:00.000000000 +0100
35840 +++ linux-2.6.16/arch/x86_64/kernel/setup64-xen.c       2006-06-26 09:51:32.000000000 +0200
35841 @@ -0,0 +1,360 @@
35842 +/* 
35843 + * X86-64 specific CPU setup.
35844 + * Copyright (C) 1995  Linus Torvalds
35845 + * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
35846 + * See setup.c for older changelog.
35847 + * $Id$
35848 + *
35849 + * Jun Nakajima <jun.nakajima@intel.com> 
35850 + *   Modified for Xen
35851 + *
35852 + */ 
35853 +#include <linux/config.h>
35854 +#include <linux/init.h>
35855 +#include <linux/kernel.h>
35856 +#include <linux/sched.h>
35857 +#include <linux/string.h>
35858 +#include <linux/bootmem.h>
35859 +#include <linux/bitops.h>
35860 +#include <linux/module.h>
35861 +#include <asm/bootsetup.h>
35862 +#include <asm/pda.h>
35863 +#include <asm/pgtable.h>
35864 +#include <asm/processor.h>
35865 +#include <asm/desc.h>
35866 +#include <asm/atomic.h>
35867 +#include <asm/mmu_context.h>
35868 +#include <asm/smp.h>
35869 +#include <asm/i387.h>
35870 +#include <asm/percpu.h>
35871 +#include <asm/proto.h>
35872 +#include <asm/sections.h>
35873 +#ifdef CONFIG_XEN
35874 +#include <asm/hypervisor.h>
35875 +#endif
35876 +
35877 +char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
35878 +
35879 +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
35880 +
35881 +struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
35882 +struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
35883 +
35884 +#ifndef CONFIG_X86_NO_IDT
35885 +struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table }; 
35886 +#endif
35887 +
35888 +char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
35889 +
35890 +unsigned long __supported_pte_mask __read_mostly = ~0UL;
35891 +static int do_not_nx __cpuinitdata = 0;
35892 +
35893 +/* noexec=on|off
35894 +Control non executable mappings for 64bit processes.
35895 +
35896 +on     Enable(default)
35897 +off    Disable
35898 +*/ 
35899 +int __init nonx_setup(char *str)
35900 +{
35901 +       if (!strncmp(str, "on", 2)) {
35902 +                __supported_pte_mask |= _PAGE_NX; 
35903 +               do_not_nx = 0; 
35904 +       } else if (!strncmp(str, "off", 3)) {
35905 +               do_not_nx = 1;
35906 +               __supported_pte_mask &= ~_PAGE_NX;
35907 +        }
35908 +       return 0;
35909 +} 
35910 +__setup("noexec=", nonx_setup);        /* parsed early actually */
35911 +
35912 +int force_personality32 = READ_IMPLIES_EXEC;
35913 +
35914 +/* noexec32=on|off
35915 +Control non executable heap for 32bit processes.
35916 +To control the stack too use noexec=off
35917 +
35918 +on     PROT_READ does not imply PROT_EXEC for 32bit processes
35919 +off    PROT_READ implies PROT_EXEC (default)
35920 +*/
35921 +static int __init nonx32_setup(char *str)
35922 +{
35923 +       if (!strcmp(str, "on"))
35924 +               force_personality32 &= ~READ_IMPLIES_EXEC;
35925 +       else if (!strcmp(str, "off"))
35926 +               force_personality32 |= READ_IMPLIES_EXEC;
35927 +       return 0;
35928 +}
35929 +__setup("noexec32=", nonx32_setup);
35930 +
35931 +/*
35932 + * Great future plan:
35933 + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
35934 + * Always point %gs to its beginning
35935 + */
35936 +void __init setup_per_cpu_areas(void)
35937 +{ 
35938 +       int i;
35939 +       unsigned long size;
35940 +
35941 +#ifdef CONFIG_HOTPLUG_CPU
35942 +       prefill_possible_map();
35943 +#endif
35944 +
35945 +       /* Copy section for each CPU (we discard the original) */
35946 +       size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
35947 +#ifdef CONFIG_MODULES
35948 +       if (size < PERCPU_ENOUGH_ROOM)
35949 +               size = PERCPU_ENOUGH_ROOM;
35950 +#endif
35951 +
35952 +       for_each_cpu_mask (i, cpu_possible_map) {
35953 +               char *ptr;
35954 +
35955 +               if (!NODE_DATA(cpu_to_node(i))) {
35956 +                       printk("cpu with no node %d, num_online_nodes %d\n",
35957 +                              i, num_online_nodes());
35958 +                       ptr = alloc_bootmem(size);
35959 +               } else { 
35960 +                       ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
35961 +               }
35962 +               if (!ptr)
35963 +                       panic("Cannot allocate cpu data for CPU %d\n", i);
35964 +               cpu_pda(i)->data_offset = ptr - __per_cpu_start;
35965 +               memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
35966 +       }
35967 +} 
35968 +
35969 +#ifdef CONFIG_XEN
35970 +static void switch_pt(void)
35971 +{
35972 +       xen_pt_switch(__pa(init_level4_pgt));
35973 +        xen_new_user_pt(__pa(init_level4_user_pgt));
35974 +}
35975 +
35976 +void __cpuinit cpu_gdt_init(struct desc_ptr *gdt_descr)
35977 +{
35978 +       unsigned long frames[16];
35979 +       unsigned long va;
35980 +       int f;
35981 +
35982 +       for (va = gdt_descr->address, f = 0;
35983 +            va < gdt_descr->address + gdt_descr->size;
35984 +            va += PAGE_SIZE, f++) {
35985 +               frames[f] = virt_to_mfn(va);
35986 +               make_page_readonly(
35987 +                       (void *)va, XENFEAT_writable_descriptor_tables);
35988 +       }
35989 +       if (HYPERVISOR_set_gdt(frames, gdt_descr->size /
35990 +                               sizeof (struct desc_struct)))
35991 +               BUG();
35992 +}
35993 +#else
35994 +static void switch_pt(void)
35995 +{
35996 +       asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
35997 +}
35998 +
35999 +void __init cpu_gdt_init(struct desc_ptr *gdt_descr)
36000 +{
36001 +       asm volatile("lgdt %0" :: "m" (*gdt_descr));
36002 +       asm volatile("lidt %0" :: "m" (idt_descr));
36003 +}
36004 +#endif
36005 +
36006 +void pda_init(int cpu)
36007 +{ 
36008 +       struct x8664_pda *pda = cpu_pda(cpu);
36009 +
36010 +       /* Setup up data that may be needed in __get_free_pages early */
36011 +       asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 
36012 +#ifndef CONFIG_XEN
36013 +       wrmsrl(MSR_GS_BASE, pda);
36014 +#else
36015 +       HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, (unsigned long)pda);
36016 +#endif
36017 +       pda->cpunumber = cpu; 
36018 +       pda->irqcount = -1;
36019 +       pda->kernelstack = 
36020 +               (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; 
36021 +       pda->active_mm = &init_mm;
36022 +       pda->mmu_state = 0;
36023 +
36024 +       if (cpu == 0) {
36025 +#ifdef CONFIG_XEN
36026 +               xen_init_pt();
36027 +#endif
36028 +               /* others are initialized in smpboot.c */
36029 +               pda->pcurrent = &init_task;
36030 +               pda->irqstackptr = boot_cpu_stack; 
36031 +       } else {
36032 +               pda->irqstackptr = (char *)
36033 +                       __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
36034 +               if (!pda->irqstackptr)
36035 +                       panic("cannot allocate irqstack for cpu %d", cpu); 
36036 +       }
36037 +
36038 +       switch_pt();
36039 +
36040 +       pda->irqstackptr += IRQSTACKSIZE-64;
36041 +} 
36042 +
36043 +#ifndef CONFIG_X86_NO_TSS
36044 +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
36045 +__attribute__((section(".bss.page_aligned")));
36046 +#endif
36047 +
36048 +/* May not be marked __init: used by software suspend */
36049 +void syscall_init(void)
36050 +{
36051 +#ifndef CONFIG_XEN
36052 +       /* 
36053 +        * LSTAR and STAR live in a bit strange symbiosis.
36054 +        * They both write to the same internal register. STAR allows to set CS/DS
36055 +        * but only a 32bit target. LSTAR sets the 64bit rip.    
36056 +        */ 
36057 +       wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32); 
36058 +       wrmsrl(MSR_LSTAR, system_call); 
36059 +
36060 +       /* Flags to clear on syscall */
36061 +       wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); 
36062 +#endif
36063 +#ifdef CONFIG_IA32_EMULATION                   
36064 +       syscall32_cpu_init ();
36065 +#endif
36066 +}
36067 +
36068 +void __cpuinit check_efer(void)
36069 +{
36070 +       unsigned long efer;
36071 +
36072 +       rdmsrl(MSR_EFER, efer); 
36073 +        if (!(efer & EFER_NX) || do_not_nx) { 
36074 +                __supported_pte_mask &= ~_PAGE_NX; 
36075 +        }       
36076 +}
36077 +
36078 +/*
36079 + * cpu_init() initializes state that is per-CPU. Some data is already
36080 + * initialized (naturally) in the bootstrap process, such as the GDT
36081 + * and IDT. We reload them nevertheless, this function acts as a
36082 + * 'CPU state barrier', nothing should get across.
36083 + * A lot of state is already set up in PDA init.
36084 + */
36085 +void __cpuinit cpu_init (void)
36086 +{
36087 +       int cpu = stack_smp_processor_id();
36088 +#ifndef CONFIG_X86_NO_TSS
36089 +       struct tss_struct *t = &per_cpu(init_tss, cpu);
36090 +       unsigned long v; 
36091 +       char *estacks = NULL; 
36092 +       unsigned i;
36093 +#endif
36094 +       struct task_struct *me;
36095 +
36096 +       /* CPU 0 is initialised in head64.c */
36097 +       if (cpu != 0) {
36098 +               pda_init(cpu);
36099 +               zap_low_mappings(cpu);
36100 +       }
36101 +#ifndef CONFIG_X86_NO_TSS
36102 +       else
36103 +               estacks = boot_exception_stacks; 
36104 +#endif
36105 +
36106 +       me = current;
36107 +
36108 +       if (cpu_test_and_set(cpu, cpu_initialized))
36109 +               panic("CPU#%d already initialized!\n", cpu);
36110 +
36111 +       printk("Initializing CPU#%d\n", cpu);
36112 +
36113 +       clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
36114 +
36115 +       /*
36116 +        * Initialize the per-CPU GDT with the boot GDT,
36117 +        * and set up the GDT descriptor:
36118 +        */
36119 +#ifndef CONFIG_XEN 
36120 +       if (cpu)
36121 +               memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
36122 +#endif
36123 +
36124 +       cpu_gdt_descr[cpu].size = GDT_SIZE;
36125 +       cpu_gdt_init(&cpu_gdt_descr[cpu]);
36126 +
36127 +       memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
36128 +       syscall_init();
36129 +
36130 +       wrmsrl(MSR_FS_BASE, 0);
36131 +       wrmsrl(MSR_KERNEL_GS_BASE, 0);
36132 +       barrier(); 
36133 +
36134 +       check_efer();
36135 +
36136 +#ifndef CONFIG_X86_NO_TSS
36137 +       /*
36138 +        * set up and load the per-CPU TSS
36139 +        */
36140 +       for (v = 0; v < N_EXCEPTION_STACKS; v++) {
36141 +               if (cpu) {
36142 +                       static const unsigned int order[N_EXCEPTION_STACKS] = {
36143 +                               [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
36144 +                               [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
36145 +                       };
36146 +
36147 +                       estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
36148 +                       if (!estacks)
36149 +                               panic("Cannot allocate exception stack %ld %d\n",
36150 +                                     v, cpu); 
36151 +               }
36152 +               switch (v + 1) {
36153 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
36154 +               case DEBUG_STACK:
36155 +                       cpu_pda[cpu].debugstack = (unsigned long)estacks;
36156 +                       estacks += DEBUG_STKSZ;
36157 +                       break;
36158 +#endif
36159 +               default:
36160 +                       estacks += EXCEPTION_STKSZ;
36161 +                       break;
36162 +               }
36163 +               t->ist[v] = (unsigned long)estacks;
36164 +       }
36165 +
36166 +       t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
36167 +       /*
36168 +        * <= is required because the CPU will access up to
36169 +        * 8 bits beyond the end of the IO permission bitmap.
36170 +        */
36171 +       for (i = 0; i <= IO_BITMAP_LONGS; i++)
36172 +               t->io_bitmap[i] = ~0UL;
36173 +#endif
36174 +
36175 +       atomic_inc(&init_mm.mm_count);
36176 +       me->active_mm = &init_mm;
36177 +       if (me->mm)
36178 +               BUG();
36179 +       enter_lazy_tlb(&init_mm, me);
36180 +
36181 +#ifndef CONFIG_X86_NO_TSS
36182 +       set_tss_desc(cpu, t);
36183 +#endif
36184 +#ifndef CONFIG_XEN
36185 +       load_TR_desc();
36186 +#endif
36187 +       load_LDT(&init_mm.context);
36188 +
36189 +       /*
36190 +        * Clear all 6 debug registers:
36191 +        */
36192 +
36193 +       set_debug(0UL, 0);
36194 +       set_debug(0UL, 1);
36195 +       set_debug(0UL, 2);
36196 +       set_debug(0UL, 3);
36197 +       set_debug(0UL, 6);
36198 +       set_debug(0UL, 7);
36199 +
36200 +       fpu_init(); 
36201 +}
36202 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/smp-xen.c linux-2.6.16/arch/x86_64/kernel/smp-xen.c
36203 --- linux-2.6.16.orig/arch/x86_64/kernel/smp-xen.c      1970-01-01 01:00:00.000000000 +0100
36204 +++ linux-2.6.16/arch/x86_64/kernel/smp-xen.c   2006-06-26 09:51:32.000000000 +0200
36205 @@ -0,0 +1,596 @@
36206 +/*
36207 + *     Intel SMP support routines.
36208 + *
36209 + *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
36210 + *     (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
36211 + *      (c) 2002,2003 Andi Kleen, SuSE Labs.
36212 + *
36213 + *     This code is released under the GNU General Public License version 2 or
36214 + *     later.
36215 + */
36216 +
36217 +#include <linux/init.h>
36218 +
36219 +#include <linux/mm.h>
36220 +#include <linux/delay.h>
36221 +#include <linux/spinlock.h>
36222 +#include <linux/smp_lock.h>
36223 +#include <linux/smp.h>
36224 +#include <linux/kernel_stat.h>
36225 +#include <linux/mc146818rtc.h>
36226 +#include <linux/interrupt.h>
36227 +
36228 +#include <asm/mtrr.h>
36229 +#include <asm/pgalloc.h>
36230 +#include <asm/tlbflush.h>
36231 +#include <asm/mach_apic.h>
36232 +#include <asm/mmu_context.h>
36233 +#include <asm/proto.h>
36234 +#include <asm/apicdef.h>
36235 +#include <asm/idle.h>
36236 +#ifdef CONFIG_XEN
36237 +#include <xen/evtchn.h>
36238 +#endif
36239 +
36240 +#ifndef CONFIG_XEN
36241 +/*
36242 + *     Smarter SMP flushing macros. 
36243 + *             c/o Linus Torvalds.
36244 + *
36245 + *     These mean you can really definitely utterly forget about
36246 + *     writing to user space from interrupts. (Its not allowed anyway).
36247 + *
36248 + *     Optimizations Manfred Spraul <manfred@colorfullife.com>
36249 + *
36250 + *     More scalable flush, from Andi Kleen
36251 + *
36252 + *     To avoid global state use 8 different call vectors.
36253 + *     Each CPU uses a specific vector to trigger flushes on other
36254 + *     CPUs. Depending on the received vector the target CPUs look into
36255 + *     the right per cpu variable for the flush data.
36256 + *
36257 + *     With more than 8 CPUs they are hashed to the 8 available
36258 + *     vectors. The limited global vector space forces us to this right now.
36259 + *     In future when interrupts are split into per CPU domains this could be
36260 + *     fixed, at the cost of triggering multiple IPIs in some cases.
36261 + */
36262 +
36263 +union smp_flush_state {
36264 +       struct {
36265 +               cpumask_t flush_cpumask;
36266 +               struct mm_struct *flush_mm;
36267 +               unsigned long flush_va;
36268 +#define FLUSH_ALL      -1ULL
36269 +               spinlock_t tlbstate_lock;
36270 +       };
36271 +       char pad[SMP_CACHE_BYTES];
36272 +} ____cacheline_aligned;
36273 +
36274 +/* State is put into the per CPU data section, but padded
36275 +   to a full cache line because other CPUs can access it and we don't
36276 +   want false sharing in the per cpu data segment. */
36277 +static DEFINE_PER_CPU(union smp_flush_state, flush_state);
36278 +#endif
36279 +
36280 +/*
36281 + * We cannot call mmdrop() because we are in interrupt context, 
36282 + * instead update mm->cpu_vm_mask.
36283 + */
36284 +static inline void leave_mm(unsigned long cpu)
36285 +{
36286 +       if (read_pda(mmu_state) == TLBSTATE_OK)
36287 +               BUG();
36288 +       clear_bit(cpu, &read_pda(active_mm)->cpu_vm_mask);
36289 +       load_cr3(swapper_pg_dir);
36290 +}
36291 +
36292 +#ifndef CONFIG_XEN
36293 +/*
36294 + *
36295 + * The flush IPI assumes that a thread switch happens in this order:
36296 + * [cpu0: the cpu that switches]
36297 + * 1) switch_mm() either 1a) or 1b)
36298 + * 1a) thread switch to a different mm
36299 + * 1a1) clear_bit(cpu, &old_mm->cpu_vm_mask);
36300 + *     Stop ipi delivery for the old mm. This is not synchronized with
36301 + *     the other cpus, but smp_invalidate_interrupt ignore flush ipis
36302 + *     for the wrong mm, and in the worst case we perform a superfluous
36303 + *     tlb flush.
36304 + * 1a2) set cpu mmu_state to TLBSTATE_OK
36305 + *     Now the smp_invalidate_interrupt won't call leave_mm if cpu0
36306 + *     was in lazy tlb mode.
36307 + * 1a3) update cpu active_mm
36308 + *     Now cpu0 accepts tlb flushes for the new mm.
36309 + * 1a4) set_bit(cpu, &new_mm->cpu_vm_mask);
36310 + *     Now the other cpus will send tlb flush ipis.
36311 + * 1a4) change cr3.
36312 + * 1b) thread switch without mm change
36313 + *     cpu active_mm is correct, cpu0 already handles
36314 + *     flush ipis.
36315 + * 1b1) set cpu mmu_state to TLBSTATE_OK
36316 + * 1b2) test_and_set the cpu bit in cpu_vm_mask.
36317 + *     Atomically set the bit [other cpus will start sending flush ipis],
36318 + *     and test the bit.
36319 + * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
36320 + * 2) switch %%esp, ie current
36321 + *
36322 + * The interrupt must handle 2 special cases:
36323 + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
36324 + * - the cpu performs speculative tlb reads, i.e. even if the cpu only
36325 + *   runs in kernel space, the cpu could load tlb entries for user space
36326 + *   pages.
36327 + *
36328 + * The good news is that cpu mmu_state is local to each cpu, no
36329 + * write/read ordering problems.
36330 + */
36331 +
36332 +/*
36333 + * TLB flush IPI:
36334 + *
36335 + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
36336 + * 2) Leave the mm if we are in the lazy tlb mode.
36337 + *
36338 + * Interrupts are disabled.
36339 + */
36340 +
36341 +asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
36342 +{
36343 +       int cpu;
36344 +       int sender;
36345 +       union smp_flush_state *f;
36346 +
36347 +       cpu = smp_processor_id();
36348 +       /*
36349 +        * orig_rax contains the interrupt vector - 256.
36350 +        * Use that to determine where the sender put the data.
36351 +        */
36352 +       sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START;
36353 +       f = &per_cpu(flush_state, sender);
36354 +
36355 +       if (!cpu_isset(cpu, f->flush_cpumask))
36356 +               goto out;
36357 +               /* 
36358 +                * This was a BUG() but until someone can quote me the
36359 +                * line from the intel manual that guarantees an IPI to
36360 +                * multiple CPUs is retried _only_ on the erroring CPUs
36361 +                * its staying as a return
36362 +                *
36363 +                * BUG();
36364 +                */
36365 +                
36366 +       if (f->flush_mm == read_pda(active_mm)) {
36367 +               if (read_pda(mmu_state) == TLBSTATE_OK) {
36368 +                       if (f->flush_va == FLUSH_ALL)
36369 +                               local_flush_tlb();
36370 +                       else
36371 +                               __flush_tlb_one(f->flush_va);
36372 +               } else
36373 +                       leave_mm(cpu);
36374 +       }
36375 +out:
36376 +       ack_APIC_irq();
36377 +       cpu_clear(cpu, f->flush_cpumask);
36378 +}
36379 +
36380 +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
36381 +                                               unsigned long va)
36382 +{
36383 +       int sender;
36384 +       union smp_flush_state *f;
36385 +
36386 +       /* Caller has disabled preemption */
36387 +       sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
36388 +       f = &per_cpu(flush_state, sender);
36389 +
36390 +       /* Could avoid this lock when
36391 +          num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
36392 +          probably not worth checking this for a cache-hot lock. */
36393 +       spin_lock(&f->tlbstate_lock);
36394 +
36395 +       f->flush_mm = mm;
36396 +       f->flush_va = va;
36397 +       cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
36398 +
36399 +       /*
36400 +        * We have to send the IPI only to
36401 +        * CPUs affected.
36402 +        */
36403 +       send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
36404 +
36405 +       while (!cpus_empty(f->flush_cpumask))
36406 +               cpu_relax();
36407 +
36408 +       f->flush_mm = NULL;
36409 +       f->flush_va = 0;
36410 +       spin_unlock(&f->tlbstate_lock);
36411 +}
36412 +
36413 +int __cpuinit init_smp_flush(void)
36414 +{
36415 +       int i;
36416 +       for_each_cpu_mask(i, cpu_possible_map) {
36417 +               spin_lock_init(&per_cpu(flush_state.tlbstate_lock, i));
36418 +       }
36419 +       return 0;
36420 +}
36421 +
36422 +core_initcall(init_smp_flush);
36423 +       
36424 +void flush_tlb_current_task(void)
36425 +{
36426 +       struct mm_struct *mm = current->mm;
36427 +       cpumask_t cpu_mask;
36428 +
36429 +       preempt_disable();
36430 +       cpu_mask = mm->cpu_vm_mask;
36431 +       cpu_clear(smp_processor_id(), cpu_mask);
36432 +
36433 +       local_flush_tlb();
36434 +       if (!cpus_empty(cpu_mask))
36435 +               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
36436 +       preempt_enable();
36437 +}
36438 +
36439 +void flush_tlb_mm (struct mm_struct * mm)
36440 +{
36441 +       cpumask_t cpu_mask;
36442 +
36443 +       preempt_disable();
36444 +       cpu_mask = mm->cpu_vm_mask;
36445 +       cpu_clear(smp_processor_id(), cpu_mask);
36446 +
36447 +       if (current->active_mm == mm) {
36448 +               if (current->mm)
36449 +                       local_flush_tlb();
36450 +               else
36451 +                       leave_mm(smp_processor_id());
36452 +       }
36453 +       if (!cpus_empty(cpu_mask))
36454 +               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
36455 +
36456 +       preempt_enable();
36457 +}
36458 +
36459 +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
36460 +{
36461 +       struct mm_struct *mm = vma->vm_mm;
36462 +       cpumask_t cpu_mask;
36463 +
36464 +       preempt_disable();
36465 +       cpu_mask = mm->cpu_vm_mask;
36466 +       cpu_clear(smp_processor_id(), cpu_mask);
36467 +
36468 +       if (current->active_mm == mm) {
36469 +               if(current->mm)
36470 +                       __flush_tlb_one(va);
36471 +                else
36472 +                       leave_mm(smp_processor_id());
36473 +       }
36474 +
36475 +       if (!cpus_empty(cpu_mask))
36476 +               flush_tlb_others(cpu_mask, mm, va);
36477 +
36478 +       preempt_enable();
36479 +}
36480 +
36481 +static void do_flush_tlb_all(void* info)
36482 +{
36483 +       unsigned long cpu = smp_processor_id();
36484 +
36485 +       __flush_tlb_all();
36486 +       if (read_pda(mmu_state) == TLBSTATE_LAZY)
36487 +               leave_mm(cpu);
36488 +}
36489 +
36490 +void flush_tlb_all(void)
36491 +{
36492 +       on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
36493 +}
36494 +#else
36495 +asmlinkage void smp_invalidate_interrupt (void)
36496 +{ return; }
36497 +void flush_tlb_current_task(void)
36498 +{ xen_tlb_flush_mask(&current->mm->cpu_vm_mask); }
36499 +void flush_tlb_mm (struct mm_struct * mm)
36500 +{ xen_tlb_flush_mask(&mm->cpu_vm_mask); }
36501 +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
36502 +{ xen_invlpg_mask(&vma->vm_mm->cpu_vm_mask, va); }
36503 +void flush_tlb_all(void)
36504 +{ xen_tlb_flush_all(); }
36505 +#endif /* Xen */
36506 +
36507 +/*
36508 + * this function sends a 'reschedule' IPI to another CPU.
36509 + * it goes straight through and wastes no time serializing
36510 + * anything. Worst case is that we lose a reschedule ...
36511 + */
36512 +
36513 +void smp_send_reschedule(int cpu)
36514 +{
36515 +       send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
36516 +}
36517 +
36518 +/*
36519 + * Structure and data for smp_call_function(). This is designed to minimise
36520 + * static memory requirements. It also looks cleaner.
36521 + */
36522 +static DEFINE_SPINLOCK(call_lock);
36523 +
36524 +struct call_data_struct {
36525 +       void (*func) (void *info);
36526 +       void *info;
36527 +       atomic_t started;
36528 +       atomic_t finished;
36529 +       int wait;
36530 +};
36531 +
36532 +static struct call_data_struct * call_data;
36533 +
36534 +void lock_ipi_call_lock(void)
36535 +{
36536 +       spin_lock_irq(&call_lock);
36537 +}
36538 +
36539 +void unlock_ipi_call_lock(void)
36540 +{
36541 +       spin_unlock_irq(&call_lock);
36542 +}
36543 +
36544 +/*
36545 + * this function sends a 'generic call function' IPI to one other CPU
36546 + * in the system.
36547 + *
36548 + * cpu is a standard Linux logical CPU number.
36549 + */
36550 +static void
36551 +__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
36552 +                               int nonatomic, int wait)
36553 +{
36554 +       struct call_data_struct data;
36555 +       int cpus = 1;
36556 +
36557 +       data.func = func;
36558 +       data.info = info;
36559 +       atomic_set(&data.started, 0);
36560 +       data.wait = wait;
36561 +       if (wait)
36562 +               atomic_set(&data.finished, 0);
36563 +
36564 +       call_data = &data;
36565 +       wmb();
36566 +       /* Send a message to all other CPUs and wait for them to respond */
36567 +       send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
36568 +
36569 +       /* Wait for response */
36570 +       while (atomic_read(&data.started) != cpus)
36571 +               cpu_relax();
36572 +
36573 +       if (!wait)
36574 +               return;
36575 +
36576 +       while (atomic_read(&data.finished) != cpus)
36577 +               cpu_relax();
36578 +}
36579 +
36580 +/*
36581 + * smp_call_function_single - Run a function on another CPU
36582 + * @func: The function to run. This must be fast and non-blocking.
36583 + * @info: An arbitrary pointer to pass to the function.
36584 + * @nonatomic: Currently unused.
36585 + * @wait: If true, wait until function has completed on other CPUs.
36586 + *
36587 + * Retrurns 0 on success, else a negative status code.
36588 + *
36589 + * Does not return until the remote CPU is nearly ready to execute <func>
36590 + * or is or has executed.
36591 + */
36592 +
36593 +int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
36594 +       int nonatomic, int wait)
36595 +{
36596 +       /* prevent preemption and reschedule on another processor */
36597 +       int me = get_cpu();
36598 +       if (cpu == me) {
36599 +               WARN_ON(1);
36600 +               put_cpu();
36601 +               return -EBUSY;
36602 +       }
36603 +       spin_lock_bh(&call_lock);
36604 +       __smp_call_function_single(cpu, func, info, nonatomic, wait);
36605 +       spin_unlock_bh(&call_lock);
36606 +       put_cpu();
36607 +       return 0;
36608 +}
36609 +
36610 +/*
36611 + * this function sends a 'generic call function' IPI to all other CPUs
36612 + * in the system.
36613 + */
36614 +static void __smp_call_function (void (*func) (void *info), void *info,
36615 +                               int nonatomic, int wait)
36616 +{
36617 +       struct call_data_struct data;
36618 +       int cpus = num_online_cpus()-1;
36619 +
36620 +       if (!cpus)
36621 +               return;
36622 +
36623 +       data.func = func;
36624 +       data.info = info;
36625 +       atomic_set(&data.started, 0);
36626 +       data.wait = wait;
36627 +       if (wait)
36628 +               atomic_set(&data.finished, 0);
36629 +
36630 +       call_data = &data;
36631 +       wmb();
36632 +       /* Send a message to all other CPUs and wait for them to respond */
36633 +       send_IPI_allbutself(CALL_FUNCTION_VECTOR);
36634 +
36635 +       /* Wait for response */
36636 +       while (atomic_read(&data.started) != cpus)
36637 +#ifndef CONFIG_XEN
36638 +               cpu_relax();
36639 +#else
36640 +               barrier();
36641 +#endif
36642 +
36643 +       if (!wait)
36644 +               return;
36645 +
36646 +       while (atomic_read(&data.finished) != cpus)
36647 +#ifndef CONFIG_XEN
36648 +               cpu_relax();
36649 +#else
36650 +               barrier();
36651 +#endif
36652 +}
36653 +
36654 +/*
36655 + * smp_call_function - run a function on all other CPUs.
36656 + * @func: The function to run. This must be fast and non-blocking.
36657 + * @info: An arbitrary pointer to pass to the function.
36658 + * @nonatomic: currently unused.
36659 + * @wait: If true, wait (atomically) until function has completed on other
36660 + *        CPUs.
36661 + *
36662 + * Returns 0 on success, else a negative status code. Does not return until
36663 + * remote CPUs are nearly ready to execute func or are or have executed.
36664 + *
36665 + * You must not call this function with disabled interrupts or from a
36666 + * hardware interrupt handler or from a bottom half handler.
36667 + * Actually there are a few legal cases, like panic.
36668 + */
36669 +int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
36670 +                       int wait)
36671 +{
36672 +       spin_lock(&call_lock);
36673 +       __smp_call_function(func,info,nonatomic,wait);
36674 +       spin_unlock(&call_lock);
36675 +       return 0;
36676 +}
36677 +
36678 +void smp_stop_cpu(void)
36679 +{
36680 +       unsigned long flags;
36681 +       /*
36682 +        * Remove this CPU:
36683 +        */
36684 +       cpu_clear(smp_processor_id(), cpu_online_map);
36685 +       local_irq_save(flags);
36686 +#ifndef CONFIG_XEN
36687 +       disable_local_APIC();
36688 +#endif
36689 +       local_irq_restore(flags); 
36690 +}
36691 +
36692 +static void smp_really_stop_cpu(void *dummy)
36693 +{
36694 +       smp_stop_cpu(); 
36695 +       for (;;) 
36696 +               asm("hlt"); 
36697 +} 
36698 +
36699 +void smp_send_stop(void)
36700 +{
36701 +       int nolock = 0;
36702 +#ifndef CONFIG_XEN
36703 +       if (reboot_force)
36704 +               return;
36705 +#endif
36706 +       /* Don't deadlock on the call lock in panic */
36707 +       if (!spin_trylock(&call_lock)) {
36708 +               /* ignore locking because we have paniced anyways */
36709 +               nolock = 1;
36710 +       }
36711 +       __smp_call_function(smp_really_stop_cpu, NULL, 0, 0);
36712 +       if (!nolock)
36713 +               spin_unlock(&call_lock);
36714 +
36715 +       local_irq_disable();
36716 +#ifndef CONFIG_XEN
36717 +       disable_local_APIC();
36718 +#endif
36719 +       local_irq_enable();
36720 +}
36721 +
36722 +/*
36723 + * Reschedule call back. Nothing to do,
36724 + * all the work is done automatically when
36725 + * we return from the interrupt.
36726 + */
36727 +#ifndef CONFIG_XEN
36728 +asmlinkage void smp_reschedule_interrupt(void)
36729 +#else
36730 +asmlinkage irqreturn_t smp_reschedule_interrupt(void)
36731 +#endif
36732 +{
36733 +#ifndef CONFIG_XEN
36734 +       ack_APIC_irq();
36735 +#else
36736 +       return IRQ_HANDLED;
36737 +#endif
36738 +}
36739 +
36740 +#ifndef CONFIG_XEN
36741 +asmlinkage void smp_call_function_interrupt(void)
36742 +#else
36743 +asmlinkage irqreturn_t smp_call_function_interrupt(void)
36744 +#endif
36745 +{
36746 +       void (*func) (void *info) = call_data->func;
36747 +       void *info = call_data->info;
36748 +       int wait = call_data->wait;
36749 +
36750 +#ifndef CONFIG_XEN
36751 +       ack_APIC_irq();
36752 +#endif
36753 +       /*
36754 +        * Notify initiating CPU that I've grabbed the data and am
36755 +        * about to execute the function
36756 +        */
36757 +       mb();
36758 +       atomic_inc(&call_data->started);
36759 +       /*
36760 +        * At this point the info structure may be out of scope unless wait==1
36761 +        */
36762 +       exit_idle();
36763 +       irq_enter();
36764 +       (*func)(info);
36765 +       irq_exit();
36766 +       if (wait) {
36767 +               mb();
36768 +               atomic_inc(&call_data->finished);
36769 +       }
36770 +#ifdef CONFIG_XEN
36771 +       return IRQ_HANDLED;
36772 +#endif
36773 +}
36774 +
36775 +int safe_smp_processor_id(void)
36776 +{
36777 +#ifdef CONFIG_XEN
36778 +       return smp_processor_id();
36779 +#else
36780 +       int apicid, i;
36781 +
36782 +       if (disable_apic)
36783 +               return 0;
36784 +
36785 +       apicid = hard_smp_processor_id();
36786 +       if (x86_cpu_to_apicid[apicid] == apicid)
36787 +               return apicid;
36788 +
36789 +       for (i = 0; i < NR_CPUS; ++i) {
36790 +               if (x86_cpu_to_apicid[i] == apicid)
36791 +                       return i;
36792 +       }
36793 +
36794 +       /* No entries in x86_cpu_to_apicid?  Either no MPS|ACPI,
36795 +        * or called too early.  Either way, we must be CPU 0. */
36796 +       if (x86_cpu_to_apicid[0] == BAD_APICID)
36797 +               return 0;
36798 +
36799 +       return 0; /* Should not happen */
36800 +#endif
36801 +}
36802 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/traps-xen.c linux-2.6.16/arch/x86_64/kernel/traps-xen.c
36803 --- linux-2.6.16.orig/arch/x86_64/kernel/traps-xen.c    1970-01-01 01:00:00.000000000 +0100
36804 +++ linux-2.6.16/arch/x86_64/kernel/traps-xen.c 2006-06-26 09:51:32.000000000 +0200
36805 @@ -0,0 +1,1023 @@
36806 +/*
36807 + *  linux/arch/x86-64/traps.c
36808 + *
36809 + *  Copyright (C) 1991, 1992  Linus Torvalds
36810 + *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
36811 + *
36812 + *  Pentium III FXSR, SSE support
36813 + *     Gareth Hughes <gareth@valinux.com>, May 2000
36814 + *
36815 + *  $Id$
36816 + */
36817 +
36818 +/*
36819 + * 'Traps.c' handles hardware traps and faults after we have saved some
36820 + * state in 'entry.S'.
36821 + */
36822 +#include <linux/config.h>
36823 +#include <linux/sched.h>
36824 +#include <linux/kernel.h>
36825 +#include <linux/string.h>
36826 +#include <linux/errno.h>
36827 +#include <linux/ptrace.h>
36828 +#include <linux/timer.h>
36829 +#include <linux/mm.h>
36830 +#include <linux/init.h>
36831 +#include <linux/delay.h>
36832 +#include <linux/spinlock.h>
36833 +#include <linux/interrupt.h>
36834 +#include <linux/module.h>
36835 +#include <linux/moduleparam.h>
36836 +#include <linux/nmi.h>
36837 +#include <linux/kprobes.h>
36838 +
36839 +#include <asm/system.h>
36840 +#include <asm/uaccess.h>
36841 +#include <asm/io.h>
36842 +#include <asm/atomic.h>
36843 +#include <asm/debugreg.h>
36844 +#include <asm/desc.h>
36845 +#include <asm/i387.h>
36846 +#include <asm/kdebug.h>
36847 +#include <asm/processor.h>
36848 +
36849 +#include <asm/smp.h>
36850 +#include <asm/pgalloc.h>
36851 +#include <asm/pda.h>
36852 +#include <asm/proto.h>
36853 +#include <asm/nmi.h>
36854 +
36855 +#ifndef CONFIG_X86_NO_IDT
36856 +extern struct gate_struct idt_table[256]; 
36857 +#endif
36858 +
36859 +asmlinkage void divide_error(void);
36860 +asmlinkage void debug(void);
36861 +asmlinkage void nmi(void);
36862 +asmlinkage void int3(void);
36863 +asmlinkage void overflow(void);
36864 +asmlinkage void bounds(void);
36865 +asmlinkage void invalid_op(void);
36866 +asmlinkage void device_not_available(void);
36867 +asmlinkage void double_fault(void);
36868 +asmlinkage void coprocessor_segment_overrun(void);
36869 +asmlinkage void invalid_TSS(void);
36870 +asmlinkage void segment_not_present(void);
36871 +asmlinkage void stack_segment(void);
36872 +asmlinkage void general_protection(void);
36873 +asmlinkage void page_fault(void);
36874 +asmlinkage void coprocessor_error(void);
36875 +asmlinkage void simd_coprocessor_error(void);
36876 +asmlinkage void reserved(void);
36877 +asmlinkage void alignment_check(void);
36878 +asmlinkage void machine_check(void);
36879 +asmlinkage void spurious_interrupt_bug(void);
36880 +
36881 +struct notifier_block *die_chain;
36882 +static DEFINE_SPINLOCK(die_notifier_lock);
36883 +
36884 +int register_die_notifier(struct notifier_block *nb)
36885 +{
36886 +       int err = 0;
36887 +       unsigned long flags;
36888 +       spin_lock_irqsave(&die_notifier_lock, flags);
36889 +       err = notifier_chain_register(&die_chain, nb);
36890 +       spin_unlock_irqrestore(&die_notifier_lock, flags);
36891 +       return err;
36892 +}
36893 +
36894 +static inline void conditional_sti(struct pt_regs *regs)
36895 +{
36896 +       if (regs->eflags & X86_EFLAGS_IF)
36897 +               local_irq_enable();
36898 +}
36899 +
36900 +static inline void preempt_conditional_sti(struct pt_regs *regs)
36901 +{
36902 +       preempt_disable();
36903 +       if (regs->eflags & X86_EFLAGS_IF)
36904 +               local_irq_enable();
36905 +}
36906 +
36907 +static inline void preempt_conditional_cli(struct pt_regs *regs)
36908 +{
36909 +       if (regs->eflags & X86_EFLAGS_IF)
36910 +               local_irq_disable();
36911 +       preempt_enable_no_resched();
36912 +}
36913 +
36914 +static int kstack_depth_to_print = 10;
36915 +
36916 +#ifdef CONFIG_KALLSYMS
36917 +#include <linux/kallsyms.h> 
36918 +int printk_address(unsigned long address)
36919 +{ 
36920 +       unsigned long offset = 0, symsize;
36921 +       const char *symname;
36922 +       char *modname;
36923 +       char *delim = ":"; 
36924 +       char namebuf[128];
36925 +
36926 +       symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); 
36927 +       if (!symname) 
36928 +               return printk("[<%016lx>]", address);
36929 +       if (!modname) 
36930 +               modname = delim = "";           
36931 +        return printk("<%016lx>{%s%s%s%s%+ld}",
36932 +                     address,delim,modname,delim,symname,offset); 
36933 +} 
36934 +#else
36935 +int printk_address(unsigned long address)
36936 +{ 
36937 +       return printk("[<%016lx>]", address);
36938 +} 
36939 +#endif
36940 +
36941 +static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
36942 +                                       unsigned *usedp, const char **idp)
36943 +{
36944 +#ifndef CONFIG_X86_NO_TSS
36945 +       static char ids[][8] = {
36946 +               [DEBUG_STACK - 1] = "#DB",
36947 +               [NMI_STACK - 1] = "NMI",
36948 +               [DOUBLEFAULT_STACK - 1] = "#DF",
36949 +               [STACKFAULT_STACK - 1] = "#SS",
36950 +               [MCE_STACK - 1] = "#MC",
36951 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
36952 +               [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
36953 +#endif
36954 +       };
36955 +       unsigned k;
36956 +
36957 +       for (k = 0; k < N_EXCEPTION_STACKS; k++) {
36958 +               unsigned long end;
36959 +
36960 +               switch (k + 1) {
36961 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
36962 +               case DEBUG_STACK:
36963 +                       end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ;
36964 +                       break;
36965 +#endif
36966 +               default:
36967 +                       end = per_cpu(init_tss, cpu).ist[k];
36968 +                       break;
36969 +               }
36970 +               if (stack >= end)
36971 +                       continue;
36972 +               if (stack >= end - EXCEPTION_STKSZ) {
36973 +                       if (*usedp & (1U << k))
36974 +                               break;
36975 +                       *usedp |= 1U << k;
36976 +                       *idp = ids[k];
36977 +                       return (unsigned long *)end;
36978 +               }
36979 +#if DEBUG_STKSZ > EXCEPTION_STKSZ
36980 +               if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
36981 +                       unsigned j = N_EXCEPTION_STACKS - 1;
36982 +
36983 +                       do {
36984 +                               ++j;
36985 +                               end -= EXCEPTION_STKSZ;
36986 +                               ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
36987 +                       } while (stack < end - EXCEPTION_STKSZ);
36988 +                       if (*usedp & (1U << j))
36989 +                               break;
36990 +                       *usedp |= 1U << j;
36991 +                       *idp = ids[j];
36992 +                       return (unsigned long *)end;
36993 +               }
36994 +#endif
36995 +       }
36996 +#endif
36997 +       return NULL;
36998 +}
36999 +
37000 +/*
37001 + * x86-64 can have upto three kernel stacks: 
37002 + * process stack
37003 + * interrupt stack
37004 + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
37005 + */
37006 +
37007 +void show_trace(unsigned long *stack)
37008 +{
37009 +       const unsigned cpu = safe_smp_processor_id();
37010 +       unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
37011 +       int i;
37012 +       unsigned used = 0;
37013 +
37014 +       printk("\nCall Trace:");
37015 +
37016 +#define HANDLE_STACK(cond) \
37017 +       do while (cond) { \
37018 +               unsigned long addr = *stack++; \
37019 +               if (kernel_text_address(addr)) { \
37020 +                       if (i > 50) { \
37021 +                               printk("\n       "); \
37022 +                               i = 0; \
37023 +                       } \
37024 +                       else \
37025 +                               i += printk(" "); \
37026 +                       /* \
37027 +                        * If the address is either in the text segment of the \
37028 +                        * kernel, or in the region which contains vmalloc'ed \
37029 +                        * memory, it *may* be the address of a calling \
37030 +                        * routine; if so, print it so that someone tracing \
37031 +                        * down the cause of the crash will be able to figure \
37032 +                        * out the call path that was taken. \
37033 +                        */ \
37034 +                       i += printk_address(addr); \
37035 +               } \
37036 +       } while (0)
37037 +
37038 +       for(i = 11; ; ) {
37039 +               const char *id;
37040 +               unsigned long *estack_end;
37041 +               estack_end = in_exception_stack(cpu, (unsigned long)stack,
37042 +                                               &used, &id);
37043 +
37044 +               if (estack_end) {
37045 +                       i += printk(" <%s>", id);
37046 +                       HANDLE_STACK (stack < estack_end);
37047 +                       i += printk(" <EOE>");
37048 +                       stack = (unsigned long *) estack_end[-2];
37049 +                       continue;
37050 +               }
37051 +               if (irqstack_end) {
37052 +                       unsigned long *irqstack;
37053 +                       irqstack = irqstack_end -
37054 +                               (IRQSTACKSIZE - 64) / sizeof(*irqstack);
37055 +
37056 +                       if (stack >= irqstack && stack < irqstack_end) {
37057 +                               i += printk(" <IRQ>");
37058 +                               HANDLE_STACK (stack < irqstack_end);
37059 +                               stack = (unsigned long *) (irqstack_end[-1]);
37060 +                               irqstack_end = NULL;
37061 +                               i += printk(" <EOI>");
37062 +                               continue;
37063 +                       }
37064 +               }
37065 +               break;
37066 +       }
37067 +
37068 +       HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
37069 +#undef HANDLE_STACK
37070 +       printk("\n");
37071 +}
37072 +
37073 +void show_stack(struct task_struct *tsk, unsigned long * rsp)
37074 +{
37075 +       unsigned long *stack;
37076 +       int i;
37077 +       const int cpu = safe_smp_processor_id();
37078 +       unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
37079 +       unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
37080 +
37081 +       // debugging aid: "show_stack(NULL, NULL);" prints the
37082 +       // back trace for this cpu.
37083 +
37084 +       if (rsp == NULL) {
37085 +               if (tsk)
37086 +                       rsp = (unsigned long *)tsk->thread.rsp;
37087 +               else
37088 +                       rsp = (unsigned long *)&rsp;
37089 +       }
37090 +
37091 +       stack = rsp;
37092 +       for(i=0; i < kstack_depth_to_print; i++) {
37093 +               if (stack >= irqstack && stack <= irqstack_end) {
37094 +                       if (stack == irqstack_end) {
37095 +                               stack = (unsigned long *) (irqstack_end[-1]);
37096 +                               printk(" <EOI> ");
37097 +                       }
37098 +               } else {
37099 +               if (((long) stack & (THREAD_SIZE-1)) == 0)
37100 +                       break;
37101 +               }
37102 +               if (i && ((i % 4) == 0))
37103 +                       printk("\n       ");
37104 +               printk("%016lx ", *stack++);
37105 +               touch_nmi_watchdog();
37106 +       }
37107 +       show_trace((unsigned long *)rsp);
37108 +}
37109 +
37110 +/*
37111 + * The architecture-independent dump_stack generator
37112 + */
37113 +void dump_stack(void)
37114 +{
37115 +       unsigned long dummy;
37116 +       show_trace(&dummy);
37117 +}
37118 +
37119 +EXPORT_SYMBOL(dump_stack);
37120 +
37121 +void show_registers(struct pt_regs *regs)
37122 +{
37123 +       int i;
37124 +       int in_kernel = !user_mode(regs);
37125 +       unsigned long rsp;
37126 +       const int cpu = safe_smp_processor_id(); 
37127 +       struct task_struct *cur = cpu_pda(cpu)->pcurrent;
37128 +
37129 +               rsp = regs->rsp;
37130 +
37131 +       printk("CPU %d ", cpu);
37132 +       __show_regs(regs);
37133 +       printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
37134 +               cur->comm, cur->pid, task_thread_info(cur), cur);
37135 +
37136 +       /*
37137 +        * When in-kernel, we also print out the stack and code at the
37138 +        * time of the fault..
37139 +        */
37140 +       if (in_kernel) {
37141 +
37142 +               printk("Stack: ");
37143 +               show_stack(NULL, (unsigned long*)rsp);
37144 +
37145 +               printk("\nCode: ");
37146 +               if(regs->rip < PAGE_OFFSET)
37147 +                       goto bad;
37148 +
37149 +               for(i=0;i<20;i++)
37150 +               {
37151 +                       unsigned char c;
37152 +                       if(__get_user(c, &((unsigned char*)regs->rip)[i])) {
37153 +bad:
37154 +                               printk(" Bad RIP value.");
37155 +                               break;
37156 +                       }
37157 +                       printk("%02x ", c);
37158 +               }
37159 +       }
37160 +       printk("\n");
37161 +}      
37162 +
37163 +void handle_BUG(struct pt_regs *regs)
37164 +{ 
37165 +       struct bug_frame f;
37166 +       long len;
37167 +       const char *prefix = "";
37168 +
37169 +       if (user_mode(regs))
37170 +               return; 
37171 +       if (__copy_from_user(&f, (const void __user *) regs->rip,
37172 +                            sizeof(struct bug_frame)))
37173 +               return; 
37174 +       if (f.filename >= 0 ||
37175 +           f.ud2[0] != 0x0f || f.ud2[1] != 0x0b) 
37176 +               return;
37177 +       len = __strnlen_user((char *)(long)f.filename, PATH_MAX) - 1;
37178 +       if (len < 0 || len >= PATH_MAX)
37179 +               f.filename = (int)(long)"unmapped filename";
37180 +       else if (len > 50) {
37181 +               f.filename += len - 50;
37182 +               prefix = "...";
37183 +       }
37184 +       printk("----------- [cut here ] --------- [please bite here ] ---------\n");
37185 +       printk(KERN_ALERT "Kernel BUG at %s%.50s:%d\n", prefix, (char *)(long)f.filename, f.line);
37186 +} 
37187 +
37188 +#ifdef CONFIG_BUG
37189 +void out_of_line_bug(void)
37190 +{ 
37191 +       BUG(); 
37192 +} 
37193 +#endif
37194 +
37195 +static DEFINE_SPINLOCK(die_lock);
37196 +static int die_owner = -1;
37197 +
37198 +unsigned __kprobes long oops_begin(void)
37199 +{
37200 +       int cpu = safe_smp_processor_id();
37201 +       unsigned long flags;
37202 +
37203 +       /* racy, but better than risking deadlock. */
37204 +       local_irq_save(flags);
37205 +       if (!spin_trylock(&die_lock)) { 
37206 +               if (cpu == die_owner) 
37207 +                       /* nested oops. should stop eventually */;
37208 +               else
37209 +                       spin_lock(&die_lock);
37210 +       }
37211 +       die_owner = cpu;
37212 +       console_verbose();
37213 +       bust_spinlocks(1);
37214 +       return flags;
37215 +}
37216 +
37217 +void __kprobes oops_end(unsigned long flags)
37218 +{ 
37219 +       die_owner = -1;
37220 +       bust_spinlocks(0);
37221 +       spin_unlock_irqrestore(&die_lock, flags);
37222 +       if (panic_on_oops)
37223 +               panic("Oops");
37224 +}
37225 +
37226 +void __kprobes __die(const char * str, struct pt_regs * regs, long err)
37227 +{
37228 +       static int die_counter;
37229 +       printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
37230 +#ifdef CONFIG_PREEMPT
37231 +       printk("PREEMPT ");
37232 +#endif
37233 +#ifdef CONFIG_SMP
37234 +       printk("SMP ");
37235 +#endif
37236 +#ifdef CONFIG_DEBUG_PAGEALLOC
37237 +       printk("DEBUG_PAGEALLOC");
37238 +#endif
37239 +       printk("\n");
37240 +       notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
37241 +       show_registers(regs);
37242 +       /* Executive summary in case the oops scrolled away */
37243 +       printk(KERN_ALERT "RIP ");
37244 +       printk_address(regs->rip); 
37245 +       printk(" RSP <%016lx>\n", regs->rsp); 
37246 +}
37247 +
37248 +void die(const char * str, struct pt_regs * regs, long err)
37249 +{
37250 +       unsigned long flags = oops_begin();
37251 +
37252 +       handle_BUG(regs);
37253 +       __die(str, regs, err);
37254 +       oops_end(flags);
37255 +       do_exit(SIGSEGV); 
37256 +}
37257 +
37258 +#ifdef CONFIG_X86_LOCAL_APIC
37259 +void __kprobes die_nmi(char *str, struct pt_regs *regs)
37260 +{
37261 +       unsigned long flags = oops_begin();
37262 +
37263 +       /*
37264 +        * We are in trouble anyway, lets at least try
37265 +        * to get a message out.
37266 +        */
37267 +       printk(str, safe_smp_processor_id());
37268 +       show_registers(regs);
37269 +       if (panic_on_timeout || panic_on_oops)
37270 +               panic("nmi watchdog");
37271 +       printk("console shuts up ...\n");
37272 +       oops_end(flags);
37273 +       do_exit(SIGSEGV);
37274 +}
37275 +#endif
37276 +
37277 +static void __kprobes do_trap(int trapnr, int signr, char *str,
37278 +                             struct pt_regs * regs, long error_code,
37279 +                             siginfo_t *info)
37280 +{
37281 +       struct task_struct *tsk = current;
37282 +
37283 +       conditional_sti(regs);
37284 +
37285 +       tsk->thread.error_code = error_code;
37286 +       tsk->thread.trap_no = trapnr;
37287 +
37288 +       if (user_mode(regs)) {
37289 +               if (exception_trace && unhandled_signal(tsk, signr))
37290 +                       printk(KERN_INFO
37291 +                              "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
37292 +                              tsk->comm, tsk->pid, str,
37293 +                              regs->rip,regs->rsp,error_code); 
37294 +
37295 +               if (info)
37296 +                       force_sig_info(signr, info, tsk);
37297 +               else
37298 +                       force_sig(signr, tsk);
37299 +               return;
37300 +       }
37301 +
37302 +
37303 +       /* kernel trap */ 
37304 +       {            
37305 +               const struct exception_table_entry *fixup;
37306 +               fixup = search_exception_tables(regs->rip);
37307 +               if (fixup) {
37308 +                       regs->rip = fixup->fixup;
37309 +               } else  
37310 +                       die(str, regs, error_code);
37311 +               return;
37312 +       }
37313 +}
37314 +
37315 +#define DO_ERROR(trapnr, signr, str, name) \
37316 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
37317 +{ \
37318 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
37319 +                                                       == NOTIFY_STOP) \
37320 +               return; \
37321 +       do_trap(trapnr, signr, str, regs, error_code, NULL); \
37322 +}
37323 +
37324 +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
37325 +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
37326 +{ \
37327 +       siginfo_t info; \
37328 +       info.si_signo = signr; \
37329 +       info.si_errno = 0; \
37330 +       info.si_code = sicode; \
37331 +       info.si_addr = (void __user *)siaddr; \
37332 +       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
37333 +                                                       == NOTIFY_STOP) \
37334 +               return; \
37335 +       do_trap(trapnr, signr, str, regs, error_code, &info); \
37336 +}
37337 +
37338 +DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->rip)
37339 +DO_ERROR( 4, SIGSEGV, "overflow", overflow)
37340 +DO_ERROR( 5, SIGSEGV, "bounds", bounds)
37341 +DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
37342 +DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
37343 +DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
37344 +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
37345 +DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
37346 +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
37347 +DO_ERROR(18, SIGSEGV, "reserved", reserved)
37348 +DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
37349 +
37350 +asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
37351 +{
37352 +       static const char str[] = "double fault";
37353 +       struct task_struct *tsk = current;
37354 +
37355 +       /* Return not checked because double check cannot be ignored */
37356 +       notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
37357 +
37358 +       tsk->thread.error_code = error_code;
37359 +       tsk->thread.trap_no = 8;
37360 +
37361 +       /* This is always a kernel trap and never fixable (and thus must
37362 +          never return). */
37363 +       for (;;)
37364 +               die(str, regs, error_code);
37365 +}
37366 +
37367 +asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
37368 +                                               long error_code)
37369 +{
37370 +       struct task_struct *tsk = current;
37371 +
37372 +       conditional_sti(regs);
37373 +
37374 +       tsk->thread.error_code = error_code;
37375 +       tsk->thread.trap_no = 13;
37376 +
37377 +       if (user_mode(regs)) {
37378 +               if (exception_trace && unhandled_signal(tsk, SIGSEGV))
37379 +                       printk(KERN_INFO
37380 +                      "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
37381 +                              tsk->comm, tsk->pid,
37382 +                              regs->rip,regs->rsp,error_code); 
37383 +
37384 +               force_sig(SIGSEGV, tsk);
37385 +               return;
37386 +       } 
37387 +
37388 +       /* kernel gp */
37389 +       {
37390 +               const struct exception_table_entry *fixup;
37391 +               fixup = search_exception_tables(regs->rip);
37392 +               if (fixup) {
37393 +                       regs->rip = fixup->fixup;
37394 +                       return;
37395 +               }
37396 +               if (notify_die(DIE_GPF, "general protection fault", regs,
37397 +                                       error_code, 13, SIGSEGV) == NOTIFY_STOP)
37398 +                       return;
37399 +               die("general protection fault", regs, error_code);
37400 +       }
37401 +}
37402 +
37403 +static __kprobes void
37404 +mem_parity_error(unsigned char reason, struct pt_regs * regs)
37405 +{
37406 +       printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
37407 +       printk("You probably have a hardware problem with your RAM chips\n");
37408 +
37409 +#if 0 /* XEN */
37410 +       /* Clear and disable the memory parity error line. */
37411 +       reason = (reason & 0xf) | 4;
37412 +       outb(reason, 0x61);
37413 +#endif /* XEN */
37414 +}
37415 +
37416 +static __kprobes void
37417 +io_check_error(unsigned char reason, struct pt_regs * regs)
37418 +{
37419 +       printk("NMI: IOCK error (debug interrupt?)\n");
37420 +       show_registers(regs);
37421 +
37422 +#if 0 /* XEN */
37423 +       /* Re-enable the IOCK line, wait for a few seconds */
37424 +       reason = (reason & 0xf) | 8;
37425 +       outb(reason, 0x61);
37426 +       mdelay(2000);
37427 +       reason &= ~8;
37428 +       outb(reason, 0x61);
37429 +#endif /* XEN */
37430 +}
37431 +
37432 +static __kprobes void
37433 +unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
37434 +{      printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
37435 +       printk("Dazed and confused, but trying to continue\n");
37436 +       printk("Do you have a strange power saving mode enabled?\n");
37437 +}
37438 +
37439 +/* Runs on IST stack. This code must keep interrupts off all the time.
37440 +   Nested NMIs are prevented by the CPU. */
37441 +asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
37442 +{
37443 +       unsigned char reason = 0;
37444 +       int cpu;
37445 +
37446 +       cpu = smp_processor_id();
37447 +
37448 +       /* Only the BSP gets external NMIs from the system.  */
37449 +       if (!cpu)
37450 +               reason = get_nmi_reason();
37451 +
37452 +       if (!(reason & 0xc0)) {
37453 +               if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
37454 +                                                               == NOTIFY_STOP)
37455 +                       return;
37456 +#ifdef CONFIG_X86_LOCAL_APIC
37457 +               /*
37458 +                * Ok, so this is none of the documented NMI sources,
37459 +                * so it must be the NMI watchdog.
37460 +                */
37461 +               if (nmi_watchdog > 0) {
37462 +                       nmi_watchdog_tick(regs,reason);
37463 +                       return;
37464 +               }
37465 +#endif
37466 +               unknown_nmi_error(reason, regs);
37467 +               return;
37468 +       }
37469 +       if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
37470 +               return; 
37471 +
37472 +       /* AK: following checks seem to be broken on modern chipsets. FIXME */
37473 +
37474 +       if (reason & 0x80)
37475 +               mem_parity_error(reason, regs);
37476 +       if (reason & 0x40)
37477 +               io_check_error(reason, regs);
37478 +}
37479 +
37480 +/* runs on IST stack. */
37481 +asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
37482 +{
37483 +       if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
37484 +               return;
37485 +       }
37486 +       do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
37487 +       return;
37488 +}
37489 +
37490 +/* Help handler running on IST stack to switch back to user stack
37491 +   for scheduling or signal handling. The actual stack switch is done in
37492 +   entry.S */
37493 +asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
37494 +{
37495 +       struct pt_regs *regs = eregs;
37496 +       /* Did already sync */
37497 +       if (eregs == (struct pt_regs *)eregs->rsp)
37498 +               ;
37499 +       /* Exception from user space */
37500 +       else if (user_mode(eregs))
37501 +               regs = task_pt_regs(current);
37502 +       /* Exception from kernel and interrupts are enabled. Move to
37503 +          kernel process stack. */
37504 +       else if (eregs->eflags & X86_EFLAGS_IF)
37505 +               regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
37506 +       if (eregs != regs)
37507 +               *regs = *eregs;
37508 +       return regs;
37509 +}
37510 +
37511 +/* runs on IST stack. */
37512 +asmlinkage void __kprobes do_debug(struct pt_regs * regs,
37513 +                                  unsigned long error_code)
37514 +{
37515 +       unsigned long condition;
37516 +       struct task_struct *tsk = current;
37517 +       siginfo_t info;
37518 +
37519 +       get_debugreg(condition, 6);
37520 +
37521 +       if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
37522 +                                               SIGTRAP) == NOTIFY_STOP)
37523 +               return;
37524 +
37525 +       preempt_conditional_sti(regs);
37526 +
37527 +       /* Mask out spurious debug traps due to lazy DR7 setting */
37528 +       if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
37529 +               if (!tsk->thread.debugreg7) { 
37530 +                       goto clear_dr7;
37531 +               }
37532 +       }
37533 +
37534 +       tsk->thread.debugreg6 = condition;
37535 +
37536 +       /* Mask out spurious TF errors due to lazy TF clearing */
37537 +       if (condition & DR_STEP) {
37538 +               /*
37539 +                * The TF error should be masked out only if the current
37540 +                * process is not traced and if the TRAP flag has been set
37541 +                * previously by a tracing process (condition detected by
37542 +                * the PT_DTRACE flag); remember that the i386 TRAP flag
37543 +                * can be modified by the process itself in user mode,
37544 +                * allowing programs to debug themselves without the ptrace()
37545 +                * interface.
37546 +                */
37547 +                if (!user_mode(regs))
37548 +                       goto clear_TF_reenable;
37549 +               /*
37550 +                * Was the TF flag set by a debugger? If so, clear it now,
37551 +                * so that register information is correct.
37552 +                */
37553 +               if (tsk->ptrace & PT_DTRACE) {
37554 +                       regs->eflags &= ~TF_MASK;
37555 +                       tsk->ptrace &= ~PT_DTRACE;
37556 +               }
37557 +       }
37558 +
37559 +       /* Ok, finally something we can handle */
37560 +       tsk->thread.trap_no = 1;
37561 +       tsk->thread.error_code = error_code;
37562 +       info.si_signo = SIGTRAP;
37563 +       info.si_errno = 0;
37564 +       info.si_code = TRAP_BRKPT;
37565 +       info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
37566 +       force_sig_info(SIGTRAP, &info, tsk);
37567 +
37568 +clear_dr7:
37569 +       set_debugreg(0UL, 7);
37570 +       preempt_conditional_cli(regs);
37571 +       return;
37572 +
37573 +clear_TF_reenable:
37574 +       set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
37575 +       regs->eflags &= ~TF_MASK;
37576 +       preempt_conditional_cli(regs);
37577 +}
37578 +
37579 +static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
37580 +{
37581 +       const struct exception_table_entry *fixup;
37582 +       fixup = search_exception_tables(regs->rip);
37583 +       if (fixup) {
37584 +               regs->rip = fixup->fixup;
37585 +               return 1;
37586 +       }
37587 +       notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
37588 +       /* Illegal floating point operation in the kernel */
37589 +       current->thread.trap_no = trapnr;
37590 +       die(str, regs, 0);
37591 +       return 0;
37592 +}
37593 +
37594 +/*
37595 + * Note that we play around with the 'TS' bit in an attempt to get
37596 + * the correct behaviour even in the presence of the asynchronous
37597 + * IRQ13 behaviour
37598 + */
37599 +asmlinkage void do_coprocessor_error(struct pt_regs *regs)
37600 +{
37601 +       void __user *rip = (void __user *)(regs->rip);
37602 +       struct task_struct * task;
37603 +       siginfo_t info;
37604 +       unsigned short cwd, swd;
37605 +
37606 +       conditional_sti(regs);
37607 +       if (!user_mode(regs) &&
37608 +           kernel_math_error(regs, "kernel x87 math error", 16))
37609 +               return;
37610 +
37611 +       /*
37612 +        * Save the info for the exception handler and clear the error.
37613 +        */
37614 +       task = current;
37615 +       save_init_fpu(task);
37616 +       task->thread.trap_no = 16;
37617 +       task->thread.error_code = 0;
37618 +       info.si_signo = SIGFPE;
37619 +       info.si_errno = 0;
37620 +       info.si_code = __SI_FAULT;
37621 +       info.si_addr = rip;
37622 +       /*
37623 +        * (~cwd & swd) will mask out exceptions that are not set to unmasked
37624 +        * status.  0x3f is the exception bits in these regs, 0x200 is the
37625 +        * C1 reg you need in case of a stack fault, 0x040 is the stack
37626 +        * fault bit.  We should only be taking one exception at a time,
37627 +        * so if this combination doesn't produce any single exception,
37628 +        * then we have a bad program that isn't synchronizing its FPU usage
37629 +        * and it will suffer the consequences since we won't be able to
37630 +        * fully reproduce the context of the exception
37631 +        */
37632 +       cwd = get_fpu_cwd(task);
37633 +       swd = get_fpu_swd(task);
37634 +       switch (swd & ~cwd & 0x3f) {
37635 +               case 0x000:
37636 +               default:
37637 +                       break;
37638 +               case 0x001: /* Invalid Op */
37639 +                       /*
37640 +                        * swd & 0x240 == 0x040: Stack Underflow
37641 +                        * swd & 0x240 == 0x240: Stack Overflow
37642 +                        * User must clear the SF bit (0x40) if set
37643 +                        */
37644 +                       info.si_code = FPE_FLTINV;
37645 +                       break;
37646 +               case 0x002: /* Denormalize */
37647 +               case 0x010: /* Underflow */
37648 +                       info.si_code = FPE_FLTUND;
37649 +                       break;
37650 +               case 0x004: /* Zero Divide */
37651 +                       info.si_code = FPE_FLTDIV;
37652 +                       break;
37653 +               case 0x008: /* Overflow */
37654 +                       info.si_code = FPE_FLTOVF;
37655 +                       break;
37656 +               case 0x020: /* Precision */
37657 +                       info.si_code = FPE_FLTRES;
37658 +                       break;
37659 +       }
37660 +       force_sig_info(SIGFPE, &info, task);
37661 +}
37662 +
37663 +asmlinkage void bad_intr(void)
37664 +{
37665 +       printk("bad interrupt"); 
37666 +}
37667 +
37668 +asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
37669 +{
37670 +       void __user *rip = (void __user *)(regs->rip);
37671 +       struct task_struct * task;
37672 +       siginfo_t info;
37673 +       unsigned short mxcsr;
37674 +
37675 +       conditional_sti(regs);
37676 +       if (!user_mode(regs) &&
37677 +               kernel_math_error(regs, "kernel simd math error", 19))
37678 +               return;
37679 +
37680 +       /*
37681 +        * Save the info for the exception handler and clear the error.
37682 +        */
37683 +       task = current;
37684 +       save_init_fpu(task);
37685 +       task->thread.trap_no = 19;
37686 +       task->thread.error_code = 0;
37687 +       info.si_signo = SIGFPE;
37688 +       info.si_errno = 0;
37689 +       info.si_code = __SI_FAULT;
37690 +       info.si_addr = rip;
37691 +       /*
37692 +        * The SIMD FPU exceptions are handled a little differently, as there
37693 +        * is only a single status/control register.  Thus, to determine which
37694 +        * unmasked exception was caught we must mask the exception mask bits
37695 +        * at 0x1f80, and then use these to mask the exception bits at 0x3f.
37696 +        */
37697 +       mxcsr = get_fpu_mxcsr(task);
37698 +       switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
37699 +               case 0x000:
37700 +               default:
37701 +                       break;
37702 +               case 0x001: /* Invalid Op */
37703 +                       info.si_code = FPE_FLTINV;
37704 +                       break;
37705 +               case 0x002: /* Denormalize */
37706 +               case 0x010: /* Underflow */
37707 +                       info.si_code = FPE_FLTUND;
37708 +                       break;
37709 +               case 0x004: /* Zero Divide */
37710 +                       info.si_code = FPE_FLTDIV;
37711 +                       break;
37712 +               case 0x008: /* Overflow */
37713 +                       info.si_code = FPE_FLTOVF;
37714 +                       break;
37715 +               case 0x020: /* Precision */
37716 +                       info.si_code = FPE_FLTRES;
37717 +                       break;
37718 +       }
37719 +       force_sig_info(SIGFPE, &info, task);
37720 +}
37721 +
37722 +asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
37723 +{
37724 +}
37725 +
37726 +#if 0
37727 +asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
37728 +{
37729 +}
37730 +#endif
37731 +
37732 +asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
37733 +{
37734 +}
37735 +
37736 +/*
37737 + *  'math_state_restore()' saves the current math information in the
37738 + * old math state array, and gets the new ones from the current task
37739 + *
37740 + * Careful.. There are problems with IBM-designed IRQ13 behaviour.
37741 + * Don't touch unless you *really* know how it works.
37742 + */
37743 +asmlinkage void math_state_restore(void)
37744 +{
37745 +       struct task_struct *me = current;
37746 +        /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */
37747 +
37748 +       if (!used_math())
37749 +               init_fpu(me);
37750 +       restore_fpu_checking(&me->thread.i387.fxsave);
37751 +       task_thread_info(me)->status |= TS_USEDFPU;
37752 +}
37753 +
37754 +
37755 +/*
37756 + * NB. All these are "interrupt gates" (i.e. events_mask is set) because we
37757 + * specify <dpl>|4 in the second field.
37758 + */
37759 +static trap_info_t trap_table[] = {
37760 +        {  0, 0|4, __KERNEL_CS, (unsigned long)divide_error               },
37761 +        {  1, 0|4, __KERNEL_CS, (unsigned long)debug                      },
37762 +        {  3, 3|4, __KERNEL_CS, (unsigned long)int3                       },
37763 +        {  4, 3|4, __KERNEL_CS, (unsigned long)overflow                   },
37764 +        {  5, 0|4, __KERNEL_CS, (unsigned long)bounds                     },
37765 +        {  6, 0|4, __KERNEL_CS, (unsigned long)invalid_op                 },
37766 +        {  7, 0|4, __KERNEL_CS, (unsigned long)device_not_available       },
37767 +        {  9, 0|4, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun},
37768 +        { 10, 0|4, __KERNEL_CS, (unsigned long)invalid_TSS                },
37769 +        { 11, 0|4, __KERNEL_CS, (unsigned long)segment_not_present        },
37770 +        { 12, 0|4, __KERNEL_CS, (unsigned long)stack_segment              },
37771 +        { 13, 0|4, __KERNEL_CS, (unsigned long)general_protection         },
37772 +        { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault                 },
37773 +        { 15, 0|4, __KERNEL_CS, (unsigned long)spurious_interrupt_bug     },
37774 +        { 16, 0|4, __KERNEL_CS, (unsigned long)coprocessor_error          },
37775 +        { 17, 0|4, __KERNEL_CS, (unsigned long)alignment_check            },
37776 +#ifdef CONFIG_X86_MCE
37777 +        { 18, 0|4, __KERNEL_CS, (unsigned long)machine_check              },
37778 +#endif
37779 +        { 19, 0|4, __KERNEL_CS, (unsigned long)simd_coprocessor_error     },
37780 +#ifdef CONFIG_IA32_EMULATION
37781 +       { IA32_SYSCALL_VECTOR, 3|4, __KERNEL_CS, (unsigned long)ia32_syscall},
37782 +#endif
37783 +        {  0, 0,           0, 0                                              }
37784 +};
37785 +
37786 +void __init trap_init(void)
37787 +{
37788 +        int ret;
37789 +
37790 +        ret = HYPERVISOR_set_trap_table(trap_table);
37791 +        
37792 +        if (ret) 
37793 +                printk("HYPERVISOR_set_trap_table faild: error %d\n",
37794 +                       ret);
37795 +
37796 +       /*
37797 +        * Should be a barrier for any external CPU state.
37798 +        */
37799 +       cpu_init();
37800 +}
37801 +
37802 +void smp_trap_init(trap_info_t *trap_ctxt)
37803 +{
37804 +       trap_info_t *t = trap_table;
37805 +
37806 +       for (t = trap_table; t->address; t++) {
37807 +               trap_ctxt[t->vector].flags = t->flags;
37808 +               trap_ctxt[t->vector].cs = t->cs;
37809 +               trap_ctxt[t->vector].address = t->address;
37810 +       }
37811 +}
37812 +
37813 +
37814 +/* Actual parsing is done early in setup.c. */
37815 +static int __init oops_dummy(char *s)
37816 +{ 
37817 +       panic_on_oops = 1;
37818 +       return -1; 
37819 +} 
37820 +__setup("oops=", oops_dummy); 
37821 +
37822 +static int __init kstack_setup(char *s)
37823 +{
37824 +       kstack_depth_to_print = simple_strtoul(s,NULL,0);
37825 +       return 0;
37826 +}
37827 +__setup("kstack=", kstack_setup);
37828 +
37829 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/vsyscall-xen.c linux-2.6.16/arch/x86_64/kernel/vsyscall-xen.c
37830 --- linux-2.6.16.orig/arch/x86_64/kernel/vsyscall-xen.c 1970-01-01 01:00:00.000000000 +0100
37831 +++ linux-2.6.16/arch/x86_64/kernel/vsyscall-xen.c      2006-06-26 09:51:32.000000000 +0200
37832 @@ -0,0 +1,239 @@
37833 +/*
37834 + *  linux/arch/x86_64/kernel/vsyscall.c
37835 + *
37836 + *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
37837 + *  Copyright 2003 Andi Kleen, SuSE Labs.
37838 + *
37839 + *  Thanks to hpa@transmeta.com for some useful hint.
37840 + *  Special thanks to Ingo Molnar for his early experience with
37841 + *  a different vsyscall implementation for Linux/IA32 and for the name.
37842 + *
37843 + *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
37844 + *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
37845 + *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
37846 + *  jumping out of line if necessary. We cannot add more with this
37847 + *  mechanism because older kernels won't return -ENOSYS.
37848 + *  If we want more than four we need a vDSO.
37849 + *
37850 + *  Note: the concept clashes with user mode linux. If you use UML and
37851 + *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
37852 + */
37853 +
37854 +#include <linux/time.h>
37855 +#include <linux/init.h>
37856 +#include <linux/kernel.h>
37857 +#include <linux/timer.h>
37858 +#include <linux/seqlock.h>
37859 +#include <linux/jiffies.h>
37860 +#include <linux/sysctl.h>
37861 +
37862 +#include <asm/vsyscall.h>
37863 +#include <asm/pgtable.h>
37864 +#include <asm/page.h>
37865 +#include <asm/fixmap.h>
37866 +#include <asm/errno.h>
37867 +#include <asm/io.h>
37868 +
37869 +#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
37870 +
37871 +int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
37872 +seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
37873 +
37874 +#include <asm/unistd.h>
37875 +
37876 +static __always_inline void timeval_normalize(struct timeval * tv)
37877 +{
37878 +       time_t __sec;
37879 +
37880 +       __sec = tv->tv_usec / 1000000;
37881 +       if (__sec) {
37882 +               tv->tv_usec %= 1000000;
37883 +               tv->tv_sec += __sec;
37884 +       }
37885 +}
37886 +
37887 +static __always_inline void do_vgettimeofday(struct timeval * tv)
37888 +{
37889 +       long sequence, t;
37890 +       unsigned long sec, usec;
37891 +
37892 +       do {
37893 +               sequence = read_seqbegin(&__xtime_lock);
37894 +               
37895 +               sec = __xtime.tv_sec;
37896 +               usec = (__xtime.tv_nsec / 1000) +
37897 +                       (__jiffies - __wall_jiffies) * (1000000 / HZ);
37898 +
37899 +               if (__vxtime.mode != VXTIME_HPET) {
37900 +                       t = get_cycles_sync();
37901 +                       if (t < __vxtime.last_tsc)
37902 +                               t = __vxtime.last_tsc;
37903 +                       usec += ((t - __vxtime.last_tsc) *
37904 +                                __vxtime.tsc_quot) >> 32;
37905 +                       /* See comment in x86_64 do_gettimeofday. */
37906 +               } else {
37907 +                       usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
37908 +                                 __vxtime.last) * __vxtime.quot) >> 32;
37909 +               }
37910 +       } while (read_seqretry(&__xtime_lock, sequence));
37911 +
37912 +       tv->tv_sec = sec + usec / 1000000;
37913 +       tv->tv_usec = usec % 1000000;
37914 +}
37915 +
37916 +/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
37917 +static __always_inline void do_get_tz(struct timezone * tz)
37918 +{
37919 +       *tz = __sys_tz;
37920 +}
37921 +
37922 +static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
37923 +{
37924 +       int ret;
37925 +       asm volatile("vsysc2: syscall"
37926 +               : "=a" (ret)
37927 +               : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber );
37928 +       return ret;
37929 +}
37930 +
37931 +static __always_inline long time_syscall(long *t)
37932 +{
37933 +       long secs;
37934 +       asm volatile("vsysc1: syscall"
37935 +               : "=a" (secs)
37936 +               : "0" (__NR_time),"D" (t) : __syscall_clobber);
37937 +       return secs;
37938 +}
37939 +
37940 +int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
37941 +{
37942 +       if (unlikely(!__sysctl_vsyscall))
37943 +               return gettimeofday(tv,tz);
37944 +       if (tv)
37945 +               do_vgettimeofday(tv);
37946 +       if (tz)
37947 +               do_get_tz(tz);
37948 +       return 0;
37949 +}
37950 +
37951 +/* This will break when the xtime seconds get inaccurate, but that is
37952 + * unlikely */
37953 +time_t __vsyscall(1) vtime(time_t *t)
37954 +{
37955 +       if (unlikely(!__sysctl_vsyscall))
37956 +               return time_syscall(t);
37957 +       else if (t)
37958 +               *t = __xtime.tv_sec;            
37959 +       return __xtime.tv_sec;
37960 +}
37961 +
37962 +long __vsyscall(2) venosys_0(void)
37963 +{
37964 +       return -ENOSYS;
37965 +}
37966 +
37967 +long __vsyscall(3) venosys_1(void)
37968 +{
37969 +       return -ENOSYS;
37970 +}
37971 +
37972 +#ifdef CONFIG_SYSCTL
37973 +
37974 +#define SYSCALL 0x050f
37975 +#define NOP2    0x9090
37976 +
37977 +/*
37978 + * NOP out syscall in vsyscall page when not needed.
37979 + */
37980 +static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
37981 +                        void __user *buffer, size_t *lenp, loff_t *ppos)
37982 +{
37983 +       extern u16 vsysc1, vsysc2;
37984 +       u16 *map1, *map2;
37985 +       int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
37986 +       if (!write)
37987 +               return ret;
37988 +       /* gcc has some trouble with __va(__pa()), so just do it this
37989 +          way. */
37990 +       map1 = ioremap(__pa_symbol(&vsysc1), 2);
37991 +       if (!map1)
37992 +               return -ENOMEM;
37993 +       map2 = ioremap(__pa_symbol(&vsysc2), 2);
37994 +       if (!map2) {
37995 +               ret = -ENOMEM;
37996 +               goto out;
37997 +       }
37998 +       if (!sysctl_vsyscall) {
37999 +               *map1 = SYSCALL;
38000 +               *map2 = SYSCALL;
38001 +       } else {
38002 +               *map1 = NOP2;
38003 +               *map2 = NOP2;
38004 +       }
38005 +       iounmap(map2);
38006 +out:
38007 +       iounmap(map1);
38008 +       return ret;
38009 +}
38010 +
38011 +static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
38012 +                               void __user *oldval, size_t __user *oldlenp,
38013 +                               void __user *newval, size_t newlen,
38014 +                               void **context)
38015 +{
38016 +       return -ENOSYS;
38017 +}
38018 +
38019 +static ctl_table kernel_table2[] = {
38020 +       { .ctl_name = 99, .procname = "vsyscall64",
38021 +         .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644,
38022 +         .strategy = vsyscall_sysctl_nostrat,
38023 +         .proc_handler = vsyscall_sysctl_change },
38024 +       { 0, }
38025 +};
38026 +
38027 +static ctl_table kernel_root_table2[] = {
38028 +       { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
38029 +         .child = kernel_table2 },
38030 +       { 0 },
38031 +};
38032 +
38033 +#endif
38034 +
38035 +static void __init map_vsyscall(void)
38036 +{
38037 +       extern char __vsyscall_0;
38038 +       unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
38039 +
38040 +       __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
38041 +}
38042 +
38043 +#ifdef CONFIG_XEN
38044 +static void __init map_vsyscall_user(void)
38045 +{
38046 +       extern void __set_fixmap_user(enum fixed_addresses, unsigned long, pgprot_t);
38047 +       extern char __vsyscall_0;
38048 +       unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
38049 +
38050 +       __set_fixmap_user(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
38051 +}
38052 +#endif
38053 +
38054 +static int __init vsyscall_init(void)
38055 +{
38056 +       BUG_ON(((unsigned long) &vgettimeofday !=
38057 +                       VSYSCALL_ADDR(__NR_vgettimeofday)));
38058 +       BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
38059 +       BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
38060 +       map_vsyscall();
38061 +#ifdef CONFIG_XEN
38062 +       map_vsyscall_user();
38063 +       sysctl_vsyscall = 0; /* disable vgettimeofay() */
38064 +#endif
38065 +#ifdef CONFIG_SYSCTL
38066 +       register_sysctl_table(kernel_root_table2, 0);
38067 +#endif
38068 +       return 0;
38069 +}
38070 +
38071 +__initcall(vsyscall_init);
38072 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/x8664_ksyms-xen.c linux-2.6.16/arch/x86_64/kernel/x8664_ksyms-xen.c
38073 --- linux-2.6.16.orig/arch/x86_64/kernel/x8664_ksyms-xen.c      1970-01-01 01:00:00.000000000 +0100
38074 +++ linux-2.6.16/arch/x86_64/kernel/x8664_ksyms-xen.c   2006-06-26 09:51:32.000000000 +0200
38075 @@ -0,0 +1,163 @@
38076 +#include <linux/config.h>
38077 +#include <linux/module.h>
38078 +#include <linux/smp.h>
38079 +#include <linux/user.h>
38080 +#include <linux/sched.h>
38081 +#include <linux/in6.h>
38082 +#include <linux/interrupt.h>
38083 +#include <linux/smp_lock.h>
38084 +#include <linux/pm.h>
38085 +#include <linux/pci.h>
38086 +#include <linux/apm_bios.h>
38087 +#include <linux/kernel.h>
38088 +#include <linux/string.h>
38089 +#include <linux/syscalls.h>
38090 +#include <linux/tty.h>
38091 +
38092 +#include <asm/semaphore.h>
38093 +#include <asm/processor.h>
38094 +#include <asm/i387.h>
38095 +#include <asm/uaccess.h>
38096 +#include <asm/checksum.h>
38097 +#include <asm/io.h>
38098 +#include <asm/delay.h>
38099 +#include <asm/irq.h>
38100 +#include <asm/mmx.h>
38101 +#include <asm/desc.h>
38102 +#include <asm/pgtable.h>
38103 +#include <asm/pgalloc.h>
38104 +#include <asm/nmi.h>
38105 +#include <asm/kdebug.h>
38106 +#include <asm/unistd.h>
38107 +#include <asm/tlbflush.h>
38108 +#include <asm/kdebug.h>
38109 +
38110 +#ifdef CONFIG_SMP
38111 +extern void __write_lock_failed(rwlock_t *rw);
38112 +extern void __read_lock_failed(rwlock_t *rw);
38113 +#endif
38114 +
38115 +/* platform dependent support */
38116 +EXPORT_SYMBOL(boot_cpu_data);
38117 +//EXPORT_SYMBOL(dump_fpu);
38118 +EXPORT_SYMBOL(kernel_thread);
38119 +EXPORT_SYMBOL(pm_idle);
38120 +EXPORT_SYMBOL(pm_power_off);
38121 +
38122 +EXPORT_SYMBOL(__down_failed);
38123 +EXPORT_SYMBOL(__down_failed_interruptible);
38124 +EXPORT_SYMBOL(__down_failed_trylock);
38125 +EXPORT_SYMBOL(__up_wakeup);
38126 +/* Networking helper routines. */
38127 +EXPORT_SYMBOL(csum_partial_copy_nocheck);
38128 +EXPORT_SYMBOL(ip_compute_csum);
38129 +/* Delay loops */
38130 +EXPORT_SYMBOL(__udelay);
38131 +EXPORT_SYMBOL(__ndelay);
38132 +EXPORT_SYMBOL(__delay);
38133 +EXPORT_SYMBOL(__const_udelay);
38134 +
38135 +EXPORT_SYMBOL(__get_user_1);
38136 +EXPORT_SYMBOL(__get_user_2);
38137 +EXPORT_SYMBOL(__get_user_4);
38138 +EXPORT_SYMBOL(__get_user_8);
38139 +EXPORT_SYMBOL(__put_user_1);
38140 +EXPORT_SYMBOL(__put_user_2);
38141 +EXPORT_SYMBOL(__put_user_4);
38142 +EXPORT_SYMBOL(__put_user_8);
38143 +
38144 +EXPORT_SYMBOL(strncpy_from_user);
38145 +EXPORT_SYMBOL(__strncpy_from_user);
38146 +EXPORT_SYMBOL(clear_user);
38147 +EXPORT_SYMBOL(__clear_user);
38148 +EXPORT_SYMBOL(copy_user_generic);
38149 +EXPORT_SYMBOL(copy_from_user);
38150 +EXPORT_SYMBOL(copy_to_user);
38151 +EXPORT_SYMBOL(copy_in_user);
38152 +EXPORT_SYMBOL(strnlen_user);
38153 +
38154 +#ifdef CONFIG_PCI
38155 +EXPORT_SYMBOL(pci_mem_start);
38156 +#endif
38157 +
38158 +EXPORT_SYMBOL(copy_page);
38159 +EXPORT_SYMBOL(clear_page);
38160 +
38161 +EXPORT_SYMBOL(_cpu_pda);
38162 +#ifdef CONFIG_SMP
38163 +EXPORT_SYMBOL(__write_lock_failed);
38164 +EXPORT_SYMBOL(__read_lock_failed);
38165 +
38166 +EXPORT_SYMBOL(smp_call_function);
38167 +#endif
38168 +
38169 +#ifdef CONFIG_VT
38170 +EXPORT_SYMBOL(screen_info);
38171 +#endif
38172 +
38173 +EXPORT_SYMBOL(get_wchan);
38174 +
38175 +#ifdef CONFIG_X86_LOCAL_APIC
38176 +EXPORT_SYMBOL_GPL(set_nmi_callback);
38177 +EXPORT_SYMBOL_GPL(unset_nmi_callback);
38178 +#endif
38179 +
38180 +/* Export string functions. We normally rely on gcc builtin for most of these,
38181 +   but gcc sometimes decides not to inline them. */    
38182 +#undef memcpy
38183 +#undef memset
38184 +#undef memmove
38185 +#undef strlen
38186 +
38187 +extern void * memset(void *,int,__kernel_size_t);
38188 +extern size_t strlen(const char *);
38189 +extern void * memmove(void * dest,const void *src,size_t count);
38190 +extern void * memcpy(void *,const void *,__kernel_size_t);
38191 +extern void * __memcpy(void *,const void *,__kernel_size_t);
38192 +
38193 +EXPORT_SYMBOL(memset);
38194 +EXPORT_SYMBOL(strlen);
38195 +EXPORT_SYMBOL(memmove);
38196 +EXPORT_SYMBOL(memcpy);
38197 +EXPORT_SYMBOL(__memcpy);
38198 +
38199 +#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
38200 +/* prototypes are wrong, these are assembly with custom calling functions */
38201 +extern void rwsem_down_read_failed_thunk(void);
38202 +extern void rwsem_wake_thunk(void);
38203 +extern void rwsem_downgrade_thunk(void);
38204 +extern void rwsem_down_write_failed_thunk(void);
38205 +EXPORT_SYMBOL(rwsem_down_read_failed_thunk);
38206 +EXPORT_SYMBOL(rwsem_wake_thunk);
38207 +EXPORT_SYMBOL(rwsem_downgrade_thunk);
38208 +EXPORT_SYMBOL(rwsem_down_write_failed_thunk);
38209 +#endif
38210 +
38211 +EXPORT_SYMBOL(empty_zero_page);
38212 +
38213 +EXPORT_SYMBOL(die_chain);
38214 +EXPORT_SYMBOL(register_die_notifier);
38215 +
38216 +#ifdef CONFIG_SMP
38217 +EXPORT_SYMBOL(cpu_sibling_map);
38218 +EXPORT_SYMBOL(smp_num_siblings);
38219 +#endif
38220 +
38221 +extern void do_softirq_thunk(void);
38222 +EXPORT_SYMBOL(do_softirq_thunk);
38223 +
38224 +#ifdef CONFIG_BUG
38225 +EXPORT_SYMBOL(out_of_line_bug);
38226 +#endif
38227 +
38228 +EXPORT_SYMBOL(init_level4_pgt);
38229 +
38230 +extern unsigned long __supported_pte_mask;
38231 +EXPORT_SYMBOL(__supported_pte_mask);
38232 +
38233 +#ifdef CONFIG_SMP
38234 +EXPORT_SYMBOL(flush_tlb_page);
38235 +#endif
38236 +
38237 +EXPORT_SYMBOL(load_gs_index);
38238 +
38239 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/kernel/xen_entry.S linux-2.6.16/arch/x86_64/kernel/xen_entry.S
38240 --- linux-2.6.16.orig/arch/x86_64/kernel/xen_entry.S    1970-01-01 01:00:00.000000000 +0100
38241 +++ linux-2.6.16/arch/x86_64/kernel/xen_entry.S 2006-06-26 09:51:32.000000000 +0200
38242 @@ -0,0 +1,40 @@
38243 +/*
38244 + * Copied from arch/xen/i386/kernel/entry.S
38245 + */                        
38246 +/* Offsets into shared_info_t. */                
38247 +#define evtchn_upcall_pending          /* 0 */
38248 +#define evtchn_upcall_mask             1
38249 +
38250 +#define sizeof_vcpu_shift              6
38251 +
38252 +#ifdef CONFIG_SMP
38253 +//#define preempt_disable(reg) incl threadinfo_preempt_count(reg)
38254 +//#define preempt_enable(reg)  decl threadinfo_preempt_count(reg)
38255 +#define preempt_disable(reg)
38256 +#define preempt_enable(reg)
38257 +#define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp)                   ; \
38258 +                               movq %gs:pda_cpunumber,reg              ; \
38259 +                               shl  $32, reg                           ; \
38260 +                               shr  $32-sizeof_vcpu_shift,reg          ; \
38261 +                               addq HYPERVISOR_shared_info,reg
38262 +#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp)                    ; \
38263 +#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff
38264 +#else
38265 +#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg
38266 +#define XEN_PUT_VCPU_INFO(reg)
38267 +#define XEN_PUT_VCPU_INFO_fixup
38268 +#endif
38269 +
38270 +#define XEN_LOCKED_BLOCK_EVENTS(reg)   movb $1,evtchn_upcall_mask(reg)
38271 +#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg)
38272 +#define XEN_BLOCK_EVENTS(reg)  XEN_GET_VCPU_INFO(reg)                  ; \
38273 +                               XEN_LOCKED_BLOCK_EVENTS(reg)            ; \
38274 +                               XEN_PUT_VCPU_INFO(reg)
38275 +#define XEN_UNBLOCK_EVENTS(reg)        XEN_GET_VCPU_INFO(reg)                  ; \
38276 +                               XEN_LOCKED_UNBLOCK_EVENTS(reg)          ; \
38277 +                               XEN_PUT_VCPU_INFO(reg)
38278 +#define XEN_TEST_PENDING(reg)  testb $0xFF,evtchn_upcall_pending(reg)
38279 +
38280 +VGCF_IN_SYSCALL = (1<<8)
38281 +        
38282 +       
38283 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/mm/Makefile linux-2.6.16/arch/x86_64/mm/Makefile
38284 --- linux-2.6.16.orig/arch/x86_64/mm/Makefile   2006-03-20 06:53:29.000000000 +0100
38285 +++ linux-2.6.16/arch/x86_64/mm/Makefile        2006-06-26 09:51:32.000000000 +0200
38286 @@ -9,3 +9,13 @@
38287  obj-$(CONFIG_ACPI_NUMA) += srat.o
38288  
38289  hugetlbpage-y = ../../i386/mm/hugetlbpage.o
38290 +
38291 +ifdef CONFIG_XEN
38292 +include $(srctree)/scripts/Makefile.xen
38293 +
38294 +ioremap-y      += ../../i386/mm/ioremap-xen.o
38295 +hypervisor-y   += ../../i386/mm/hypervisor.o
38296 +obj-y          += hypervisor.o
38297 +
38298 +obj-y := $(call cherrypickxen, $(obj-y))
38299 +endif
38300 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/mm/fault-xen.c linux-2.6.16/arch/x86_64/mm/fault-xen.c
38301 --- linux-2.6.16.orig/arch/x86_64/mm/fault-xen.c        1970-01-01 01:00:00.000000000 +0100
38302 +++ linux-2.6.16/arch/x86_64/mm/fault-xen.c     2006-06-26 09:51:32.000000000 +0200
38303 @@ -0,0 +1,595 @@
38304 +/*
38305 + *  linux/arch/x86-64/mm/fault.c
38306 + *
38307 + *  Copyright (C) 1995  Linus Torvalds
38308 + *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
38309 + */
38310 +
38311 +#include <linux/config.h>
38312 +#include <linux/signal.h>
38313 +#include <linux/sched.h>
38314 +#include <linux/kernel.h>
38315 +#include <linux/errno.h>
38316 +#include <linux/string.h>
38317 +#include <linux/types.h>
38318 +#include <linux/ptrace.h>
38319 +#include <linux/mman.h>
38320 +#include <linux/mm.h>
38321 +#include <linux/smp.h>
38322 +#include <linux/smp_lock.h>
38323 +#include <linux/interrupt.h>
38324 +#include <linux/init.h>
38325 +#include <linux/tty.h>
38326 +#include <linux/vt_kern.h>             /* For unblank_screen() */
38327 +#include <linux/compiler.h>
38328 +#include <linux/module.h>
38329 +#include <linux/kprobes.h>
38330 +
38331 +#include <asm/system.h>
38332 +#include <asm/uaccess.h>
38333 +#include <asm/pgalloc.h>
38334 +#include <asm/smp.h>
38335 +#include <asm/tlbflush.h>
38336 +#include <asm/proto.h>
38337 +#include <asm/kdebug.h>
38338 +#include <asm-generic/sections.h>
38339 +
38340 +/* Page fault error code bits */
38341 +#define PF_PROT        (1<<0)          /* or no page found */
38342 +#define PF_WRITE       (1<<1)
38343 +#define PF_USER        (1<<2)
38344 +#define PF_RSVD        (1<<3)
38345 +#define PF_INSTR       (1<<4)
38346 +
38347 +void bust_spinlocks(int yes)
38348 +{
38349 +       int loglevel_save = console_loglevel;
38350 +       if (yes) {
38351 +               oops_in_progress = 1;
38352 +       } else {
38353 +#ifdef CONFIG_VT
38354 +               unblank_screen();
38355 +#endif
38356 +               oops_in_progress = 0;
38357 +               /*
38358 +                * OK, the message is on the console.  Now we call printk()
38359 +                * without oops_in_progress set so that printk will give klogd
38360 +                * a poke.  Hold onto your hats...
38361 +                */
38362 +               console_loglevel = 15;          /* NMI oopser may have shut the console up */
38363 +               printk(" ");
38364 +               console_loglevel = loglevel_save;
38365 +       }
38366 +}
38367 +
38368 +/* Sometimes the CPU reports invalid exceptions on prefetch.
38369 +   Check that here and ignore.
38370 +   Opcode checker based on code by Richard Brunner */
38371 +static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
38372 +                               unsigned long error_code)
38373 +{ 
38374 +       unsigned char *instr;
38375 +       int scan_more = 1;
38376 +       int prefetch = 0; 
38377 +       unsigned char *max_instr;
38378 +
38379 +       /* If it was a exec fault ignore */
38380 +       if (error_code & PF_INSTR)
38381 +               return 0;
38382 +       
38383 +       instr = (unsigned char *)convert_rip_to_linear(current, regs);
38384 +       max_instr = instr + 15;
38385 +
38386 +       if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
38387 +               return 0;
38388 +
38389 +       while (scan_more && instr < max_instr) { 
38390 +               unsigned char opcode;
38391 +               unsigned char instr_hi;
38392 +               unsigned char instr_lo;
38393 +
38394 +               if (__get_user(opcode, instr))
38395 +                       break; 
38396 +
38397 +               instr_hi = opcode & 0xf0; 
38398 +               instr_lo = opcode & 0x0f; 
38399 +               instr++;
38400 +
38401 +               switch (instr_hi) { 
38402 +               case 0x20:
38403 +               case 0x30:
38404 +                       /* Values 0x26,0x2E,0x36,0x3E are valid x86
38405 +                          prefixes.  In long mode, the CPU will signal
38406 +                          invalid opcode if some of these prefixes are
38407 +                          present so we will never get here anyway */
38408 +                       scan_more = ((instr_lo & 7) == 0x6);
38409 +                       break;
38410 +                       
38411 +               case 0x40:
38412 +                       /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
38413 +                          Need to figure out under what instruction mode the
38414 +                          instruction was issued ... */
38415 +                       /* Could check the LDT for lm, but for now it's good
38416 +                          enough to assume that long mode only uses well known
38417 +                          segments or kernel. */
38418 +                       scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
38419 +                       break;
38420 +                       
38421 +               case 0x60:
38422 +                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
38423 +                       scan_more = (instr_lo & 0xC) == 0x4;
38424 +                       break;          
38425 +               case 0xF0:
38426 +                       /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
38427 +                       scan_more = !instr_lo || (instr_lo>>1) == 1;
38428 +                       break;                  
38429 +               case 0x00:
38430 +                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
38431 +                       scan_more = 0;
38432 +                       if (__get_user(opcode, instr)) 
38433 +                               break;
38434 +                       prefetch = (instr_lo == 0xF) &&
38435 +                               (opcode == 0x0D || opcode == 0x18);
38436 +                       break;                  
38437 +               default:
38438 +                       scan_more = 0;
38439 +                       break;
38440 +               } 
38441 +       }
38442 +       return prefetch;
38443 +}
38444 +
38445 +static int bad_address(void *p) 
38446 +{ 
38447 +       unsigned long dummy;
38448 +       return __get_user(dummy, (unsigned long *)p);
38449 +} 
38450 +
38451 +void dump_pagetable(unsigned long address)
38452 +{
38453 +       pgd_t *pgd;
38454 +       pud_t *pud;
38455 +       pmd_t *pmd;
38456 +       pte_t *pte;
38457 +
38458 +       pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
38459 +       pgd += pgd_index(address);
38460 +       if (bad_address(pgd)) goto bad;
38461 +       printk("PGD %lx ", pgd_val(*pgd));
38462 +       if (!pgd_present(*pgd)) goto ret; 
38463 +
38464 +       pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
38465 +       if (bad_address(pud)) goto bad;
38466 +       printk("PUD %lx ", pud_val(*pud));
38467 +       if (!pud_present(*pud)) goto ret;
38468 +
38469 +       pmd = pmd_offset(pud, address);
38470 +       if (bad_address(pmd)) goto bad;
38471 +       printk("PMD %lx ", pmd_val(*pmd));
38472 +       if (!pmd_present(*pmd)) goto ret;        
38473 +
38474 +       pte = pte_offset_kernel(pmd, address);
38475 +       if (bad_address(pte)) goto bad;
38476 +       printk("PTE %lx", pte_val(*pte)); 
38477 +ret:
38478 +       printk("\n");
38479 +       return;
38480 +bad:
38481 +       printk("BAD\n");
38482 +}
38483 +
38484 +static const char errata93_warning[] = 
38485 +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
38486 +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
38487 +KERN_ERR "******* Please consider a BIOS update.\n"
38488 +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
38489 +
38490 +/* Workaround for K8 erratum #93 & buggy BIOS.
38491 +   BIOS SMM functions are required to use a specific workaround
38492 +   to avoid corruption of the 64bit RIP register on C stepping K8. 
38493 +   A lot of BIOS that didn't get tested properly miss this. 
38494 +   The OS sees this as a page fault with the upper 32bits of RIP cleared.
38495 +   Try to work around it here.
38496 +   Note we only handle faults in kernel here. */
38497 +
38498 +static int is_errata93(struct pt_regs *regs, unsigned long address) 
38499 +{
38500 +       static int warned;
38501 +       if (address != regs->rip)
38502 +               return 0;
38503 +       if ((address >> 32) != 0) 
38504 +               return 0;
38505 +       address |= 0xffffffffUL << 32;
38506 +       if ((address >= (u64)_stext && address <= (u64)_etext) || 
38507 +           (address >= MODULES_VADDR && address <= MODULES_END)) { 
38508 +               if (!warned) {
38509 +                       printk(errata93_warning);               
38510 +                       warned = 1;
38511 +               }
38512 +               regs->rip = address;
38513 +               return 1;
38514 +       }
38515 +       return 0;
38516 +} 
38517 +
38518 +int unhandled_signal(struct task_struct *tsk, int sig)
38519 +{
38520 +       if (tsk->pid == 1)
38521 +               return 1;
38522 +       if (tsk->ptrace & PT_PTRACED)
38523 +               return 0;
38524 +       return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
38525 +               (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
38526 +}
38527 +
38528 +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
38529 +                                unsigned long error_code)
38530 +{
38531 +       unsigned long flags = oops_begin();
38532 +       struct task_struct *tsk;
38533 +
38534 +       printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
38535 +              current->comm, address);
38536 +       dump_pagetable(address);
38537 +       tsk = current;
38538 +       tsk->thread.cr2 = address;
38539 +       tsk->thread.trap_no = 14;
38540 +       tsk->thread.error_code = error_code;
38541 +       __die("Bad pagetable", regs, error_code);
38542 +       oops_end(flags);
38543 +       do_exit(SIGKILL);
38544 +}
38545 +
38546 +/*
38547 + * Handle a fault on the vmalloc area
38548 + *
38549 + * This assumes no large pages in there.
38550 + */
38551 +static int vmalloc_fault(unsigned long address)
38552 +{
38553 +       pgd_t *pgd, *pgd_ref;
38554 +       pud_t *pud, *pud_ref;
38555 +       pmd_t *pmd, *pmd_ref;
38556 +       pte_t *pte, *pte_ref;
38557 +
38558 +       /* Copy kernel mappings over when needed. This can also
38559 +          happen within a race in page table update. In the later
38560 +          case just flush. */
38561 +
38562 +       /* On Xen the line below does not always work. Needs investigating! */
38563 +       /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
38564 +       pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
38565 +       pgd += pgd_index(address);
38566 +       pgd_ref = pgd_offset_k(address);
38567 +       if (pgd_none(*pgd_ref))
38568 +               return -1;
38569 +       if (pgd_none(*pgd))
38570 +               set_pgd(pgd, *pgd_ref);
38571 +
38572 +       /* Below here mismatches are bugs because these lower tables
38573 +          are shared */
38574 +
38575 +       pud = pud_offset(pgd, address);
38576 +       pud_ref = pud_offset(pgd_ref, address);
38577 +       if (pud_none(*pud_ref))
38578 +               return -1;
38579 +       if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
38580 +               BUG();
38581 +       pmd = pmd_offset(pud, address);
38582 +       pmd_ref = pmd_offset(pud_ref, address);
38583 +       if (pmd_none(*pmd_ref))
38584 +               return -1;
38585 +       if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
38586 +               BUG();
38587 +       pte_ref = pte_offset_kernel(pmd_ref, address);
38588 +       if (!pte_present(*pte_ref))
38589 +               return -1;
38590 +       pte = pte_offset_kernel(pmd, address);
38591 +       /* Don't use pte_page here, because the mappings can point
38592 +          outside mem_map, and the NUMA hash lookup cannot handle
38593 +          that. */
38594 +       if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
38595 +               BUG();
38596 +       return 0;
38597 +}
38598 +
38599 +int page_fault_trace = 0;
38600 +int exception_trace = 1;
38601 +
38602 +
38603 +#define MEM_VERBOSE 1
38604 +
38605 +#ifdef MEM_VERBOSE
38606 +#define MEM_LOG(_f, _a...)                     \
38607 +       printk("fault.c:[%d]-> " _f "\n",       \
38608 +       __LINE__ , ## _a )
38609 +#else
38610 +#define MEM_LOG(_f, _a...) ((void)0)
38611 +#endif
38612 +
38613 +/*
38614 + * This routine handles page faults.  It determines the address,
38615 + * and the problem, and then passes it off to one of the appropriate
38616 + * routines.
38617 + */
38618 +asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
38619 +                                       unsigned long error_code)
38620 +{
38621 +       struct task_struct *tsk;
38622 +       struct mm_struct *mm;
38623 +       struct vm_area_struct * vma;
38624 +       unsigned long address;
38625 +       const struct exception_table_entry *fixup;
38626 +       int write;
38627 +       unsigned long flags;
38628 +       siginfo_t info;
38629 +
38630 +       if (!user_mode(regs))
38631 +               error_code &= ~PF_USER; /* means kernel */
38632 +
38633 +       /* get the address */
38634 +       address = HYPERVISOR_shared_info->vcpu_info[
38635 +               smp_processor_id()].arch.cr2;
38636 +       if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
38637 +                                       SIGSEGV) == NOTIFY_STOP)
38638 +               return;
38639 +
38640 +       if (likely(regs->eflags & X86_EFLAGS_IF))
38641 +               local_irq_enable();
38642 +
38643 +       if (unlikely(page_fault_trace))
38644 +               printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
38645 +                      regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); 
38646 +
38647 +       tsk = current;
38648 +       mm = tsk->mm;
38649 +       info.si_code = SEGV_MAPERR;
38650 +
38651 +
38652 +       /*
38653 +        * We fault-in kernel-space virtual memory on-demand. The
38654 +        * 'reference' page table is init_mm.pgd.
38655 +        *
38656 +        * NOTE! We MUST NOT take any locks for this case. We may
38657 +        * be in an interrupt or a critical region, and should
38658 +        * only copy the information from the master page table,
38659 +        * nothing more.
38660 +        *
38661 +        * This verifies that the fault happens in kernel space
38662 +        * (error_code & 4) == 0, and that the fault was not a
38663 +        * protection error (error_code & 9) == 0.
38664 +        */
38665 +       if (unlikely(address >= TASK_SIZE64)) {
38666 +               /*
38667 +                * Must check for the entire kernel range here: with writable
38668 +                * page tables the hypervisor may temporarily clear PMD
38669 +                * entries.
38670 +                */
38671 +               if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
38672 +                   address >= PAGE_OFFSET) {
38673 +                       if (vmalloc_fault(address) < 0)
38674 +                               goto bad_area_nosemaphore;
38675 +                       return;
38676 +               }
38677 +               /*
38678 +                * Don't take the mm semaphore here. If we fixup a prefetch
38679 +                * fault we could otherwise deadlock.
38680 +                */
38681 +               goto bad_area_nosemaphore;
38682 +       }
38683 +
38684 +       if (unlikely(error_code & PF_RSVD))
38685 +               pgtable_bad(address, regs, error_code);
38686 +
38687 +       /*
38688 +        * If we're in an interrupt or have no user
38689 +        * context, we must not take the fault..
38690 +        */
38691 +       if (unlikely(in_atomic() || !mm))
38692 +               goto bad_area_nosemaphore;
38693 +
38694 + again:
38695 +       /* When running in the kernel we expect faults to occur only to
38696 +        * addresses in user space.  All other faults represent errors in the
38697 +        * kernel and should generate an OOPS.  Unfortunatly, in the case of an
38698 +        * erroneous fault occuring in a code path which already holds mmap_sem
38699 +        * we will deadlock attempting to validate the fault against the
38700 +        * address space.  Luckily the kernel only validly references user
38701 +        * space from well defined areas of code, which are listed in the
38702 +        * exceptions table.
38703 +        *
38704 +        * As the vast majority of faults will be valid we will only perform
38705 +        * the source reference check when there is a possibilty of a deadlock.
38706 +        * Attempt to lock the address space, if we cannot we then validate the
38707 +        * source.  If this is invalid we can skip the address space check,
38708 +        * thus avoiding the deadlock.
38709 +        */
38710 +       if (!down_read_trylock(&mm->mmap_sem)) {
38711 +               if ((error_code & PF_USER) == 0 &&
38712 +                   !search_exception_tables(regs->rip))
38713 +                       goto bad_area_nosemaphore;
38714 +               down_read(&mm->mmap_sem);
38715 +       }
38716 +
38717 +       vma = find_vma(mm, address);
38718 +       if (!vma)
38719 +               goto bad_area;
38720 +       if (likely(vma->vm_start <= address))
38721 +               goto good_area;
38722 +       if (!(vma->vm_flags & VM_GROWSDOWN))
38723 +               goto bad_area;
38724 +       if (error_code & 4) {
38725 +               // XXX: align red zone size with ABI 
38726 +               if (address + 128 < regs->rsp)
38727 +                       goto bad_area;
38728 +       }
38729 +       if (expand_stack(vma, address))
38730 +               goto bad_area;
38731 +/*
38732 + * Ok, we have a good vm_area for this memory access, so
38733 + * we can handle it..
38734 + */
38735 +good_area:
38736 +       info.si_code = SEGV_ACCERR;
38737 +       write = 0;
38738 +       switch (error_code & (PF_PROT|PF_WRITE)) {
38739 +               default:        /* 3: write, present */
38740 +                       /* fall through */
38741 +               case PF_WRITE:          /* write, not present */
38742 +                       if (!(vma->vm_flags & VM_WRITE))
38743 +                               goto bad_area;
38744 +                       write++;
38745 +                       break;
38746 +               case PF_PROT:           /* read, present */
38747 +                       goto bad_area;
38748 +               case 0:                 /* read, not present */
38749 +                       if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
38750 +                               goto bad_area;
38751 +       }
38752 +
38753 +       /*
38754 +        * If for any reason at all we couldn't handle the fault,
38755 +        * make sure we exit gracefully rather than endlessly redo
38756 +        * the fault.
38757 +        */
38758 +       switch (handle_mm_fault(mm, vma, address, write)) {
38759 +       case VM_FAULT_MINOR:
38760 +               tsk->min_flt++;
38761 +               break;
38762 +       case VM_FAULT_MAJOR:
38763 +               tsk->maj_flt++;
38764 +               break;
38765 +       case VM_FAULT_SIGBUS:
38766 +               goto do_sigbus;
38767 +       default:
38768 +               goto out_of_memory;
38769 +       }
38770 +
38771 +       up_read(&mm->mmap_sem);
38772 +       return;
38773 +
38774 +/*
38775 + * Something tried to access memory that isn't in our memory map..
38776 + * Fix it, but check if it's kernel or user first..
38777 + */
38778 +bad_area:
38779 +       up_read(&mm->mmap_sem);
38780 +
38781 +bad_area_nosemaphore:
38782 +       /* User mode accesses just cause a SIGSEGV */
38783 +       if (error_code & PF_USER) {
38784 +               if (is_prefetch(regs, address, error_code))
38785 +                       return;
38786 +
38787 +               /* Work around K8 erratum #100 K8 in compat mode
38788 +                  occasionally jumps to illegal addresses >4GB.  We
38789 +                  catch this here in the page fault handler because
38790 +                  these addresses are not reachable. Just detect this
38791 +                  case and return.  Any code segment in LDT is
38792 +                  compatibility mode. */
38793 +               if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
38794 +                   (address >> 32))
38795 +                       return;
38796 +
38797 +               if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
38798 +                       printk(
38799 +                      "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
38800 +                                       tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
38801 +                                       tsk->comm, tsk->pid, address, regs->rip,
38802 +                                       regs->rsp, error_code);
38803 +               }
38804 +       
38805 +               tsk->thread.cr2 = address;
38806 +               /* Kernel addresses are always protection faults */
38807 +               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
38808 +               tsk->thread.trap_no = 14;
38809 +               info.si_signo = SIGSEGV;
38810 +               info.si_errno = 0;
38811 +               /* info.si_code has been set above */
38812 +               info.si_addr = (void __user *)address;
38813 +               force_sig_info(SIGSEGV, &info, tsk);
38814 +               return;
38815 +       }
38816 +
38817 +no_context:
38818 +       
38819 +       /* Are we prepared to handle this kernel fault?  */
38820 +       fixup = search_exception_tables(regs->rip);
38821 +       if (fixup) {
38822 +               regs->rip = fixup->fixup;
38823 +               return;
38824 +       }
38825 +
38826 +       /* 
38827 +        * Hall of shame of CPU/BIOS bugs.
38828 +        */
38829 +
38830 +       if (is_prefetch(regs, address, error_code))
38831 +               return;
38832 +
38833 +       if (is_errata93(regs, address))
38834 +               return; 
38835 +
38836 +/*
38837 + * Oops. The kernel tried to access some bad page. We'll have to
38838 + * terminate things with extreme prejudice.
38839 + */
38840 +
38841 +       flags = oops_begin();
38842 +
38843 +       if (address < PAGE_SIZE)
38844 +               printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
38845 +       else
38846 +               printk(KERN_ALERT "Unable to handle kernel paging request");
38847 +       printk(" at %016lx RIP: \n" KERN_ALERT,address);
38848 +       printk_address(regs->rip);
38849 +       printk("\n");
38850 +       dump_pagetable(address);
38851 +       tsk->thread.cr2 = address;
38852 +       tsk->thread.trap_no = 14;
38853 +       tsk->thread.error_code = error_code;
38854 +       __die("Oops", regs, error_code);
38855 +       /* Executive summary in case the body of the oops scrolled away */
38856 +       printk(KERN_EMERG "CR2: %016lx\n", address);
38857 +       oops_end(flags);
38858 +       do_exit(SIGKILL);
38859 +
38860 +/*
38861 + * We ran out of memory, or some other thing happened to us that made
38862 + * us unable to handle the page fault gracefully.
38863 + */
38864 +out_of_memory:
38865 +       up_read(&mm->mmap_sem);
38866 +       if (current->pid == 1) { 
38867 +               yield();
38868 +               goto again;
38869 +       }
38870 +       printk("VM: killing process %s\n", tsk->comm);
38871 +       if (error_code & 4)
38872 +               do_exit(SIGKILL);
38873 +       goto no_context;
38874 +
38875 +do_sigbus:
38876 +       up_read(&mm->mmap_sem);
38877 +
38878 +       /* Kernel mode? Handle exceptions or die */
38879 +       if (!(error_code & PF_USER))
38880 +               goto no_context;
38881 +
38882 +       tsk->thread.cr2 = address;
38883 +       tsk->thread.error_code = error_code;
38884 +       tsk->thread.trap_no = 14;
38885 +       info.si_signo = SIGBUS;
38886 +       info.si_errno = 0;
38887 +       info.si_code = BUS_ADRERR;
38888 +       info.si_addr = (void __user *)address;
38889 +       force_sig_info(SIGBUS, &info, tsk);
38890 +       return;
38891 +}
38892 +
38893 +static int __init enable_pagefaulttrace(char *str)
38894 +{
38895 +       page_fault_trace = 1;
38896 +       return 0;
38897 +}
38898 +__setup("pagefaulttrace", enable_pagefaulttrace);
38899 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/mm/init-xen.c linux-2.6.16/arch/x86_64/mm/init-xen.c
38900 --- linux-2.6.16.orig/arch/x86_64/mm/init-xen.c 1970-01-01 01:00:00.000000000 +0100
38901 +++ linux-2.6.16/arch/x86_64/mm/init-xen.c      2006-06-26 09:51:32.000000000 +0200
38902 @@ -0,0 +1,1101 @@
38903 +/*
38904 + *  linux/arch/x86_64/mm/init.c
38905 + *
38906 + *  Copyright (C) 1995  Linus Torvalds
38907 + *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
38908 + *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
38909 + *
38910 + *  Jun Nakajima <jun.nakajima@intel.com>
38911 + *     Modified for Xen.
38912 + */
38913 +
38914 +#include <linux/config.h>
38915 +#include <linux/signal.h>
38916 +#include <linux/sched.h>
38917 +#include <linux/kernel.h>
38918 +#include <linux/errno.h>
38919 +#include <linux/string.h>
38920 +#include <linux/types.h>
38921 +#include <linux/ptrace.h>
38922 +#include <linux/mman.h>
38923 +#include <linux/mm.h>
38924 +#include <linux/swap.h>
38925 +#include <linux/smp.h>
38926 +#include <linux/init.h>
38927 +#include <linux/pagemap.h>
38928 +#include <linux/bootmem.h>
38929 +#include <linux/proc_fs.h>
38930 +#include <linux/pci.h>
38931 +#include <linux/dma-mapping.h>
38932 +#include <linux/module.h>
38933 +#include <linux/memory_hotplug.h>
38934 +
38935 +#include <asm/processor.h>
38936 +#include <asm/system.h>
38937 +#include <asm/uaccess.h>
38938 +#include <asm/pgtable.h>
38939 +#include <asm/pgalloc.h>
38940 +#include <asm/dma.h>
38941 +#include <asm/fixmap.h>
38942 +#include <asm/e820.h>
38943 +#include <asm/apic.h>
38944 +#include <asm/tlb.h>
38945 +#include <asm/mmu_context.h>
38946 +#include <asm/proto.h>
38947 +#include <asm/smp.h>
38948 +#include <asm/sections.h>
38949 +#include <asm/dma-mapping.h>
38950 +#include <asm/swiotlb.h>
38951 +
38952 +#include <xen/features.h>
38953 +
38954 +#ifndef Dprintk
38955 +#define Dprintk(x...)
38956 +#endif
38957 +
38958 +struct dma_mapping_ops* dma_ops;
38959 +EXPORT_SYMBOL(dma_ops);
38960 +
38961 +extern unsigned long *contiguous_bitmap;
38962 +
38963 +static unsigned long dma_reserve __initdata;
38964 +
38965 +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
38966 +extern unsigned long start_pfn;
38967 +
38968 +/*
38969 + * Use this until direct mapping is established, i.e. before __va() is 
38970 + * available in init_memory_mapping().
38971 + */
38972 +
38973 +#define addr_to_page(addr, page)                               \
38974 +       (addr) &= PHYSICAL_PAGE_MASK;                           \
38975 +       (page) = ((unsigned long *) ((unsigned long)            \
38976 +       (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) +   \
38977 +       __START_KERNEL_map)))
38978 +
38979 +static void early_make_page_readonly(void *va, unsigned int feature)
38980 +{
38981 +       unsigned long addr, _va = (unsigned long)va;
38982 +       pte_t pte, *ptep;
38983 +       unsigned long *page = (unsigned long *) init_level4_pgt;
38984 +
38985 +       if (xen_feature(feature))
38986 +               return;
38987 +
38988 +       addr = (unsigned long) page[pgd_index(_va)];
38989 +       addr_to_page(addr, page);
38990 +
38991 +       addr = page[pud_index(_va)];
38992 +       addr_to_page(addr, page);
38993 +
38994 +       addr = page[pmd_index(_va)];
38995 +       addr_to_page(addr, page);
38996 +
38997 +       ptep = (pte_t *) &page[pte_index(_va)];
38998 +
38999 +       pte.pte = ptep->pte & ~_PAGE_RW;
39000 +       if (HYPERVISOR_update_va_mapping(_va, pte, 0))
39001 +               BUG();
39002 +}
39003 +
39004 +void make_page_readonly(void *va, unsigned int feature)
39005 +{
39006 +       pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
39007 +       unsigned long addr = (unsigned long) va;
39008 +
39009 +       if (xen_feature(feature))
39010 +               return;
39011 +
39012 +       pgd = pgd_offset_k(addr);
39013 +       pud = pud_offset(pgd, addr);
39014 +       pmd = pmd_offset(pud, addr);
39015 +       ptep = pte_offset_kernel(pmd, addr);
39016 +
39017 +       pte.pte = ptep->pte & ~_PAGE_RW;
39018 +       if (HYPERVISOR_update_va_mapping(addr, pte, 0))
39019 +               xen_l1_entry_update(ptep, pte); /* fallback */
39020 +
39021 +       if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
39022 +               make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
39023 +}
39024 +
39025 +void make_page_writable(void *va, unsigned int feature)
39026 +{
39027 +       pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
39028 +       unsigned long addr = (unsigned long) va;
39029 +
39030 +       if (xen_feature(feature))
39031 +               return;
39032 +
39033 +       pgd = pgd_offset_k(addr);
39034 +       pud = pud_offset(pgd, addr);
39035 +       pmd = pmd_offset(pud, addr);
39036 +       ptep = pte_offset_kernel(pmd, addr);
39037 +
39038 +       pte.pte = ptep->pte | _PAGE_RW;
39039 +       if (HYPERVISOR_update_va_mapping(addr, pte, 0))
39040 +               xen_l1_entry_update(ptep, pte); /* fallback */
39041 +
39042 +       if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
39043 +               make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
39044 +}
39045 +
39046 +void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
39047 +{
39048 +       if (xen_feature(feature))
39049 +               return;
39050 +
39051 +       while (nr-- != 0) {
39052 +               make_page_readonly(va, feature);
39053 +               va = (void*)((unsigned long)va + PAGE_SIZE);
39054 +       }
39055 +}
39056 +
39057 +void make_pages_writable(void *va, unsigned nr, unsigned int feature)
39058 +{
39059 +       if (xen_feature(feature))
39060 +               return;
39061 +
39062 +       while (nr-- != 0) {
39063 +               make_page_writable(va, feature);
39064 +               va = (void*)((unsigned long)va + PAGE_SIZE);
39065 +       }
39066 +}
39067 +
39068 +/*
39069 + * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
39070 + * physical space so we can cache the place of the first one and move
39071 + * around without checking the pgd every time.
39072 + */
39073 +
39074 +void show_mem(void)
39075 +{
39076 +       long i, total = 0, reserved = 0;
39077 +       long shared = 0, cached = 0;
39078 +       pg_data_t *pgdat;
39079 +       struct page *page;
39080 +
39081 +       printk(KERN_INFO "Mem-info:\n");
39082 +       show_free_areas();
39083 +       printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
39084 +
39085 +       for_each_pgdat(pgdat) {
39086 +               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
39087 +                       page = pfn_to_page(pgdat->node_start_pfn + i);
39088 +                       total++;
39089 +                       if (PageReserved(page))
39090 +                               reserved++;
39091 +                       else if (PageSwapCache(page))
39092 +                               cached++;
39093 +                       else if (page_count(page))
39094 +                               shared += page_count(page) - 1;
39095 +               }
39096 +       }
39097 +       printk(KERN_INFO "%lu pages of RAM\n", total);
39098 +       printk(KERN_INFO "%lu reserved pages\n",reserved);
39099 +       printk(KERN_INFO "%lu pages shared\n",shared);
39100 +       printk(KERN_INFO "%lu pages swap cached\n",cached);
39101 +}
39102 +
39103 +/* References to section boundaries */
39104 +
39105 +int after_bootmem;
39106 +
39107 +static void *spp_getpage(void)
39108 +{ 
39109 +       void *ptr;
39110 +       if (after_bootmem)
39111 +               ptr = (void *) get_zeroed_page(GFP_ATOMIC); 
39112 +       else
39113 +               ptr = alloc_bootmem_pages(PAGE_SIZE);
39114 +       if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
39115 +               panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
39116 +
39117 +       Dprintk("spp_getpage %p\n", ptr);
39118 +       return ptr;
39119 +} 
39120 +
39121 +#define pgd_offset_u(address) (pgd_t *)(init_level4_user_pgt + pgd_index(address))
39122 +
39123 +static inline pud_t *pud_offset_u(unsigned long address)
39124 +{
39125 +       pud_t *pud = level3_user_pgt;
39126 +
39127 +       return pud + pud_index(address);
39128 +}
39129 +
39130 +static void set_pte_phys(unsigned long vaddr,
39131 +                        unsigned long phys, pgprot_t prot, int user_mode)
39132 +{
39133 +       pgd_t *pgd;
39134 +       pud_t *pud;
39135 +       pmd_t *pmd;
39136 +       pte_t *pte, new_pte;
39137 +
39138 +       Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
39139 +
39140 +       pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
39141 +       if (pgd_none(*pgd)) {
39142 +               printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
39143 +               return;
39144 +       }
39145 +       pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
39146 +       if (pud_none(*pud)) {
39147 +               pmd = (pmd_t *) spp_getpage(); 
39148 +               make_page_readonly(pmd, XENFEAT_writable_page_tables);
39149 +               set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
39150 +               if (pmd != pmd_offset(pud, 0)) {
39151 +                       printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
39152 +                       return;
39153 +               }
39154 +       }
39155 +       pmd = pmd_offset(pud, vaddr);
39156 +       if (pmd_none(*pmd)) {
39157 +               pte = (pte_t *) spp_getpage();
39158 +               make_page_readonly(pte, XENFEAT_writable_page_tables);
39159 +               set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
39160 +               if (pte != pte_offset_kernel(pmd, 0)) {
39161 +                       printk("PAGETABLE BUG #02!\n");
39162 +                       return;
39163 +               }
39164 +       }
39165 +       new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
39166 +
39167 +       pte = pte_offset_kernel(pmd, vaddr);
39168 +       if (!pte_none(*pte) &&
39169 +           pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
39170 +               pte_ERROR(*pte);
39171 +       set_pte(pte, new_pte);
39172 +
39173 +       /*
39174 +        * It's enough to flush this one mapping.
39175 +        * (PGE mappings get flushed as well)
39176 +        */
39177 +       __flush_tlb_one(vaddr);
39178 +}
39179 +
39180 +static void set_pte_phys_ma(unsigned long vaddr,
39181 +                        unsigned long phys, pgprot_t prot)
39182 +{
39183 +       pgd_t *pgd;
39184 +       pud_t *pud;
39185 +       pmd_t *pmd;
39186 +       pte_t *pte, new_pte;
39187 +
39188 +       Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
39189 +
39190 +       pgd = pgd_offset_k(vaddr);
39191 +       if (pgd_none(*pgd)) {
39192 +               printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
39193 +               return;
39194 +       }
39195 +       pud = pud_offset(pgd, vaddr);
39196 +       if (pud_none(*pud)) {
39197 +
39198 +               pmd = (pmd_t *) spp_getpage(); 
39199 +               make_page_readonly(pmd, XENFEAT_writable_page_tables);
39200 +
39201 +               set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
39202 +
39203 +               if (pmd != pmd_offset(pud, 0)) {
39204 +                       printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
39205 +                       return;
39206 +               }
39207 +       }
39208 +       pmd = pmd_offset(pud, vaddr);
39209 +
39210 +       if (pmd_none(*pmd)) {
39211 +               pte = (pte_t *) spp_getpage();
39212 +               make_page_readonly(pte, XENFEAT_writable_page_tables);
39213 +
39214 +               set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
39215 +               if (pte != pte_offset_kernel(pmd, 0)) {
39216 +                       printk("PAGETABLE BUG #02!\n");
39217 +                       return;
39218 +               }
39219 +       }
39220 +
39221 +       new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
39222 +       pte = pte_offset_kernel(pmd, vaddr);
39223 +
39224 +       /* 
39225 +        * Note that the pte page is already RO, thus we want to use
39226 +        * xen_l1_entry_update(), not set_pte().
39227 +        */
39228 +       xen_l1_entry_update(pte, 
39229 +                           pfn_pte_ma(phys >> PAGE_SHIFT, prot));
39230 +
39231 +       /*
39232 +        * It's enough to flush this one mapping.
39233 +        * (PGE mappings get flushed as well)
39234 +        */
39235 +       __flush_tlb_one(vaddr);
39236 +}
39237 +
39238 +#define SET_FIXMAP_KERNEL 0
39239 +#define SET_FIXMAP_USER   1
39240 +
39241 +/* NOTE: this is meant to be run only at boot */
39242 +void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
39243 +{
39244 +       unsigned long address = __fix_to_virt(idx);
39245 +
39246 +       if (idx >= __end_of_fixed_addresses) {
39247 +               printk("Invalid __set_fixmap\n");
39248 +               return;
39249 +       }
39250 +       switch (idx) {
39251 +       case VSYSCALL_FIRST_PAGE:
39252 +               set_pte_phys(address, phys, prot, SET_FIXMAP_KERNEL);
39253 +               break;
39254 +       default:
39255 +               set_pte_phys_ma(address, phys, prot);
39256 +               break;
39257 +       }
39258 +}
39259 +
39260 +/*
39261 + * At this point it only supports vsyscall area.
39262 + */
39263 +void __set_fixmap_user (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
39264 +{
39265 +       unsigned long address = __fix_to_virt(idx);
39266 +
39267 +       if (idx >= __end_of_fixed_addresses) {
39268 +               printk("Invalid __set_fixmap\n");
39269 +               return;
39270 +       }
39271 +
39272 +       set_pte_phys(address, phys, prot, SET_FIXMAP_USER); 
39273 +}
39274 +
39275 +unsigned long __initdata table_start, tables_space; 
39276 +
39277 +unsigned long get_machine_pfn(unsigned long addr)
39278 +{
39279 +       pud_t* pud = pud_offset_k(NULL, addr);
39280 +       pmd_t* pmd = pmd_offset(pud, addr);
39281 +       pte_t *pte = pte_offset_kernel(pmd, addr);
39282 +
39283 +       return pte_mfn(*pte);
39284 +} 
39285 +
39286 +static __meminit void *alloc_static_page(unsigned long *phys)
39287 +{
39288 +       unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
39289 +
39290 +       if (after_bootmem) {
39291 +               void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
39292 +
39293 +               *phys = __pa(adr);
39294 +               return adr;
39295 +       }
39296 +
39297 +       *phys = start_pfn << PAGE_SHIFT;
39298 +       start_pfn++;
39299 +       memset((void *)va, 0, PAGE_SIZE);
39300 +       return (void *)va;
39301 +} 
39302 +
39303 +#define PTE_SIZE PAGE_SIZE
39304 +
39305 +static inline void __set_pte(pte_t *dst, pte_t val)
39306 +{
39307 +       *dst = val;
39308 +}
39309 +
39310 +static inline int make_readonly(unsigned long paddr)
39311 +{
39312 +       int readonly = 0;
39313 +
39314 +       /* Make old and new page tables read-only. */
39315 +       if (!xen_feature(XENFEAT_writable_page_tables)
39316 +           && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
39317 +           && (paddr < ((table_start << PAGE_SHIFT) + tables_space)))
39318 +               readonly = 1;
39319 +       /*
39320 +        * No need for writable mapping of kernel image. This also ensures that
39321 +        * page and descriptor tables embedded inside don't have writable
39322 +        * mappings. 
39323 +        */
39324 +       if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end)))
39325 +               readonly = 1;
39326 +
39327 +       return readonly;
39328 +}
39329 +
39330 +static void __meminit
39331 +phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
39332 +{
39333 +       int i, k;
39334 +
39335 +       for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
39336 +               unsigned long pte_phys;
39337 +               pte_t *pte, *pte_save;
39338 +
39339 +               if (address >= end) {
39340 +                       for (; i < PTRS_PER_PMD; i++, pmd++)
39341 +                               set_pmd(pmd, __pmd(0));
39342 +                       break;
39343 +               }
39344 +               pte = alloc_static_page(&pte_phys);
39345 +               pte_save = pte;
39346 +               for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
39347 +                       if ((address >= end) ||
39348 +                           ((address >> PAGE_SHIFT) >=
39349 +                            xen_start_info->nr_pages)) { 
39350 +                               __set_pte(pte, __pte(0)); 
39351 +                               continue;
39352 +                       }
39353 +                       if (make_readonly(address)) {
39354 +                               __set_pte(pte, 
39355 +                                         __pte(address | (_KERNPG_TABLE & ~_PAGE_RW)));
39356 +                               continue;
39357 +                       }
39358 +                       __set_pte(pte, __pte(address | _KERNPG_TABLE));
39359 +               }
39360 +               pte = pte_save;
39361 +               early_make_page_readonly(pte, XENFEAT_writable_page_tables);
39362 +               set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
39363 +       }
39364 +}
39365 +
39366 +static void __meminit
39367 +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
39368 +{
39369 +       pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
39370 +
39371 +       if (pmd_none(*pmd)) {
39372 +               spin_lock(&init_mm.page_table_lock);
39373 +               phys_pmd_init(pmd, address, end);
39374 +               spin_unlock(&init_mm.page_table_lock);
39375 +               __flush_tlb_all();
39376 +       }
39377 +}
39378 +
39379 +static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
39380 +{ 
39381 +       long i = pud_index(address);
39382 +
39383 +       pud = pud + i;
39384 +
39385 +       if (after_bootmem && pud_val(*pud)) {
39386 +               phys_pmd_update(pud, address, end);
39387 +               return;
39388 +       }
39389 +
39390 +       for (; i < PTRS_PER_PUD; pud++, i++) {
39391 +               unsigned long paddr, pmd_phys;
39392 +               pmd_t *pmd;
39393 +
39394 +               paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
39395 +               if (paddr >= end)
39396 +                       break;
39397 +
39398 +               pmd = alloc_static_page(&pmd_phys);
39399 +               early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
39400 +               spin_lock(&init_mm.page_table_lock);
39401 +               set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
39402 +               phys_pmd_init(pmd, paddr, end);
39403 +               spin_unlock(&init_mm.page_table_lock);
39404 +       }
39405 +       __flush_tlb();
39406 +} 
39407 +
39408 +void __init xen_init_pt(void)
39409 +{
39410 +       unsigned long addr, *page;
39411 +
39412 +       memset((void *)init_level4_pgt,   0, PAGE_SIZE);
39413 +       memset((void *)level3_kernel_pgt, 0, PAGE_SIZE);
39414 +       memset((void *)level2_kernel_pgt, 0, PAGE_SIZE);
39415 +
39416 +       /* Find the initial pte page that was built for us. */
39417 +       page = (unsigned long *)xen_start_info->pt_base;
39418 +       addr = page[pgd_index(__START_KERNEL_map)];
39419 +       addr_to_page(addr, page);
39420 +       addr = page[pud_index(__START_KERNEL_map)];
39421 +       addr_to_page(addr, page);
39422 +
39423 +       /* Construct mapping of initial pte page in our own directories. */
39424 +       init_level4_pgt[pgd_index(__START_KERNEL_map)] = 
39425 +               mk_kernel_pgd(__pa_symbol(level3_kernel_pgt));
39426 +       level3_kernel_pgt[pud_index(__START_KERNEL_map)] = 
39427 +               __pud(__pa_symbol(level2_kernel_pgt) |
39428 +                     _KERNPG_TABLE | _PAGE_USER);
39429 +       memcpy((void *)level2_kernel_pgt, page, PAGE_SIZE);
39430 +
39431 +       early_make_page_readonly(init_level4_pgt,
39432 +                                XENFEAT_writable_page_tables);
39433 +       early_make_page_readonly(init_level4_user_pgt,
39434 +                                XENFEAT_writable_page_tables);
39435 +       early_make_page_readonly(level3_kernel_pgt,
39436 +                                XENFEAT_writable_page_tables);
39437 +       early_make_page_readonly(level3_user_pgt,
39438 +                                XENFEAT_writable_page_tables);
39439 +       early_make_page_readonly(level2_kernel_pgt,
39440 +                                XENFEAT_writable_page_tables);
39441 +
39442 +       xen_pgd_pin(__pa_symbol(init_level4_pgt));
39443 +       xen_pgd_pin(__pa_symbol(init_level4_user_pgt));
39444 +
39445 +       set_pgd((pgd_t *)(init_level4_user_pgt + 511), 
39446 +               mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
39447 +}
39448 +
39449 +void __init extend_init_mapping(void) 
39450 +{
39451 +       unsigned long va = __START_KERNEL_map;
39452 +       unsigned long phys, addr, *pte_page;
39453 +       pmd_t *pmd;
39454 +       pte_t *pte, new_pte;
39455 +       unsigned long *page = (unsigned long *)init_level4_pgt;
39456 +
39457 +       addr = page[pgd_index(va)];
39458 +       addr_to_page(addr, page);
39459 +       addr = page[pud_index(va)];
39460 +       addr_to_page(addr, page);
39461 +
39462 +       /* Kill mapping of low 1MB. */
39463 +       while (va < (unsigned long)&_text) {
39464 +               HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
39465 +               va += PAGE_SIZE;
39466 +       }
39467 +
39468 +       /* Ensure init mappings cover kernel text/data and initial tables. */
39469 +       while (va < (__START_KERNEL_map
39470 +                    + (start_pfn << PAGE_SHIFT)
39471 +                    + tables_space)) {
39472 +               pmd = (pmd_t *)&page[pmd_index(va)];
39473 +               if (pmd_none(*pmd)) {
39474 +                       pte_page = alloc_static_page(&phys);
39475 +                       early_make_page_readonly(
39476 +                               pte_page, XENFEAT_writable_page_tables);
39477 +                       set_pmd(pmd, __pmd(phys | _KERNPG_TABLE | _PAGE_USER));
39478 +               } else {
39479 +                       addr = page[pmd_index(va)];
39480 +                       addr_to_page(addr, pte_page);
39481 +               }
39482 +               pte = (pte_t *)&pte_page[pte_index(va)];
39483 +               if (pte_none(*pte)) {
39484 +                       new_pte = pfn_pte(
39485 +                               (va - __START_KERNEL_map) >> PAGE_SHIFT, 
39486 +                               __pgprot(_KERNPG_TABLE | _PAGE_USER));
39487 +                       xen_l1_entry_update(pte, new_pte);
39488 +               }
39489 +               va += PAGE_SIZE;
39490 +       }
39491 +
39492 +       /* Finally, blow away any spurious initial mappings. */
39493 +       while (1) {
39494 +               pmd = (pmd_t *)&page[pmd_index(va)];
39495 +               if (pmd_none(*pmd))
39496 +                       break;
39497 +               HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
39498 +               va += PAGE_SIZE;
39499 +       }
39500 +}
39501 +
39502 +static void __init find_early_table_space(unsigned long end)
39503 +{
39504 +       unsigned long puds, pmds, ptes; 
39505 +
39506 +       puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
39507 +       pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
39508 +       ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
39509 +
39510 +       tables_space =
39511 +               round_up(puds * 8, PAGE_SIZE) + 
39512 +               round_up(pmds * 8, PAGE_SIZE) + 
39513 +               round_up(ptes * 8, PAGE_SIZE); 
39514 +
39515 +       extend_init_mapping();
39516 +
39517 +       table_start = start_pfn;
39518 +
39519 +       early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
39520 +               end, table_start << PAGE_SHIFT, start_pfn << PAGE_SHIFT);
39521 +}
39522 +
39523 +/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
39524 +   This runs before bootmem is initialized and gets pages directly from the 
39525 +   physical memory. To access them they are temporarily mapped. */
39526 +void __meminit init_memory_mapping(unsigned long start, unsigned long end)
39527 +{ 
39528 +       unsigned long next; 
39529 +
39530 +       Dprintk("init_memory_mapping\n");
39531 +
39532 +       /* 
39533 +        * Find space for the kernel direct mapping tables.
39534 +        * Later we should allocate these tables in the local node of the memory
39535 +        * mapped.  Unfortunately this is done currently before the nodes are 
39536 +        * discovered.
39537 +        */
39538 +       if (!after_bootmem)
39539 +               find_early_table_space(end);
39540 +
39541 +       start = (unsigned long)__va(start);
39542 +       end = (unsigned long)__va(end);
39543 +
39544 +       for (; start < end; start = next) {
39545 +               unsigned long pud_phys; 
39546 +               pgd_t *pgd = pgd_offset_k(start);
39547 +               pud_t *pud;
39548 +
39549 +               if (after_bootmem) {
39550 +                       pud = pud_offset_k(pgd, __PAGE_OFFSET);
39551 +                       make_page_readonly(pud, XENFEAT_writable_page_tables);
39552 +                       pud_phys = __pa(pud);
39553 +               } else {
39554 +                       pud = alloc_static_page(&pud_phys);
39555 +                       early_make_page_readonly(pud, XENFEAT_writable_page_tables);
39556 +               }
39557 +               next = start + PGDIR_SIZE;
39558 +               if (next > end) 
39559 +                       next = end; 
39560 +               phys_pud_init(pud, __pa(start), __pa(next));
39561 +               if (!after_bootmem)
39562 +                       set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
39563 +       }
39564 +
39565 +       BUG_ON(!after_bootmem && start_pfn != table_start + (tables_space >> PAGE_SHIFT));
39566 +
39567 +       __flush_tlb_all();
39568 +}
39569 +
39570 +void __cpuinit zap_low_mappings(int cpu)
39571 +{
39572 +       /* this is not required for Xen */
39573 +#if 0
39574 +       swap_low_mappings();
39575 +#endif
39576 +}
39577 +
39578 +/* Compute zone sizes for the DMA and DMA32 zones in a node. */
39579 +__init void
39580 +size_zones(unsigned long *z, unsigned long *h,
39581 +          unsigned long start_pfn, unsigned long end_pfn)
39582 +{
39583 +       int i;
39584 +#ifndef CONFIG_XEN
39585 +       unsigned long w;
39586 +#endif
39587 +
39588 +       for (i = 0; i < MAX_NR_ZONES; i++)
39589 +               z[i] = 0;
39590 +
39591 +#ifndef CONFIG_XEN
39592 +       if (start_pfn < MAX_DMA_PFN)
39593 +               z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
39594 +       if (start_pfn < MAX_DMA32_PFN) {
39595 +               unsigned long dma32_pfn = MAX_DMA32_PFN;
39596 +               if (dma32_pfn > end_pfn)
39597 +                       dma32_pfn = end_pfn;
39598 +               z[ZONE_DMA32] = dma32_pfn - start_pfn;
39599 +       }
39600 +       z[ZONE_NORMAL] = end_pfn - start_pfn;
39601 +
39602 +       /* Remove lower zones from higher ones. */
39603 +       w = 0;
39604 +       for (i = 0; i < MAX_NR_ZONES; i++) {
39605 +               if (z[i])
39606 +                       z[i] -= w;
39607 +               w += z[i];
39608 +       }
39609 +
39610 +       /* Compute holes */
39611 +       w = start_pfn;
39612 +       for (i = 0; i < MAX_NR_ZONES; i++) {
39613 +               unsigned long s = w;
39614 +               w += z[i];
39615 +               h[i] = e820_hole_size(s, w);
39616 +       }
39617 +
39618 +       /* Add the space pace needed for mem_map to the holes too. */
39619 +       for (i = 0; i < MAX_NR_ZONES; i++)
39620 +               h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
39621 +
39622 +       /* The 16MB DMA zone has the kernel and other misc mappings.
39623 +          Account them too */
39624 +       if (h[ZONE_DMA]) {
39625 +               h[ZONE_DMA] += dma_reserve;
39626 +               if (h[ZONE_DMA] >= z[ZONE_DMA]) {
39627 +                       printk(KERN_WARNING
39628 +                               "Kernel too large and filling up ZONE_DMA?\n");
39629 +                       h[ZONE_DMA] = z[ZONE_DMA];
39630 +               }
39631 +       }
39632 +#else
39633 +       z[ZONE_DMA] = end_pfn;
39634 +       for (i = 0; i < MAX_NR_ZONES; i++)
39635 +               h[i] = 0;
39636 +#endif
39637 +}
39638 +
39639 +#ifndef CONFIG_NUMA
39640 +void __init paging_init(void)
39641 +{
39642 +       unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
39643 +       int i;
39644 +
39645 +       memory_present(0, 0, end_pfn);
39646 +       sparse_init();
39647 +       size_zones(zones, holes, 0, end_pfn);
39648 +       free_area_init_node(0, NODE_DATA(0), zones,
39649 +                           __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
39650 +
39651 +       if (!xen_feature(XENFEAT_auto_translated_physmap) ||
39652 +           xen_start_info->shared_info >= xen_start_info->nr_pages) {
39653 +               /* Switch to the real shared_info page, and clear the
39654 +                * dummy page. */
39655 +               set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
39656 +               HYPERVISOR_shared_info =
39657 +                       (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
39658 +               memset(empty_zero_page, 0, sizeof(empty_zero_page));
39659 +       }
39660 +
39661 +       init_mm.context.pinned = 1;
39662 +
39663 +       /* Setup mapping of lower 1st MB */
39664 +       for (i = 0; i < NR_FIX_ISAMAPS; i++)
39665 +               if (xen_start_info->flags & SIF_PRIVILEGED)
39666 +                       set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
39667 +               else
39668 +                       __set_fixmap(FIX_ISAMAP_BEGIN - i,
39669 +                                    virt_to_mfn(empty_zero_page) << PAGE_SHIFT,
39670 +                                    PAGE_KERNEL_RO);
39671 +}
39672 +#endif
39673 +
39674 +/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
39675 +   from the CPU leading to inconsistent cache lines. address and size
39676 +   must be aligned to 2MB boundaries. 
39677 +   Does nothing when the mapping doesn't exist. */
39678 +void __init clear_kernel_mapping(unsigned long address, unsigned long size) 
39679 +{
39680 +       unsigned long end = address + size;
39681 +
39682 +       BUG_ON(address & ~LARGE_PAGE_MASK);
39683 +       BUG_ON(size & ~LARGE_PAGE_MASK); 
39684 +       
39685 +       for (; address < end; address += LARGE_PAGE_SIZE) { 
39686 +               pgd_t *pgd = pgd_offset_k(address);
39687 +               pud_t *pud;
39688 +               pmd_t *pmd;
39689 +               if (pgd_none(*pgd))
39690 +                       continue;
39691 +               pud = pud_offset(pgd, address);
39692 +               if (pud_none(*pud))
39693 +                       continue; 
39694 +               pmd = pmd_offset(pud, address);
39695 +               if (!pmd || pmd_none(*pmd))
39696 +                       continue; 
39697 +               if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { 
39698 +                       /* Could handle this, but it should not happen currently. */
39699 +                       printk(KERN_ERR 
39700 +              "clear_kernel_mapping: mapping has been split. will leak memory\n"); 
39701 +                       pmd_ERROR(*pmd); 
39702 +               }
39703 +               set_pmd(pmd, __pmd(0));                 
39704 +       }
39705 +       __flush_tlb_all();
39706 +} 
39707 +
39708 +/*
39709 + * Memory hotplug specific functions
39710 + * These are only for non-NUMA machines right now.
39711 + */
39712 +#ifdef CONFIG_MEMORY_HOTPLUG
39713 +
39714 +void online_page(struct page *page)
39715 +{
39716 +       ClearPageReserved(page);
39717 +       set_page_count(page, 1);
39718 +       __free_page(page);
39719 +       totalram_pages++;
39720 +       num_physpages++;
39721 +}
39722 +
39723 +int add_memory(u64 start, u64 size)
39724 +{
39725 +       struct pglist_data *pgdat = NODE_DATA(0);
39726 +       struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
39727 +       unsigned long start_pfn = start >> PAGE_SHIFT;
39728 +       unsigned long nr_pages = size >> PAGE_SHIFT;
39729 +       int ret;
39730 +
39731 +       ret = __add_pages(zone, start_pfn, nr_pages);
39732 +       if (ret)
39733 +               goto error;
39734 +
39735 +       init_memory_mapping(start, (start + size -1));
39736 +
39737 +       return ret;
39738 +error:
39739 +       printk("%s: Problem encountered in __add_pages!\n", __func__);
39740 +       return ret;
39741 +}
39742 +EXPORT_SYMBOL_GPL(add_memory);
39743 +
39744 +int remove_memory(u64 start, u64 size)
39745 +{
39746 +       return -EINVAL;
39747 +}
39748 +EXPORT_SYMBOL_GPL(remove_memory);
39749 +
39750 +#endif
39751 +
39752 +static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
39753 +                        kcore_vsyscall;
39754 +
39755 +void __init mem_init(void)
39756 +{
39757 +       long codesize, reservedpages, datasize, initsize;
39758 +
39759 +       contiguous_bitmap = alloc_bootmem_low_pages(
39760 +               (end_pfn + 2*BITS_PER_LONG) >> 3);
39761 +       BUG_ON(!contiguous_bitmap);
39762 +       memset(contiguous_bitmap, 0, (end_pfn + 2*BITS_PER_LONG) >> 3);
39763 +
39764 +#if defined(CONFIG_SWIOTLB)
39765 +       pci_swiotlb_init();     
39766 +#endif
39767 +       no_iommu_init();
39768 +
39769 +       /* How many end-of-memory variables you have, grandma! */
39770 +       max_low_pfn = end_pfn;
39771 +       max_pfn = end_pfn;
39772 +       num_physpages = end_pfn;
39773 +       high_memory = (void *) __va(end_pfn * PAGE_SIZE);
39774 +
39775 +       /* clear the zero-page */
39776 +       memset(empty_zero_page, 0, PAGE_SIZE);
39777 +
39778 +       reservedpages = 0;
39779 +
39780 +       /* this will put all low memory onto the freelists */
39781 +#ifdef CONFIG_NUMA
39782 +       totalram_pages = numa_free_all_bootmem();
39783 +#else
39784 +       totalram_pages = free_all_bootmem();
39785 +#endif
39786 +       reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
39787 +
39788 +       after_bootmem = 1;
39789 +
39790 +       codesize =  (unsigned long) &_etext - (unsigned long) &_text;
39791 +       datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
39792 +       initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
39793 +
39794 +       /* Register memory areas for /proc/kcore */
39795 +       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
39796 +       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
39797 +                  VMALLOC_END-VMALLOC_START);
39798 +       kclist_add(&kcore_kernel, &_stext, _end - _stext);
39799 +       kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
39800 +       kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, 
39801 +                                VSYSCALL_END - VSYSCALL_START);
39802 +
39803 +       printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
39804 +               (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
39805 +               end_pfn << (PAGE_SHIFT-10),
39806 +               codesize >> 10,
39807 +               reservedpages << (PAGE_SHIFT-10),
39808 +               datasize >> 10,
39809 +               initsize >> 10);
39810 +
39811 +#ifndef CONFIG_XEN
39812 +#ifdef CONFIG_SMP
39813 +       /*
39814 +        * Sync boot_level4_pgt mappings with the init_level4_pgt
39815 +        * except for the low identity mappings which are already zapped
39816 +        * in init_level4_pgt. This sync-up is essential for AP's bringup
39817 +        */
39818 +       memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
39819 +#endif
39820 +#endif
39821 +}
39822 +
39823 +void free_initmem(void)
39824 +{
39825 +#ifdef __DO_LATER__
39826 +       /*
39827 +        * Some pages can be pinned, but some are not. Unpinning such pages 
39828 +        * triggers BUG(). 
39829 +        */
39830 +       unsigned long addr;
39831 +
39832 +       addr = (unsigned long)(&__init_begin);
39833 +       for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
39834 +               ClearPageReserved(virt_to_page(addr));
39835 +               set_page_count(virt_to_page(addr), 1);
39836 +               memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); 
39837 +               make_page_writable(
39838 +                       __va(__pa(addr)), XENFEAT_writable_page_tables);
39839 +               /*
39840 +                * Make pages from __PAGE_OFFSET address as well
39841 +                */
39842 +               make_page_writable(
39843 +                       (void *)addr, XENFEAT_writable_page_tables);
39844 +               free_page(addr);
39845 +               totalram_pages++;
39846 +       }
39847 +       memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
39848 +       printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
39849 +#endif
39850 +}
39851 +
39852 +#ifdef CONFIG_DEBUG_RODATA
39853 +
39854 +extern char __start_rodata, __end_rodata;
39855 +void mark_rodata_ro(void)
39856 +{
39857 +       unsigned long addr = (unsigned long)&__start_rodata;
39858 +
39859 +       for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
39860 +               change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
39861 +
39862 +       printk ("Write protecting the kernel read-only data: %luk\n",
39863 +                       (&__end_rodata - &__start_rodata) >> 10);
39864 +
39865 +       /*
39866 +        * change_page_attr_addr() requires a global_flush_tlb() call after it.
39867 +        * We do this after the printk so that if something went wrong in the
39868 +        * change, the printk gets out at least to give a better debug hint
39869 +        * of who is the culprit.
39870 +        */
39871 +       global_flush_tlb();
39872 +}
39873 +#endif
39874 +
39875 +#ifdef CONFIG_BLK_DEV_INITRD
39876 +void free_initrd_mem(unsigned long start, unsigned long end)
39877 +{
39878 +       if (start >= end)
39879 +               return;
39880 +       printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
39881 +       for (; start < end; start += PAGE_SIZE) {
39882 +               ClearPageReserved(virt_to_page(start));
39883 +               set_page_count(virt_to_page(start), 1);
39884 +               free_page(start);
39885 +               totalram_pages++;
39886 +       }
39887 +}
39888 +#endif
39889 +
39890 +void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 
39891 +{ 
39892 +       /* Should check here against the e820 map to avoid double free */ 
39893 +#ifdef CONFIG_NUMA
39894 +       int nid = phys_to_nid(phys);
39895 +       reserve_bootmem_node(NODE_DATA(nid), phys, len);
39896 +#else                  
39897 +       reserve_bootmem(phys, len);    
39898 +#endif
39899 +       if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
39900 +               dma_reserve += len / PAGE_SIZE;
39901 +}
39902 +
39903 +int kern_addr_valid(unsigned long addr) 
39904 +{ 
39905 +       unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
39906 +       pgd_t *pgd;
39907 +       pud_t *pud;
39908 +       pmd_t *pmd;
39909 +       pte_t *pte;
39910 +
39911 +       if (above != 0 && above != -1UL)
39912 +               return 0; 
39913 +       
39914 +       pgd = pgd_offset_k(addr);
39915 +       if (pgd_none(*pgd))
39916 +               return 0;
39917 +
39918 +       pud = pud_offset_k(pgd, addr);
39919 +       if (pud_none(*pud))
39920 +               return 0; 
39921 +
39922 +       pmd = pmd_offset(pud, addr);
39923 +       if (pmd_none(*pmd))
39924 +               return 0;
39925 +       if (pmd_large(*pmd))
39926 +               return pfn_valid(pmd_pfn(*pmd));
39927 +
39928 +       pte = pte_offset_kernel(pmd, addr);
39929 +       if (pte_none(*pte))
39930 +               return 0;
39931 +       return pfn_valid(pte_pfn(*pte));
39932 +}
39933 +
39934 +#ifdef CONFIG_SYSCTL
39935 +#include <linux/sysctl.h>
39936 +
39937 +extern int exception_trace, page_fault_trace;
39938 +
39939 +static ctl_table debug_table2[] = {
39940 +       { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
39941 +         proc_dointvec },
39942 +       { 0, }
39943 +}; 
39944 +
39945 +static ctl_table debug_root_table2[] = { 
39946 +       { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, 
39947 +          .child = debug_table2 }, 
39948 +       { 0 }, 
39949 +}; 
39950 +
39951 +static __init int x8664_sysctl_init(void)
39952 +{ 
39953 +       register_sysctl_table(debug_root_table2, 1);
39954 +       return 0;
39955 +}
39956 +__initcall(x8664_sysctl_init);
39957 +#endif
39958 +
39959 +/* A pseudo VMAs to allow ptrace access for the vsyscall page.   This only
39960 +   covers the 64bit vsyscall page now. 32bit has a real VMA now and does
39961 +   not need special handling anymore. */
39962 +
39963 +static struct vm_area_struct gate_vma = {
39964 +       .vm_start = VSYSCALL_START,
39965 +       .vm_end = VSYSCALL_END,
39966 +       .vm_page_prot = PAGE_READONLY
39967 +};
39968 +
39969 +struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
39970 +{
39971 +#ifdef CONFIG_IA32_EMULATION
39972 +       if (test_tsk_thread_flag(tsk, TIF_IA32))
39973 +               return NULL;
39974 +#endif
39975 +       return &gate_vma;
39976 +}
39977 +
39978 +int in_gate_area(struct task_struct *task, unsigned long addr)
39979 +{
39980 +       struct vm_area_struct *vma = get_gate_vma(task);
39981 +       if (!vma)
39982 +               return 0;
39983 +       return (addr >= vma->vm_start) && (addr < vma->vm_end);
39984 +}
39985 +
39986 +/* Use this when you have no reliable task/vma, typically from interrupt
39987 + * context.  It is less reliable than using the task's vma and may give
39988 + * false positives.
39989 + */
39990 +int in_gate_area_no_task(unsigned long addr)
39991 +{
39992 +       return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
39993 +}
39994 +
39995 +/*
39996 + * Local variables:
39997 + *  c-file-style: "linux"
39998 + *  indent-tabs-mode: t
39999 + *  c-indent-level: 8
40000 + *  c-basic-offset: 8
40001 + *  tab-width: 8
40002 + * End:
40003 + */
40004 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/mm/pageattr-xen.c linux-2.6.16/arch/x86_64/mm/pageattr-xen.c
40005 --- linux-2.6.16.orig/arch/x86_64/mm/pageattr-xen.c     1970-01-01 01:00:00.000000000 +0100
40006 +++ linux-2.6.16/arch/x86_64/mm/pageattr-xen.c  2006-06-26 09:51:32.000000000 +0200
40007 @@ -0,0 +1,401 @@
40008 +/* 
40009 + * Copyright 2002 Andi Kleen, SuSE Labs. 
40010 + * Thanks to Ben LaHaise for precious feedback.
40011 + */ 
40012 +
40013 +#include <linux/config.h>
40014 +#include <linux/mm.h>
40015 +#include <linux/sched.h>
40016 +#include <linux/highmem.h>
40017 +#include <linux/module.h>
40018 +#include <linux/slab.h>
40019 +#include <asm/uaccess.h>
40020 +#include <asm/processor.h>
40021 +#include <asm/tlbflush.h>
40022 +#include <asm/io.h>
40023 +
40024 +#ifdef CONFIG_XEN
40025 +#include <asm/pgalloc.h>
40026 +#include <asm/mmu_context.h>
40027 +
40028 +LIST_HEAD(mm_unpinned);
40029 +DEFINE_SPINLOCK(mm_unpinned_lock);
40030 +
40031 +static inline void mm_walk_set_prot(void *pt, pgprot_t flags)
40032 +{
40033 +       struct page *page = virt_to_page(pt);
40034 +       unsigned long pfn = page_to_pfn(page);
40035 +
40036 +       BUG_ON(HYPERVISOR_update_va_mapping(
40037 +                      (unsigned long)__va(pfn << PAGE_SHIFT),
40038 +                      pfn_pte(pfn, flags), 0));
40039 +}
40040 +
40041 +static void mm_walk(struct mm_struct *mm, pgprot_t flags)
40042 +{
40043 +       pgd_t       *pgd;
40044 +       pud_t       *pud;
40045 +       pmd_t       *pmd;
40046 +       pte_t       *pte;
40047 +       int          g,u,m;
40048 +
40049 +       pgd = mm->pgd;
40050 +       /*
40051 +        * Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not
40052 +        * be the 'current' task's pagetables (e.g., current may be 32-bit,
40053 +        * but the pagetables may be for a 64-bit task).
40054 +        * Subtracting 1 from TASK_SIZE64 means the loop limit is correct
40055 +        * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE.
40056 +        */
40057 +       for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
40058 +               if (pgd_none(*pgd))
40059 +                       continue;
40060 +               pud = pud_offset(pgd, 0);
40061 +               if (PTRS_PER_PUD > 1) /* not folded */ 
40062 +                       mm_walk_set_prot(pud,flags);
40063 +               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
40064 +                       if (pud_none(*pud))
40065 +                               continue;
40066 +                       pmd = pmd_offset(pud, 0);
40067 +                       if (PTRS_PER_PMD > 1) /* not folded */ 
40068 +                               mm_walk_set_prot(pmd,flags);
40069 +                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
40070 +                               if (pmd_none(*pmd))
40071 +                                       continue;
40072 +                               pte = pte_offset_kernel(pmd,0);
40073 +                               mm_walk_set_prot(pte,flags);
40074 +                       }
40075 +               }
40076 +       }
40077 +}
40078 +
40079 +void mm_pin(struct mm_struct *mm)
40080 +{
40081 +       if (xen_feature(XENFEAT_writable_page_tables))
40082 +               return;
40083 +
40084 +       spin_lock(&mm->page_table_lock);
40085 +
40086 +       mm_walk(mm, PAGE_KERNEL_RO);
40087 +       BUG_ON(HYPERVISOR_update_va_mapping(
40088 +                      (unsigned long)mm->pgd,
40089 +                      pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO),
40090 +                      UVMF_TLB_FLUSH));
40091 +       BUG_ON(HYPERVISOR_update_va_mapping(
40092 +                      (unsigned long)__user_pgd(mm->pgd),
40093 +                      pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, PAGE_KERNEL_RO),
40094 +                      UVMF_TLB_FLUSH));
40095 +       xen_pgd_pin(__pa(mm->pgd)); /* kernel */
40096 +       xen_pgd_pin(__pa(__user_pgd(mm->pgd))); /* user */
40097 +       mm->context.pinned = 1;
40098 +       spin_lock(&mm_unpinned_lock);
40099 +       list_del(&mm->context.unpinned);
40100 +       spin_unlock(&mm_unpinned_lock);
40101 +
40102 +       spin_unlock(&mm->page_table_lock);
40103 +}
40104 +
40105 +void mm_unpin(struct mm_struct *mm)
40106 +{
40107 +       if (xen_feature(XENFEAT_writable_page_tables))
40108 +               return;
40109 +
40110 +       spin_lock(&mm->page_table_lock);
40111 +
40112 +       xen_pgd_unpin(__pa(mm->pgd));
40113 +       xen_pgd_unpin(__pa(__user_pgd(mm->pgd)));
40114 +       BUG_ON(HYPERVISOR_update_va_mapping(
40115 +                      (unsigned long)mm->pgd,
40116 +                      pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0));
40117 +       BUG_ON(HYPERVISOR_update_va_mapping(
40118 +                      (unsigned long)__user_pgd(mm->pgd),
40119 +                      pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, PAGE_KERNEL), 0));
40120 +       mm_walk(mm, PAGE_KERNEL);
40121 +       xen_tlb_flush();
40122 +       mm->context.pinned = 0;
40123 +       spin_lock(&mm_unpinned_lock);
40124 +       list_add(&mm->context.unpinned, &mm_unpinned);
40125 +       spin_unlock(&mm_unpinned_lock);
40126 +
40127 +       spin_unlock(&mm->page_table_lock);
40128 +}
40129 +
40130 +void mm_pin_all(void)
40131 +{
40132 +       if (xen_feature(XENFEAT_writable_page_tables))
40133 +               return;
40134 +
40135 +       while (!list_empty(&mm_unpinned))       
40136 +               mm_pin(list_entry(mm_unpinned.next, struct mm_struct,
40137 +                                 context.unpinned));
40138 +}
40139 +
40140 +void _arch_exit_mmap(struct mm_struct *mm)
40141 +{
40142 +    struct task_struct *tsk = current;
40143 +
40144 +    task_lock(tsk);
40145 +
40146 +    /*
40147 +     * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
40148 +     * *much* faster this way, as no tlb flushes means bigger wrpt batches.
40149 +     */
40150 +    if ( tsk->active_mm == mm )
40151 +    {
40152 +        tsk->active_mm = &init_mm;
40153 +        atomic_inc(&init_mm.mm_count);
40154 +
40155 +        switch_mm(mm, &init_mm, tsk);
40156 +
40157 +        atomic_dec(&mm->mm_count);
40158 +        BUG_ON(atomic_read(&mm->mm_count) == 0);
40159 +    }
40160 +
40161 +    task_unlock(tsk);
40162 +
40163 +    if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) )
40164 +        mm_unpin(mm);
40165 +}
40166 +
40167 +void pte_free(struct page *pte)
40168 +{
40169 +       unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
40170 +
40171 +       if (!pte_write(*virt_to_ptep(va)))
40172 +               BUG_ON(HYPERVISOR_update_va_mapping(
40173 +                       va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0));
40174 +       __free_page(pte);
40175 +}
40176 +#endif /* CONFIG_XEN */
40177 +
40178 +static inline pte_t *lookup_address(unsigned long address) 
40179 +{ 
40180 +       pgd_t *pgd = pgd_offset_k(address);
40181 +       pud_t *pud;
40182 +       pmd_t *pmd;
40183 +       pte_t *pte;
40184 +       if (pgd_none(*pgd))
40185 +               return NULL;
40186 +       pud = pud_offset(pgd, address);
40187 +       if (!pud_present(*pud))
40188 +               return NULL; 
40189 +       pmd = pmd_offset(pud, address);
40190 +       if (!pmd_present(*pmd))
40191 +               return NULL; 
40192 +       if (pmd_large(*pmd))
40193 +               return (pte_t *)pmd;
40194 +       pte = pte_offset_kernel(pmd, address);
40195 +       if (pte && !pte_present(*pte))
40196 +               pte = NULL; 
40197 +       return pte;
40198 +} 
40199 +
40200 +static struct page *split_large_page(unsigned long address, pgprot_t prot,
40201 +                                    pgprot_t ref_prot)
40202 +{ 
40203 +       int i; 
40204 +       unsigned long addr;
40205 +       struct page *base = alloc_pages(GFP_KERNEL, 0);
40206 +       pte_t *pbase;
40207 +       if (!base) 
40208 +               return NULL;
40209 +       address = __pa(address);
40210 +       addr = address & LARGE_PAGE_MASK; 
40211 +       pbase = (pte_t *)page_address(base);
40212 +       for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
40213 +               pbase[i] = pfn_pte(addr >> PAGE_SHIFT, 
40214 +                                  addr == address ? prot : ref_prot);
40215 +       }
40216 +       return base;
40217 +} 
40218 +
40219 +
40220 +static void flush_kernel_map(void *address) 
40221 +{
40222 +       if (0 && address && cpu_has_clflush) {
40223 +               /* is this worth it? */ 
40224 +               int i;
40225 +               for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) 
40226 +                       asm volatile("clflush (%0)" :: "r" (address + i)); 
40227 +       } else
40228 +               asm volatile("wbinvd":::"memory"); 
40229 +       if (address)
40230 +               __flush_tlb_one(address);
40231 +       else
40232 +               __flush_tlb_all();
40233 +}
40234 +
40235 +
40236 +static inline void flush_map(unsigned long address)
40237 +{      
40238 +       on_each_cpu(flush_kernel_map, (void *)address, 1, 1);
40239 +}
40240 +
40241 +struct deferred_page { 
40242 +       struct deferred_page *next; 
40243 +       struct page *fpage;
40244 +       unsigned long address;
40245 +}; 
40246 +static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */
40247 +
40248 +static inline void save_page(unsigned long address, struct page *fpage)
40249 +{
40250 +       struct deferred_page *df;
40251 +       df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL); 
40252 +       if (!df) {
40253 +               flush_map(address);
40254 +               __free_page(fpage);
40255 +       } else { 
40256 +               df->next = df_list;
40257 +               df->fpage = fpage;
40258 +               df->address = address;
40259 +               df_list = df;
40260 +       }                       
40261 +}
40262 +
40263 +/* 
40264 + * No more special protections in this 2/4MB area - revert to a
40265 + * large page again. 
40266 + */
40267 +static void revert_page(unsigned long address, pgprot_t ref_prot)
40268 +{
40269 +       pgd_t *pgd;
40270 +       pud_t *pud;
40271 +       pmd_t *pmd;
40272 +       pte_t large_pte;
40273 +
40274 +       pgd = pgd_offset_k(address);
40275 +       BUG_ON(pgd_none(*pgd));
40276 +       pud = pud_offset(pgd,address);
40277 +       BUG_ON(pud_none(*pud));
40278 +       pmd = pmd_offset(pud, address);
40279 +       BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
40280 +       pgprot_val(ref_prot) |= _PAGE_PSE;
40281 +       large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
40282 +       set_pte((pte_t *)pmd, large_pte);
40283 +}      
40284 +
40285 +static int
40286 +__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
40287 +                                  pgprot_t ref_prot)
40288 +{ 
40289 +       pte_t *kpte; 
40290 +       struct page *kpte_page;
40291 +       unsigned kpte_flags;
40292 +       pgprot_t ref_prot2;
40293 +       kpte = lookup_address(address);
40294 +       if (!kpte) return 0;
40295 +       kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
40296 +       kpte_flags = pte_val(*kpte); 
40297 +       if (pgprot_val(prot) != pgprot_val(ref_prot)) { 
40298 +               if ((kpte_flags & _PAGE_PSE) == 0) { 
40299 +                       set_pte(kpte, pfn_pte(pfn, prot));
40300 +               } else {
40301 +                       /*
40302 +                        * split_large_page will take the reference for this change_page_attr
40303 +                        * on the split page.
40304 +                        */
40305 +
40306 +                       struct page *split;
40307 +                       ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE));
40308 +
40309 +                       split = split_large_page(address, prot, ref_prot2);
40310 +                       if (!split)
40311 +                               return -ENOMEM;
40312 +                       set_pte(kpte,mk_pte(split, ref_prot2));
40313 +                       kpte_page = split;
40314 +               }       
40315 +               get_page(kpte_page);
40316 +       } else if ((kpte_flags & _PAGE_PSE) == 0) { 
40317 +               set_pte(kpte, pfn_pte(pfn, ref_prot));
40318 +               __put_page(kpte_page);
40319 +       } else
40320 +               BUG();
40321 +
40322 +       /* on x86-64 the direct mapping set at boot is not using 4k pages */
40323 +       /*
40324 +        * ..., but the XEN guest kernels (currently) do:
40325 +        * If the pte was reserved, it means it was created at boot
40326 +        * time (not via split_large_page) and in turn we must not
40327 +        * replace it with a large page.
40328 +        */
40329 +#ifndef CONFIG_XEN
40330 +       BUG_ON(PageReserved(kpte_page));
40331 +#else
40332 +       if (!PageReserved(kpte_page))
40333 +#endif
40334 +               switch (page_count(kpte_page)) {
40335 +               case 1:
40336 +                       save_page(address, kpte_page);               
40337 +                       revert_page(address, ref_prot);
40338 +                       break;
40339 +               case 0:
40340 +                       BUG(); /* memleak and failed 2M page regeneration */
40341 +               }
40342 +       return 0;
40343 +} 
40344 +
40345 +/*
40346 + * Change the page attributes of an page in the linear mapping.
40347 + *
40348 + * This should be used when a page is mapped with a different caching policy
40349 + * than write-back somewhere - some CPUs do not like it when mappings with
40350 + * different caching policies exist. This changes the page attributes of the
40351 + * in kernel linear mapping too.
40352 + * 
40353 + * The caller needs to ensure that there are no conflicting mappings elsewhere.
40354 + * This function only deals with the kernel linear map.
40355 + * 
40356 + * Caller must call global_flush_tlb() after this.
40357 + */
40358 +int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
40359 +{
40360 +       int err = 0; 
40361 +       int i; 
40362 +
40363 +       down_write(&init_mm.mmap_sem);
40364 +       for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
40365 +               unsigned long pfn = __pa(address) >> PAGE_SHIFT;
40366 +
40367 +               err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
40368 +               if (err) 
40369 +                       break; 
40370 +               /* Handle kernel mapping too which aliases part of the
40371 +                * lowmem */
40372 +               if (__pa(address) < KERNEL_TEXT_SIZE) {
40373 +                       unsigned long addr2;
40374 +                       pgprot_t prot2 = prot;
40375 +                       addr2 = __START_KERNEL_map + __pa(address);
40376 +                       pgprot_val(prot2) &= ~_PAGE_NX;
40377 +                       err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC);
40378 +               } 
40379 +       }       
40380 +       up_write(&init_mm.mmap_sem); 
40381 +       return err;
40382 +}
40383 +
40384 +/* Don't call this for MMIO areas that may not have a mem_map entry */
40385 +int change_page_attr(struct page *page, int numpages, pgprot_t prot)
40386 +{
40387 +       unsigned long addr = (unsigned long)page_address(page);
40388 +       return change_page_attr_addr(addr, numpages, prot);
40389 +}
40390 +
40391 +void global_flush_tlb(void)
40392 +{ 
40393 +       struct deferred_page *df, *next_df;
40394 +
40395 +       down_read(&init_mm.mmap_sem);
40396 +       df = xchg(&df_list, NULL);
40397 +       up_read(&init_mm.mmap_sem);
40398 +       flush_map((df && !df->next) ? df->address : 0);
40399 +       for (; df; df = next_df) { 
40400 +               next_df = df->next;
40401 +               if (df->fpage) 
40402 +                       __free_page(df->fpage);
40403 +               kfree(df);
40404 +       } 
40405 +} 
40406 +
40407 +EXPORT_SYMBOL(change_page_attr);
40408 +EXPORT_SYMBOL(global_flush_tlb);
40409 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/arch/x86_64/pci/Makefile linux-2.6.16/arch/x86_64/pci/Makefile
40410 --- linux-2.6.16.orig/arch/x86_64/pci/Makefile  2006-03-20 06:53:29.000000000 +0100
40411 +++ linux-2.6.16/arch/x86_64/pci/Makefile       2006-06-26 09:51:32.000000000 +0200
40412 @@ -15,10 +15,22 @@
40413  
40414  obj-$(CONFIG_NUMA)     += k8-bus.o
40415  
40416 +# pcifront should be after mmconfig.o and direct.o as it should only
40417 +# take over if direct access to the PCI bus is unavailable
40418 +obj-$(CONFIG_XEN_PCIDEV_FRONTEND)      += pcifront.o
40419 +
40420  direct-y += ../../i386/pci/direct.o
40421  acpi-y   += ../../i386/pci/acpi.o
40422 +pcifront-y += ../../i386/pci/pcifront.o
40423  legacy-y += ../../i386/pci/legacy.o
40424  irq-y    += ../../i386/pci/irq.o
40425  common-y += ../../i386/pci/common.o
40426  fixup-y  += ../../i386/pci/fixup.o
40427  i386-y  += ../../i386/pci/i386.o
40428 +
40429 +ifdef CONFIG_XEN
40430 +irq-y          := ../../i386/pci/irq-xen.o
40431 +include $(srctree)/scripts/Makefile.xen
40432 +
40433 +obj-y := $(call cherrypickxen, $(obj-y))
40434 +endif
40435 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/Makefile linux-2.6.16/drivers/Makefile
40436 --- linux-2.6.16.orig/drivers/Makefile  2006-06-26 09:49:45.000000000 +0200
40437 +++ linux-2.6.16/drivers/Makefile       2006-06-26 09:51:32.000000000 +0200
40438 @@ -33,6 +33,7 @@
40439  obj-$(CONFIG_NUBUS)            += nubus/
40440  obj-$(CONFIG_ATM)              += atm/
40441  obj-$(CONFIG_PPC_PMAC)         += macintosh/
40442 +obj-$(CONFIG_XEN)              += xen/
40443  obj-$(CONFIG_IDE)              += ide/
40444  obj-$(CONFIG_FC4)              += fc4/
40445  obj-$(CONFIG_SCSI)             += scsi/
40446 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/acpi/Kconfig linux-2.6.16/drivers/acpi/Kconfig
40447 --- linux-2.6.16.orig/drivers/acpi/Kconfig      2006-03-20 06:53:29.000000000 +0100
40448 +++ linux-2.6.16/drivers/acpi/Kconfig   2006-06-26 09:51:32.000000000 +0200
40449 @@ -46,7 +46,7 @@
40450  
40451  config ACPI_SLEEP
40452         bool "Sleep States"
40453 -       depends on X86 && (!SMP || SUSPEND_SMP)
40454 +       depends on X86 && (!SMP || SUSPEND_SMP) && !XEN
40455         depends on PM
40456         default y
40457         ---help---
40458 @@ -287,6 +287,7 @@
40459  config X86_PM_TIMER
40460         bool "Power Management Timer Support" if EMBEDDED
40461         depends on X86
40462 +       depends on !XEN
40463         default y
40464         help
40465           The Power Management Timer is available on all ACPI-capable,
40466 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/acpi/tables.c linux-2.6.16/drivers/acpi/tables.c
40467 --- linux-2.6.16.orig/drivers/acpi/tables.c     2006-03-20 06:53:29.000000000 +0100
40468 +++ linux-2.6.16/drivers/acpi/tables.c  2006-06-26 09:51:32.000000000 +0200
40469 @@ -572,6 +572,11 @@
40470   * 
40471   * result: sdt_entry[] is initialized
40472   */
40473 +#if defined(CONFIG_X86_XEN) || defined(CONFIG_X86_64_XEN)
40474 +#define acpi_rsdp_phys_to_va(rsdp_phys) isa_bus_to_virt(rsdp_phys)
40475 +#else
40476 +#define acpi_rsdp_phys_to_va(rsdp_phys) __va(rsdp_phys)
40477 +#endif
40478  
40479  int __init acpi_table_init(void)
40480  {
40481 @@ -587,7 +592,7 @@
40482                 return -ENODEV;
40483         }
40484  
40485 -       rsdp = (struct acpi_table_rsdp *)__va(rsdp_phys);
40486 +       rsdp = (struct acpi_table_rsdp *)acpi_rsdp_phys_to_va(rsdp_phys);
40487         if (!rsdp) {
40488                 printk(KERN_WARNING PREFIX "Unable to map RSDP\n");
40489                 return -ENODEV;
40490 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/char/mem.c linux-2.6.16/drivers/char/mem.c
40491 --- linux-2.6.16.orig/drivers/char/mem.c        2006-03-20 06:53:29.000000000 +0100
40492 +++ linux-2.6.16/drivers/char/mem.c     2006-06-26 09:51:32.000000000 +0200
40493 @@ -108,6 +108,7 @@
40494  }
40495  #endif
40496  
40497 +#ifndef ARCH_HAS_DEV_MEM
40498  /*
40499   * This funcion reads the *physical* memory. The f_pos points directly to the 
40500   * memory location. 
40501 @@ -232,6 +233,7 @@
40502         *ppos += written;
40503         return written;
40504  }
40505 +#endif
40506  
40507  #ifndef __HAVE_PHYS_MEM_ACCESS_PROT
40508  static pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
40509 @@ -773,6 +775,7 @@
40510  #define open_kmem      open_mem
40511  #define open_oldmem    open_mem
40512  
40513 +#ifndef ARCH_HAS_DEV_MEM
40514  static struct file_operations mem_fops = {
40515         .llseek         = memory_lseek,
40516         .read           = read_mem,
40517 @@ -780,6 +783,9 @@
40518         .mmap           = mmap_mem,
40519         .open           = open_mem,
40520  };
40521 +#else
40522 +extern struct file_operations mem_fops;
40523 +#endif
40524  
40525  static struct file_operations kmem_fops = {
40526         .llseek         = memory_lseek,
40527 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/char/tpm/Kconfig linux-2.6.16/drivers/char/tpm/Kconfig
40528 --- linux-2.6.16.orig/drivers/char/tpm/Kconfig  2006-03-20 06:53:29.000000000 +0100
40529 +++ linux-2.6.16/drivers/char/tpm/Kconfig       2006-06-26 09:51:32.000000000 +0200
40530 @@ -22,7 +22,7 @@
40531  
40532  config TCG_NSC
40533         tristate "National Semiconductor TPM Interface"
40534 -       depends on TCG_TPM
40535 +       depends on TCG_TPM && !XEN_UNPRIVILEGED_GUEST
40536         ---help---
40537           If you have a TPM security chip from National Semicondutor 
40538           say Yes and it will be accessible from within Linux.  To 
40539 @@ -31,7 +31,7 @@
40540  
40541  config TCG_ATMEL
40542         tristate "Atmel TPM Interface"
40543 -       depends on TCG_TPM
40544 +       depends on TCG_TPM && !XEN_UNPRIVILEGED_GUEST
40545         ---help---
40546           If you have a TPM security chip from Atmel say Yes and it 
40547           will be accessible from within Linux.  To compile this driver 
40548 @@ -49,5 +49,15 @@
40549           Further information on this driver and the supported hardware
40550           can be found at http://www.prosec.rub.de/tpm
40551  
40552 +config TCG_XEN
40553 +       tristate "XEN TPM Interface"
40554 +       depends on TCG_TPM && XEN && XEN_TPMDEV_FRONTEND
40555 +       ---help---
40556 +         If you want to make TPM support available to a Xen
40557 +         user domain, say Yes and it will
40558 +          be accessible from within Linux. To compile this driver
40559 +          as a module, choose M here; the module will be called
40560 +          tpm_xen.
40561 +
40562  endmenu
40563  
40564 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/char/tpm/Makefile linux-2.6.16/drivers/char/tpm/Makefile
40565 --- linux-2.6.16.orig/drivers/char/tpm/Makefile 2006-03-20 06:53:29.000000000 +0100
40566 +++ linux-2.6.16/drivers/char/tpm/Makefile      2006-06-26 09:51:32.000000000 +0200
40567 @@ -8,3 +8,4 @@
40568  obj-$(CONFIG_TCG_NSC) += tpm_nsc.o
40569  obj-$(CONFIG_TCG_ATMEL) += tpm_atmel.o
40570  obj-$(CONFIG_TCG_INFINEON) += tpm_infineon.o
40571 +obj-$(CONFIG_TCG_XEN) += tpm_xen.o
40572 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/char/tpm/tpm.c linux-2.6.16/drivers/char/tpm/tpm.c
40573 --- linux-2.6.16.orig/drivers/char/tpm/tpm.c    2006-03-20 06:53:29.000000000 +0100
40574 +++ linux-2.6.16/drivers/char/tpm/tpm.c 2006-06-26 09:51:32.000000000 +0200
40575 @@ -30,7 +30,8 @@
40576  
40577  enum tpm_const {
40578         TPM_MINOR = 224,        /* officially assigned */
40579 -       TPM_BUFSIZE = 2048,
40580 +       TPM_MIN_BUFSIZE = 2048,
40581 +       TPM_MAX_BUFSIZE = 64 * 1024,
40582         TPM_NUM_DEVICES = 256,
40583         TPM_NUM_MASK_ENTRIES = TPM_NUM_DEVICES / (8 * sizeof(int))
40584  };
40585 @@ -52,14 +53,14 @@
40586  
40587         down(&chip->buffer_mutex);
40588         atomic_set(&chip->data_pending, 0);
40589 -       memset(chip->data_buffer, 0, TPM_BUFSIZE);
40590 +       memset(chip->data_buffer, 0, get_chip_buffersize(chip));
40591         up(&chip->buffer_mutex);
40592  }
40593  
40594  /*
40595   * Internal kernel interface to transmit TPM commands
40596   */
40597 -static ssize_t tpm_transmit(struct tpm_chip *chip, const char *buf,
40598 +static ssize_t tpm_transmit(struct tpm_chip * chip, const char *buf,
40599                             size_t bufsiz)
40600  {
40601         ssize_t rc;
40602 @@ -351,7 +352,7 @@
40603  
40604         spin_unlock(&driver_lock);
40605  
40606 -       chip->data_buffer = kmalloc(TPM_BUFSIZE * sizeof(u8), GFP_KERNEL);
40607 +       chip->data_buffer = kmalloc(get_chip_buffersize(chip) * sizeof(u8), GFP_KERNEL);
40608         if (chip->data_buffer == NULL) {
40609                 chip->num_opens--;
40610                 put_device(chip->dev);
40611 @@ -399,8 +400,8 @@
40612  
40613         down(&chip->buffer_mutex);
40614  
40615 -       if (in_size > TPM_BUFSIZE)
40616 -               in_size = TPM_BUFSIZE;
40617 +       if (in_size > get_chip_buffersize(chip))
40618 +               in_size = get_chip_buffersize(chip);
40619  
40620         if (copy_from_user
40621             (chip->data_buffer, (void __user *) buf, in_size)) {
40622 @@ -409,9 +410,11 @@
40623         }
40624  
40625         /* atomic tpm command send and result receive */
40626 -       out_size = tpm_transmit(chip, chip->data_buffer, TPM_BUFSIZE);
40627 +       out_size = tpm_transmit(chip, chip->data_buffer, 
40628 +                               get_chip_buffersize(chip));
40629  
40630         atomic_set(&chip->data_pending, out_size);
40631 +       atomic_set(&chip->data_position, 0);
40632         up(&chip->buffer_mutex);
40633  
40634         /* Set a timeout by which the reader must come claim the result */
40635 @@ -427,20 +430,33 @@
40636  {
40637         struct tpm_chip *chip = file->private_data;
40638         int ret_size;
40639 +       int pos, pending = 0;
40640  
40641 -       del_singleshot_timer_sync(&chip->user_read_timer);
40642 -       flush_scheduled_work();
40643         ret_size = atomic_read(&chip->data_pending);
40644 -       atomic_set(&chip->data_pending, 0);
40645         if (ret_size > 0) {     /* relay data */
40646                 if (size < ret_size)
40647                         ret_size = size;
40648  
40649 +               pos = atomic_read(&chip->data_position);
40650 +
40651                 down(&chip->buffer_mutex);
40652 -               if (copy_to_user(buf, chip->data_buffer, ret_size))
40653 +               if (copy_to_user(buf, &chip->data_buffer[pos], ret_size)) {
40654                         ret_size = -EFAULT;
40655 +               } else {
40656 +                       pending = atomic_read(&chip->data_pending) - ret_size;
40657 +                       if ( pending ) {
40658 +                               atomic_set( &chip->data_pending, pending );
40659 +                               atomic_set( &chip->data_position, pos+ret_size );
40660 +                       }
40661 +               }
40662                 up(&chip->buffer_mutex);
40663         }
40664 +       
40665 +       if ( ret_size <= 0 || pending == 0 ) {
40666 +               atomic_set( &chip->data_pending, 0 );
40667 +               del_singleshot_timer_sync(&chip->user_read_timer);
40668 +               flush_scheduled_work();
40669 +       }
40670  
40671         return ret_size;
40672  }
40673 @@ -544,6 +560,12 @@
40674         chip->user_read_timer.data = (unsigned long) chip;
40675  
40676         chip->vendor = entry;
40677 +       
40678 +       if (entry->buffersize < TPM_MIN_BUFSIZE) {
40679 +               entry->buffersize = TPM_MIN_BUFSIZE;
40680 +       } else if (entry->buffersize > TPM_MAX_BUFSIZE) {
40681 +               entry->buffersize = TPM_MAX_BUFSIZE;
40682 +       }
40683  
40684         chip->dev_num = -1;
40685  
40686 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/char/tpm/tpm.h linux-2.6.16/drivers/char/tpm/tpm.h
40687 --- linux-2.6.16.orig/drivers/char/tpm/tpm.h    2006-03-20 06:53:29.000000000 +0100
40688 +++ linux-2.6.16/drivers/char/tpm/tpm.h 2006-06-26 09:51:32.000000000 +0200
40689 @@ -50,6 +50,7 @@
40690         u8 req_complete_mask;
40691         u8 req_complete_val;
40692         u8 req_canceled;
40693 +       u32 buffersize;
40694         void __iomem *iobase;           /* ioremapped address */
40695         unsigned long base;             /* TPM base address */
40696  
40697 @@ -74,6 +75,7 @@
40698         /* Data passed to and from the tpm via the read/write calls */
40699         u8 *data_buffer;
40700         atomic_t data_pending;
40701 +       atomic_t data_position;
40702         struct semaphore buffer_mutex;
40703  
40704         struct timer_list user_read_timer;      /* user needs to claim result */
40705 @@ -99,6 +101,11 @@
40706         outb(value & 0xFF, base+1);
40707  }
40708  
40709 +static inline u32 get_chip_buffersize(struct tpm_chip *chip)
40710 +{
40711 +       return chip->vendor->buffersize;
40712 +}
40713 +
40714  extern int tpm_register_hardware(struct device *,
40715                                  struct tpm_vendor_specific *);
40716  extern int tpm_open(struct inode *, struct file *);
40717 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/char/tpm/tpm_xen.c linux-2.6.16/drivers/char/tpm/tpm_xen.c
40718 --- linux-2.6.16.orig/drivers/char/tpm/tpm_xen.c        1970-01-01 01:00:00.000000000 +0100
40719 +++ linux-2.6.16/drivers/char/tpm/tpm_xen.c     2006-06-26 09:51:32.000000000 +0200
40720 @@ -0,0 +1,536 @@
40721 +/*
40722 + * Copyright (C) 2004 IBM Corporation
40723 + *
40724 + * Authors:
40725 + * Leendert van Doorn <leendert@watson.ibm.com>
40726 + * Dave Safford <safford@watson.ibm.com>
40727 + * Reiner Sailer <sailer@watson.ibm.com>
40728 + * Kylene Hall <kjhall@us.ibm.com>
40729 + * Stefan Berger <stefanb@us.ibm.com>
40730 + *
40731 + * Maintained by: <tpmdd_devel@lists.sourceforge.net>
40732 + *
40733 + * Device driver for TCG/TCPA TPM (trusted platform module) for XEN.
40734 + * Specifications at www.trustedcomputinggroup.org
40735 + *
40736 + * This program is free software; you can redistribute it and/or
40737 + * modify it under the terms of the GNU General Public License as
40738 + * published by the Free Software Foundation, version 2 of the
40739 + * License.
40740 + *
40741 + */
40742 +
40743 +#include <asm/uaccess.h>
40744 +#include <linux/list.h>
40745 +#include <xen/tpmfe.h>
40746 +#include <linux/device.h>
40747 +#include <linux/interrupt.h>
40748 +#include <linux/platform_device.h>
40749 +#include "tpm.h"
40750 +
40751 +/* read status bits */
40752 +enum {
40753 +       STATUS_BUSY = 0x01,
40754 +       STATUS_DATA_AVAIL = 0x02,
40755 +       STATUS_READY = 0x04
40756 +};
40757 +
40758 +#define MIN(x,y)  ((x) < (y)) ? (x) : (y)
40759 +
40760 +struct transmission {
40761 +       struct list_head next;
40762 +       unsigned char *request;
40763 +       unsigned int request_len;
40764 +       unsigned char *rcv_buffer;
40765 +       unsigned int  buffersize;
40766 +       unsigned int flags;
40767 +};
40768 +
40769 +enum {
40770 +       TRANSMISSION_FLAG_WAS_QUEUED = 0x1
40771 +};
40772 +
40773 +struct data_exchange {
40774 +       struct transmission *current_request;
40775 +       spinlock_t           req_list_lock;
40776 +       wait_queue_head_t    req_wait_queue;
40777 +
40778 +       struct list_head     queued_requests;
40779 +
40780 +       struct transmission *current_response;
40781 +       spinlock_t           resp_list_lock;
40782 +       wait_queue_head_t    resp_wait_queue;     // processes waiting for responses
40783 +
40784 +       struct transmission *req_cancelled;       // if a cancellation was encounterd
40785 +
40786 +       unsigned int         fe_status;
40787 +       unsigned int         flags;
40788 +};
40789 +
40790 +enum {
40791 +       DATAEX_FLAG_QUEUED_ONLY = 0x1
40792 +};
40793 +
40794 +static struct data_exchange dataex;
40795 +
40796 +static unsigned long disconnect_time;
40797 +
40798 +static struct tpmfe_device tpmfe;
40799 +
40800 +/* local function prototypes */
40801 +static void __exit cleanup_xen(void);
40802 +
40803 +
40804 +/* =============================================================
40805 + * Some utility functions
40806 + * =============================================================
40807 + */
40808 +static inline struct transmission *
40809 +transmission_alloc(void)
40810 +{
40811 +       return kzalloc(sizeof(struct transmission), GFP_KERNEL);
40812 +}
40813 +
40814 +static inline unsigned char *
40815 +transmission_set_buffer(struct transmission *t,
40816 +                        unsigned char *buffer, unsigned int len)
40817 +{
40818 +       kfree(t->request);
40819 +       t->request = kmalloc(len, GFP_KERNEL);
40820 +       if (t->request) {
40821 +               memcpy(t->request,
40822 +                      buffer,
40823 +                      len);
40824 +               t->request_len = len;
40825 +       }
40826 +       return t->request;
40827 +}
40828 +
40829 +static inline void
40830 +transmission_free(struct transmission *t)
40831 +{
40832 +       kfree(t->request);
40833 +       kfree(t->rcv_buffer);
40834 +       kfree(t);
40835 +}
40836 +
40837 +/* =============================================================
40838 + * Interface with the TPM shared memory driver for XEN
40839 + * =============================================================
40840 + */
40841 +static int tpm_recv(const u8 *buffer, size_t count, const void *ptr)
40842 +{
40843 +       int ret_size = 0;
40844 +       struct transmission *t;
40845 +
40846 +       /*
40847 +        * The list with requests must contain one request
40848 +        * only and the element there must be the one that
40849 +        * was passed to me from the front-end.
40850 +        */
40851 +       if (dataex.current_request != ptr) {
40852 +               printk("WARNING: The request pointer is different than the "
40853 +                      "pointer the shared memory driver returned to me. "
40854 +                      "%p != %p\n",
40855 +                      dataex.current_request, ptr);
40856 +       }
40857 +
40858 +       /*
40859 +        * If the request has been cancelled, just quit here
40860 +        */
40861 +       if (dataex.req_cancelled == (struct transmission *)ptr) {
40862 +               if (dataex.current_request == dataex.req_cancelled) {
40863 +                       dataex.current_request = NULL;
40864 +               }
40865 +               transmission_free(dataex.req_cancelled);
40866 +               dataex.req_cancelled = NULL;
40867 +               return 0;
40868 +       }
40869 +
40870 +       if (NULL != (t = dataex.current_request)) {
40871 +               transmission_free(t);
40872 +               dataex.current_request = NULL;
40873 +       }
40874 +
40875 +       t = transmission_alloc();
40876 +       if (t) {
40877 +               unsigned long flags;
40878 +               t->rcv_buffer = kmalloc(count, GFP_KERNEL);
40879 +               if (! t->rcv_buffer) {
40880 +                       transmission_free(t);
40881 +                       return -ENOMEM;
40882 +               }
40883 +               t->buffersize = count;
40884 +               memcpy(t->rcv_buffer, buffer, count);
40885 +               ret_size = count;
40886 +
40887 +               spin_lock_irqsave(&dataex.resp_list_lock ,flags);
40888 +               dataex.current_response = t;
40889 +               spin_unlock_irqrestore(&dataex.resp_list_lock, flags);
40890 +               wake_up_interruptible(&dataex.resp_wait_queue);
40891 +       }
40892 +       return ret_size;
40893 +}
40894 +
40895 +
40896 +static void tpm_fe_status(unsigned int flags)
40897 +{
40898 +       dataex.fe_status = flags;
40899 +       if ((dataex.fe_status & TPMFE_STATUS_CONNECTED) == 0) {
40900 +               disconnect_time = jiffies;
40901 +       }
40902 +}
40903 +
40904 +/* =============================================================
40905 + * Interface with the generic TPM driver
40906 + * =============================================================
40907 + */
40908 +static int tpm_xen_recv(struct tpm_chip *chip, u8 * buf, size_t count)
40909 +{
40910 +       unsigned long flags;
40911 +       int rc = 0;
40912 +
40913 +       spin_lock_irqsave(&dataex.resp_list_lock, flags);
40914 +       /*
40915 +        * Check if the previous operation only queued the command
40916 +        * In this case there won't be a response, so I just
40917 +        * return from here and reset that flag. In any other
40918 +        * case I should receive a response from the back-end.
40919 +        */
40920 +       if ((dataex.flags & DATAEX_FLAG_QUEUED_ONLY) != 0) {
40921 +               dataex.flags &= ~DATAEX_FLAG_QUEUED_ONLY;
40922 +               spin_unlock_irqrestore(&dataex.resp_list_lock, flags);
40923 +               /*
40924 +                * a little hack here. The first few measurements
40925 +                * are queued since there's no way to talk to the
40926 +                * TPM yet (due to slowness of the control channel)
40927 +                * So we just make IMA happy by giving it 30 NULL
40928 +                * bytes back where the most important part is
40929 +                * that the result code is '0'.
40930 +                */
40931 +
40932 +               count = MIN(count, 30);
40933 +               memset(buf, 0x0, count);
40934 +               return count;
40935 +       }
40936 +       /*
40937 +        * Check whether something is in the responselist and if
40938 +        * there's nothing in the list wait for something to appear.
40939 +        */
40940 +
40941 +       if (NULL == dataex.current_response) {
40942 +               spin_unlock_irqrestore(&dataex.resp_list_lock, flags);
40943 +               interruptible_sleep_on_timeout(&dataex.resp_wait_queue,
40944 +                                              1000);
40945 +               spin_lock_irqsave(&dataex.resp_list_lock ,flags);
40946 +       }
40947 +
40948 +       if (NULL != dataex.current_response) {
40949 +               struct transmission *t = dataex.current_response;
40950 +               dataex.current_response = NULL;
40951 +               rc = MIN(count, t->buffersize);
40952 +               memcpy(buf, t->rcv_buffer, rc);
40953 +               transmission_free(t);
40954 +       }
40955 +
40956 +       spin_unlock_irqrestore(&dataex.resp_list_lock, flags);
40957 +       return rc;
40958 +}
40959 +
40960 +static int tpm_xen_send(struct tpm_chip *chip, u8 * buf, size_t count)
40961 +{
40962 +       /*
40963 +        * We simply pass the packet onto the XEN shared
40964 +        * memory driver.
40965 +        */
40966 +       unsigned long flags;
40967 +       int rc;
40968 +       struct transmission *t = transmission_alloc();
40969 +
40970 +       spin_lock_irqsave(&dataex.req_list_lock, flags);
40971 +       /*
40972 +        * If there's a current request, it must be the
40973 +        * previous request that has timed out.
40974 +        */
40975 +       if (dataex.current_request != NULL) {
40976 +               printk("WARNING: Sending although there is a request outstanding.\n"
40977 +                      "         Previous request must have timed out.\n");
40978 +               transmission_free(dataex.current_request);
40979 +               dataex.current_request = NULL;
40980 +       }
40981 +
40982 +       if (t != NULL) {
40983 +               unsigned int error = 0;
40984 +               /*
40985 +                * Queue the packet if the driver below is not
40986 +                * ready, yet, or there is any packet already
40987 +                * in the queue.
40988 +                * If the driver below is ready, unqueue all
40989 +                * packets first before sending our current
40990 +                * packet.
40991 +                * For each unqueued packet, except for the
40992 +                * last (=current) packet, call the function
40993 +                * tpm_xen_recv to wait for the response to come
40994 +                * back.
40995 +                */
40996 +               if ((dataex.fe_status & TPMFE_STATUS_CONNECTED) == 0) {
40997 +                       if (time_after(jiffies, disconnect_time + HZ * 10)) {
40998 +                               rc = -ENOENT;
40999 +                       } else {
41000 +                               /*
41001 +                                * copy the request into the buffer
41002 +                                */
41003 +                               if (transmission_set_buffer(t, buf, count)
41004 +                                   == NULL) {
41005 +                                       transmission_free(t);
41006 +                                       rc = -ENOMEM;
41007 +                                       goto exit;
41008 +                               }
41009 +                               dataex.flags |= DATAEX_FLAG_QUEUED_ONLY;
41010 +                               list_add_tail(&t->next, &dataex.queued_requests);
41011 +                               rc = 0;
41012 +                       }
41013 +               } else {
41014 +                       /*
41015 +                        * Check whether there are any packets in the queue
41016 +                        */
41017 +                       while (!list_empty(&dataex.queued_requests)) {
41018 +                               /*
41019 +                                * Need to dequeue them.
41020 +                                * Read the result into a dummy buffer.
41021 +                                */
41022 +                               unsigned char buffer[1];
41023 +                               struct transmission *qt = (struct transmission *) dataex.queued_requests.next;
41024 +                               list_del(&qt->next);
41025 +                               dataex.current_request = qt;
41026 +                               spin_unlock_irqrestore(&dataex.req_list_lock,
41027 +                                                      flags);
41028 +
41029 +                               rc = tpm_fe_send(tpmfe.tpm_private,
41030 +                                                qt->request,
41031 +                                                qt->request_len,
41032 +                                                qt);
41033 +
41034 +                               if (rc < 0) {
41035 +                                       spin_lock_irqsave(&dataex.req_list_lock, flags);
41036 +                                       if ((qt = dataex.current_request) != NULL) {
41037 +                                               /*
41038 +                                                * requeue it at the beginning
41039 +                                                * of the list
41040 +                                                */
41041 +                                               list_add(&qt->next,
41042 +                                                        &dataex.queued_requests);
41043 +                                       }
41044 +                                       dataex.current_request = NULL;
41045 +                                       error = 1;
41046 +                                       break;
41047 +                               }
41048 +                               /*
41049 +                                * After this point qt is not valid anymore!
41050 +                                * It is freed when the front-end is delivering the data
41051 +                                * by calling tpm_recv
41052 +                                */
41053 +
41054 +                               /*
41055 +                                * Try to receive the response now into the provided dummy
41056 +                                * buffer (I don't really care about this response since
41057 +                                * there is no receiver anymore for this response)
41058 +                                */
41059 +                               rc = tpm_xen_recv(chip, buffer, sizeof(buffer));
41060 +
41061 +                               spin_lock_irqsave(&dataex.req_list_lock, flags);
41062 +                       }
41063 +
41064 +                       if (error == 0) {
41065 +                               /*
41066 +                                * Finally, send the current request.
41067 +                                */
41068 +                               dataex.current_request = t;
41069 +                               /*
41070 +                                * Call the shared memory driver
41071 +                                * Pass to it the buffer with the request, the
41072 +                                * amount of bytes in the request and
41073 +                                * a void * pointer (here: transmission structure)
41074 +                                */
41075 +                               rc = tpm_fe_send(tpmfe.tpm_private,
41076 +                                                buf, count, t);
41077 +                               /*
41078 +                                * The generic TPM driver will call
41079 +                                * the function to receive the response.
41080 +                                */
41081 +                               if (rc < 0) {
41082 +                                       dataex.current_request = NULL;
41083 +                                       goto queue_it;
41084 +                               }
41085 +                       } else {
41086 +queue_it:
41087 +                               if (transmission_set_buffer(t, buf, count) == NULL) {
41088 +                                       transmission_free(t);
41089 +                                       rc = -ENOMEM;
41090 +                                       goto exit;
41091 +                               }
41092 +                               /*
41093 +                                * An error occurred. Don't event try
41094 +                                * to send the current request. Just
41095 +                                * queue it.
41096 +                                */
41097 +                               dataex.flags |= DATAEX_FLAG_QUEUED_ONLY;
41098 +                               list_add_tail(&t->next,
41099 +                                             &dataex.queued_requests);
41100 +                               rc = 0;
41101 +                       }
41102 +               }
41103 +       } else {
41104 +               rc = -ENOMEM;
41105 +       }
41106 +
41107 +exit:
41108 +       spin_unlock_irqrestore(&dataex.req_list_lock, flags);
41109 +       return rc;
41110 +}
41111 +
41112 +static void tpm_xen_cancel(struct tpm_chip *chip)
41113 +{
41114 +       unsigned long flags;
41115 +       spin_lock_irqsave(&dataex.resp_list_lock,flags);
41116 +
41117 +       dataex.req_cancelled = dataex.current_request;
41118 +
41119 +       spin_unlock_irqrestore(&dataex.resp_list_lock,flags);
41120 +}
41121 +
41122 +static u8 tpm_xen_status(struct tpm_chip *chip)
41123 +{
41124 +       unsigned long flags;
41125 +       u8 rc = 0;
41126 +       spin_lock_irqsave(&dataex.resp_list_lock, flags);
41127 +       /*
41128 +        * Data are available if:
41129 +        *  - there's a current response
41130 +        *  - the last packet was queued only (this is fake, but necessary to
41131 +        *      get the generic TPM layer to call the receive function.)
41132 +        */
41133 +       if (NULL != dataex.current_response ||
41134 +           0 != (dataex.flags & DATAEX_FLAG_QUEUED_ONLY)) {
41135 +               rc = STATUS_DATA_AVAIL;
41136 +       }
41137 +       spin_unlock_irqrestore(&dataex.resp_list_lock, flags);
41138 +       return rc;
41139 +}
41140 +
41141 +static struct file_operations tpm_xen_ops = {
41142 +       .owner = THIS_MODULE,
41143 +       .llseek = no_llseek,
41144 +       .open = tpm_open,
41145 +       .read = tpm_read,
41146 +       .write = tpm_write,
41147 +       .release = tpm_release,
41148 +};
41149 +
41150 +static DEVICE_ATTR(pubek, S_IRUGO, tpm_show_pubek, NULL);
41151 +static DEVICE_ATTR(pcrs, S_IRUGO, tpm_show_pcrs, NULL);
41152 +static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL);
41153 +static DEVICE_ATTR(cancel, S_IWUSR |S_IWGRP, NULL, tpm_store_cancel);
41154 +
41155 +static struct attribute* xen_attrs[] = {
41156 +       &dev_attr_pubek.attr,
41157 +       &dev_attr_pcrs.attr,
41158 +       &dev_attr_caps.attr,
41159 +       &dev_attr_cancel.attr,
41160 +       NULL,
41161 +};
41162 +
41163 +static struct attribute_group xen_attr_grp = { .attrs = xen_attrs };
41164 +
41165 +static struct tpm_vendor_specific tpm_xen = {
41166 +       .recv = tpm_xen_recv,
41167 +       .send = tpm_xen_send,
41168 +       .cancel = tpm_xen_cancel,
41169 +       .status = tpm_xen_status,
41170 +       .req_complete_mask = STATUS_BUSY | STATUS_DATA_AVAIL,
41171 +       .req_complete_val  = STATUS_DATA_AVAIL,
41172 +       .req_canceled = STATUS_READY,
41173 +       .base = 0,
41174 +       .attr_group = &xen_attr_grp,
41175 +       .miscdev.fops = &tpm_xen_ops,
41176 +       .buffersize = 64 * 1024,
41177 +};
41178 +
41179 +static struct platform_device *pdev;
41180 +
41181 +static struct tpmfe_device tpmfe = {
41182 +       .receive = tpm_recv,
41183 +       .status  = tpm_fe_status,
41184 +};
41185 +
41186 +
41187 +static int __init init_xen(void)
41188 +{
41189 +       int rc;
41190 +
41191 +       if ((xen_start_info->flags & SIF_INITDOMAIN)) {
41192 +               return -EPERM;
41193 +       }
41194 +       /*
41195 +        * Register device with the low lever front-end
41196 +        * driver
41197 +        */
41198 +       if ((rc = tpm_fe_register_receiver(&tpmfe)) < 0) {
41199 +               goto err_exit;
41200 +       }
41201 +
41202 +       /*
41203 +        * Register our device with the system.
41204 +        */
41205 +       pdev = platform_device_register_simple("tpm_vtpm", -1, NULL, 0);
41206 +       if (IS_ERR(pdev)) {
41207 +               rc = PTR_ERR(pdev);
41208 +               goto err_unreg_fe;
41209 +       }
41210 +
41211 +       tpm_xen.buffersize = tpmfe.max_tx_size;
41212 +
41213 +       if ((rc = tpm_register_hardware(&pdev->dev, &tpm_xen)) < 0) {
41214 +               goto err_unreg_pdev;
41215 +       }
41216 +
41217 +       dataex.current_request = NULL;
41218 +       spin_lock_init(&dataex.req_list_lock);
41219 +       init_waitqueue_head(&dataex.req_wait_queue);
41220 +       INIT_LIST_HEAD(&dataex.queued_requests);
41221 +
41222 +       dataex.current_response = NULL;
41223 +       spin_lock_init(&dataex.resp_list_lock);
41224 +       init_waitqueue_head(&dataex.resp_wait_queue);
41225 +
41226 +       disconnect_time = jiffies;
41227 +
41228 +       return 0;
41229 +
41230 +
41231 +err_unreg_pdev:
41232 +       platform_device_unregister(pdev);
41233 +err_unreg_fe:
41234 +       tpm_fe_unregister_receiver();
41235 +
41236 +err_exit:
41237 +       return rc;
41238 +}
41239 +
41240 +static void __exit cleanup_xen(void)
41241 +{
41242 +       struct tpm_chip *chip = dev_get_drvdata(&pdev->dev);
41243 +       if (chip) {
41244 +               tpm_remove_hardware(chip->dev);
41245 +               platform_device_unregister(pdev);
41246 +               tpm_fe_unregister_receiver();
41247 +       }
41248 +}
41249 +
41250 +module_init(init_xen);
41251 +module_exit(cleanup_xen);
41252 +
41253 +MODULE_AUTHOR("Stefan Berger (stefanb@us.ibm.com)");
41254 +MODULE_DESCRIPTION("TPM Driver for XEN (shared memory)");
41255 +MODULE_VERSION("1.0");
41256 +MODULE_LICENSE("GPL");
41257 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/char/tty_io.c linux-2.6.16/drivers/char/tty_io.c
41258 --- linux-2.6.16.orig/drivers/char/tty_io.c     2006-06-26 09:49:46.000000000 +0200
41259 +++ linux-2.6.16/drivers/char/tty_io.c  2006-06-26 09:51:32.000000000 +0200
41260 @@ -133,6 +133,8 @@
41261     vt.c for deeply disgusting hack reasons */
41262  DECLARE_MUTEX(tty_sem);
41263  
41264 +int console_use_vt = 1;
41265 +
41266  #ifdef CONFIG_UNIX98_PTYS
41267  extern struct tty_driver *ptm_driver;  /* Unix98 pty masters; for /dev/ptmx */
41268  extern int pty_limit;          /* Config limit on Unix98 ptys */
41269 @@ -2055,7 +2057,7 @@
41270                 goto got_driver;
41271         }
41272  #ifdef CONFIG_VT
41273 -       if (device == MKDEV(TTY_MAJOR,0)) {
41274 +       if (console_use_vt && (device == MKDEV(TTY_MAJOR,0))) {
41275                 extern struct tty_driver *console_driver;
41276                 driver = console_driver;
41277                 index = fg_console;
41278 @@ -3247,6 +3249,8 @@
41279  #endif
41280  
41281  #ifdef CONFIG_VT
41282 +       if (!console_use_vt)
41283 +               goto out_vt;
41284         cdev_init(&vc0_cdev, &console_fops);
41285         if (cdev_add(&vc0_cdev, MKDEV(TTY_MAJOR, 0), 1) ||
41286             register_chrdev_region(MKDEV(TTY_MAJOR, 0), 1, "/dev/vc/0") < 0)
41287 @@ -3255,6 +3259,7 @@
41288         class_device_create(tty_class, NULL, MKDEV(TTY_MAJOR, 0), NULL, "tty0");
41289  
41290         vty_init();
41291 + out_vt:
41292  #endif
41293         return 0;
41294  }
41295 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/firmware/Kconfig linux-2.6.16/drivers/firmware/Kconfig
41296 --- linux-2.6.16.orig/drivers/firmware/Kconfig  2006-03-20 06:53:29.000000000 +0100
41297 +++ linux-2.6.16/drivers/firmware/Kconfig       2006-06-26 09:51:32.000000000 +0200
41298 @@ -8,7 +8,7 @@
41299  config EDD
41300         tristate "BIOS Enhanced Disk Drive calls determine boot disk (EXPERIMENTAL)"
41301         depends on EXPERIMENTAL
41302 -       depends on !IA64
41303 +       depends on !IA64 && !XEN
41304         help
41305           Say Y or M here if you want to enable BIOS Enhanced Disk Drive
41306           Services real mode BIOS calls to determine which disk
41307 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/pci/Kconfig linux-2.6.16/drivers/pci/Kconfig
41308 --- linux-2.6.16.orig/drivers/pci/Kconfig       2006-03-20 06:53:29.000000000 +0100
41309 +++ linux-2.6.16/drivers/pci/Kconfig    2006-06-26 09:51:32.000000000 +0200
41310 @@ -5,6 +5,7 @@
41311         bool "Message Signaled Interrupts (MSI and MSI-X)"
41312         depends on PCI
41313         depends on (X86_LOCAL_APIC && X86_IO_APIC) || IA64
41314 +       depends on !XEN
41315         help
41316            This allows device drivers to enable MSI (Message Signaled
41317            Interrupts).  Message Signaled Interrupts enable a device to
41318 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/serial/Kconfig linux-2.6.16/drivers/serial/Kconfig
41319 --- linux-2.6.16.orig/drivers/serial/Kconfig    2006-03-20 06:53:29.000000000 +0100
41320 +++ linux-2.6.16/drivers/serial/Kconfig 2006-06-26 09:51:32.000000000 +0200
41321 @@ -11,6 +11,7 @@
41322  config SERIAL_8250
41323         tristate "8250/16550 and compatible serial support"
41324         depends on (BROKEN || !SPARC)
41325 +       depends on !XEN_DISABLE_SERIAL
41326         select SERIAL_CORE
41327         ---help---
41328           This selects whether you want to include the driver for the standard
41329 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/video/Kconfig linux-2.6.16/drivers/video/Kconfig
41330 --- linux-2.6.16.orig/drivers/video/Kconfig     2006-06-26 09:49:45.000000000 +0200
41331 +++ linux-2.6.16/drivers/video/Kconfig  2006-06-26 09:51:32.000000000 +0200
41332 @@ -537,7 +537,7 @@
41333  
41334  config VIDEO_SELECT
41335         bool
41336 -       depends on (FB = y) && X86
41337 +       depends on (FB = y) && X86 && !XEN
41338         default y
41339  
41340  config FB_SGIVW
41341 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/Kconfig linux-2.6.16/drivers/xen/Kconfig
41342 --- linux-2.6.16.orig/drivers/xen/Kconfig       1970-01-01 01:00:00.000000000 +0100
41343 +++ linux-2.6.16/drivers/xen/Kconfig    2006-06-26 09:51:32.000000000 +0200
41344 @@ -0,0 +1,212 @@
41345 +#
41346 +# This Kconfig describe xen options
41347 +#
41348 +
41349 +mainmenu "Xen Configuration"
41350 +
41351 +config XEN
41352 +       bool
41353 +       default y if X86_XEN || X86_64_XEN
41354 +       help
41355 +         This is the Linux Xen port.
41356 +
41357 +if XEN
41358 +config XEN_INTERFACE_VERSION
41359 +       hex
41360 +       default 0x00030101
41361 +
41362 +menu "XEN"
41363 +
41364 +config XEN_PRIVILEGED_GUEST
41365 +       bool "Privileged Guest (domain 0)"
41366 +       depends XEN
41367 +       default n
41368 +       help
41369 +         Support for privileged operation (domain 0)
41370 +
41371 +config XEN_UNPRIVILEGED_GUEST
41372 +       bool
41373 +       default !XEN_PRIVILEGED_GUEST
41374 +
41375 +config XEN_PCIDEV_BACKEND
41376 +       tristate "PCI device backend driver"
41377 +       depends PCI
41378 +       default XEN_PRIVILEGED_GUEST
41379 +       help
41380 +         The PCI device backend driver allows the kernel to export arbitrary
41381 +         PCI devices to other guests. If you select this to be a module, you
41382 +         will need to make sure no other driver has bound to the device(s)
41383 +         you want to make visible to other guests.
41384 +
41385 +choice
41386 +       prompt "PCI Backend Mode"
41387 +       depends on XEN_PCIDEV_BACKEND
41388 +       default XEN_PCIDEV_BACKEND_VPCI
41389 +
41390 +config XEN_PCIDEV_BACKEND_VPCI
41391 +       bool "Virtual PCI"
41392 +       ---help---
41393 +         This PCI Backend hides the true PCI topology and makes the frontend
41394 +         think there is a single PCI bus with only the exported devices on it.
41395 +         For example, a device at 03:05.0 will be re-assigned to 00:00.0. A
41396 +         second device at 02:1a.0 will be re-assigned to 00:01.0.
41397 +
41398 +config XEN_PCIDEV_BACKEND_PASS
41399 +       bool "Passthrough"
41400 +       ---help---
41401 +         This PCI Backend provides a real view of the PCI topology to the
41402 +         frontend (for example, a device at 06:01.b will still appear at
41403 +         06:01.b to the frontend). This is similar to how Xen 2.0.x exposed
41404 +         PCI devices to its driver domains. This may be required for drivers
41405 +         which depend on finding their hardward in certain bus/slot
41406 +         locations.
41407 +
41408 +endchoice
41409 +
41410 +config XEN_PCIDEV_BE_DEBUG
41411 +       bool "PCI Backend Debugging"
41412 +       depends on XEN_PCIDEV_BACKEND
41413 +       default n
41414 +
41415 +config XEN_BLKDEV_BACKEND
41416 +       tristate "Block-device backend driver"
41417 +       default y
41418 +       help
41419 +         The block-device backend driver allows the kernel to export its
41420 +         block devices to other guests via a high-performance shared-memory
41421 +         interface.
41422 +
41423 +config XEN_BLKDEV_TAP_BE
41424 +        tristate "Block Tap support for backend driver (DANGEROUS)"
41425 +        depends on XEN_BLKDEV_BACKEND
41426 +        default n
41427 +        help
41428 +          If you intend to use the block tap driver, the backend domain will
41429 +          not know the domain id of the real frontend, and so will not be able
41430 +          to map its data pages.  This modifies the backend to attempt to map
41431 +          from both the tap domain and the real frontend.  This presents a
41432 +          security risk, and so should ONLY be used for development
41433 +          with the blktap.  This option will be removed as the block drivers are
41434 +          modified to use grant tables.
41435 +
41436 +config XEN_NETDEV_BACKEND
41437 +       tristate "Network-device backend driver"
41438 +       default y
41439 +       help
41440 +         The network-device backend driver allows the kernel to export its
41441 +         network devices to other guests via a high-performance shared-memory
41442 +         interface.
41443 +
41444 +config XEN_NETDEV_PIPELINED_TRANSMITTER
41445 +       bool "Pipelined transmitter (DANGEROUS)"
41446 +       depends on XEN_NETDEV_BACKEND
41447 +       default n
41448 +       help
41449 +         If the net backend is a dumb domain, such as a transparent Ethernet
41450 +         bridge with no local IP interface, it is safe to say Y here to get
41451 +         slightly lower network overhead.
41452 +         If the backend has a local IP interface; or may be doing smart things
41453 +         like reassembling packets to perform firewall filtering; or if you
41454 +         are unsure; or if you experience network hangs when this option is
41455 +         enabled; then you must say N here.
41456 +
41457 +config XEN_NETDEV_LOOPBACK
41458 +       tristate "Network-device loopback driver"
41459 +       depends on XEN_NETDEV_BACKEND
41460 +       default y
41461 +       help
41462 +         A two-interface loopback device to emulate a local netfront-netback
41463 +         connection.
41464 +
41465 +config XEN_TPMDEV_BACKEND
41466 +       tristate "TPM-device backend driver"
41467 +       default n
41468 +       help
41469 +         The TPM-device backend driver
41470 +
41471 +config XEN_TPMDEV_CLOSE_IF_VTPM_FAILS
41472 +       bool "TPM backend closes upon vTPM failure"
41473 +       depends on XEN_TPMDEV_BACKEND
41474 +       default n
41475 +       help
41476 +         The TPM backend closes the channel if the vTPM in userspace indicates
41477 +         a failure. The corresponding domain's channel will be closed.
41478 +         Say Y if you want this feature.
41479 +
41480 +config XEN_BLKDEV_FRONTEND
41481 +       tristate "Block-device frontend driver"
41482 +       depends on XEN
41483 +       default y
41484 +       help
41485 +         The block-device frontend driver allows the kernel to access block
41486 +         devices mounted within another guest OS. Unless you are building a
41487 +         dedicated device-driver domain, or your master control domain
41488 +         (domain 0), then you almost certainly want to say Y here.
41489 +
41490 +config XEN_NETDEV_FRONTEND
41491 +       tristate "Network-device frontend driver"
41492 +       depends on XEN
41493 +       default y
41494 +       help
41495 +         The network-device frontend driver allows the kernel to access
41496 +         network interfaces within another guest OS. Unless you are building a
41497 +         dedicated device-driver domain, or your master control domain
41498 +         (domain 0), then you almost certainly want to say Y here.
41499 +
41500 +config XEN_BLKDEV_TAP
41501 +       tristate "Block device tap driver"
41502 +       default n
41503 +       help
41504 +         This driver allows a VM to interact on block device channels
41505 +         to other VMs.  Block messages may be passed through or redirected
41506 +         to a character device, allowing device prototyping in application
41507 +         space.  Odds are that you want to say N here.
41508 +
41509 +config XEN_TPMDEV_FRONTEND
41510 +       tristate "TPM-device frontend driver"
41511 +       default n
41512 +       select TCG_TPM
41513 +       select TCG_XEN
41514 +       help
41515 +         The TPM-device frontend driver.
41516 +
41517 +config XEN_SCRUB_PAGES
41518 +       bool "Scrub memory before freeing it to Xen"
41519 +       default y
41520 +       help
41521 +         Erase memory contents before freeing it back to Xen's global
41522 +         pool. This ensures that any secrets contained within that
41523 +         memory (e.g., private keys) cannot be found by other guests that
41524 +         may be running on the machine. Most people will want to say Y here.
41525 +         If security is not a concern then you may increase performance by
41526 +         saying N.
41527 +
41528 +config XEN_DISABLE_SERIAL
41529 +       bool "Disable serial port drivers"
41530 +       default y
41531 +       help
41532 +         Disable serial port drivers, allowing the Xen console driver
41533 +         to provide a serial console at ttyS0.
41534 +
41535 +config XEN_SYSFS
41536 +       tristate "Export Xen attributes in sysfs"
41537 +       depends on SYSFS
41538 +       default y
41539 +       help
41540 +               Xen hypervisor attributes will show up under /sys/hypervisor/.
41541 +
41542 +endmenu
41543 +
41544 +config HAVE_ARCH_ALLOC_SKB
41545 +       bool
41546 +       default y
41547 +
41548 +config HAVE_ARCH_DEV_ALLOC_SKB
41549 +       bool
41550 +       default y
41551 +
41552 +config NO_IDLE_HZ
41553 +       bool
41554 +       default y
41555 +
41556 +endif
41557 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/Makefile linux-2.6.16/drivers/xen/Makefile
41558 --- linux-2.6.16.orig/drivers/xen/Makefile      1970-01-01 01:00:00.000000000 +0100
41559 +++ linux-2.6.16/drivers/xen/Makefile   2006-06-26 09:51:32.000000000 +0200
41560 @@ -0,0 +1,22 @@
41561 +
41562 +obj-y  += net_driver_util.o
41563 +obj-y  += util.o
41564 +
41565 +obj-y  += core/
41566 +obj-y  += char/
41567 +obj-y  += console/
41568 +obj-y  += evtchn/
41569 +obj-y  += balloon/
41570 +obj-y  += privcmd/
41571 +obj-y  += xenbus/
41572 +
41573 +obj-$(CONFIG_XEN_BLKDEV_BACKEND)       += blkback/
41574 +obj-$(CONFIG_XEN_NETDEV_BACKEND)       += netback/
41575 +obj-$(CONFIG_XEN_TPMDEV_BACKEND)       += tpmback/
41576 +obj-$(CONFIG_XEN_BLKDEV_FRONTEND)      += blkfront/
41577 +obj-$(CONFIG_XEN_NETDEV_FRONTEND)      += netfront/
41578 +obj-$(CONFIG_XEN_BLKDEV_TAP)           += blktap/
41579 +obj-$(CONFIG_XEN_TPMDEV_FRONTEND)      += tpmfront/
41580 +obj-$(CONFIG_XEN_PCIDEV_BACKEND)       += pciback/
41581 +obj-$(CONFIG_XEN_PCIDEV_FRONTEND)      += pcifront/
41582 +
41583 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/balloon/Makefile linux-2.6.16/drivers/xen/balloon/Makefile
41584 --- linux-2.6.16.orig/drivers/xen/balloon/Makefile      1970-01-01 01:00:00.000000000 +0100
41585 +++ linux-2.6.16/drivers/xen/balloon/Makefile   2006-06-26 09:51:32.000000000 +0200
41586 @@ -0,0 +1,2 @@
41587 +
41588 +obj-y += balloon.o
41589 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/balloon/balloon.c linux-2.6.16/drivers/xen/balloon/balloon.c
41590 --- linux-2.6.16.orig/drivers/xen/balloon/balloon.c     1970-01-01 01:00:00.000000000 +0100
41591 +++ linux-2.6.16/drivers/xen/balloon/balloon.c  2006-06-26 09:51:32.000000000 +0200
41592 @@ -0,0 +1,592 @@
41593 +/******************************************************************************
41594 + * balloon.c
41595 + *
41596 + * Xen balloon driver - enables returning/claiming memory to/from Xen.
41597 + *
41598 + * Copyright (c) 2003, B Dragovic
41599 + * Copyright (c) 2003-2004, M Williamson, K Fraser
41600 + * Copyright (c) 2005 Dan M. Smith, IBM Corporation
41601 + * 
41602 + * This program is free software; you can redistribute it and/or
41603 + * modify it under the terms of the GNU General Public License version 2
41604 + * as published by the Free Software Foundation; or, when distributed
41605 + * separately from the Linux kernel or incorporated into other
41606 + * software packages, subject to the following license:
41607 + * 
41608 + * Permission is hereby granted, free of charge, to any person obtaining a copy
41609 + * of this source file (the "Software"), to deal in the Software without
41610 + * restriction, including without limitation the rights to use, copy, modify,
41611 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
41612 + * and to permit persons to whom the Software is furnished to do so, subject to
41613 + * the following conditions:
41614 + * 
41615 + * The above copyright notice and this permission notice shall be included in
41616 + * all copies or substantial portions of the Software.
41617 + * 
41618 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
41619 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
41620 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
41621 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
41622 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
41623 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
41624 + * IN THE SOFTWARE.
41625 + */
41626 +
41627 +#include <linux/config.h>
41628 +#include <linux/kernel.h>
41629 +#include <linux/module.h>
41630 +#include <linux/sched.h>
41631 +#include <linux/errno.h>
41632 +#include <linux/mm.h>
41633 +#include <linux/mman.h>
41634 +#include <linux/smp_lock.h>
41635 +#include <linux/pagemap.h>
41636 +#include <linux/bootmem.h>
41637 +#include <linux/highmem.h>
41638 +#include <linux/vmalloc.h>
41639 +#include <xen/xen_proc.h>
41640 +#include <asm/hypervisor.h>
41641 +#include <xen/balloon.h>
41642 +#include <xen/interface/memory.h>
41643 +#include <asm/pgalloc.h>
41644 +#include <asm/pgtable.h>
41645 +#include <asm/uaccess.h>
41646 +#include <asm/tlb.h>
41647 +#include <linux/list.h>
41648 +
41649 +#include <xen/xenbus.h>
41650 +
41651 +#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
41652 +
41653 +static struct proc_dir_entry *balloon_pde;
41654 +
41655 +static DECLARE_MUTEX(balloon_mutex);
41656 +
41657 +/*
41658 + * Protects atomic reservation decrease/increase against concurrent increases.
41659 + * Also protects non-atomic updates of current_pages and driver_pages, and
41660 + * balloon lists.
41661 + */
41662 +spinlock_t balloon_lock = SPIN_LOCK_UNLOCKED;
41663 +
41664 +/* We aim for 'current allocation' == 'target allocation'. */
41665 +static unsigned long current_pages;
41666 +static unsigned long target_pages;
41667 +
41668 +/* VM /proc information for memory */
41669 +extern unsigned long totalram_pages;
41670 +
41671 +/* We may hit the hard limit in Xen. If we do then we remember it. */
41672 +static unsigned long hard_limit;
41673 +
41674 +/*
41675 + * Drivers may alter the memory reservation independently, but they must
41676 + * inform the balloon driver so that we can avoid hitting the hard limit.
41677 + */
41678 +static unsigned long driver_pages;
41679 +
41680 +/* List of ballooned pages, threaded through the mem_map array. */
41681 +static LIST_HEAD(ballooned_pages);
41682 +static unsigned long balloon_low, balloon_high;
41683 +
41684 +/* Main work function, always executed in process context. */
41685 +static void balloon_process(void *unused);
41686 +static DECLARE_WORK(balloon_worker, balloon_process, NULL);
41687 +static struct timer_list balloon_timer;
41688 +
41689 +#define PAGE_TO_LIST(p) (&(p)->ballooned)
41690 +#define LIST_TO_PAGE(l) list_entry((l), struct page, ballooned)
41691 +#define UNLIST_PAGE(p)                         \
41692 +       do {                                    \
41693 +               list_del(PAGE_TO_LIST(p));      \
41694 +               PAGE_TO_LIST(p)->next = NULL;   \
41695 +               PAGE_TO_LIST(p)->prev = NULL;   \
41696 +       } while(0)
41697 +
41698 +#define IPRINTK(fmt, args...) \
41699 +       printk(KERN_INFO "xen_mem: " fmt, ##args)
41700 +#define WPRINTK(fmt, args...) \
41701 +       printk(KERN_WARNING "xen_mem: " fmt, ##args)
41702 +
41703 +/* balloon_append: add the given page to the balloon. */
41704 +static void balloon_append(struct page *page)
41705 +{
41706 +       /* Lowmem is re-populated first, so highmem pages go at list tail. */
41707 +       if (PageHighMem(page)) {
41708 +               list_add_tail(PAGE_TO_LIST(page), &ballooned_pages);
41709 +               balloon_high++;
41710 +       } else {
41711 +               list_add(PAGE_TO_LIST(page), &ballooned_pages);
41712 +               balloon_low++;
41713 +       }
41714 +}
41715 +
41716 +/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
41717 +static struct page *balloon_retrieve(void)
41718 +{
41719 +       struct page *page;
41720 +
41721 +       if (list_empty(&ballooned_pages))
41722 +               return NULL;
41723 +
41724 +       page = LIST_TO_PAGE(ballooned_pages.next);
41725 +       UNLIST_PAGE(page);
41726 +
41727 +       if (PageHighMem(page))
41728 +               balloon_high--;
41729 +       else
41730 +               balloon_low--;
41731 +
41732 +       return page;
41733 +}
41734 +
41735 +static struct page *balloon_first_page(void)
41736 +{
41737 +       if (list_empty(&ballooned_pages))
41738 +               return NULL;
41739 +       return LIST_TO_PAGE(ballooned_pages.next);
41740 +}
41741 +
41742 +static struct page *balloon_next_page(struct page *page)
41743 +{
41744 +       struct list_head *next = PAGE_TO_LIST(page)->next;
41745 +       if (next == &ballooned_pages)
41746 +               return NULL;
41747 +       return LIST_TO_PAGE(next);
41748 +}
41749 +
41750 +static void balloon_alarm(unsigned long unused)
41751 +{
41752 +       schedule_work(&balloon_worker);
41753 +}
41754 +
41755 +static unsigned long current_target(void)
41756 +{
41757 +       unsigned long target = min(target_pages, hard_limit);
41758 +       if (target > (current_pages + balloon_low + balloon_high))
41759 +               target = current_pages + balloon_low + balloon_high;
41760 +       return target;
41761 +}
41762 +
41763 +static int increase_reservation(unsigned long nr_pages)
41764 +{
41765 +       unsigned long *frame_list, pfn, i, flags;
41766 +       struct page   *page;
41767 +       long           rc;
41768 +       struct xen_memory_reservation reservation = {
41769 +               .address_bits = 0,
41770 +               .extent_order = 0,
41771 +               .domid        = DOMID_SELF
41772 +       };
41773 +
41774 +       if (nr_pages > (PAGE_SIZE / sizeof(unsigned long)))
41775 +               nr_pages = PAGE_SIZE / sizeof(unsigned long);
41776 +
41777 +       frame_list = (unsigned long *)__get_free_page(GFP_KERNEL);
41778 +       if (frame_list == NULL)
41779 +               return -ENOMEM;
41780 +
41781 +       balloon_lock(flags);
41782 +
41783 +       page = balloon_first_page();
41784 +       for (i = 0; i < nr_pages; i++) {
41785 +               BUG_ON(page == NULL);
41786 +               frame_list[i] = page_to_pfn(page);;
41787 +               page = balloon_next_page(page);
41788 +       }
41789 +
41790 +       reservation.extent_start = frame_list;
41791 +       reservation.nr_extents   = nr_pages;
41792 +       rc = HYPERVISOR_memory_op(
41793 +               XENMEM_populate_physmap, &reservation);
41794 +       if (rc < nr_pages) {
41795 +               int ret;
41796 +               /* We hit the Xen hard limit: reprobe. */
41797 +               reservation.extent_start = frame_list;
41798 +               reservation.nr_extents   = rc;
41799 +               ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
41800 +                               &reservation);
41801 +               BUG_ON(ret != rc);
41802 +               hard_limit = current_pages + rc - driver_pages;
41803 +               goto out;
41804 +       }
41805 +
41806 +       for (i = 0; i < nr_pages; i++) {
41807 +               page = balloon_retrieve();
41808 +               BUG_ON(page == NULL);
41809 +
41810 +               pfn = page_to_pfn(page);
41811 +               BUG_ON(phys_to_machine_mapping_valid(pfn));
41812 +
41813 +               /* Update P->M and M->P tables. */
41814 +               set_phys_to_machine(pfn, frame_list[i]);
41815 +               xen_machphys_update(frame_list[i], pfn);
41816 +            
41817 +               /* Link back into the page tables if not highmem. */
41818 +               if (pfn < max_low_pfn) {
41819 +                       int ret;
41820 +                       ret = HYPERVISOR_update_va_mapping(
41821 +                               (unsigned long)__va(pfn << PAGE_SHIFT),
41822 +                               pfn_pte_ma(frame_list[i], PAGE_KERNEL),
41823 +                               0);
41824 +                       BUG_ON(ret);
41825 +               }
41826 +
41827 +               /* Relinquish the page back to the allocator. */
41828 +               ClearPageReserved(page);
41829 +               set_page_count(page, 1);
41830 +               __free_page(page);
41831 +       }
41832 +
41833 +       current_pages += nr_pages;
41834 +       totalram_pages = current_pages;
41835 +
41836 + out:
41837 +       balloon_unlock(flags);
41838 +
41839 +       free_page((unsigned long)frame_list);
41840 +
41841 +       return 0;
41842 +}
41843 +
41844 +static int decrease_reservation(unsigned long nr_pages)
41845 +{
41846 +       unsigned long *frame_list, pfn, i, flags;
41847 +       struct page   *page;
41848 +       void          *v;
41849 +       int            need_sleep = 0;
41850 +       int ret;
41851 +       struct xen_memory_reservation reservation = {
41852 +               .address_bits = 0,
41853 +               .extent_order = 0,
41854 +               .domid        = DOMID_SELF
41855 +       };
41856 +
41857 +       if (nr_pages > (PAGE_SIZE / sizeof(unsigned long)))
41858 +               nr_pages = PAGE_SIZE / sizeof(unsigned long);
41859 +
41860 +       frame_list = (unsigned long *)__get_free_page(GFP_KERNEL);
41861 +       if (frame_list == NULL)
41862 +               return -ENOMEM;
41863 +
41864 +       for (i = 0; i < nr_pages; i++) {
41865 +               if ((page = alloc_page(GFP_HIGHUSER)) == NULL) {
41866 +                       nr_pages = i;
41867 +                       need_sleep = 1;
41868 +                       break;
41869 +               }
41870 +
41871 +               pfn = page_to_pfn(page);
41872 +               frame_list[i] = pfn_to_mfn(pfn);
41873 +
41874 +               if (!PageHighMem(page)) {
41875 +                       v = phys_to_virt(pfn << PAGE_SHIFT);
41876 +                       scrub_pages(v, 1);
41877 +                       ret = HYPERVISOR_update_va_mapping(
41878 +                               (unsigned long)v, __pte_ma(0), 0);
41879 +                       BUG_ON(ret);
41880 +               }
41881 +#ifdef CONFIG_XEN_SCRUB_PAGES
41882 +               else {
41883 +                       v = kmap(page);
41884 +                       scrub_pages(v, 1);
41885 +                       kunmap(page);
41886 +               }
41887 +#endif
41888 +       }
41889 +
41890 +       /* Ensure that ballooned highmem pages don't have kmaps. */
41891 +       kmap_flush_unused();
41892 +       flush_tlb_all();
41893 +
41894 +       balloon_lock(flags);
41895 +
41896 +       /* No more mappings: invalidate P2M and add to balloon. */
41897 +       for (i = 0; i < nr_pages; i++) {
41898 +               pfn = mfn_to_pfn(frame_list[i]);
41899 +               set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
41900 +               balloon_append(pfn_to_page(pfn));
41901 +       }
41902 +
41903 +       reservation.extent_start = frame_list;
41904 +       reservation.nr_extents   = nr_pages;
41905 +       ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
41906 +       BUG_ON(ret != nr_pages);
41907 +
41908 +       current_pages -= nr_pages;
41909 +       totalram_pages = current_pages;
41910 +
41911 +       balloon_unlock(flags);
41912 +
41913 +       free_page((unsigned long)frame_list);
41914 +
41915 +       return need_sleep;
41916 +}
41917 +
41918 +/*
41919 + * We avoid multiple worker processes conflicting via the balloon mutex.
41920 + * We may of course race updates of the target counts (which are protected
41921 + * by the balloon lock), or with changes to the Xen hard limit, but we will
41922 + * recover from these in time.
41923 + */
41924 +static void balloon_process(void *unused)
41925 +{
41926 +       int need_sleep = 0;
41927 +       long credit;
41928 +
41929 +       down(&balloon_mutex);
41930 +
41931 +       do {
41932 +               credit = current_target() - current_pages;
41933 +               if (credit > 0)
41934 +                       need_sleep = (increase_reservation(credit) != 0);
41935 +               if (credit < 0)
41936 +                       need_sleep = (decrease_reservation(-credit) != 0);
41937 +
41938 +#ifndef CONFIG_PREEMPT
41939 +               if (need_resched())
41940 +                       schedule();
41941 +#endif
41942 +       } while ((credit != 0) && !need_sleep);
41943 +
41944 +       /* Schedule more work if there is some still to be done. */
41945 +       if (current_target() != current_pages)
41946 +               mod_timer(&balloon_timer, jiffies + HZ);
41947 +
41948 +       up(&balloon_mutex);
41949 +}
41950 +
41951 +/* Resets the Xen limit, sets new target, and kicks off processing. */
41952 +static void set_new_target(unsigned long target)
41953 +{
41954 +       /* No need for lock. Not read-modify-write updates. */
41955 +       hard_limit   = ~0UL;
41956 +       target_pages = target;
41957 +       schedule_work(&balloon_worker);
41958 +}
41959 +
41960 +static struct xenbus_watch target_watch =
41961 +{
41962 +       .node = "memory/target"
41963 +};
41964 +
41965 +/* React to a change in the target key */
41966 +static void watch_target(struct xenbus_watch *watch,
41967 +                        const char **vec, unsigned int len)
41968 +{
41969 +       unsigned long long new_target;
41970 +       int err;
41971 +
41972 +       err = xenbus_scanf(XBT_NULL, "memory", "target", "%llu", &new_target);
41973 +       if (err != 1) {
41974 +               /* This is ok (for domain0 at least) - so just return */
41975 +               return;
41976 +       } 
41977 +        
41978 +       /* The given memory/target value is in KiB, so it needs converting to
41979 +          pages.  PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
41980 +       */
41981 +       set_new_target(new_target >> (PAGE_SHIFT - 10));
41982 +    
41983 +}
41984 +
41985 +static int balloon_init_watcher(struct notifier_block *notifier,
41986 +                                unsigned long event,
41987 +                                void *data)
41988 +{
41989 +       int err;
41990 +
41991 +       err = register_xenbus_watch(&target_watch);
41992 +       if (err)
41993 +               printk(KERN_ERR "Failed to set balloon watcher\n");
41994 +
41995 +       return NOTIFY_DONE;
41996 +    
41997 +}
41998 +
41999 +static int balloon_write(struct file *file, const char __user *buffer,
42000 +                         unsigned long count, void *data)
42001 +{
42002 +       char memstring[64], *endchar;
42003 +       unsigned long long target_bytes;
42004 +
42005 +       if (!capable(CAP_SYS_ADMIN))
42006 +               return -EPERM;
42007 +
42008 +       if (count <= 1)
42009 +               return -EBADMSG; /* runt */
42010 +       if (count > sizeof(memstring))
42011 +               return -EFBIG;   /* too long */
42012 +
42013 +       if (copy_from_user(memstring, buffer, count))
42014 +               return -EFAULT;
42015 +       memstring[sizeof(memstring)-1] = '\0';
42016 +
42017 +       target_bytes = memparse(memstring, &endchar);
42018 +       set_new_target(target_bytes >> PAGE_SHIFT);
42019 +
42020 +       return count;
42021 +}
42022 +
42023 +static int balloon_read(char *page, char **start, off_t off,
42024 +                        int count, int *eof, void *data)
42025 +{
42026 +       int len;
42027 +
42028 +       len = sprintf(
42029 +               page,
42030 +               "Current allocation: %8lu kB\n"
42031 +               "Requested target:   %8lu kB\n"
42032 +               "Low-mem balloon:    %8lu kB\n"
42033 +               "High-mem balloon:   %8lu kB\n"
42034 +               "Xen hard limit:     ",
42035 +               PAGES2KB(current_pages), PAGES2KB(target_pages), 
42036 +               PAGES2KB(balloon_low), PAGES2KB(balloon_high));
42037 +
42038 +       if (hard_limit != ~0UL) {
42039 +               len += sprintf(
42040 +                       page + len, 
42041 +                       "%8lu kB (inc. %8lu kB driver headroom)\n",
42042 +                       PAGES2KB(hard_limit), PAGES2KB(driver_pages));
42043 +       } else {
42044 +               len += sprintf(
42045 +                       page + len,
42046 +                       "     ??? kB\n");
42047 +       }
42048 +
42049 +       *eof = 1;
42050 +       return len;
42051 +}
42052 +
42053 +static struct notifier_block xenstore_notifier;
42054 +
42055 +static int __init balloon_init(void)
42056 +{
42057 +       unsigned long pfn;
42058 +       struct page *page;
42059 +
42060 +       IPRINTK("Initialising balloon driver.\n");
42061 +
42062 +       if (xen_init() < 0)
42063 +               return -1;
42064 +
42065 +       current_pages = min(xen_start_info->nr_pages, max_pfn);
42066 +       totalram_pages = current_pages;
42067 +       target_pages  = current_pages;
42068 +       balloon_low   = 0;
42069 +       balloon_high  = 0;
42070 +       driver_pages  = 0UL;
42071 +       hard_limit    = ~0UL;
42072 +
42073 +       init_timer(&balloon_timer);
42074 +       balloon_timer.data = 0;
42075 +       balloon_timer.function = balloon_alarm;
42076 +    
42077 +       if ((balloon_pde = create_xen_proc_entry("balloon", 0644)) == NULL) {
42078 +               WPRINTK("Unable to create /proc/xen/balloon.\n");
42079 +               return -1;
42080 +       }
42081 +
42082 +       balloon_pde->read_proc  = balloon_read;
42083 +       balloon_pde->write_proc = balloon_write;
42084 +    
42085 +       /* Initialise the balloon with excess memory space. */
42086 +       for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
42087 +               page = pfn_to_page(pfn);
42088 +               if (!PageReserved(page))
42089 +                       balloon_append(page);
42090 +       }
42091 +
42092 +       target_watch.callback = watch_target;
42093 +       xenstore_notifier.notifier_call = balloon_init_watcher;
42094 +
42095 +       register_xenstore_notifier(&xenstore_notifier);
42096 +    
42097 +       return 0;
42098 +}
42099 +
42100 +subsys_initcall(balloon_init);
42101 +
42102 +void balloon_update_driver_allowance(long delta)
42103 +{
42104 +       unsigned long flags;
42105 +
42106 +       balloon_lock(flags);
42107 +       driver_pages += delta;
42108 +       balloon_unlock(flags);
42109 +}
42110 +
42111 +static int dealloc_pte_fn(
42112 +       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
42113 +{
42114 +       unsigned long mfn = pte_mfn(*pte);
42115 +       int ret;
42116 +       struct xen_memory_reservation reservation = {
42117 +               .extent_start = &mfn,
42118 +               .nr_extents   = 1,
42119 +               .extent_order = 0,
42120 +               .domid        = DOMID_SELF
42121 +       };
42122 +       set_pte_at(&init_mm, addr, pte, __pte_ma(0));
42123 +       set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY);
42124 +       ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
42125 +       BUG_ON(ret != 1);
42126 +       return 0;
42127 +}
42128 +
42129 +struct page *balloon_alloc_empty_page_range(unsigned long nr_pages)
42130 +{
42131 +       unsigned long vstart, flags;
42132 +       unsigned int  order = get_order(nr_pages * PAGE_SIZE);
42133 +       int ret;
42134 +
42135 +       vstart = __get_free_pages(GFP_KERNEL, order);
42136 +       if (vstart == 0)
42137 +               return NULL;
42138 +
42139 +       scrub_pages(vstart, 1 << order);
42140 +
42141 +       balloon_lock(flags);
42142 +       ret = apply_to_page_range(&init_mm, vstart,
42143 +                                 PAGE_SIZE << order, dealloc_pte_fn, NULL);
42144 +       BUG_ON(ret);
42145 +       current_pages -= 1UL << order;
42146 +       totalram_pages = current_pages;
42147 +       balloon_unlock(flags);
42148 +
42149 +       schedule_work(&balloon_worker);
42150 +
42151 +       flush_tlb_all();
42152 +
42153 +       return virt_to_page(vstart);
42154 +}
42155 +
42156 +void balloon_dealloc_empty_page_range(
42157 +       struct page *page, unsigned long nr_pages)
42158 +{
42159 +       unsigned long i, flags;
42160 +       unsigned int  order = get_order(nr_pages * PAGE_SIZE);
42161 +
42162 +       balloon_lock(flags);
42163 +       for (i = 0; i < (1UL << order); i++)
42164 +               balloon_append(page + i);
42165 +       balloon_unlock(flags);
42166 +
42167 +       schedule_work(&balloon_worker);
42168 +}
42169 +
42170 +EXPORT_SYMBOL_GPL(balloon_update_driver_allowance);
42171 +EXPORT_SYMBOL_GPL(balloon_alloc_empty_page_range);
42172 +EXPORT_SYMBOL_GPL(balloon_dealloc_empty_page_range);
42173 +
42174 +MODULE_LICENSE("Dual BSD/GPL");
42175 +
42176 +/*
42177 + * Local variables:
42178 + *  c-file-style: "linux"
42179 + *  indent-tabs-mode: t
42180 + *  c-indent-level: 8
42181 + *  c-basic-offset: 8
42182 + *  tab-width: 8
42183 + * End:
42184 + */
42185 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/blkback/Makefile linux-2.6.16/drivers/xen/blkback/Makefile
42186 --- linux-2.6.16.orig/drivers/xen/blkback/Makefile      1970-01-01 01:00:00.000000000 +0100
42187 +++ linux-2.6.16/drivers/xen/blkback/Makefile   2006-06-26 09:51:32.000000000 +0200
42188 @@ -0,0 +1,3 @@
42189 +obj-$(CONFIG_XEN_BLKDEV_BACKEND) := blkbk.o
42190 +
42191 +blkbk-y        := blkback.o xenbus.o interface.o vbd.o
42192 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/blkback/blkback.c linux-2.6.16/drivers/xen/blkback/blkback.c
42193 --- linux-2.6.16.orig/drivers/xen/blkback/blkback.c     1970-01-01 01:00:00.000000000 +0100
42194 +++ linux-2.6.16/drivers/xen/blkback/blkback.c  2006-06-26 09:51:32.000000000 +0200
42195 @@ -0,0 +1,604 @@
42196 +/******************************************************************************
42197 + * arch/xen/drivers/blkif/backend/main.c
42198 + * 
42199 + * Back-end of the driver for virtual block devices. This portion of the
42200 + * driver exports a 'unified' block-device interface that can be accessed
42201 + * by any operating system that implements a compatible front end. A 
42202 + * reference front-end implementation can be found in:
42203 + *  arch/xen/drivers/blkif/frontend
42204 + * 
42205 + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
42206 + * Copyright (c) 2005, Christopher Clark
42207 + * 
42208 + * This program is free software; you can redistribute it and/or
42209 + * modify it under the terms of the GNU General Public License version 2
42210 + * as published by the Free Software Foundation; or, when distributed
42211 + * separately from the Linux kernel or incorporated into other
42212 + * software packages, subject to the following license:
42213 + * 
42214 + * Permission is hereby granted, free of charge, to any person obtaining a copy
42215 + * of this source file (the "Software"), to deal in the Software without
42216 + * restriction, including without limitation the rights to use, copy, modify,
42217 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
42218 + * and to permit persons to whom the Software is furnished to do so, subject to
42219 + * the following conditions:
42220 + * 
42221 + * The above copyright notice and this permission notice shall be included in
42222 + * all copies or substantial portions of the Software.
42223 + * 
42224 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
42225 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
42226 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
42227 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
42228 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
42229 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
42230 + * IN THE SOFTWARE.
42231 + */
42232 +
42233 +#include <linux/spinlock.h>
42234 +#include <linux/kthread.h>
42235 +#include <linux/list.h>
42236 +#include <xen/balloon.h>
42237 +#include <asm/hypervisor.h>
42238 +#include "common.h"
42239 +
42240 +/*
42241 + * These are rather arbitrary. They are fairly large because adjacent requests
42242 + * pulled from a communication ring are quite likely to end up being part of
42243 + * the same scatter/gather request at the disc.
42244 + * 
42245 + * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
42246 + * 
42247 + * This will increase the chances of being able to write whole tracks.
42248 + * 64 should be enough to keep us competitive with Linux.
42249 + */
42250 +static int blkif_reqs = 64;
42251 +module_param_named(reqs, blkif_reqs, int, 0);
42252 +MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
42253 +
42254 +static int mmap_pages;
42255 +
42256 +/* Run-time switchable: /sys/module/blkback/parameters/ */
42257 +static unsigned int log_stats = 0;
42258 +static unsigned int debug_lvl = 0;
42259 +module_param(log_stats, int, 0644);
42260 +module_param(debug_lvl, int, 0644);
42261 +
42262 +/*
42263 + * Each outstanding request that we've passed to the lower device layers has a 
42264 + * 'pending_req' allocated to it. Each buffer_head that completes decrements 
42265 + * the pendcnt towards zero. When it hits zero, the specified domain has a 
42266 + * response queued for it, with the saved 'id' passed back.
42267 + */
42268 +typedef struct {
42269 +       blkif_t       *blkif;
42270 +       unsigned long  id;
42271 +       int            nr_pages;
42272 +       atomic_t       pendcnt;
42273 +       unsigned short operation;
42274 +       int            status;
42275 +       struct list_head free_list;
42276 +} pending_req_t;
42277 +
42278 +static pending_req_t *pending_reqs;
42279 +static struct list_head pending_free;
42280 +static spinlock_t pending_free_lock = SPIN_LOCK_UNLOCKED;
42281 +static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
42282 +
42283 +#define BLKBACK_INVALID_HANDLE (~0)
42284 +
42285 +static unsigned long mmap_vstart;
42286 +static unsigned long *pending_vaddrs;
42287 +static grant_handle_t *pending_grant_handles;
42288 +
42289 +static inline int vaddr_pagenr(pending_req_t *req, int seg)
42290 +{
42291 +       return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
42292 +}
42293 +
42294 +static inline unsigned long vaddr(pending_req_t *req, int seg)
42295 +{
42296 +       return pending_vaddrs[vaddr_pagenr(req, seg)];
42297 +}
42298 +
42299 +#define pending_handle(_req, _seg) \
42300 +       (pending_grant_handles[vaddr_pagenr(_req, _seg)])
42301 +
42302 +
42303 +#ifdef CONFIG_XEN_BLKDEV_TAP_BE
42304 +/*
42305 + * If the tap driver is used, we may get pages belonging to either the tap
42306 + * or (more likely) the real frontend.  The backend must specify which domain
42307 + * a given page belongs to in update_va_mapping though.  For the moment, 
42308 + * the tap rewrites the ID field of the request to contain the request index
42309 + * and the id of the real front end domain.
42310 + */
42311 +#define BLKTAP_COOKIE 0xbeadfeed
42312 +static inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16); }
42313 +#endif
42314 +
42315 +static int do_block_io_op(blkif_t *blkif);
42316 +static void dispatch_rw_block_io(blkif_t *blkif,
42317 +                                blkif_request_t *req,
42318 +                                pending_req_t *pending_req);
42319 +static void make_response(blkif_t *blkif, unsigned long id, 
42320 +                          unsigned short op, int st);
42321 +
42322 +/******************************************************************
42323 + * misc small helpers
42324 + */
42325 +static pending_req_t* alloc_req(void)
42326 +{
42327 +       pending_req_t *req = NULL;
42328 +       unsigned long flags;
42329 +
42330 +       spin_lock_irqsave(&pending_free_lock, flags);
42331 +       if (!list_empty(&pending_free)) {
42332 +               req = list_entry(pending_free.next, pending_req_t, free_list);
42333 +               list_del(&req->free_list);
42334 +       }
42335 +       spin_unlock_irqrestore(&pending_free_lock, flags);
42336 +       return req;
42337 +}
42338 +
42339 +static void free_req(pending_req_t *req)
42340 +{
42341 +       unsigned long flags;
42342 +       int was_empty;
42343 +
42344 +       spin_lock_irqsave(&pending_free_lock, flags);
42345 +       was_empty = list_empty(&pending_free);
42346 +       list_add(&req->free_list, &pending_free);
42347 +       spin_unlock_irqrestore(&pending_free_lock, flags);
42348 +       if (was_empty)
42349 +               wake_up(&pending_free_wq);
42350 +}
42351 +
42352 +static void unplug_queue(blkif_t *blkif)
42353 +{
42354 +       if (blkif->plug == NULL)
42355 +               return;
42356 +       if (blkif->plug->unplug_fn)
42357 +               blkif->plug->unplug_fn(blkif->plug);
42358 +       blk_put_queue(blkif->plug);
42359 +       blkif->plug = NULL;
42360 +}
42361 +
42362 +static void plug_queue(blkif_t *blkif, struct bio *bio)
42363 +{
42364 +       request_queue_t *q = bdev_get_queue(bio->bi_bdev);
42365 +
42366 +       if (q == blkif->plug)
42367 +               return;
42368 +       unplug_queue(blkif);
42369 +       blk_get_queue(q);
42370 +       blkif->plug = q;
42371 +}
42372 +
42373 +static void fast_flush_area(pending_req_t *req)
42374 +{
42375 +       struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
42376 +       unsigned int i, invcount = 0;
42377 +       grant_handle_t handle;
42378 +       int ret;
42379 +
42380 +       for (i = 0; i < req->nr_pages; i++) {
42381 +               handle = pending_handle(req, i);
42382 +               if (handle == BLKBACK_INVALID_HANDLE)
42383 +                       continue;
42384 +               unmap[invcount].host_addr    = vaddr(req, i);
42385 +               unmap[invcount].dev_bus_addr = 0;
42386 +               unmap[invcount].handle       = handle;
42387 +               pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
42388 +               invcount++;
42389 +       }
42390 +
42391 +       ret = HYPERVISOR_grant_table_op(
42392 +               GNTTABOP_unmap_grant_ref, unmap, invcount);
42393 +       BUG_ON(ret);
42394 +}
42395 +
42396 +/******************************************************************
42397 + * SCHEDULER FUNCTIONS
42398 + */
42399 +
42400 +static void print_stats(blkif_t *blkif)
42401 +{
42402 +       printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d\n",
42403 +              current->comm, blkif->st_oo_req,
42404 +              blkif->st_rd_req, blkif->st_wr_req);
42405 +       blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
42406 +       blkif->st_rd_req = 0;
42407 +       blkif->st_wr_req = 0;
42408 +       blkif->st_oo_req = 0;
42409 +}
42410 +
42411 +int blkif_schedule(void *arg)
42412 +{
42413 +       blkif_t *blkif = arg;
42414 +
42415 +       blkif_get(blkif);
42416 +
42417 +       if (debug_lvl)
42418 +               printk(KERN_DEBUG "%s: started\n", current->comm);
42419 +
42420 +       /*
42421 +        * This thread may start before we are connected to the frontend
42422 +        * driver. In that case we must wait to be fully connected.
42423 +        */
42424 +       wait_event_interruptible(
42425 +               blkif->wq,
42426 +               blkif_connected(blkif) || kthread_should_stop());
42427 +
42428 +       while (!kthread_should_stop()) {
42429 +               wait_event_interruptible(
42430 +                       blkif->wq,
42431 +                       atomic_read(&blkif->io_pending) ||
42432 +                       kthread_should_stop());
42433 +               wait_event_interruptible(
42434 +                       pending_free_wq,
42435 +                       !list_empty(&pending_free) ||
42436 +                       kthread_should_stop());
42437 +
42438 +               atomic_set(&blkif->io_pending, 0);
42439 +               if (do_block_io_op(blkif))
42440 +                       atomic_inc(&blkif->io_pending);
42441 +               unplug_queue(blkif);
42442 +
42443 +               if (log_stats && time_after(jiffies, blkif->st_print))
42444 +                       print_stats(blkif);
42445 +       }
42446 +
42447 +       if (log_stats)
42448 +               print_stats(blkif);
42449 +       if (debug_lvl)
42450 +               printk(KERN_DEBUG "%s: exiting\n", current->comm);
42451 +
42452 +       blkif->xenblkd = NULL;
42453 +       blkif_put(blkif);
42454 +
42455 +       return 0;
42456 +}
42457 +
42458 +/******************************************************************
42459 + * COMPLETION CALLBACK -- Called as bh->b_end_io()
42460 + */
42461 +
42462 +static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
42463 +{
42464 +       /* An error fails the entire request. */
42465 +       if (!uptodate) {
42466 +               DPRINTK("Buffer not up-to-date at end of operation\n");
42467 +               pending_req->status = BLKIF_RSP_ERROR;
42468 +       }
42469 +
42470 +       if (atomic_dec_and_test(&pending_req->pendcnt)) {
42471 +               fast_flush_area(pending_req);
42472 +               make_response(pending_req->blkif, pending_req->id,
42473 +                             pending_req->operation, pending_req->status);
42474 +               blkif_put(pending_req->blkif);
42475 +               free_req(pending_req);
42476 +       }
42477 +}
42478 +
42479 +static int end_block_io_op(struct bio *bio, unsigned int done, int error)
42480 +{
42481 +       if (bio->bi_size != 0)
42482 +               return 1;
42483 +       __end_block_io_op(bio->bi_private, !error);
42484 +       bio_put(bio);
42485 +       return error;
42486 +}
42487 +
42488 +
42489 +/******************************************************************************
42490 + * NOTIFICATION FROM GUEST OS.
42491 + */
42492 +
42493 +irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
42494 +{
42495 +       blkif_t *blkif = dev_id;
42496 +
42497 +       atomic_inc(&blkif->io_pending);
42498 +       wake_up(&blkif->wq);
42499 +       return IRQ_HANDLED;
42500 +}
42501 +
42502 +
42503 +
42504 +/******************************************************************
42505 + * DOWNWARD CALLS -- These interface with the block-device layer proper.
42506 + */
42507 +
42508 +static int do_block_io_op(blkif_t *blkif)
42509 +{
42510 +       blkif_back_ring_t *blk_ring = &blkif->blk_ring;
42511 +       blkif_request_t *req;
42512 +       pending_req_t *pending_req;
42513 +       RING_IDX rc, rp;
42514 +       int more_to_do = 0;
42515 +
42516 +       rc = blk_ring->req_cons;
42517 +       rp = blk_ring->sring->req_prod;
42518 +       rmb(); /* Ensure we see queued requests up to 'rp'. */
42519 +
42520 +       while ((rc != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
42521 +
42522 +               pending_req = alloc_req();
42523 +               if (NULL == pending_req) {
42524 +                       blkif->st_oo_req++;
42525 +                       more_to_do = 1;
42526 +                       break;
42527 +               }
42528 +
42529 +               req = RING_GET_REQUEST(blk_ring, rc);
42530 +               blk_ring->req_cons = ++rc; /* before make_response() */
42531 +
42532 +               switch (req->operation) {
42533 +               case BLKIF_OP_READ:
42534 +                       blkif->st_rd_req++;
42535 +                       dispatch_rw_block_io(blkif, req, pending_req);
42536 +                       break;
42537 +               case BLKIF_OP_WRITE:
42538 +                       blkif->st_wr_req++;
42539 +                       dispatch_rw_block_io(blkif, req, pending_req);
42540 +                       break;
42541 +               default:
42542 +                       DPRINTK("error: unknown block io operation [%d]\n",
42543 +                               req->operation);
42544 +                       make_response(blkif, req->id, req->operation,
42545 +                                     BLKIF_RSP_ERROR);
42546 +                       free_req(pending_req);
42547 +                       break;
42548 +               }
42549 +       }
42550 +       return more_to_do;
42551 +}
42552 +
42553 +static void dispatch_rw_block_io(blkif_t *blkif,
42554 +                                blkif_request_t *req,
42555 +                                pending_req_t *pending_req)
42556 +{
42557 +       extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
42558 +       int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
42559 +       struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
42560 +       struct phys_req preq;
42561 +       struct { 
42562 +               unsigned long buf; unsigned int nsec;
42563 +       } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
42564 +       unsigned int nseg;
42565 +       struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
42566 +       int ret, i, nbio = 0;
42567 +
42568 +       /* Check that number of segments is sane. */
42569 +       nseg = req->nr_segments;
42570 +       if (unlikely(nseg == 0) || 
42571 +           unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
42572 +               DPRINTK("Bad number of segments in request (%d)\n", nseg);
42573 +               goto fail_response;
42574 +       }
42575 +
42576 +       preq.dev           = req->handle;
42577 +       preq.sector_number = req->sector_number;
42578 +       preq.nr_sects      = 0;
42579 +
42580 +       pending_req->blkif     = blkif;
42581 +       pending_req->id        = req->id;
42582 +       pending_req->operation = operation;
42583 +       pending_req->status    = BLKIF_RSP_OKAY;
42584 +       pending_req->nr_pages  = nseg;
42585 +
42586 +       for (i = 0; i < nseg; i++) {
42587 +               seg[i].nsec = req->seg[i].last_sect -
42588 +                       req->seg[i].first_sect + 1;
42589 +
42590 +               if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
42591 +                   (seg[i].nsec <= 0))
42592 +                       goto fail_response;
42593 +               preq.nr_sects += seg[i].nsec;
42594 +
42595 +               map[i].host_addr = vaddr(pending_req, i);
42596 +               map[i].dom = blkif->domid;
42597 +               map[i].ref = req->seg[i].gref;
42598 +               map[i].flags = GNTMAP_host_map;
42599 +               if ( operation == WRITE )
42600 +                       map[i].flags |= GNTMAP_readonly;
42601 +       }
42602 +
42603 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
42604 +       BUG_ON(ret);
42605 +
42606 +       for (i = 0; i < nseg; i++) {
42607 +               if (unlikely(map[i].status != 0)) {
42608 +                       DPRINTK("invalid buffer -- could not remap it\n");
42609 +                       goto fail_flush;
42610 +               }
42611 +
42612 +               pending_handle(pending_req, i) = map[i].handle;
42613 +#ifdef __ia64__
42614 +               pending_vaddrs[vaddr_pagenr(pending_req, i)] =
42615 +                       (unsigned long)gnttab_map_vaddr(map[i]);
42616 +#else
42617 +               set_phys_to_machine(__pa(vaddr(
42618 +                       pending_req, i)) >> PAGE_SHIFT,
42619 +                       FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
42620 +#endif
42621 +               seg[i].buf  = map[i].dev_bus_addr | 
42622 +                       (req->seg[i].first_sect << 9);
42623 +       }
42624 +
42625 +       if (vbd_translate(&preq, blkif, operation) != 0) {
42626 +               DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", 
42627 +                       operation == READ ? "read" : "write",
42628 +                       preq.sector_number,
42629 +                       preq.sector_number + preq.nr_sects, preq.dev); 
42630 +               goto fail_flush;
42631 +       }
42632 +
42633 +       for (i = 0; i < nseg; i++) {
42634 +               if (((int)preq.sector_number|(int)seg[i].nsec) &
42635 +                   ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
42636 +                       DPRINTK("Misaligned I/O request from domain %d",
42637 +                               blkif->domid);
42638 +                       goto fail_put_bio;
42639 +               }
42640 +
42641 +               while ((bio == NULL) ||
42642 +                      (bio_add_page(bio,
42643 +                                    virt_to_page(vaddr(pending_req, i)),
42644 +                                    seg[i].nsec << 9,
42645 +                                    seg[i].buf & ~PAGE_MASK) == 0)) {
42646 +                       bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
42647 +                       if (unlikely(bio == NULL))
42648 +                               goto fail_put_bio;
42649 +                
42650 +                       bio->bi_bdev    = preq.bdev;
42651 +                       bio->bi_private = pending_req;
42652 +                       bio->bi_end_io  = end_block_io_op;
42653 +                       bio->bi_sector  = preq.sector_number;
42654 +               }
42655 +
42656 +               preq.sector_number += seg[i].nsec;
42657 +       }
42658 +
42659 +       plug_queue(blkif, bio);
42660 +       atomic_set(&pending_req->pendcnt, nbio);
42661 +       blkif_get(blkif);
42662 +
42663 +       for (i = 0; i < nbio; i++)
42664 +               submit_bio(operation, biolist[i]);
42665 +
42666 +       return;
42667 +
42668 + fail_put_bio:
42669 +       for (i = 0; i < (nbio-1); i++)
42670 +               bio_put(biolist[i]);
42671 + fail_flush:
42672 +       fast_flush_area(pending_req);
42673 + fail_response:
42674 +       make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
42675 +       free_req(pending_req);
42676 +} 
42677 +
42678 +
42679 +
42680 +/******************************************************************
42681 + * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
42682 + */
42683 +
42684 +
42685 +static void make_response(blkif_t *blkif, unsigned long id, 
42686 +                          unsigned short op, int st)
42687 +{
42688 +       blkif_response_t *resp;
42689 +       unsigned long     flags;
42690 +       blkif_back_ring_t *blk_ring = &blkif->blk_ring;
42691 +       int more_to_do = 0;
42692 +       int notify;
42693 +
42694 +       spin_lock_irqsave(&blkif->blk_ring_lock, flags);
42695 +
42696 +       /* Place on the response ring for the relevant domain. */ 
42697 +       resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
42698 +       resp->id        = id;
42699 +       resp->operation = op;
42700 +       resp->status    = st;
42701 +       blk_ring->rsp_prod_pvt++;
42702 +       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
42703 +
42704 +       if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
42705 +               /*
42706 +                * Tail check for pending requests. Allows frontend to avoid
42707 +                * notifications if requests are already in flight (lower
42708 +                * overheads and promotes batching).
42709 +                */
42710 +               RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
42711 +
42712 +       } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
42713 +               more_to_do = 1;
42714 +
42715 +       }
42716 +       spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
42717 +
42718 +       if (more_to_do) {
42719 +               atomic_inc(&blkif->io_pending);
42720 +               wake_up(&blkif->wq);
42721 +       }
42722 +       if (notify)
42723 +               notify_remote_via_irq(blkif->irq);
42724 +}
42725 +
42726 +static int __init blkif_init(void)
42727 +{
42728 +       struct page *page;
42729 +       int i;
42730 +
42731 +       if (xen_init() < 0)
42732 +               return -ENODEV;
42733 +
42734 +       mmap_pages            = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
42735 +       pending_reqs          = kmalloc(sizeof(pending_reqs[0]) *
42736 +                                       blkif_reqs, GFP_KERNEL);
42737 +       pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
42738 +                                       mmap_pages, GFP_KERNEL);
42739 +       pending_vaddrs        = kmalloc(sizeof(pending_vaddrs[0]) *
42740 +                                       mmap_pages, GFP_KERNEL);
42741 +       if (!pending_reqs || !pending_grant_handles || !pending_vaddrs) {
42742 +               kfree(pending_reqs);
42743 +               kfree(pending_grant_handles);
42744 +               kfree(pending_vaddrs);
42745 +               printk("%s: out of memory\n", __FUNCTION__);
42746 +               return -ENOMEM;
42747 +       }
42748 +
42749 +       blkif_interface_init();
42750 +       
42751 +#ifdef __ia64__
42752 +       extern unsigned long alloc_empty_foreign_map_page_range(
42753 +               unsigned long pages);
42754 +       mmap_vstart = (unsigned long)
42755 +               alloc_empty_foreign_map_page_range(mmap_pages);
42756 +#else /* ! ia64 */
42757 +       page = balloon_alloc_empty_page_range(mmap_pages);
42758 +       BUG_ON(page == NULL);
42759 +       mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
42760 +#endif
42761 +       printk("%s: reqs=%d, pages=%d, mmap_vstart=0x%lx\n",
42762 +              __FUNCTION__, blkif_reqs, mmap_pages, mmap_vstart);
42763 +       BUG_ON(mmap_vstart == 0);
42764 +       for (i = 0; i < mmap_pages; i++) {
42765 +               pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
42766 +               pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
42767 +       }
42768 +
42769 +       memset(pending_reqs, 0, sizeof(pending_reqs));
42770 +       INIT_LIST_HEAD(&pending_free);
42771 +
42772 +       for (i = 0; i < blkif_reqs; i++)
42773 +               list_add_tail(&pending_reqs[i].free_list, &pending_free);
42774 +    
42775 +       blkif_xenbus_init();
42776 +       __unsafe(THIS_MODULE);
42777 +       return 0;
42778 +}
42779 +
42780 +module_init(blkif_init);
42781 +
42782 +static void blkif_exit(void)
42783 +{
42784 +       BUG();
42785 +}
42786 +
42787 +module_exit(blkif_exit);
42788 +
42789 +MODULE_LICENSE("Dual BSD/GPL");
42790 +
42791 +/*
42792 + * Local variables:
42793 + *  c-file-style: "linux"
42794 + *  indent-tabs-mode: t
42795 + *  c-indent-level: 8
42796 + *  c-basic-offset: 8
42797 + *  tab-width: 8
42798 + * End:
42799 + */
42800 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/blkback/common.h linux-2.6.16/drivers/xen/blkback/common.h
42801 --- linux-2.6.16.orig/drivers/xen/blkback/common.h      1970-01-01 01:00:00.000000000 +0100
42802 +++ linux-2.6.16/drivers/xen/blkback/common.h   2006-06-26 09:51:32.000000000 +0200
42803 @@ -0,0 +1,150 @@
42804 +/* 
42805 + * This program is free software; you can redistribute it and/or
42806 + * modify it under the terms of the GNU General Public License version 2
42807 + * as published by the Free Software Foundation; or, when distributed
42808 + * separately from the Linux kernel or incorporated into other
42809 + * software packages, subject to the following license:
42810 + * 
42811 + * Permission is hereby granted, free of charge, to any person obtaining a copy
42812 + * of this source file (the "Software"), to deal in the Software without
42813 + * restriction, including without limitation the rights to use, copy, modify,
42814 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
42815 + * and to permit persons to whom the Software is furnished to do so, subject to
42816 + * the following conditions:
42817 + * 
42818 + * The above copyright notice and this permission notice shall be included in
42819 + * all copies or substantial portions of the Software.
42820 + * 
42821 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
42822 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
42823 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
42824 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
42825 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
42826 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
42827 + * IN THE SOFTWARE.
42828 + */
42829 +
42830 +#ifndef __BLKIF__BACKEND__COMMON_H__
42831 +#define __BLKIF__BACKEND__COMMON_H__
42832 +
42833 +#include <linux/config.h>
42834 +#include <linux/version.h>
42835 +#include <linux/module.h>
42836 +#include <linux/interrupt.h>
42837 +#include <linux/slab.h>
42838 +#include <linux/blkdev.h>
42839 +#include <linux/vmalloc.h>
42840 +#include <asm/io.h>
42841 +#include <asm/setup.h>
42842 +#include <asm/pgalloc.h>
42843 +#include <xen/evtchn.h>
42844 +#include <asm/hypervisor.h>
42845 +#include <xen/interface/io/blkif.h>
42846 +#include <xen/interface/io/ring.h>
42847 +#include <xen/gnttab.h>
42848 +#include <xen/driver_util.h>
42849 +
42850 +#define DPRINTK(_f, _a...) pr_debug("(file=%s, line=%d) " _f, \
42851 +                                    __FILE__ , __LINE__ , ## _a )
42852 +
42853 +struct vbd {
42854 +       blkif_vdev_t   handle;      /* what the domain refers to this vbd as */
42855 +       unsigned char  readonly;    /* Non-zero -> read-only */
42856 +       unsigned char  type;        /* VDISK_xxx */
42857 +       u32            pdevice;     /* phys device that this vbd maps to */
42858 +       struct block_device *bdev;
42859 +}; 
42860 +
42861 +struct backend_info; 
42862 +
42863 +typedef struct blkif_st {
42864 +       /* Unique identifier for this interface. */
42865 +       domid_t           domid;
42866 +       unsigned int      handle;
42867 +       /* Physical parameters of the comms window. */
42868 +       unsigned int      evtchn;
42869 +       unsigned int      irq;
42870 +       /* Comms information. */
42871 +       blkif_back_ring_t blk_ring;
42872 +       struct vm_struct *blk_ring_area;
42873 +       /* The VBD attached to this interface. */
42874 +       struct vbd        vbd;
42875 +       /* Back pointer to the backend_info. */
42876 +       struct backend_info *be; 
42877 +       /* Private fields. */
42878 +       enum { DISCONNECTED, CONNECTED } status;
42879 +#ifdef CONFIG_XEN_BLKDEV_TAP_BE
42880 +       /* Is this a blktap frontend */
42881 +       unsigned int     is_blktap;
42882 +#endif
42883 +       spinlock_t       blk_ring_lock;
42884 +       atomic_t         refcnt;
42885 +
42886 +       wait_queue_head_t   wq;
42887 +       struct task_struct  *xenblkd;
42888 +       atomic_t            io_pending;
42889 +       request_queue_t     *plug;
42890 +
42891 +       /* statistics */
42892 +       unsigned long       st_print;
42893 +       int                 st_rd_req;
42894 +       int                 st_wr_req;
42895 +       int                 st_oo_req;
42896 +
42897 +       struct work_struct free_work;
42898 +
42899 +       grant_handle_t shmem_handle;
42900 +       grant_ref_t    shmem_ref;
42901 +} blkif_t;
42902 +
42903 +blkif_t *alloc_blkif(domid_t domid);
42904 +void free_blkif_callback(blkif_t *blkif);
42905 +int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn);
42906 +
42907 +#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
42908 +#define blkif_put(_b)                                  \
42909 +       do {                                            \
42910 +               if (atomic_dec_and_test(&(_b)->refcnt)) \
42911 +                       free_blkif_callback(_b);        \
42912 +       } while (0)
42913 +
42914 +/* Create a vbd. */
42915 +int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major,
42916 +              unsigned minor, int readonly);
42917 +void vbd_free(struct vbd *vbd);
42918 +
42919 +unsigned long vbd_size(struct vbd *vbd);
42920 +unsigned int vbd_info(struct vbd *vbd);
42921 +unsigned long vbd_secsize(struct vbd *vbd);
42922 +
42923 +struct phys_req {
42924 +       unsigned short       dev;
42925 +       unsigned short       nr_sects;
42926 +       struct block_device *bdev;
42927 +       blkif_sector_t       sector_number;
42928 +};
42929 +
42930 +int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation); 
42931 +
42932 +void blkif_interface_init(void);
42933 +
42934 +void blkif_xenbus_init(void);
42935 +
42936 +irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
42937 +int blkif_schedule(void *arg);
42938 +
42939 +void update_blkif_status(blkif_t *blkif); 
42940 +
42941 +int blkif_connected(blkif_t *blkif);
42942 +
42943 +#endif /* __BLKIF__BACKEND__COMMON_H__ */
42944 +
42945 +/*
42946 + * Local variables:
42947 + *  c-file-style: "linux"
42948 + *  indent-tabs-mode: t
42949 + *  c-indent-level: 8
42950 + *  c-basic-offset: 8
42951 + *  tab-width: 8
42952 + * End:
42953 + */
42954 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/blkback/interface.c linux-2.6.16/drivers/xen/blkback/interface.c
42955 --- linux-2.6.16.orig/drivers/xen/blkback/interface.c   1970-01-01 01:00:00.000000000 +0100
42956 +++ linux-2.6.16/drivers/xen/blkback/interface.c        2006-06-26 09:51:32.000000000 +0200
42957 @@ -0,0 +1,188 @@
42958 +/******************************************************************************
42959 + * arch/xen/drivers/blkif/backend/interface.c
42960 + * 
42961 + * Block-device interface management.
42962 + * 
42963 + * Copyright (c) 2004, Keir Fraser
42964 + * 
42965 + * This program is free software; you can redistribute it and/or
42966 + * modify it under the terms of the GNU General Public License version 2
42967 + * as published by the Free Software Foundation; or, when distributed
42968 + * separately from the Linux kernel or incorporated into other
42969 + * software packages, subject to the following license:
42970 + * 
42971 + * Permission is hereby granted, free of charge, to any person obtaining a copy
42972 + * of this source file (the "Software"), to deal in the Software without
42973 + * restriction, including without limitation the rights to use, copy, modify,
42974 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
42975 + * and to permit persons to whom the Software is furnished to do so, subject to
42976 + * the following conditions:
42977 + * 
42978 + * The above copyright notice and this permission notice shall be included in
42979 + * all copies or substantial portions of the Software.
42980 + * 
42981 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
42982 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
42983 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
42984 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
42985 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
42986 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
42987 + * IN THE SOFTWARE.
42988 + */
42989 +
42990 +#include "common.h"
42991 +#include <xen/evtchn.h>
42992 +
42993 +static kmem_cache_t *blkif_cachep;
42994 +
42995 +blkif_t *alloc_blkif(domid_t domid)
42996 +{
42997 +       blkif_t *blkif;
42998 +
42999 +       blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
43000 +       if (!blkif)
43001 +               return ERR_PTR(-ENOMEM);
43002 +
43003 +       memset(blkif, 0, sizeof(*blkif));
43004 +       blkif->domid = domid;
43005 +       blkif->status = DISCONNECTED;
43006 +       spin_lock_init(&blkif->blk_ring_lock);
43007 +       atomic_set(&blkif->refcnt, 1);
43008 +       init_waitqueue_head(&blkif->wq);
43009 +       blkif->st_print = jiffies;
43010 +
43011 +       return blkif;
43012 +}
43013 +
43014 +static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
43015 +{
43016 +       struct gnttab_map_grant_ref op;
43017 +       int ret;
43018 +
43019 +       op.host_addr = (unsigned long)blkif->blk_ring_area->addr;
43020 +       op.flags     = GNTMAP_host_map;
43021 +       op.ref       = shared_page;
43022 +       op.dom       = blkif->domid;
43023 +
43024 +       lock_vm_area(blkif->blk_ring_area);
43025 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
43026 +       unlock_vm_area(blkif->blk_ring_area);
43027 +       BUG_ON(ret);
43028 +
43029 +       if (op.status) {
43030 +               DPRINTK(" Grant table operation failure !\n");
43031 +               return op.status;
43032 +       }
43033 +
43034 +       blkif->shmem_ref = shared_page;
43035 +       blkif->shmem_handle = op.handle;
43036 +
43037 +#ifdef __ia64__
43038 +       /* on some arch's, map_grant_ref behaves like mmap, in that the
43039 +        * passed address is a hint and a different address may be returned */
43040 +       blkif->blk_ring_area->addr = gnttab_map_vaddr(op);
43041 +#endif
43042 +
43043 +       return 0;
43044 +}
43045 +
43046 +static void unmap_frontend_page(blkif_t *blkif)
43047 +{
43048 +       struct gnttab_unmap_grant_ref op;
43049 +       int ret;
43050 +
43051 +       op.host_addr    = (unsigned long)blkif->blk_ring_area->addr;
43052 +       op.handle       = blkif->shmem_handle;
43053 +       op.dev_bus_addr = 0;
43054 +
43055 +       lock_vm_area(blkif->blk_ring_area);
43056 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
43057 +       unlock_vm_area(blkif->blk_ring_area);
43058 +       BUG_ON(ret);
43059 +}
43060 +
43061 +int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
43062 +{
43063 +       blkif_sring_t *sring;
43064 +       int err;
43065 +       evtchn_op_t op = {
43066 +               .cmd = EVTCHNOP_bind_interdomain,
43067 +               .u.bind_interdomain.remote_dom = blkif->domid,
43068 +               .u.bind_interdomain.remote_port = evtchn };
43069 +
43070 +       /* Already connected through? */
43071 +       if (blkif->irq)
43072 +               return 0;
43073 +
43074 +       if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL )
43075 +               return -ENOMEM;
43076 +
43077 +       err = map_frontend_page(blkif, shared_page);
43078 +       if (err) {
43079 +               free_vm_area(blkif->blk_ring_area);
43080 +               return err;
43081 +       }
43082 +
43083 +       err = HYPERVISOR_event_channel_op(&op);
43084 +       if (err) {
43085 +               unmap_frontend_page(blkif);
43086 +               free_vm_area(blkif->blk_ring_area);
43087 +               return err;
43088 +       }
43089 +
43090 +       blkif->evtchn = op.u.bind_interdomain.local_port;
43091 +
43092 +       sring = (blkif_sring_t *)blkif->blk_ring_area->addr;
43093 +       BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE);
43094 +
43095 +       blkif->irq = bind_evtchn_to_irqhandler(
43096 +               blkif->evtchn, blkif_be_int, 0, "blkif-backend", blkif);
43097 +
43098 +       /* We're potentially connected now */
43099 +       update_blkif_status(blkif); 
43100 +
43101 +       return 0;
43102 +}
43103 +
43104 +static void free_blkif(void *arg)
43105 +{
43106 +       blkif_t *blkif = (blkif_t *)arg;
43107 +
43108 +       /* Already disconnected? */
43109 +       if (blkif->irq) {
43110 +               unbind_from_irqhandler(blkif->irq, blkif);
43111 +               blkif->irq = 0;
43112 +       }
43113 +
43114 +       vbd_free(&blkif->vbd);
43115 +
43116 +       if (blkif->blk_ring.sring) {
43117 +               unmap_frontend_page(blkif);
43118 +               free_vm_area(blkif->blk_ring_area);
43119 +               blkif->blk_ring.sring = NULL;
43120 +       }
43121 +
43122 +       kmem_cache_free(blkif_cachep, blkif);
43123 +}
43124 +
43125 +void free_blkif_callback(blkif_t *blkif)
43126 +{
43127 +       INIT_WORK(&blkif->free_work, free_blkif, (void *)blkif);
43128 +       schedule_work(&blkif->free_work);
43129 +}
43130 +
43131 +void __init blkif_interface_init(void)
43132 +{
43133 +       blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), 
43134 +                                        0, 0, NULL, NULL);
43135 +}
43136 +
43137 +/*
43138 + * Local variables:
43139 + *  c-file-style: "linux"
43140 + *  indent-tabs-mode: t
43141 + *  c-indent-level: 8
43142 + *  c-basic-offset: 8
43143 + *  tab-width: 8
43144 + * End:
43145 + */
43146 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/blkback/vbd.c linux-2.6.16/drivers/xen/blkback/vbd.c
43147 --- linux-2.6.16.orig/drivers/xen/blkback/vbd.c 1970-01-01 01:00:00.000000000 +0100
43148 +++ linux-2.6.16/drivers/xen/blkback/vbd.c      2006-06-26 09:51:32.000000000 +0200
43149 @@ -0,0 +1,126 @@
43150 +/******************************************************************************
43151 + * blkback/vbd.c
43152 + * 
43153 + * Routines for managing virtual block devices (VBDs).
43154 + * 
43155 + * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
43156 + * 
43157 + * This program is free software; you can redistribute it and/or
43158 + * modify it under the terms of the GNU General Public License version 2
43159 + * as published by the Free Software Foundation; or, when distributed
43160 + * separately from the Linux kernel or incorporated into other
43161 + * software packages, subject to the following license:
43162 + * 
43163 + * Permission is hereby granted, free of charge, to any person obtaining a copy
43164 + * of this source file (the "Software"), to deal in the Software without
43165 + * restriction, including without limitation the rights to use, copy, modify,
43166 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
43167 + * and to permit persons to whom the Software is furnished to do so, subject to
43168 + * the following conditions:
43169 + * 
43170 + * The above copyright notice and this permission notice shall be included in
43171 + * all copies or substantial portions of the Software.
43172 + * 
43173 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
43174 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
43175 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
43176 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
43177 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
43178 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
43179 + * IN THE SOFTWARE.
43180 + */
43181 +
43182 +#include "common.h"
43183 +#include <xen/xenbus.h>
43184 +
43185 +#define vbd_sz(_v)   ((_v)->bdev->bd_part ?                            \
43186 +       (_v)->bdev->bd_part->nr_sects : (_v)->bdev->bd_disk->capacity)
43187 +
43188 +unsigned long vbd_size(struct vbd *vbd)
43189 +{
43190 +       return vbd_sz(vbd);
43191 +}
43192 +
43193 +unsigned int vbd_info(struct vbd *vbd)
43194 +{
43195 +       return vbd->type | (vbd->readonly?VDISK_READONLY:0);
43196 +}
43197 +
43198 +unsigned long vbd_secsize(struct vbd *vbd)
43199 +{
43200 +       return bdev_hardsect_size(vbd->bdev);
43201 +}
43202 +
43203 +int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
43204 +              unsigned minor, int readonly)
43205 +{
43206 +       struct vbd *vbd;
43207 +
43208 +       vbd = &blkif->vbd;
43209 +       vbd->handle   = handle; 
43210 +       vbd->readonly = readonly;
43211 +       vbd->type     = 0;
43212 +
43213 +       vbd->pdevice  = MKDEV(major, minor);
43214 +
43215 +       vbd->bdev = open_by_devnum(
43216 +               vbd->pdevice,
43217 +               vbd->readonly ? FMODE_READ : FMODE_WRITE);
43218 +       if (IS_ERR(vbd->bdev)) {
43219 +               DPRINTK("vbd_creat: device %08x doesn't exist.\n",
43220 +                       vbd->pdevice);
43221 +               return -ENOENT;
43222 +       }
43223 +
43224 +       if (vbd->bdev->bd_disk == NULL) {
43225 +               DPRINTK("vbd_creat: device %08x doesn't exist.\n",
43226 +                       vbd->pdevice);
43227 +               vbd_free(vbd);
43228 +               return -ENOENT;
43229 +       }
43230 +
43231 +       if (vbd->bdev->bd_disk->flags & GENHD_FL_CD)
43232 +               vbd->type |= VDISK_CDROM;
43233 +       if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
43234 +               vbd->type |= VDISK_REMOVABLE;
43235 +
43236 +       DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
43237 +               handle, blkif->domid);
43238 +       return 0;
43239 +}
43240 +
43241 +void vbd_free(struct vbd *vbd)
43242 +{
43243 +       if (vbd->bdev)
43244 +               blkdev_put(vbd->bdev);
43245 +       vbd->bdev = NULL;
43246 +}
43247 +
43248 +int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
43249 +{
43250 +       struct vbd *vbd = &blkif->vbd;
43251 +       int rc = -EACCES;
43252 +
43253 +       if ((operation == WRITE) && vbd->readonly)
43254 +               goto out;
43255 +
43256 +       if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd)))
43257 +               goto out;
43258 +
43259 +       req->dev  = vbd->pdevice;
43260 +       req->bdev = vbd->bdev;
43261 +       rc = 0;
43262 +
43263 + out:
43264 +       return rc;
43265 +}
43266 +
43267 +/*
43268 + * Local variables:
43269 + *  c-file-style: "linux"
43270 + *  indent-tabs-mode: t
43271 + *  c-indent-level: 8
43272 + *  c-basic-offset: 8
43273 + *  tab-width: 8
43274 + * End:
43275 + */
43276 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/blkback/xenbus.c linux-2.6.16/drivers/xen/blkback/xenbus.c
43277 --- linux-2.6.16.orig/drivers/xen/blkback/xenbus.c      1970-01-01 01:00:00.000000000 +0100
43278 +++ linux-2.6.16/drivers/xen/blkback/xenbus.c   2006-06-26 09:51:32.000000000 +0200
43279 @@ -0,0 +1,430 @@
43280 +/*  Xenbus code for blkif backend
43281 +    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
43282 +    Copyright (C) 2005 XenSource Ltd
43283 +
43284 +    This program is free software; you can redistribute it and/or modify
43285 +    it under the terms of the GNU General Public License as published by
43286 +    the Free Software Foundation; either version 2 of the License, or
43287 +    (at your option) any later version.
43288 +
43289 +    This program is distributed in the hope that it will be useful,
43290 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
43291 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
43292 +    GNU General Public License for more details.
43293 +
43294 +    You should have received a copy of the GNU General Public License
43295 +    along with this program; if not, write to the Free Software
43296 +    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
43297 +*/
43298 +
43299 +
43300 +#include <stdarg.h>
43301 +#include <linux/module.h>
43302 +#include <linux/kthread.h>
43303 +#include <xen/xenbus.h>
43304 +#include "common.h"
43305 +
43306 +#undef DPRINTK
43307 +#define DPRINTK(fmt, args...) \
43308 +    pr_debug("blkback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
43309 +
43310 +
43311 +struct backend_info
43312 +{
43313 +       struct xenbus_device *dev;
43314 +       blkif_t *blkif;
43315 +       struct xenbus_watch backend_watch;
43316 +
43317 +       unsigned major;
43318 +       unsigned minor;
43319 +       char *mode;
43320 +};
43321 +
43322 +
43323 +static void maybe_connect(struct backend_info *);
43324 +static void connect(struct backend_info *);
43325 +static int connect_ring(struct backend_info *);
43326 +static void backend_changed(struct xenbus_watch *, const char **,
43327 +                           unsigned int);
43328 +
43329 +int blkif_connected(blkif_t *blkif)
43330 +{
43331 +       return (blkif->be->dev->state == XenbusStateConnected);
43332 +}
43333 +
43334 +void update_blkif_status(blkif_t *blkif)
43335 +{ 
43336 +       if(blkif->irq && blkif->vbd.bdev) {
43337 +               blkif->status = CONNECTED; 
43338 +               (void)blkif_be_int(0, blkif, NULL); 
43339 +       }
43340 +       maybe_connect(blkif->be); 
43341 +}
43342 +
43343 +
43344 +static ssize_t show_physical_device(struct device *_dev,
43345 +                                   struct device_attribute *attr, char *buf)
43346 +{
43347 +       struct xenbus_device *dev = to_xenbus_device(_dev);
43348 +       struct backend_info *be = dev->data;
43349 +       return sprintf(buf, "%x:%x\n", be->major, be->minor);
43350 +}
43351 +DEVICE_ATTR(physical_device, S_IRUSR | S_IRGRP | S_IROTH,
43352 +           show_physical_device, NULL);
43353 +
43354 +
43355 +static ssize_t show_mode(struct device *_dev, struct device_attribute *attr,
43356 +                        char *buf)
43357 +{
43358 +       struct xenbus_device *dev = to_xenbus_device(_dev);
43359 +       struct backend_info *be = dev->data;
43360 +       return sprintf(buf, "%s\n", be->mode);
43361 +}
43362 +DEVICE_ATTR(mode, S_IRUSR | S_IRGRP | S_IROTH, show_mode, NULL);
43363 +
43364 +
43365 +static int blkback_remove(struct xenbus_device *dev)
43366 +{
43367 +       struct backend_info *be = dev->data;
43368 +
43369 +       DPRINTK("");
43370 +
43371 +       if (be->backend_watch.node) {
43372 +               unregister_xenbus_watch(&be->backend_watch);
43373 +               kfree(be->backend_watch.node);
43374 +               be->backend_watch.node = NULL;
43375 +       }
43376 +       if (be->blkif) {
43377 +               be->blkif->status = DISCONNECTED; 
43378 +               if (be->blkif->xenblkd)
43379 +                       kthread_stop(be->blkif->xenblkd);
43380 +               blkif_put(be->blkif);
43381 +               be->blkif = NULL;
43382 +       }
43383 +
43384 +       device_remove_file(&dev->dev, &dev_attr_physical_device);
43385 +       device_remove_file(&dev->dev, &dev_attr_mode);
43386 +
43387 +       kfree(be);
43388 +       dev->data = NULL;
43389 +       return 0;
43390 +}
43391 +
43392 +
43393 +/**
43394 + * Entry point to this code when a new device is created.  Allocate the basic
43395 + * structures, and watch the store waiting for the hotplug scripts to tell us
43396 + * the device's physical major and minor numbers.  Switch to InitWait.
43397 + */
43398 +static int blkback_probe(struct xenbus_device *dev,
43399 +                        const struct xenbus_device_id *id)
43400 +{
43401 +       int err;
43402 +       struct backend_info *be = kzalloc(sizeof(struct backend_info),
43403 +                                         GFP_KERNEL);
43404 +       if (!be) {
43405 +               xenbus_dev_fatal(dev, -ENOMEM,
43406 +                                "allocating backend structure");
43407 +               return -ENOMEM;
43408 +       }
43409 +       be->dev = dev;
43410 +       dev->data = be;
43411 +
43412 +       be->blkif = alloc_blkif(dev->otherend_id);
43413 +       if (IS_ERR(be->blkif)) {
43414 +               err = PTR_ERR(be->blkif);
43415 +               be->blkif = NULL;
43416 +               xenbus_dev_fatal(dev, err, "creating block interface");
43417 +               goto fail;
43418 +       }
43419 +
43420 +       /* setup back pointer */
43421 +       be->blkif->be = be; 
43422 +
43423 +       err = xenbus_watch_path2(dev, dev->nodename, "physical-device",
43424 +                                &be->backend_watch, backend_changed);
43425 +       if (err)
43426 +               goto fail;
43427 +
43428 +       err = xenbus_switch_state(dev, XenbusStateInitWait);
43429 +       if (err)
43430 +               goto fail;
43431 +
43432 +       return 0;
43433 +
43434 +fail:
43435 +       DPRINTK("failed");
43436 +       blkback_remove(dev);
43437 +       return err;
43438 +}
43439 +
43440 +
43441 +/**
43442 + * Callback received when the hotplug scripts have placed the physical-device
43443 + * node.  Read it and the mode node, and create a vbd.  If the frontend is
43444 + * ready, connect.
43445 + */
43446 +static void backend_changed(struct xenbus_watch *watch,
43447 +                           const char **vec, unsigned int len)
43448 +{
43449 +       int err;
43450 +       unsigned major;
43451 +       unsigned minor;
43452 +       struct backend_info *be
43453 +               = container_of(watch, struct backend_info, backend_watch);
43454 +       struct xenbus_device *dev = be->dev;
43455 +
43456 +       DPRINTK("");
43457 +
43458 +       err = xenbus_scanf(XBT_NULL, dev->nodename, "physical-device", "%x:%x",
43459 +                          &major, &minor);
43460 +       if (XENBUS_EXIST_ERR(err)) {
43461 +               /* Since this watch will fire once immediately after it is
43462 +                  registered, we expect this.  Ignore it, and wait for the
43463 +                  hotplug scripts. */
43464 +               return;
43465 +       }
43466 +       if (err != 2) {
43467 +               xenbus_dev_fatal(dev, err, "reading physical-device");
43468 +               return;
43469 +       }
43470 +
43471 +       if (be->major && be->minor &&
43472 +           (be->major != major || be->minor != minor)) {
43473 +               printk(KERN_WARNING
43474 +                      "blkback: changing physical device (from %x:%x to "
43475 +                      "%x:%x) not supported.\n", be->major, be->minor,
43476 +                      major, minor);
43477 +               return;
43478 +       }
43479 +
43480 +       be->mode = xenbus_read(XBT_NULL, dev->nodename, "mode", NULL);
43481 +       if (IS_ERR(be->mode)) {
43482 +               err = PTR_ERR(be->mode);
43483 +               be->mode = NULL;
43484 +               xenbus_dev_fatal(dev, err, "reading mode");
43485 +               return;
43486 +       }
43487 +
43488 +       if (be->major == 0 && be->minor == 0) {
43489 +               /* Front end dir is a number, which is used as the handle. */
43490 +
43491 +               char *p = strrchr(dev->otherend, '/') + 1;
43492 +               long handle = simple_strtoul(p, NULL, 0);
43493 +
43494 +               be->major = major;
43495 +               be->minor = minor;
43496 +
43497 +               err = vbd_create(be->blkif, handle, major, minor,
43498 +                                (NULL == strchr(be->mode, 'w')));
43499 +               if (err) {
43500 +                       be->major = 0;
43501 +                       be->minor = 0;
43502 +                       xenbus_dev_fatal(dev, err, "creating vbd structure");
43503 +                       return;
43504 +               }
43505 +
43506 +               be->blkif->xenblkd = kthread_run(blkif_schedule, be->blkif,
43507 +                                                "xvd %d %02x:%02x",
43508 +                                                be->blkif->domid,
43509 +                                                be->major, be->minor);
43510 +               if (IS_ERR(be->blkif->xenblkd)) {
43511 +                       err = PTR_ERR(be->blkif->xenblkd);
43512 +                       be->blkif->xenblkd = NULL;
43513 +                       xenbus_dev_error(dev, err, "start xenblkd");
43514 +                       return;
43515 +               }
43516 +
43517 +               device_create_file(&dev->dev, &dev_attr_physical_device);
43518 +               device_create_file(&dev->dev, &dev_attr_mode);
43519 +
43520 +               /* We're potentially connected now */
43521 +               update_blkif_status(be->blkif); 
43522 +       }
43523 +}
43524 +
43525 +
43526 +/**
43527 + * Callback received when the frontend's state changes.
43528 + */
43529 +static void frontend_changed(struct xenbus_device *dev,
43530 +                            XenbusState frontend_state)
43531 +{
43532 +       struct backend_info *be = dev->data;
43533 +       int err;
43534 +
43535 +       DPRINTK("");
43536 +
43537 +       switch (frontend_state) {
43538 +       case XenbusStateInitialising:
43539 +               break;
43540 +
43541 +       case XenbusStateInitialised:
43542 +       case XenbusStateConnected:
43543 +               /* Ensure we connect even when two watches fire in 
43544 +                  close successsion and we miss the intermediate value 
43545 +                  of frontend_state. */
43546 +               if (dev->state == XenbusStateConnected)
43547 +                       break;
43548 +
43549 +               err = connect_ring(be);
43550 +               if (err)
43551 +                       break;
43552 +               update_blkif_status(be->blkif);
43553 +               break;
43554 +
43555 +       case XenbusStateClosing:
43556 +               xenbus_switch_state(dev, XenbusStateClosing);
43557 +               break;
43558 +
43559 +       case XenbusStateClosed:
43560 +               device_unregister(&dev->dev);
43561 +               break;
43562 +
43563 +       case XenbusStateUnknown:
43564 +       case XenbusStateInitWait:
43565 +       default:
43566 +               xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
43567 +                                frontend_state);
43568 +               break;
43569 +       }
43570 +}
43571 +
43572 +
43573 +/* ** Connection ** */
43574 +
43575 +
43576 +static void maybe_connect(struct backend_info *be)
43577 +{
43578 +       if ((be->major != 0 || be->minor != 0) &&
43579 +           be->blkif->status == CONNECTED)
43580 +               connect(be);
43581 +}
43582 +
43583 +
43584 +/**
43585 + * Write the physical details regarding the block device to the store, and
43586 + * switch to Connected state.
43587 + */
43588 +static void connect(struct backend_info *be)
43589 +{
43590 +       xenbus_transaction_t xbt;
43591 +       int err;
43592 +       struct xenbus_device *dev = be->dev;
43593 +
43594 +       DPRINTK("%s", dev->otherend);
43595 +
43596 +       /* Supply the information about the device the frontend needs */
43597 +again:
43598 +       err = xenbus_transaction_start(&xbt);
43599 +
43600 +       if (err) {
43601 +               xenbus_dev_fatal(dev, err, "starting transaction");
43602 +               return;
43603 +       }
43604 +
43605 +       err = xenbus_printf(xbt, dev->nodename, "sectors", "%lu",
43606 +                           vbd_size(&be->blkif->vbd));
43607 +       if (err) {
43608 +               xenbus_dev_fatal(dev, err, "writing %s/sectors",
43609 +                                dev->nodename);
43610 +               goto abort;
43611 +       }
43612 +
43613 +       /* FIXME: use a typename instead */
43614 +       err = xenbus_printf(xbt, dev->nodename, "info", "%u",
43615 +                           vbd_info(&be->blkif->vbd));
43616 +       if (err) {
43617 +               xenbus_dev_fatal(dev, err, "writing %s/info",
43618 +                                dev->nodename);
43619 +               goto abort;
43620 +       }
43621 +       err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
43622 +                           vbd_secsize(&be->blkif->vbd));
43623 +       if (err) {
43624 +               xenbus_dev_fatal(dev, err, "writing %s/sector-size",
43625 +                                dev->nodename);
43626 +               goto abort;
43627 +       }
43628 +
43629 +       err = xenbus_transaction_end(xbt, 0);
43630 +       if (err == -EAGAIN)
43631 +               goto again;
43632 +       if (err)
43633 +               xenbus_dev_fatal(dev, err, "ending transaction");
43634 +
43635 +       err = xenbus_switch_state(dev, XenbusStateConnected);
43636 +       if (err)
43637 +               xenbus_dev_fatal(dev, err, "switching to Connected state",
43638 +                                dev->nodename);
43639 +
43640 +       return;
43641 + abort:
43642 +       xenbus_transaction_end(xbt, 1);
43643 +}
43644 +
43645 +
43646 +static int connect_ring(struct backend_info *be)
43647 +{
43648 +       struct xenbus_device *dev = be->dev;
43649 +       unsigned long ring_ref;
43650 +       unsigned int evtchn;
43651 +       int err;
43652 +
43653 +       DPRINTK("%s", dev->otherend);
43654 +
43655 +       err = xenbus_gather(XBT_NULL, dev->otherend, "ring-ref", "%lu", &ring_ref,
43656 +                           "event-channel", "%u", &evtchn, NULL);
43657 +       if (err) {
43658 +               xenbus_dev_fatal(dev, err,
43659 +                                "reading %s/ring-ref and event-channel",
43660 +                                dev->otherend);
43661 +               return err;
43662 +       }
43663 +
43664 +       /* Map the shared frame, irq etc. */
43665 +       err = blkif_map(be->blkif, ring_ref, evtchn);
43666 +       if (err) {
43667 +               xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
43668 +                                ring_ref, evtchn);
43669 +               return err;
43670 +       }
43671 +
43672 +       return 0;
43673 +}
43674 +
43675 +
43676 +/* ** Driver Registration ** */
43677 +
43678 +
43679 +static struct xenbus_device_id blkback_ids[] = {
43680 +       { "vbd" },
43681 +       { "" }
43682 +};
43683 +
43684 +
43685 +static struct xenbus_driver blkback = {
43686 +       .name = "vbd",
43687 +       .owner = THIS_MODULE,
43688 +       .ids = blkback_ids,
43689 +       .probe = blkback_probe,
43690 +       .remove = blkback_remove,
43691 +       .otherend_changed = frontend_changed
43692 +};
43693 +
43694 +
43695 +void blkif_xenbus_init(void)
43696 +{
43697 +       xenbus_register_backend(&blkback);
43698 +}
43699 +
43700 +
43701 +/*
43702 + * Local variables:
43703 + *  c-file-style: "linux"
43704 + *  indent-tabs-mode: t
43705 + *  c-indent-level: 8
43706 + *  c-basic-offset: 8
43707 + *  tab-width: 8
43708 + * End:
43709 + */
43710 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/blkfront/Makefile linux-2.6.16/drivers/xen/blkfront/Makefile
43711 --- linux-2.6.16.orig/drivers/xen/blkfront/Makefile     1970-01-01 01:00:00.000000000 +0100
43712 +++ linux-2.6.16/drivers/xen/blkfront/Makefile  2006-06-26 09:51:32.000000000 +0200
43713 @@ -0,0 +1,5 @@
43714 +
43715 +obj-$(CONFIG_XEN_BLKDEV_FRONTEND)      := xenblk.o
43716 +
43717 +xenblk-objs := blkfront.o vbd.o
43718 +
43719 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/blkfront/blkfront.c linux-2.6.16/drivers/xen/blkfront/blkfront.c
43720 --- linux-2.6.16.orig/drivers/xen/blkfront/blkfront.c   1970-01-01 01:00:00.000000000 +0100
43721 +++ linux-2.6.16/drivers/xen/blkfront/blkfront.c        2006-06-26 09:51:32.000000000 +0200
43722 @@ -0,0 +1,819 @@
43723 +/******************************************************************************
43724 + * blkfront.c
43725 + * 
43726 + * XenLinux virtual block-device driver.
43727 + * 
43728 + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
43729 + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
43730 + * Copyright (c) 2004, Christian Limpach
43731 + * Copyright (c) 2004, Andrew Warfield
43732 + * Copyright (c) 2005, Christopher Clark
43733 + * Copyright (c) 2005, XenSource Ltd
43734 + * 
43735 + * This program is free software; you can redistribute it and/or
43736 + * modify it under the terms of the GNU General Public License version 2
43737 + * as published by the Free Software Foundation; or, when distributed
43738 + * separately from the Linux kernel or incorporated into other
43739 + * software packages, subject to the following license:
43740 + * 
43741 + * Permission is hereby granted, free of charge, to any person obtaining a copy
43742 + * of this source file (the "Software"), to deal in the Software without
43743 + * restriction, including without limitation the rights to use, copy, modify,
43744 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
43745 + * and to permit persons to whom the Software is furnished to do so, subject to
43746 + * the following conditions:
43747 + * 
43748 + * The above copyright notice and this permission notice shall be included in
43749 + * all copies or substantial portions of the Software.
43750 + * 
43751 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
43752 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
43753 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
43754 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
43755 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
43756 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
43757 + * IN THE SOFTWARE.
43758 + */
43759 +
43760 +#include <linux/version.h>
43761 +#include "block.h"
43762 +#include <linux/cdrom.h>
43763 +#include <linux/sched.h>
43764 +#include <linux/interrupt.h>
43765 +#include <scsi/scsi.h>
43766 +#include <xen/evtchn.h>
43767 +#include <xen/xenbus.h>
43768 +#include <xen/interface/grant_table.h>
43769 +#include <xen/gnttab.h>
43770 +#include <asm/hypervisor.h>
43771 +
43772 +#define BLKIF_STATE_DISCONNECTED 0
43773 +#define BLKIF_STATE_CONNECTED    1
43774 +#define BLKIF_STATE_SUSPENDED    2
43775 +
43776 +#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
43777 +    (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
43778 +#define GRANT_INVALID_REF      0
43779 +
43780 +static void connect(struct blkfront_info *);
43781 +static void blkfront_closing(struct xenbus_device *);
43782 +static int blkfront_remove(struct xenbus_device *);
43783 +static int talk_to_backend(struct xenbus_device *, struct blkfront_info *);
43784 +static int setup_blkring(struct xenbus_device *, struct blkfront_info *);
43785 +
43786 +static void kick_pending_request_queues(struct blkfront_info *);
43787 +
43788 +static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs);
43789 +static void blkif_restart_queue(void *arg);
43790 +static void blkif_recover(struct blkfront_info *);
43791 +static void blkif_completion(struct blk_shadow *);
43792 +static void blkif_free(struct blkfront_info *, int);
43793 +
43794 +
43795 +/**
43796 + * Entry point to this code when a new device is created.  Allocate the basic
43797 + * structures and the ring buffer for communication with the backend, and
43798 + * inform the backend of the appropriate details for those.  Switch to
43799 + * Initialised state.
43800 + */
43801 +static int blkfront_probe(struct xenbus_device *dev,
43802 +                         const struct xenbus_device_id *id)
43803 +{
43804 +       int err, vdevice, i;
43805 +       struct blkfront_info *info;
43806 +
43807 +       /* FIXME: Use dynamic device id if this is not set. */
43808 +       err = xenbus_scanf(XBT_NULL, dev->nodename,
43809 +                          "virtual-device", "%i", &vdevice);
43810 +       if (err != 1) {
43811 +               xenbus_dev_fatal(dev, err, "reading virtual-device");
43812 +               return err;
43813 +       }
43814 +
43815 +       info = kzalloc(sizeof(*info), GFP_KERNEL);
43816 +       if (!info) {
43817 +               xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
43818 +               return -ENOMEM;
43819 +       }
43820 +
43821 +       info->xbdev = dev;
43822 +       info->vdevice = vdevice;
43823 +       info->connected = BLKIF_STATE_DISCONNECTED;
43824 +       INIT_WORK(&info->work, blkif_restart_queue, (void *)info);
43825 +
43826 +       for (i = 0; i < BLK_RING_SIZE; i++)
43827 +               info->shadow[i].req.id = i+1;
43828 +       info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
43829 +
43830 +       /* Front end dir is a number, which is used as the id. */
43831 +       info->handle = simple_strtoul(strrchr(dev->nodename,'/')+1, NULL, 0);
43832 +       dev->data = info;
43833 +
43834 +       err = talk_to_backend(dev, info);
43835 +       if (err) {
43836 +               kfree(info);
43837 +               dev->data = NULL;
43838 +               return err;
43839 +       }
43840 +
43841 +       return 0;
43842 +}
43843 +
43844 +
43845 +/**
43846 + * We are reconnecting to the backend, due to a suspend/resume, or a backend
43847 + * driver restart.  We tear down our blkif structure and recreate it, but
43848 + * leave the device-layer structures intact so that this is transparent to the
43849 + * rest of the kernel.
43850 + */
43851 +static int blkfront_resume(struct xenbus_device *dev)
43852 +{
43853 +       struct blkfront_info *info = dev->data;
43854 +       int err;
43855 +
43856 +       DPRINTK("blkfront_resume: %s\n", dev->nodename);
43857 +
43858 +       blkif_free(info, 1);
43859 +
43860 +       err = talk_to_backend(dev, info);
43861 +       if (!err)
43862 +               blkif_recover(info);
43863 +
43864 +       return err;
43865 +}
43866 +
43867 +
43868 +/* Common code used when first setting up, and when resuming. */
43869 +static int talk_to_backend(struct xenbus_device *dev,
43870 +                          struct blkfront_info *info)
43871 +{
43872 +       const char *message = NULL;
43873 +       xenbus_transaction_t xbt;
43874 +       int err;
43875 +
43876 +       /* Create shared ring, alloc event channel. */
43877 +       err = setup_blkring(dev, info);
43878 +       if (err)
43879 +               goto out;
43880 +
43881 +again:
43882 +       err = xenbus_transaction_start(&xbt);
43883 +       if (err) {
43884 +               xenbus_dev_fatal(dev, err, "starting transaction");
43885 +               goto destroy_blkring;
43886 +       }
43887 +
43888 +       err = xenbus_printf(xbt, dev->nodename,
43889 +                           "ring-ref","%u", info->ring_ref);
43890 +       if (err) {
43891 +               message = "writing ring-ref";
43892 +               goto abort_transaction;
43893 +       }
43894 +       err = xenbus_printf(xbt, dev->nodename,
43895 +                           "event-channel", "%u", info->evtchn);
43896 +       if (err) {
43897 +               message = "writing event-channel";
43898 +               goto abort_transaction;
43899 +       }
43900 +
43901 +       err = xenbus_transaction_end(xbt, 0);
43902 +       if (err) {
43903 +               if (err == -EAGAIN)
43904 +                       goto again;
43905 +               xenbus_dev_fatal(dev, err, "completing transaction");
43906 +               goto destroy_blkring;
43907 +       }
43908 +
43909 +       xenbus_switch_state(dev, XenbusStateInitialised);
43910 +
43911 +       return 0;
43912 +
43913 + abort_transaction:
43914 +       xenbus_transaction_end(xbt, 1);
43915 +       if (message)
43916 +               xenbus_dev_fatal(dev, err, "%s", message);
43917 + destroy_blkring:
43918 +       blkif_free(info, 0);
43919 + out:
43920 +       return err;
43921 +}
43922 +
43923 +
43924 +static int setup_blkring(struct xenbus_device *dev,
43925 +                        struct blkfront_info *info)
43926 +{
43927 +       blkif_sring_t *sring;
43928 +       int err;
43929 +
43930 +       info->ring_ref = GRANT_INVALID_REF;
43931 +
43932 +       sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL);
43933 +       if (!sring) {
43934 +               xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
43935 +               return -ENOMEM;
43936 +       }
43937 +       SHARED_RING_INIT(sring);
43938 +       FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
43939 +
43940 +       err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
43941 +       if (err < 0) {
43942 +               free_page((unsigned long)sring);
43943 +               info->ring.sring = NULL;
43944 +               goto fail;
43945 +       }
43946 +       info->ring_ref = err;
43947 +
43948 +       err = xenbus_alloc_evtchn(dev, &info->evtchn);
43949 +       if (err)
43950 +               goto fail;
43951 +
43952 +       err = bind_evtchn_to_irqhandler(
43953 +               info->evtchn, blkif_int, SA_SAMPLE_RANDOM, "blkif", info);
43954 +       if (err <= 0) {
43955 +               xenbus_dev_fatal(dev, err,
43956 +                                "bind_evtchn_to_irqhandler failed");
43957 +               goto fail;
43958 +       }
43959 +       info->irq = err;
43960 +
43961 +       return 0;
43962 +fail:
43963 +       blkif_free(info, 0);
43964 +       return err;
43965 +}
43966 +
43967 +
43968 +/**
43969 + * Callback received when the backend's state changes.
43970 + */
43971 +static void backend_changed(struct xenbus_device *dev,
43972 +                           XenbusState backend_state)
43973 +{
43974 +       struct blkfront_info *info = dev->data;
43975 +       struct block_device *bd;
43976 +
43977 +       DPRINTK("blkfront:backend_changed.\n");
43978 +
43979 +       switch (backend_state) {
43980 +       case XenbusStateUnknown:
43981 +       case XenbusStateInitialising:
43982 +       case XenbusStateInitWait:
43983 +       case XenbusStateInitialised:
43984 +       case XenbusStateClosed:
43985 +               break;
43986 +
43987 +       case XenbusStateConnected:
43988 +               connect(info);
43989 +               break;
43990 +
43991 +       case XenbusStateClosing:
43992 +               bd = bdget(info->dev);
43993 +               if (bd == NULL)
43994 +                       xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
43995 +
43996 +               down(&bd->bd_sem);
43997 +               if (info->users > 0)
43998 +                       xenbus_dev_error(dev, -EBUSY,
43999 +                                        "Device in use; refusing to close");
44000 +               else
44001 +                       blkfront_closing(dev);
44002 +               up(&bd->bd_sem);
44003 +               bdput(bd);
44004 +               break;
44005 +       }
44006 +}
44007 +
44008 +
44009 +/* ** Connection ** */
44010 +
44011 +
44012 +/*
44013 + * Invoked when the backend is finally 'ready' (and has told produced
44014 + * the details about the physical device - #sectors, size, etc).
44015 + */
44016 +static void connect(struct blkfront_info *info)
44017 +{
44018 +       unsigned long sectors, sector_size;
44019 +       unsigned int binfo;
44020 +       int err;
44021 +
44022 +       if ((info->connected == BLKIF_STATE_CONNECTED) ||
44023 +           (info->connected == BLKIF_STATE_SUSPENDED) )
44024 +               return;
44025 +
44026 +       DPRINTK("blkfront.c:connect:%s.\n", info->xbdev->otherend);
44027 +
44028 +       err = xenbus_gather(XBT_NULL, info->xbdev->otherend,
44029 +                           "sectors", "%lu", &sectors,
44030 +                           "info", "%u", &binfo,
44031 +                           "sector-size", "%lu", &sector_size,
44032 +                           NULL);
44033 +       if (err) {
44034 +               xenbus_dev_fatal(info->xbdev, err,
44035 +                                "reading backend fields at %s",
44036 +                                info->xbdev->otherend);
44037 +               return;
44038 +       }
44039 +
44040 +       err = xlvbd_add(sectors, info->vdevice, binfo, sector_size, info);
44041 +       if (err) {
44042 +               xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
44043 +                                info->xbdev->otherend);
44044 +               return;
44045 +       }
44046 +
44047 +       (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
44048 +
44049 +       /* Kick pending requests. */
44050 +       spin_lock_irq(&blkif_io_lock);
44051 +       info->connected = BLKIF_STATE_CONNECTED;
44052 +       kick_pending_request_queues(info);
44053 +       spin_unlock_irq(&blkif_io_lock);
44054 +
44055 +       add_disk(info->gd);
44056 +}
44057 +
44058 +/**
44059 + * Handle the change of state of the backend to Closing.  We must delete our
44060 + * device-layer structures now, to ensure that writes are flushed through to
44061 + * the backend.  Once is this done, we can switch to Closed in
44062 + * acknowledgement.
44063 + */
44064 +static void blkfront_closing(struct xenbus_device *dev)
44065 +{
44066 +       struct blkfront_info *info = dev->data;
44067 +
44068 +       DPRINTK("blkfront_closing: %s removed\n", dev->nodename);
44069 +
44070 +       xlvbd_del(info);
44071 +
44072 +       xenbus_switch_state(dev, XenbusStateClosed);
44073 +}
44074 +
44075 +
44076 +static int blkfront_remove(struct xenbus_device *dev)
44077 +{
44078 +       struct blkfront_info *info = dev->data;
44079 +
44080 +       DPRINTK("blkfront_remove: %s removed\n", dev->nodename);
44081 +
44082 +       blkif_free(info, 0);
44083 +
44084 +       kfree(info);
44085 +
44086 +       return 0;
44087 +}
44088 +
44089 +
44090 +static inline int GET_ID_FROM_FREELIST(
44091 +       struct blkfront_info *info)
44092 +{
44093 +       unsigned long free = info->shadow_free;
44094 +       BUG_ON(free > BLK_RING_SIZE);
44095 +       info->shadow_free = info->shadow[free].req.id;
44096 +       info->shadow[free].req.id = 0x0fffffee; /* debug */
44097 +       return free;
44098 +}
44099 +
44100 +static inline void ADD_ID_TO_FREELIST(
44101 +       struct blkfront_info *info, unsigned long id)
44102 +{
44103 +       info->shadow[id].req.id  = info->shadow_free;
44104 +       info->shadow[id].request = 0;
44105 +       info->shadow_free = id;
44106 +}
44107 +
44108 +static inline void flush_requests(struct blkfront_info *info)
44109 +{
44110 +       int notify;
44111 +
44112 +       RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
44113 +
44114 +       if (notify)
44115 +               notify_remote_via_irq(info->irq);
44116 +}
44117 +
44118 +static void kick_pending_request_queues(struct blkfront_info *info)
44119 +{
44120 +       if (!RING_FULL(&info->ring)) {
44121 +               /* Re-enable calldowns. */
44122 +               blk_start_queue(info->rq);
44123 +               /* Kick things off immediately. */
44124 +               do_blkif_request(info->rq);
44125 +       }
44126 +}
44127 +
44128 +static void blkif_restart_queue(void *arg)
44129 +{
44130 +       struct blkfront_info *info = (struct blkfront_info *)arg;
44131 +       spin_lock_irq(&blkif_io_lock);
44132 +       kick_pending_request_queues(info);
44133 +       spin_unlock_irq(&blkif_io_lock);
44134 +}
44135 +
44136 +static void blkif_restart_queue_callback(void *arg)
44137 +{
44138 +       struct blkfront_info *info = (struct blkfront_info *)arg;
44139 +       schedule_work(&info->work);
44140 +}
44141 +
44142 +int blkif_open(struct inode *inode, struct file *filep)
44143 +{
44144 +       struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
44145 +       info->users++;
44146 +       return 0;
44147 +}
44148 +
44149 +
44150 +int blkif_release(struct inode *inode, struct file *filep)
44151 +{
44152 +       struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
44153 +       info->users--;
44154 +       if (info->users == 0) {
44155 +               /* Check whether we have been instructed to close.  We will
44156 +                  have ignored this request initially, as the device was
44157 +                  still mounted. */
44158 +               struct xenbus_device * dev = info->xbdev;
44159 +               XenbusState state = xenbus_read_driver_state(dev->otherend);
44160 +
44161 +               if (state == XenbusStateClosing)
44162 +                       blkfront_closing(dev);
44163 +       }
44164 +       return 0;
44165 +}
44166 +
44167 +
44168 +int blkif_ioctl(struct inode *inode, struct file *filep,
44169 +                unsigned command, unsigned long argument)
44170 +{
44171 +       int i;
44172 +
44173 +       DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
44174 +                     command, (long)argument, inode->i_rdev);
44175 +
44176 +       switch (command) {
44177 +       case HDIO_GETGEO:
44178 +               /* return ENOSYS to use defaults */
44179 +               return -ENOSYS;
44180 +
44181 +       case CDROMMULTISESSION:
44182 +               DPRINTK("FIXME: support multisession CDs later\n");
44183 +               for (i = 0; i < sizeof(struct cdrom_multisession); i++)
44184 +                       if (put_user(0, (char __user *)(argument + i)))
44185 +                               return -EFAULT;
44186 +               return 0;
44187 +
44188 +       default:
44189 +               /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
44190 +                 command);*/
44191 +               return -EINVAL; /* same return as native Linux */
44192 +       }
44193 +
44194 +       return 0;
44195 +}
44196 +
44197 +
44198 +/*
44199 + * blkif_queue_request
44200 + *
44201 + * request block io
44202 + *
44203 + * id: for guest use only.
44204 + * operation: BLKIF_OP_{READ,WRITE,PROBE}
44205 + * buffer: buffer to read/write into. this should be a
44206 + *   virtual address in the guest os.
44207 + */
44208 +static int blkif_queue_request(struct request *req)
44209 +{
44210 +       struct blkfront_info *info = req->rq_disk->private_data;
44211 +       unsigned long buffer_mfn;
44212 +       blkif_request_t *ring_req;
44213 +       struct bio *bio;
44214 +       struct bio_vec *bvec;
44215 +       int idx;
44216 +       unsigned long id;
44217 +       unsigned int fsect, lsect;
44218 +       int ref;
44219 +       grant_ref_t gref_head;
44220 +
44221 +       if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
44222 +               return 1;
44223 +
44224 +       if (gnttab_alloc_grant_references(
44225 +               BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
44226 +               gnttab_request_free_callback(
44227 +                       &info->callback,
44228 +                       blkif_restart_queue_callback,
44229 +                       info,
44230 +                       BLKIF_MAX_SEGMENTS_PER_REQUEST);
44231 +               return 1;
44232 +       }
44233 +
44234 +       /* Fill out a communications ring structure. */
44235 +       ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
44236 +       id = GET_ID_FROM_FREELIST(info);
44237 +       info->shadow[id].request = (unsigned long)req;
44238 +
44239 +       ring_req->id = id;
44240 +       ring_req->operation = rq_data_dir(req) ?
44241 +               BLKIF_OP_WRITE : BLKIF_OP_READ;
44242 +       ring_req->sector_number = (blkif_sector_t)req->sector;
44243 +       ring_req->handle = info->handle;
44244 +
44245 +       ring_req->nr_segments = 0;
44246 +       rq_for_each_bio (bio, req) {
44247 +               bio_for_each_segment (bvec, bio, idx) {
44248 +                       BUG_ON(ring_req->nr_segments
44249 +                              == BLKIF_MAX_SEGMENTS_PER_REQUEST);
44250 +                       buffer_mfn = page_to_phys(bvec->bv_page) >> PAGE_SHIFT;
44251 +                       fsect = bvec->bv_offset >> 9;
44252 +                       lsect = fsect + (bvec->bv_len >> 9) - 1;
44253 +                       /* install a grant reference. */
44254 +                       ref = gnttab_claim_grant_reference(&gref_head);
44255 +                       BUG_ON(ref == -ENOSPC);
44256 +
44257 +                       gnttab_grant_foreign_access_ref(
44258 +                               ref,
44259 +                               info->xbdev->otherend_id,
44260 +                               buffer_mfn,
44261 +                               rq_data_dir(req) );
44262 +
44263 +                       info->shadow[id].frame[ring_req->nr_segments] =
44264 +                               mfn_to_pfn(buffer_mfn);
44265 +
44266 +                       ring_req->seg[ring_req->nr_segments] =
44267 +                               (struct blkif_request_segment) {
44268 +                                       .gref       = ref,
44269 +                                       .first_sect = fsect,
44270 +                                       .last_sect  = lsect };
44271 +
44272 +                       ring_req->nr_segments++;
44273 +               }
44274 +       }
44275 +
44276 +       info->ring.req_prod_pvt++;
44277 +
44278 +       /* Keep a private copy so we can reissue requests when recovering. */
44279 +       info->shadow[id].req = *ring_req;
44280 +
44281 +       gnttab_free_grant_references(gref_head);
44282 +
44283 +       return 0;
44284 +}
44285 +
44286 +/*
44287 + * do_blkif_request
44288 + *  read a block; request is in a request queue
44289 + */
44290 +void do_blkif_request(request_queue_t *rq)
44291 +{
44292 +       struct blkfront_info *info = NULL;
44293 +       struct request *req;
44294 +       int queued;
44295 +
44296 +       DPRINTK("Entered do_blkif_request\n");
44297 +
44298 +       queued = 0;
44299 +
44300 +       while ((req = elv_next_request(rq)) != NULL) {
44301 +               info = req->rq_disk->private_data;
44302 +               if (!blk_fs_request(req)) {
44303 +                       end_request(req, 0);
44304 +                       continue;
44305 +               }
44306 +
44307 +               if (RING_FULL(&info->ring))
44308 +                       goto wait;
44309 +
44310 +               DPRINTK("do_blk_req %p: cmd %p, sec %lx, "
44311 +                       "(%u/%li) buffer:%p [%s]\n",
44312 +                       req, req->cmd, req->sector, req->current_nr_sectors,
44313 +                       req->nr_sectors, req->buffer,
44314 +                       rq_data_dir(req) ? "write" : "read");
44315 +
44316 +
44317 +               blkdev_dequeue_request(req);
44318 +               if (blkif_queue_request(req)) {
44319 +                       blk_requeue_request(rq, req);
44320 +               wait:
44321 +                       /* Avoid pointless unplugs. */
44322 +                       blk_stop_queue(rq);
44323 +                       break;
44324 +               }
44325 +
44326 +               queued++;
44327 +       }
44328 +
44329 +       if (queued != 0)
44330 +               flush_requests(info);
44331 +}
44332 +
44333 +
44334 +static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
44335 +{
44336 +       struct request *req;
44337 +       blkif_response_t *bret;
44338 +       RING_IDX i, rp;
44339 +       unsigned long flags;
44340 +       struct blkfront_info *info = (struct blkfront_info *)dev_id;
44341 +
44342 +       spin_lock_irqsave(&blkif_io_lock, flags);
44343 +
44344 +       if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
44345 +               spin_unlock_irqrestore(&blkif_io_lock, flags);
44346 +               return IRQ_HANDLED;
44347 +       }
44348 +
44349 + again:
44350 +       rp = info->ring.sring->rsp_prod;
44351 +       rmb(); /* Ensure we see queued responses up to 'rp'. */
44352 +
44353 +       for (i = info->ring.rsp_cons; i != rp; i++) {
44354 +               unsigned long id;
44355 +               int ret;
44356 +
44357 +               bret = RING_GET_RESPONSE(&info->ring, i);
44358 +               id   = bret->id;
44359 +               req  = (struct request *)info->shadow[id].request;
44360 +
44361 +               blkif_completion(&info->shadow[id]);
44362 +
44363 +               ADD_ID_TO_FREELIST(info, id);
44364 +
44365 +               switch (bret->operation) {
44366 +               case BLKIF_OP_READ:
44367 +               case BLKIF_OP_WRITE:
44368 +                       if (unlikely(bret->status != BLKIF_RSP_OKAY))
44369 +                               DPRINTK("Bad return from blkdev data "
44370 +                                       "request: %x\n", bret->status);
44371 +
44372 +                       ret = end_that_request_first(
44373 +                               req, (bret->status == BLKIF_RSP_OKAY),
44374 +                               req->hard_nr_sectors);
44375 +                       BUG_ON(ret);
44376 +                       end_that_request_last(
44377 +                               req, (bret->status == BLKIF_RSP_OKAY));
44378 +                       break;
44379 +               default:
44380 +                       BUG();
44381 +               }
44382 +       }
44383 +
44384 +       info->ring.rsp_cons = i;
44385 +
44386 +       if (i != info->ring.req_prod_pvt) {
44387 +               int more_to_do;
44388 +               RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
44389 +               if (more_to_do)
44390 +                       goto again;
44391 +       } else
44392 +               info->ring.sring->rsp_event = i + 1;
44393 +
44394 +       kick_pending_request_queues(info);
44395 +
44396 +       spin_unlock_irqrestore(&blkif_io_lock, flags);
44397 +
44398 +       return IRQ_HANDLED;
44399 +}
44400 +
44401 +static void blkif_free(struct blkfront_info *info, int suspend)
44402 +{
44403 +       /* Prevent new requests being issued until we fix things up. */
44404 +       spin_lock_irq(&blkif_io_lock);
44405 +       info->connected = suspend ?
44406 +               BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
44407 +       spin_unlock_irq(&blkif_io_lock);
44408 +
44409 +       /* Free resources associated with old device channel. */
44410 +       if (info->ring_ref != GRANT_INVALID_REF) {
44411 +               gnttab_end_foreign_access(info->ring_ref, 0,
44412 +                                         (unsigned long)info->ring.sring);
44413 +               info->ring_ref = GRANT_INVALID_REF;
44414 +               info->ring.sring = NULL;
44415 +       }
44416 +       if (info->irq)
44417 +               unbind_from_irqhandler(info->irq, info);
44418 +       info->evtchn = info->irq = 0;
44419 +
44420 +}
44421 +
44422 +static void blkif_completion(struct blk_shadow *s)
44423 +{
44424 +       int i;
44425 +       for (i = 0; i < s->req.nr_segments; i++)
44426 +               gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
44427 +}
44428 +
44429 +static void blkif_recover(struct blkfront_info *info)
44430 +{
44431 +       int i;
44432 +       blkif_request_t *req;
44433 +       struct blk_shadow *copy;
44434 +       int j;
44435 +
44436 +       /* Stage 1: Make a safe copy of the shadow state. */
44437 +       copy = kmalloc(sizeof(info->shadow), GFP_KERNEL | __GFP_NOFAIL);
44438 +       memcpy(copy, info->shadow, sizeof(info->shadow));
44439 +
44440 +       /* Stage 2: Set up free list. */
44441 +       memset(&info->shadow, 0, sizeof(info->shadow));
44442 +       for (i = 0; i < BLK_RING_SIZE; i++)
44443 +               info->shadow[i].req.id = i+1;
44444 +       info->shadow_free = info->ring.req_prod_pvt;
44445 +       info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
44446 +
44447 +       /* Stage 3: Find pending requests and requeue them. */
44448 +       for (i = 0; i < BLK_RING_SIZE; i++) {
44449 +               /* Not in use? */
44450 +               if (copy[i].request == 0)
44451 +                       continue;
44452 +
44453 +               /* Grab a request slot and copy shadow state into it. */
44454 +               req = RING_GET_REQUEST(
44455 +                       &info->ring, info->ring.req_prod_pvt);
44456 +               *req = copy[i].req;
44457 +
44458 +               /* We get a new request id, and must reset the shadow state. */
44459 +               req->id = GET_ID_FROM_FREELIST(info);
44460 +               memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
44461 +
44462 +               /* Rewrite any grant references invalidated by susp/resume. */
44463 +               for (j = 0; j < req->nr_segments; j++)
44464 +                       gnttab_grant_foreign_access_ref(
44465 +                               req->seg[j].gref,
44466 +                               info->xbdev->otherend_id,
44467 +                               pfn_to_mfn(info->shadow[req->id].frame[j]),
44468 +                               rq_data_dir(
44469 +                                       (struct request *)
44470 +                                       info->shadow[req->id].request));
44471 +               info->shadow[req->id].req = *req;
44472 +
44473 +               info->ring.req_prod_pvt++;
44474 +       }
44475 +
44476 +       kfree(copy);
44477 +
44478 +       (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
44479 +
44480 +       /* Now safe for us to use the shared ring */
44481 +       spin_lock_irq(&blkif_io_lock);
44482 +       info->connected = BLKIF_STATE_CONNECTED;
44483 +       spin_unlock_irq(&blkif_io_lock);
44484 +
44485 +       /* Send off requeued requests */
44486 +       flush_requests(info);
44487 +
44488 +       /* Kick any other new requests queued since we resumed */
44489 +       spin_lock_irq(&blkif_io_lock);
44490 +       kick_pending_request_queues(info);
44491 +       spin_unlock_irq(&blkif_io_lock);
44492 +}
44493 +
44494 +
44495 +/* ** Driver Registration ** */
44496 +
44497 +
44498 +static struct xenbus_device_id blkfront_ids[] = {
44499 +       { "vbd" },
44500 +       { "" }
44501 +};
44502 +
44503 +
44504 +static struct xenbus_driver blkfront = {
44505 +       .name = "vbd",
44506 +       .owner = THIS_MODULE,
44507 +       .ids = blkfront_ids,
44508 +       .probe = blkfront_probe,
44509 +       .remove = blkfront_remove,
44510 +       .resume = blkfront_resume,
44511 +       .otherend_changed = backend_changed,
44512 +};
44513 +
44514 +
44515 +static int __init xlblk_init(void)
44516 +{
44517 +       if (xen_init() < 0)
44518 +               return -ENODEV;
44519 +
44520 +       return xenbus_register_frontend(&blkfront);
44521 +}
44522 +module_init(xlblk_init);
44523 +
44524 +
44525 +static void xlblk_exit(void)
44526 +{
44527 +       return xenbus_unregister_driver(&blkfront);
44528 +}
44529 +module_exit(xlblk_exit);
44530 +
44531 +MODULE_LICENSE("Dual BSD/GPL");
44532 +
44533 +/*
44534 + * Local variables:
44535 + *  c-file-style: "linux"
44536 + *  indent-tabs-mode: t
44537 + *  c-indent-level: 8
44538 + *  c-basic-offset: 8
44539 + *  tab-width: 8
44540 + * End:
44541 + */
44542 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/blkfront/block.h linux-2.6.16/drivers/xen/blkfront/block.h
44543 --- linux-2.6.16.orig/drivers/xen/blkfront/block.h      1970-01-01 01:00:00.000000000 +0100
44544 +++ linux-2.6.16/drivers/xen/blkfront/block.h   2006-06-26 09:51:32.000000000 +0200
44545 @@ -0,0 +1,165 @@
44546 +/******************************************************************************
44547 + * block.h
44548 + * 
44549 + * Shared definitions between all levels of XenLinux Virtual block devices.
44550 + * 
44551 + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
44552 + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
44553 + * Copyright (c) 2004-2005, Christian Limpach
44554 + * 
44555 + * This program is free software; you can redistribute it and/or
44556 + * modify it under the terms of the GNU General Public License version 2
44557 + * as published by the Free Software Foundation; or, when distributed
44558 + * separately from the Linux kernel or incorporated into other
44559 + * software packages, subject to the following license:
44560 + * 
44561 + * Permission is hereby granted, free of charge, to any person obtaining a copy
44562 + * of this source file (the "Software"), to deal in the Software without
44563 + * restriction, including without limitation the rights to use, copy, modify,
44564 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
44565 + * and to permit persons to whom the Software is furnished to do so, subject to
44566 + * the following conditions:
44567 + * 
44568 + * The above copyright notice and this permission notice shall be included in
44569 + * all copies or substantial portions of the Software.
44570 + * 
44571 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
44572 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
44573 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
44574 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
44575 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
44576 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
44577 + * IN THE SOFTWARE.
44578 + */
44579 +
44580 +#ifndef __XEN_DRIVERS_BLOCK_H__
44581 +#define __XEN_DRIVERS_BLOCK_H__
44582 +
44583 +#include <linux/config.h>
44584 +#include <linux/version.h>
44585 +#include <linux/module.h>
44586 +#include <linux/kernel.h>
44587 +#include <linux/sched.h>
44588 +#include <linux/slab.h>
44589 +#include <linux/string.h>
44590 +#include <linux/errno.h>
44591 +#include <linux/fs.h>
44592 +#include <linux/hdreg.h>
44593 +#include <linux/blkdev.h>
44594 +#include <linux/major.h>
44595 +#include <linux/devfs_fs_kernel.h>
44596 +#include <asm/hypervisor.h>
44597 +#include <xen/xenbus.h>
44598 +#include <xen/gnttab.h>
44599 +#include <xen/interface/xen.h>
44600 +#include <xen/interface/io/blkif.h>
44601 +#include <xen/interface/io/ring.h>
44602 +#include <asm/io.h>
44603 +#include <asm/atomic.h>
44604 +#include <asm/uaccess.h>
44605 +
44606 +#if 1
44607 +#define IPRINTK(fmt, args...) \
44608 +    printk(KERN_INFO "xen_blk: " fmt, ##args)
44609 +#else
44610 +#define IPRINTK(fmt, args...) ((void)0)
44611 +#endif
44612 +
44613 +#if 1
44614 +#define WPRINTK(fmt, args...) \
44615 +    printk(KERN_WARNING "xen_blk: " fmt, ##args)
44616 +#else
44617 +#define WPRINTK(fmt, args...) ((void)0)
44618 +#endif
44619 +
44620 +#define DPRINTK(_f, _a...) pr_debug(_f, ## _a)
44621 +
44622 +#if 0
44623 +#define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a)
44624 +#else
44625 +#define DPRINTK_IOCTL(_f, _a...) ((void)0)
44626 +#endif
44627 +
44628 +struct xlbd_type_info
44629 +{
44630 +       int partn_shift;
44631 +       int disks_per_major;
44632 +       char *devname;
44633 +       char *diskname;
44634 +};
44635 +
44636 +struct xlbd_major_info
44637 +{
44638 +       int major;
44639 +       int index;
44640 +       int usage;
44641 +       struct xlbd_type_info *type;
44642 +};
44643 +
44644 +struct blk_shadow {
44645 +       blkif_request_t req;
44646 +       unsigned long request;
44647 +       unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
44648 +};
44649 +
44650 +#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE)
44651 +
44652 +/*
44653 + * We have one of these per vbd, whether ide, scsi or 'other'.  They
44654 + * hang in private_data off the gendisk structure. We may end up
44655 + * putting all kinds of interesting stuff here :-)
44656 + */
44657 +struct blkfront_info
44658 +{
44659 +       struct xenbus_device *xbdev;
44660 +       dev_t dev;
44661 +       struct gendisk *gd;
44662 +       int vdevice;
44663 +       blkif_vdev_t handle;
44664 +       int connected;
44665 +       int ring_ref;
44666 +       blkif_front_ring_t ring;
44667 +       unsigned int evtchn, irq;
44668 +       struct xlbd_major_info *mi;
44669 +       request_queue_t *rq;
44670 +       struct work_struct work;
44671 +       struct gnttab_free_callback callback;
44672 +       struct blk_shadow shadow[BLK_RING_SIZE];
44673 +       unsigned long shadow_free;
44674 +
44675 +       /**
44676 +        * The number of people holding this device open.  We won't allow a
44677 +        * hot-unplug unless this is 0.
44678 +        */
44679 +       int users;
44680 +};
44681 +
44682 +extern spinlock_t blkif_io_lock;
44683 +
44684 +extern int blkif_open(struct inode *inode, struct file *filep);
44685 +extern int blkif_release(struct inode *inode, struct file *filep);
44686 +extern int blkif_ioctl(struct inode *inode, struct file *filep,
44687 +                       unsigned command, unsigned long argument);
44688 +extern int blkif_check(dev_t dev);
44689 +extern int blkif_revalidate(dev_t dev);
44690 +extern void do_blkif_request (request_queue_t *rq);
44691 +
44692 +/* Virtual block-device subsystem. */
44693 +/* Note that xlvbd_add doesn't call add_disk for you: you're expected
44694 +   to call add_disk on info->gd once the disk is properly connected
44695 +   up. */
44696 +int xlvbd_add(blkif_sector_t capacity, int device,
44697 +             u16 vdisk_info, u16 sector_size, struct blkfront_info *info);
44698 +void xlvbd_del(struct blkfront_info *info);
44699 +
44700 +#endif /* __XEN_DRIVERS_BLOCK_H__ */
44701 +
44702 +/*
44703 + * Local variables:
44704 + *  c-file-style: "linux"
44705 + *  indent-tabs-mode: t
44706 + *  c-indent-level: 8
44707 + *  c-basic-offset: 8
44708 + *  tab-width: 8
44709 + * End:
44710 + */
44711 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/blkfront/vbd.c linux-2.6.16/drivers/xen/blkfront/vbd.c
44712 --- linux-2.6.16.orig/drivers/xen/blkfront/vbd.c        1970-01-01 01:00:00.000000000 +0100
44713 +++ linux-2.6.16/drivers/xen/blkfront/vbd.c     2006-06-26 09:51:32.000000000 +0200
44714 @@ -0,0 +1,327 @@
44715 +/******************************************************************************
44716 + * vbd.c
44717 + * 
44718 + * XenLinux virtual block-device driver (xvd).
44719 + * 
44720 + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
44721 + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
44722 + * Copyright (c) 2004-2005, Christian Limpach
44723 + * 
44724 + * This program is free software; you can redistribute it and/or
44725 + * modify it under the terms of the GNU General Public License version 2
44726 + * as published by the Free Software Foundation; or, when distributed
44727 + * separately from the Linux kernel or incorporated into other
44728 + * software packages, subject to the following license:
44729 + * 
44730 + * Permission is hereby granted, free of charge, to any person obtaining a copy
44731 + * of this source file (the "Software"), to deal in the Software without
44732 + * restriction, including without limitation the rights to use, copy, modify,
44733 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
44734 + * and to permit persons to whom the Software is furnished to do so, subject to
44735 + * the following conditions:
44736 + * 
44737 + * The above copyright notice and this permission notice shall be included in
44738 + * all copies or substantial portions of the Software.
44739 + * 
44740 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
44741 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
44742 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
44743 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
44744 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
44745 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
44746 + * IN THE SOFTWARE.
44747 + */
44748 +
44749 +#include "block.h"
44750 +#include <linux/blkdev.h>
44751 +#include <linux/list.h>
44752 +
44753 +#define BLKIF_MAJOR(dev) ((dev)>>8)
44754 +#define BLKIF_MINOR(dev) ((dev) & 0xff)
44755 +
44756 +/*
44757 + * For convenience we distinguish between ide, scsi and 'other' (i.e.,
44758 + * potentially combinations of the two) in the naming scheme and in a few other
44759 + * places.
44760 + */
44761 +
44762 +#define NUM_IDE_MAJORS 10
44763 +#define NUM_SCSI_MAJORS 9
44764 +#define NUM_VBD_MAJORS 1
44765 +
44766 +static struct xlbd_type_info xlbd_ide_type = {
44767 +       .partn_shift = 6,
44768 +       .disks_per_major = 2,
44769 +       .devname = "ide",
44770 +       .diskname = "hd",
44771 +};
44772 +
44773 +static struct xlbd_type_info xlbd_scsi_type = {
44774 +       .partn_shift = 4,
44775 +       .disks_per_major = 16,
44776 +       .devname = "sd",
44777 +       .diskname = "sd",
44778 +};
44779 +
44780 +static struct xlbd_type_info xlbd_vbd_type = {
44781 +       .partn_shift = 4,
44782 +       .disks_per_major = 16,
44783 +       .devname = "xvd",
44784 +       .diskname = "xvd",
44785 +};
44786 +
44787 +static struct xlbd_major_info *major_info[NUM_IDE_MAJORS + NUM_SCSI_MAJORS +
44788 +                                        NUM_VBD_MAJORS];
44789 +
44790 +#define XLBD_MAJOR_IDE_START   0
44791 +#define XLBD_MAJOR_SCSI_START  (NUM_IDE_MAJORS)
44792 +#define XLBD_MAJOR_VBD_START   (NUM_IDE_MAJORS + NUM_SCSI_MAJORS)
44793 +
44794 +#define XLBD_MAJOR_IDE_RANGE   XLBD_MAJOR_IDE_START ... XLBD_MAJOR_SCSI_START - 1
44795 +#define XLBD_MAJOR_SCSI_RANGE  XLBD_MAJOR_SCSI_START ... XLBD_MAJOR_VBD_START - 1
44796 +#define XLBD_MAJOR_VBD_RANGE   XLBD_MAJOR_VBD_START ... XLBD_MAJOR_VBD_START + NUM_VBD_MAJORS - 1
44797 +
44798 +/* Information about our VBDs. */
44799 +#define MAX_VBDS 64
44800 +static LIST_HEAD(vbds_list);
44801 +
44802 +static struct block_device_operations xlvbd_block_fops =
44803 +{
44804 +       .owner = THIS_MODULE,
44805 +       .open = blkif_open,
44806 +       .release = blkif_release,
44807 +       .ioctl  = blkif_ioctl,
44808 +};
44809 +
44810 +spinlock_t blkif_io_lock = SPIN_LOCK_UNLOCKED;
44811 +
44812 +static struct xlbd_major_info *
44813 +xlbd_alloc_major_info(int major, int minor, int index)
44814 +{
44815 +       struct xlbd_major_info *ptr;
44816 +
44817 +       ptr = kzalloc(sizeof(struct xlbd_major_info), GFP_KERNEL);
44818 +       if (ptr == NULL)
44819 +               return NULL;
44820 +
44821 +       ptr->major = major;
44822 +
44823 +       switch (index) {
44824 +       case XLBD_MAJOR_IDE_RANGE:
44825 +               ptr->type = &xlbd_ide_type;
44826 +               ptr->index = index - XLBD_MAJOR_IDE_START;
44827 +               break;
44828 +       case XLBD_MAJOR_SCSI_RANGE:
44829 +               ptr->type = &xlbd_scsi_type;
44830 +               ptr->index = index - XLBD_MAJOR_SCSI_START;
44831 +               break;
44832 +       case XLBD_MAJOR_VBD_RANGE:
44833 +               ptr->type = &xlbd_vbd_type;
44834 +               ptr->index = index - XLBD_MAJOR_VBD_START;
44835 +               break;
44836 +       }
44837 +
44838 +       printk("Registering block device major %i\n", ptr->major);
44839 +       if (register_blkdev(ptr->major, ptr->type->devname)) {
44840 +               WPRINTK("can't get major %d with name %s\n",
44841 +                       ptr->major, ptr->type->devname);
44842 +               kfree(ptr);
44843 +               return NULL;
44844 +       }
44845 +
44846 +       devfs_mk_dir(ptr->type->devname);
44847 +       major_info[index] = ptr;
44848 +       return ptr;
44849 +}
44850 +
44851 +static struct xlbd_major_info *
44852 +xlbd_get_major_info(int vdevice)
44853 +{
44854 +       struct xlbd_major_info *mi;
44855 +       int major, minor, index;
44856 +
44857 +       major = BLKIF_MAJOR(vdevice);
44858 +       minor = BLKIF_MINOR(vdevice);
44859 +
44860 +       switch (major) {
44861 +       case IDE0_MAJOR: index = 0; break;
44862 +       case IDE1_MAJOR: index = 1; break;
44863 +       case IDE2_MAJOR: index = 2; break;
44864 +       case IDE3_MAJOR: index = 3; break;
44865 +       case IDE4_MAJOR: index = 4; break;
44866 +       case IDE5_MAJOR: index = 5; break;
44867 +       case IDE6_MAJOR: index = 6; break;
44868 +       case IDE7_MAJOR: index = 7; break;
44869 +       case IDE8_MAJOR: index = 8; break;
44870 +       case IDE9_MAJOR: index = 9; break;
44871 +       case SCSI_DISK0_MAJOR: index = 10; break;
44872 +       case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR:
44873 +               index = 11 + major - SCSI_DISK1_MAJOR;
44874 +               break;
44875 +       case SCSI_CDROM_MAJOR: index = 18; break;
44876 +       default: index = 19; break;
44877 +       }
44878 +
44879 +       mi = ((major_info[index] != NULL) ? major_info[index] :
44880 +             xlbd_alloc_major_info(major, minor, index));
44881 +       if (mi)
44882 +               mi->usage++;
44883 +       return mi;
44884 +}
44885 +
44886 +static void
44887 +xlbd_put_major_info(struct xlbd_major_info *mi)
44888 +{
44889 +       mi->usage--;
44890 +       /* XXX: release major if 0 */
44891 +}
44892 +
44893 +static int
44894 +xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
44895 +{
44896 +       request_queue_t *rq;
44897 +
44898 +       rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
44899 +       if (rq == NULL)
44900 +               return -1;
44901 +
44902 +       elevator_init(rq, "noop");
44903 +
44904 +       /* Hard sector size and max sectors impersonate the equiv. hardware. */
44905 +       blk_queue_hardsect_size(rq, sector_size);
44906 +       blk_queue_max_sectors(rq, 512);
44907 +
44908 +       /* Each segment in a request is up to an aligned page in size. */
44909 +       blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
44910 +       blk_queue_max_segment_size(rq, PAGE_SIZE);
44911 +
44912 +       /* Ensure a merged request will fit in a single I/O ring slot. */
44913 +       blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
44914 +       blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
44915 +
44916 +       /* Make sure buffer addresses are sector-aligned. */
44917 +       blk_queue_dma_alignment(rq, 511);
44918 +
44919 +       gd->queue = rq;
44920 +
44921 +       return 0;
44922 +}
44923 +
44924 +static int
44925 +xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity, int vdevice,
44926 +                   u16 vdisk_info, u16 sector_size,
44927 +                   struct blkfront_info *info)
44928 +{
44929 +       struct gendisk *gd;
44930 +       struct xlbd_major_info *mi;
44931 +       int nr_minors = 1;
44932 +       int err = -ENODEV;
44933 +
44934 +       BUG_ON(info->gd != NULL);
44935 +       BUG_ON(info->mi != NULL);
44936 +       BUG_ON(info->rq != NULL);
44937 +
44938 +       mi = xlbd_get_major_info(vdevice);
44939 +       if (mi == NULL)
44940 +               goto out;
44941 +       info->mi = mi;
44942 +
44943 +       if ((minor & ((1 << mi->type->partn_shift) - 1)) == 0)
44944 +               nr_minors = 1 << mi->type->partn_shift;
44945 +
44946 +       gd = alloc_disk(nr_minors);
44947 +       if (gd == NULL)
44948 +               goto out;
44949 +
44950 +       if (nr_minors > 1)
44951 +               sprintf(gd->disk_name, "%s%c", mi->type->diskname,
44952 +                       'a' + mi->index * mi->type->disks_per_major +
44953 +                       (minor >> mi->type->partn_shift));
44954 +       else
44955 +               sprintf(gd->disk_name, "%s%c%d", mi->type->diskname,
44956 +                       'a' + mi->index * mi->type->disks_per_major +
44957 +                       (minor >> mi->type->partn_shift),
44958 +                       minor & ((1 << mi->type->partn_shift) - 1));
44959 +
44960 +       gd->major = mi->major;
44961 +       gd->first_minor = minor;
44962 +       gd->fops = &xlvbd_block_fops;
44963 +       gd->private_data = info;
44964 +       gd->driverfs_dev = &(info->xbdev->dev);
44965 +       set_capacity(gd, capacity);
44966 +
44967 +       if (xlvbd_init_blk_queue(gd, sector_size)) {
44968 +               del_gendisk(gd);
44969 +               goto out;
44970 +       }
44971 +
44972 +       info->rq = gd->queue;
44973 +
44974 +       if (vdisk_info & VDISK_READONLY)
44975 +               set_disk_ro(gd, 1);
44976 +
44977 +       if (vdisk_info & VDISK_REMOVABLE)
44978 +               gd->flags |= GENHD_FL_REMOVABLE;
44979 +
44980 +       if (vdisk_info & VDISK_CDROM)
44981 +               gd->flags |= GENHD_FL_CD;
44982 +
44983 +       info->gd = gd;
44984 +
44985 +       return 0;
44986 +
44987 + out:
44988 +       if (mi)
44989 +               xlbd_put_major_info(mi);
44990 +       info->mi = NULL;
44991 +       return err;
44992 +}
44993 +
44994 +int
44995 +xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
44996 +         u16 sector_size, struct blkfront_info *info)
44997 +{
44998 +       struct block_device *bd;
44999 +       int err = 0;
45000 +
45001 +       info->dev = MKDEV(BLKIF_MAJOR(vdevice), BLKIF_MINOR(vdevice));
45002 +
45003 +       bd = bdget(info->dev);
45004 +       if (bd == NULL)
45005 +               return -ENODEV;
45006 +
45007 +       err = xlvbd_alloc_gendisk(BLKIF_MINOR(vdevice), capacity, vdevice,
45008 +                                 vdisk_info, sector_size, info);
45009 +
45010 +       bdput(bd);
45011 +       return err;
45012 +}
45013 +
45014 +void
45015 +xlvbd_del(struct blkfront_info *info)
45016 +{
45017 +       if (info->mi == NULL)
45018 +               return;
45019 +
45020 +       BUG_ON(info->gd == NULL);
45021 +       del_gendisk(info->gd);
45022 +       put_disk(info->gd);
45023 +       info->gd = NULL;
45024 +
45025 +       xlbd_put_major_info(info->mi);
45026 +       info->mi = NULL;
45027 +
45028 +       BUG_ON(info->rq == NULL);
45029 +       blk_cleanup_queue(info->rq);
45030 +       info->rq = NULL;
45031 +}
45032 +
45033 +/*
45034 + * Local variables:
45035 + *  c-file-style: "linux"
45036 + *  indent-tabs-mode: t
45037 + *  c-indent-level: 8
45038 + *  c-basic-offset: 8
45039 + *  tab-width: 8
45040 + * End:
45041 + */
45042 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/blktap/Makefile linux-2.6.16/drivers/xen/blktap/Makefile
45043 --- linux-2.6.16.orig/drivers/xen/blktap/Makefile       1970-01-01 01:00:00.000000000 +0100
45044 +++ linux-2.6.16/drivers/xen/blktap/Makefile    2006-06-26 09:51:32.000000000 +0200
45045 @@ -0,0 +1,3 @@
45046 +
45047 +obj-y  := xenbus.o interface.o blktap.o 
45048 +
45049 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/blktap/blktap.c linux-2.6.16/drivers/xen/blktap/blktap.c
45050 --- linux-2.6.16.orig/drivers/xen/blktap/blktap.c       1970-01-01 01:00:00.000000000 +0100
45051 +++ linux-2.6.16/drivers/xen/blktap/blktap.c    2006-06-26 09:51:32.000000000 +0200
45052 @@ -0,0 +1,910 @@
45053 +/******************************************************************************
45054 + * arch/xen/drivers/blkif/blktap/blktap.c
45055 + * 
45056 + * This is a modified version of the block backend driver that remaps requests
45057 + * to a user-space memory region.  It is intended to be used to write 
45058 + * application-level servers that provide block interfaces to client VMs.
45059 + */
45060 +
45061 +#include <linux/kernel.h>
45062 +#include <linux/spinlock.h>
45063 +#include <xen/balloon.h>
45064 +#include <linux/kernel.h>
45065 +#include <linux/fs.h>
45066 +#include <linux/mm.h>
45067 +#include <linux/miscdevice.h>
45068 +#include <linux/errno.h>
45069 +#include <linux/major.h>
45070 +#include <linux/gfp.h>
45071 +#include <linux/poll.h>
45072 +#include <asm/tlbflush.h>
45073 +#include "common.h"
45074 +
45075 +/* Only one process may open /dev/xen/blktap at any time. */
45076 +static unsigned long blktap_dev_inuse;
45077 +unsigned long blktap_ring_ok; /* make this ring->state */
45078 +
45079 +/* Rings up to user space. */
45080 +static blkif_front_ring_t blktap_ufe_ring;
45081 +
45082 +/* for poll: */
45083 +static wait_queue_head_t blktap_wait;
45084 +
45085 +/* current switching mode */
45086 +static unsigned long blktap_mode;
45087 +
45088 +/* local prototypes */
45089 +static int blktap_read_ufe_ring(void);
45090 +
45091 +
45092 +/* /dev/xen/blktap resides at device number major=10, minor=200        */ 
45093 +#define BLKTAP_MINOR 202
45094 +
45095 +/* blktap IOCTLs:                                                      */
45096 +#define BLKTAP_IOCTL_KICK_FE         1
45097 +#define BLKTAP_IOCTL_KICK_BE         2 /* currently unused */
45098 +#define BLKTAP_IOCTL_SETMODE         3
45099 +#define BLKTAP_IOCTL_PRINT_IDXS      100  
45100 +
45101 +/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE)             */
45102 +#define BLKTAP_MODE_PASSTHROUGH      0x00000000  /* default            */
45103 +#define BLKTAP_MODE_INTERCEPT_FE     0x00000001
45104 +#define BLKTAP_MODE_INTERCEPT_BE     0x00000002  /* unimp. */
45105 +#define BLKTAP_MODE_COPY_FE          0x00000004  /* unimp. */
45106 +#define BLKTAP_MODE_COPY_BE          0x00000008  /* unimp. */
45107 +#define BLKTAP_MODE_COPY_FE_PAGES    0x00000010  /* unimp. */
45108 +#define BLKTAP_MODE_COPY_BE_PAGES    0x00000020  /* unimp. */
45109 +
45110 +#define BLKTAP_MODE_INTERPOSE \
45111 +           (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
45112 +
45113 +#define BLKTAP_MODE_COPY_BOTH \
45114 +           (BLKTAP_MODE_COPY_FE | BLKTAP_MODE_COPY_BE)
45115 +
45116 +#define BLKTAP_MODE_COPY_BOTH_PAGES \
45117 +           (BLKTAP_MODE_COPY_FE_PAGES | BLKTAP_MODE_COPY_BE_PAGES)
45118 +
45119 +static inline int BLKTAP_MODE_VALID(unsigned long arg)
45120 +{
45121 +       return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
45122 +               (arg == BLKTAP_MODE_INTERCEPT_FE) ||
45123 +               (arg == BLKTAP_MODE_INTERPOSE   ));
45124 +/*
45125 +  return (
45126 +  ( arg == BLKTAP_MODE_PASSTHROUGH  ) ||
45127 +  ( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
45128 +  ( arg == BLKTAP_MODE_INTERCEPT_BE ) ||
45129 +  ( arg == BLKTAP_MODE_INTERPOSE    ) ||
45130 +  ( (arg & ~BLKTAP_MODE_COPY_FE_PAGES) == BLKTAP_MODE_COPY_FE ) ||
45131 +  ( (arg & ~BLKTAP_MODE_COPY_BE_PAGES) == BLKTAP_MODE_COPY_BE ) ||
45132 +  ( (arg & ~BLKTAP_MODE_COPY_BOTH_PAGES) == BLKTAP_MODE_COPY_BOTH )
45133 +  );
45134 +*/
45135 +}
45136 +
45137 +
45138 +/******************************************************************
45139 + * MMAP REGION
45140 + */
45141 +
45142 +/*
45143 + * We use a big chunk of address space to map in-flight requests into,
45144 + * and export this region up to user-space.  See the comments in blkback
45145 + * about this -- the two must be kept in sync if the tap is used as a 
45146 + * passthrough.
45147 + */
45148 +
45149 +#define MAX_PENDING_REQS 64
45150 +#define BATCH_PER_DOMAIN 16
45151 +
45152 +/* immediately before the mmap area, we have a bunch of pages reserved
45153 + * for shared memory rings.
45154 + */
45155 +#define RING_PAGES 1 /* Front */ 
45156 +
45157 +/* Where things are inside the device mapping. */
45158 +struct vm_area_struct *blktap_vma = NULL;
45159 +unsigned long mmap_vstart;  /* Kernel pages for mapping in data. */
45160 +unsigned long rings_vstart; /* start of mmaped vma               */
45161 +unsigned long user_vstart;  /* start of user mappings            */
45162 +
45163 +#define MMAP_PAGES                                             \
45164 +       (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
45165 +#define MMAP_VADDR(_start, _req,_seg)                                  \
45166 +       (_start +                                                       \
45167 +        ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
45168 +        ((_seg) * PAGE_SIZE))
45169 +
45170 +/*
45171 + * Each outstanding request that we've passed to the lower device layers has a 
45172 + * 'pending_req' allocated to it. Each buffer_head that completes decrements 
45173 + * the pendcnt towards zero. When it hits zero, the specified domain has a 
45174 + * response queued for it, with the saved 'id' passed back.
45175 + */
45176 +typedef struct {
45177 +       blkif_t       *blkif;
45178 +       unsigned long  id;
45179 +       int            nr_pages;
45180 +       atomic_t       pendcnt;
45181 +       unsigned short operation;
45182 +       int            status;
45183 +} pending_req_t;
45184 +
45185 +/*
45186 + * We can't allocate pending_req's in order, since they may complete out of 
45187 + * order. We therefore maintain an allocation ring. This ring also indicates 
45188 + * when enough work has been passed down -- at that point the allocation ring 
45189 + * will be empty.
45190 + */
45191 +static pending_req_t pending_reqs[MAX_PENDING_REQS];
45192 +static unsigned char pending_ring[MAX_PENDING_REQS];
45193 +static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
45194 +/* NB. We use a different index type to differentiate from shared blk rings. */
45195 +typedef unsigned int PEND_RING_IDX;
45196 +#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
45197 +static PEND_RING_IDX pending_prod, pending_cons;
45198 +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
45199 +
45200 +/* Requests passing through the tap to the backend hijack the id field
45201 + * in the request message.  In it we put the AR index _AND_ the fe domid.
45202 + * the domid is used by the backend to map the pages properly.
45203 + */
45204 +
45205 +static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx)
45206 +{
45207 +       return ((fe_dom << 16) | MASK_PEND_IDX(idx));
45208 +}
45209 +
45210 +extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id) 
45211 +{ 
45212 +       return (PEND_RING_IDX)(id & 0x0000ffff);
45213 +}
45214 +
45215 +extern inline domid_t ID_TO_DOM(unsigned long id) 
45216 +{ 
45217 +       return (domid_t)(id >> 16); 
45218 +}
45219 +
45220 +
45221 +
45222 +/******************************************************************
45223 + * GRANT HANDLES
45224 + */
45225 +
45226 +/* When using grant tables to map a frame for device access then the
45227 + * handle returned must be used to unmap the frame. This is needed to
45228 + * drop the ref count on the frame.
45229 + */
45230 +struct grant_handle_pair
45231 +{
45232 +       grant_handle_t kernel;
45233 +       grant_handle_t user;
45234 +};
45235 +static struct grant_handle_pair pending_grant_handles[MMAP_PAGES];
45236 +#define pending_handle(_idx, _i) \
45237 +    (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
45238 +#define BLKTAP_INVALID_HANDLE(_g) \
45239 +    (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF))
45240 +#define BLKTAP_INVALIDATE_HANDLE(_g) do {       \
45241 +    (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \
45242 +    } while(0)
45243 +
45244 +
45245 +/******************************************************************
45246 + * BLKTAP VM OPS
45247 + */
45248 +
45249 +static struct page *blktap_nopage(struct vm_area_struct *vma,
45250 +                                 unsigned long address,
45251 +                                 int *type)
45252 +{
45253 +       /*
45254 +        * if the page has not been mapped in by the driver then generate
45255 +        * a SIGBUS to the domain.
45256 +        */
45257 +       force_sig(SIGBUS, current);
45258 +
45259 +       return 0;
45260 +}
45261 +
45262 +struct vm_operations_struct blktap_vm_ops = {
45263 +       .nopage = blktap_nopage,
45264 +};
45265 +
45266 +/******************************************************************
45267 + * BLKTAP FILE OPS
45268 + */
45269 +
45270 +static int blktap_open(struct inode *inode, struct file *filp)
45271 +{
45272 +       blkif_sring_t *sring;
45273 +
45274 +       if (test_and_set_bit(0, &blktap_dev_inuse))
45275 +               return -EBUSY;
45276 +    
45277 +       /* Allocate the fe ring. */
45278 +       sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
45279 +       if (sring == NULL)
45280 +               return -ENOMEM;
45281 +
45282 +       SetPageReserved(virt_to_page(sring));
45283 +    
45284 +       SHARED_RING_INIT(sring);
45285 +       FRONT_RING_INIT(&blktap_ufe_ring, sring, PAGE_SIZE);
45286 +
45287 +       return 0;
45288 +}
45289 +
45290 +static int blktap_release(struct inode *inode, struct file *filp)
45291 +{
45292 +       blktap_dev_inuse = 0;
45293 +       blktap_ring_ok = 0;
45294 +
45295 +       /* Free the ring page. */
45296 +       ClearPageReserved(virt_to_page(blktap_ufe_ring.sring));
45297 +       free_page((unsigned long) blktap_ufe_ring.sring);
45298 +
45299 +       /* Clear any active mappings and free foreign map table */
45300 +       if (blktap_vma != NULL) {
45301 +               zap_page_range(
45302 +                       blktap_vma, blktap_vma->vm_start, 
45303 +                       blktap_vma->vm_end - blktap_vma->vm_start, NULL);
45304 +               blktap_vma = NULL;
45305 +       }
45306 +
45307 +       return 0;
45308 +}
45309 +
45310 +
45311 +/* Note on mmap:
45312 + * We need to map pages to user space in a way that will allow the block
45313 + * subsystem set up direct IO to them.  This couldn't be done before, because
45314 + * there isn't really a sane way to translate a user virtual address down to a 
45315 + * physical address when the page belongs to another domain.
45316 + *
45317 + * My first approach was to map the page in to kernel memory, add an entry
45318 + * for it in the physical frame list (using alloc_lomem_region as in blkback)
45319 + * and then attempt to map that page up to user space.  This is disallowed
45320 + * by xen though, which realizes that we don't really own the machine frame
45321 + * underlying the physical page.
45322 + *
45323 + * The new approach is to provide explicit support for this in xen linux.
45324 + * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
45325 + * mapped from other vms.  vma->vm_private_data is set up as a mapping 
45326 + * from pages to actual page structs.  There is a new clause in get_user_pages
45327 + * that does the right thing for this sort of mapping.
45328 + */
45329 +static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
45330 +{
45331 +       int size;
45332 +       struct page **map;
45333 +       int i;
45334 +
45335 +       DPRINTK(KERN_ALERT "blktap mmap (%lx, %lx)\n",
45336 +               vma->vm_start, vma->vm_end);
45337 +
45338 +       vma->vm_flags |= VM_RESERVED;
45339 +       vma->vm_ops = &blktap_vm_ops;
45340 +
45341 +       size = vma->vm_end - vma->vm_start;
45342 +       if (size != ((MMAP_PAGES + RING_PAGES) << PAGE_SHIFT)) {
45343 +               printk(KERN_INFO 
45344 +                      "blktap: you _must_ map exactly %d pages!\n",
45345 +                      MMAP_PAGES + RING_PAGES);
45346 +               return -EAGAIN;
45347 +       }
45348 +
45349 +       size >>= PAGE_SHIFT;
45350 +       DPRINTK(KERN_INFO "blktap: 2 rings + %d pages.\n", size-1);
45351 +    
45352 +       rings_vstart = vma->vm_start;
45353 +       user_vstart  = rings_vstart + (RING_PAGES << PAGE_SHIFT);
45354 +    
45355 +       /* Map the ring pages to the start of the region and reserve it. */
45356 +
45357 +       /* not sure if I really need to do this... */
45358 +       vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
45359 +
45360 +       if (remap_pfn_range(vma, vma->vm_start, 
45361 +                           __pa(blktap_ufe_ring.sring) >> PAGE_SHIFT, 
45362 +                           PAGE_SIZE, vma->vm_page_prot)) {
45363 +               WPRINTK("Mapping user ring failed!\n");
45364 +               goto fail;
45365 +       }
45366 +
45367 +       /* Mark this VM as containing foreign pages, and set up mappings. */
45368 +       map = kmalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
45369 +                     * sizeof(struct page_struct*),
45370 +                     GFP_KERNEL);
45371 +       if (map == NULL) {
45372 +               WPRINTK("Couldn't alloc VM_FOREIGH map.\n");
45373 +               goto fail;
45374 +       }
45375 +
45376 +       for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++)
45377 +               map[i] = NULL;
45378 +    
45379 +       vma->vm_private_data = map;
45380 +       vma->vm_flags |= VM_FOREIGN;
45381 +
45382 +       blktap_vma = vma;
45383 +       blktap_ring_ok = 1;
45384 +
45385 +       return 0;
45386 + fail:
45387 +       /* Clear any active mappings. */
45388 +       zap_page_range(vma, vma->vm_start, 
45389 +                      vma->vm_end - vma->vm_start, NULL);
45390 +
45391 +       return -ENOMEM;
45392 +}
45393 +
45394 +static int blktap_ioctl(struct inode *inode, struct file *filp,
45395 +                        unsigned int cmd, unsigned long arg)
45396 +{
45397 +       switch(cmd) {
45398 +       case BLKTAP_IOCTL_KICK_FE: /* There are fe messages to process. */
45399 +               return blktap_read_ufe_ring();
45400 +
45401 +       case BLKTAP_IOCTL_SETMODE:
45402 +               if (BLKTAP_MODE_VALID(arg)) {
45403 +                       blktap_mode = arg;
45404 +                       /* XXX: may need to flush rings here. */
45405 +                       printk(KERN_INFO "blktap: set mode to %lx\n", arg);
45406 +                       return 0;
45407 +               }
45408 +       case BLKTAP_IOCTL_PRINT_IDXS:
45409 +        {
45410 +               //print_fe_ring_idxs();
45411 +               WPRINTK("User Rings: \n-----------\n");
45412 +               WPRINTK("UF: rsp_cons: %2d, req_prod_prv: %2d "
45413 +                       "| req_prod: %2d, rsp_prod: %2d\n",
45414 +                       blktap_ufe_ring.rsp_cons,
45415 +                       blktap_ufe_ring.req_prod_pvt,
45416 +                       blktap_ufe_ring.sring->req_prod,
45417 +                       blktap_ufe_ring.sring->rsp_prod);
45418 +            
45419 +        }
45420 +       }
45421 +       return -ENOIOCTLCMD;
45422 +}
45423 +
45424 +static unsigned int blktap_poll(struct file *file, poll_table *wait)
45425 +{
45426 +       poll_wait(file, &blktap_wait, wait);
45427 +       if (blktap_ufe_ring.req_prod_pvt != blktap_ufe_ring.sring->req_prod) {
45428 +               flush_tlb_all();
45429 +               RING_PUSH_REQUESTS(&blktap_ufe_ring);
45430 +               return POLLIN | POLLRDNORM;
45431 +       }
45432 +
45433 +       return 0;
45434 +}
45435 +
45436 +void blktap_kick_user(void)
45437 +{
45438 +       /* blktap_ring->req_prod = blktap_req_prod; */
45439 +       wake_up_interruptible(&blktap_wait);
45440 +}
45441 +
45442 +static struct file_operations blktap_fops = {
45443 +       .owner   = THIS_MODULE,
45444 +       .poll    = blktap_poll,
45445 +       .ioctl   = blktap_ioctl,
45446 +       .open    = blktap_open,
45447 +       .release = blktap_release,
45448 +       .mmap    = blktap_mmap,
45449 +};
45450 +
45451 +
45452 +
45453 +static int do_block_io_op(blkif_t *blkif, int max_to_do);
45454 +static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
45455 +static void make_response(blkif_t *blkif, unsigned long id, 
45456 +                          unsigned short op, int st);
45457 +
45458 +
45459 +static void fast_flush_area(int idx, int nr_pages)
45460 +{
45461 +       struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
45462 +       unsigned int i, op = 0;
45463 +       struct grant_handle_pair *handle;
45464 +       uint64_t ptep;
45465 +       int ret;
45466 +
45467 +       for ( i = 0; i < nr_pages; i++)
45468 +       {
45469 +               handle = &pending_handle(idx, i);
45470 +               if (BLKTAP_INVALID_HANDLE(handle))
45471 +                       continue;
45472 +
45473 +               unmap[op].host_addr = MMAP_VADDR(mmap_vstart, idx, i);
45474 +               unmap[op].dev_bus_addr = 0;
45475 +               unmap[op].handle = handle->kernel;
45476 +               op++;
45477 +
45478 +               if (create_lookup_pte_addr(
45479 +                           blktap_vma->vm_mm,
45480 +                           MMAP_VADDR(user_vstart, idx, i), 
45481 +                           &ptep) !=0) {
45482 +                       DPRINTK("Couldn't get a pte addr!\n");
45483 +                       return;
45484 +               }
45485 +               unmap[op].host_addr    = ptep;
45486 +               unmap[op].dev_bus_addr = 0;
45487 +               unmap[op].handle       = handle->user;
45488 +               op++;
45489 +            
45490 +               BLKTAP_INVALIDATE_HANDLE(handle);
45491 +       }
45492 +
45493 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap, op);
45494 +       BUG_ON(ret);
45495 +
45496 +       if (blktap_vma != NULL)
45497 +               zap_page_range(blktap_vma, 
45498 +                              MMAP_VADDR(user_vstart, idx, 0), 
45499 +                              nr_pages << PAGE_SHIFT, NULL);
45500 +}
45501 +
45502 +/******************************************************************
45503 + * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
45504 + */
45505 +
45506 +static struct list_head blkio_schedule_list;
45507 +static spinlock_t blkio_schedule_list_lock;
45508 +
45509 +static int __on_blkdev_list(blkif_t *blkif)
45510 +{
45511 +       return blkif->blkdev_list.next != NULL;
45512 +}
45513 +
45514 +static void remove_from_blkdev_list(blkif_t *blkif)
45515 +{
45516 +       unsigned long flags;
45517 +
45518 +       if (!__on_blkdev_list(blkif))
45519 +               return;
45520 +
45521 +       spin_lock_irqsave(&blkio_schedule_list_lock, flags);
45522 +       if (__on_blkdev_list(blkif)) {
45523 +               list_del(&blkif->blkdev_list);
45524 +               blkif->blkdev_list.next = NULL;
45525 +               blkif_put(blkif);
45526 +       }
45527 +       spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
45528 +}
45529 +
45530 +static void add_to_blkdev_list_tail(blkif_t *blkif)
45531 +{
45532 +       unsigned long flags;
45533 +
45534 +       if (__on_blkdev_list(blkif))
45535 +               return;
45536 +
45537 +       spin_lock_irqsave(&blkio_schedule_list_lock, flags);
45538 +       if (!__on_blkdev_list(blkif) && (blkif->status == CONNECTED)) {
45539 +               list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
45540 +               blkif_get(blkif);
45541 +       }
45542 +       spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
45543 +}
45544 +
45545 +
45546 +/******************************************************************
45547 + * SCHEDULER FUNCTIONS
45548 + */
45549 +
45550 +static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
45551 +
45552 +static int blkio_schedule(void *arg)
45553 +{
45554 +       DECLARE_WAITQUEUE(wq, current);
45555 +
45556 +       blkif_t          *blkif;
45557 +       struct list_head *ent;
45558 +
45559 +       daemonize("xenblkd");
45560 +
45561 +       for (;;) {
45562 +               /* Wait for work to do. */
45563 +               add_wait_queue(&blkio_schedule_wait, &wq);
45564 +               set_current_state(TASK_INTERRUPTIBLE);
45565 +               if ((NR_PENDING_REQS == MAX_PENDING_REQS) || 
45566 +                   list_empty(&blkio_schedule_list))
45567 +                       schedule();
45568 +               __set_current_state(TASK_RUNNING);
45569 +               remove_wait_queue(&blkio_schedule_wait, &wq);
45570 +
45571 +               /* Queue up a batch of requests. */
45572 +               while ((NR_PENDING_REQS < MAX_PENDING_REQS) &&
45573 +                      !list_empty(&blkio_schedule_list)) {
45574 +                       ent = blkio_schedule_list.next;
45575 +                       blkif = list_entry(ent, blkif_t, blkdev_list);
45576 +                       blkif_get(blkif);
45577 +                       remove_from_blkdev_list(blkif);
45578 +                       if (do_block_io_op(blkif, BATCH_PER_DOMAIN))
45579 +                               add_to_blkdev_list_tail(blkif);
45580 +                       blkif_put(blkif);
45581 +               }
45582 +       }
45583 +}
45584 +
45585 +static void maybe_trigger_blkio_schedule(void)
45586 +{
45587 +       /*
45588 +        * Needed so that two processes, who together make the following
45589 +        * predicate true, don't both read stale values and evaluate the
45590 +        * predicate incorrectly. Incredibly unlikely to stall the scheduler
45591 +        * on the x86, but...
45592 +        */
45593 +       smp_mb();
45594 +
45595 +       if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
45596 +           !list_empty(&blkio_schedule_list))
45597 +               wake_up(&blkio_schedule_wait);
45598 +}
45599 +
45600 +
45601 +
45602 +/******************************************************************
45603 + * COMPLETION CALLBACK -- Called as bh->b_end_io()
45604 + */
45605 +
45606 +
45607 +static int blktap_read_ufe_ring(void)
45608 +{
45609 +       /* This is called to read responses from the UFE ring. */
45610 +
45611 +       RING_IDX i, j, rp;
45612 +       blkif_response_t *resp;
45613 +       blkif_t *blkif;
45614 +       int pending_idx;
45615 +       pending_req_t *pending_req;
45616 +       unsigned long     flags;
45617 +
45618 +       /* if we are forwarding from UFERring to FERing */
45619 +       if (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) {
45620 +
45621 +               /* for each outstanding message on the UFEring  */
45622 +               rp = blktap_ufe_ring.sring->rsp_prod;
45623 +               rmb();
45624 +        
45625 +               for (i = blktap_ufe_ring.rsp_cons; i != rp; i++) {
45626 +                       resp = RING_GET_RESPONSE(&blktap_ufe_ring, i);
45627 +                       pending_idx = MASK_PEND_IDX(ID_TO_IDX(resp->id));
45628 +                       pending_req = &pending_reqs[pending_idx];
45629 +            
45630 +                       blkif = pending_req->blkif;
45631 +                       for (j = 0; j < pending_req->nr_pages; j++) {
45632 +                               unsigned long vaddr;
45633 +                               struct page **map = blktap_vma->vm_private_data;
45634 +                               int offset; 
45635 +
45636 +                               vaddr  = MMAP_VADDR(user_vstart, pending_idx, j);
45637 +                               offset = (vaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
45638 +
45639 +                               //ClearPageReserved(virt_to_page(vaddr));
45640 +                               ClearPageReserved((struct page *)map[offset]);
45641 +                               map[offset] = NULL;
45642 +                       }
45643 +
45644 +                       fast_flush_area(pending_idx, pending_req->nr_pages);
45645 +                       make_response(blkif, pending_req->id, resp->operation, 
45646 +                                     resp->status);
45647 +                       blkif_put(pending_req->blkif);
45648 +                       spin_lock_irqsave(&pend_prod_lock, flags);
45649 +                       pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
45650 +                       spin_unlock_irqrestore(&pend_prod_lock, flags);
45651 +               }
45652 +               blktap_ufe_ring.rsp_cons = i;
45653 +               maybe_trigger_blkio_schedule();
45654 +       }
45655 +       return 0;
45656 +}
45657 +
45658 +
45659 +/******************************************************************************
45660 + * NOTIFICATION FROM GUEST OS.
45661 + */
45662 +
45663 +irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
45664 +{
45665 +       blkif_t *blkif = dev_id;
45666 +       add_to_blkdev_list_tail(blkif);
45667 +       maybe_trigger_blkio_schedule();
45668 +       return IRQ_HANDLED;
45669 +}
45670 +
45671 +
45672 +
45673 +/******************************************************************
45674 + * DOWNWARD CALLS -- These interface with the block-device layer proper.
45675 + */
45676 +
45677 +static int do_block_io_op(blkif_t *blkif, int max_to_do)
45678 +{
45679 +       blkif_back_ring_t *blk_ring = &blkif->blk_ring;
45680 +       blkif_request_t *req;
45681 +       RING_IDX i, rp;
45682 +       int more_to_do = 0;
45683 +    
45684 +       rp = blk_ring->sring->req_prod;
45685 +       rmb(); /* Ensure we see queued requests up to 'rp'. */
45686 +
45687 +       for (i = blk_ring->req_cons; 
45688 +            (i != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, i);
45689 +            i++ ) {
45690 +               if ((max_to_do-- == 0) ||
45691 +                   (NR_PENDING_REQS == MAX_PENDING_REQS)) {
45692 +                       more_to_do = 1;
45693 +                       break;
45694 +               }
45695 +        
45696 +               req = RING_GET_REQUEST(blk_ring, i);
45697 +               switch (req->operation) {
45698 +               case BLKIF_OP_READ:
45699 +               case BLKIF_OP_WRITE:
45700 +                       dispatch_rw_block_io(blkif, req);
45701 +                       break;
45702 +
45703 +               default:
45704 +                       DPRINTK("error: unknown block io operation [%d]\n",
45705 +                               req->operation);
45706 +                       make_response(blkif, req->id, req->operation,
45707 +                                     BLKIF_RSP_ERROR);
45708 +                       break;
45709 +               }
45710 +       }
45711 +
45712 +       blk_ring->req_cons = i;
45713 +       blktap_kick_user();
45714 +
45715 +       return more_to_do;
45716 +}
45717 +
45718 +static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
45719 +{
45720 +       blkif_request_t *target;
45721 +       int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
45722 +       pending_req_t *pending_req;
45723 +       struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
45724 +       int op, ret;
45725 +       unsigned int nseg;
45726 +       int retval;
45727 +
45728 +       /* Check that number of segments is sane. */
45729 +       nseg = req->nr_segments;
45730 +       if (unlikely(nseg == 0) || 
45731 +           unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
45732 +               DPRINTK("Bad number of segments in request (%d)\n", nseg);
45733 +               goto bad_descriptor;
45734 +       }
45735 +
45736 +       /* Make sure userspace is ready. */
45737 +       if (!blktap_ring_ok) {
45738 +               DPRINTK("blktap: ring not ready for requests!\n");
45739 +               goto bad_descriptor;
45740 +       }
45741 +    
45742 +
45743 +       if (RING_FULL(&blktap_ufe_ring)) {
45744 +               WPRINTK("blktap: fe_ring is full, can't add "
45745 +                       "(very broken!).\n");
45746 +               goto bad_descriptor;
45747 +       }
45748 +
45749 +       flush_cache_all(); /* a noop on intel... */
45750 +
45751 +       /* Map the foreign pages directly in to the application */    
45752 +       op = 0;
45753 +       for (i = 0; i < req->nr_segments; i++) {
45754 +
45755 +               unsigned long uvaddr;
45756 +               unsigned long kvaddr;
45757 +               uint64_t ptep;
45758 +
45759 +               uvaddr = MMAP_VADDR(user_vstart, pending_idx, i);
45760 +               kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i);
45761 +
45762 +               /* Map the remote page to kernel. */
45763 +               map[op].host_addr = kvaddr;
45764 +               map[op].dom   = blkif->domid;
45765 +               map[op].ref   = req->seg[i].gref;
45766 +               map[op].flags = GNTMAP_host_map;
45767 +               /* This needs a bit more thought in terms of interposition: 
45768 +                * If we want to be able to modify pages during write using 
45769 +                * grant table mappings, the guest will either need to allow 
45770 +                * it, or we'll need to incur a copy. Bit of an fbufs moment. ;) */
45771 +               if (req->operation == BLKIF_OP_WRITE)
45772 +                       map[op].flags |= GNTMAP_readonly;
45773 +               op++;
45774 +
45775 +               /* Now map it to user. */
45776 +               ret = create_lookup_pte_addr(blktap_vma->vm_mm, uvaddr, &ptep);
45777 +               if (ret) {
45778 +                       DPRINTK("Couldn't get a pte addr!\n");
45779 +                       fast_flush_area(pending_idx, req->nr_segments);
45780 +                       goto bad_descriptor;
45781 +               }
45782 +
45783 +               map[op].host_addr = ptep;
45784 +               map[op].dom       = blkif->domid;
45785 +               map[op].ref       = req->seg[i].gref;
45786 +               map[op].flags     = GNTMAP_host_map | GNTMAP_application_map
45787 +                       | GNTMAP_contains_pte;
45788 +               /* Above interposition comment applies here as well. */
45789 +               if (req->operation == BLKIF_OP_WRITE)
45790 +                       map[op].flags |= GNTMAP_readonly;
45791 +               op++;
45792 +       }
45793 +
45794 +       retval = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
45795 +       BUG_ON(retval);
45796 +
45797 +       op = 0;
45798 +       for (i = 0; i < (req->nr_segments*2); i += 2) {
45799 +               unsigned long uvaddr;
45800 +               unsigned long kvaddr;
45801 +               unsigned long offset;
45802 +               int cancel = 0;
45803 +
45804 +               uvaddr = MMAP_VADDR(user_vstart, pending_idx, i/2);
45805 +               kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i/2);
45806 +
45807 +               if (unlikely(map[i].status)) {
45808 +                       DPRINTK("Error on kernel grant mapping (%d)\n",
45809 +                               map[i].status);
45810 +                       ret = map[i].status;
45811 +                       cancel = 1;
45812 +               }
45813 +
45814 +               if (unlikely(map[i+1].status)) {
45815 +                       DPRINTK("Error on user grant mapping (%d)\n",
45816 +                               map[i+1].status);
45817 +                       ret = map[i+1].status;
45818 +                       cancel = 1;
45819 +               }
45820 +
45821 +               if (cancel) {
45822 +                       fast_flush_area(pending_idx, req->nr_segments);
45823 +                       goto bad_descriptor;
45824 +               }
45825 +
45826 +               /* Set the necessary mappings in p2m and in the VM_FOREIGN 
45827 +                * vm_area_struct to allow user vaddr -> struct page lookups
45828 +                * to work.  This is needed for direct IO to foreign pages. */
45829 +               set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
45830 +                               FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
45831 +
45832 +               offset = (uvaddr - blktap_vma->vm_start) >> PAGE_SHIFT;
45833 +               ((struct page **)blktap_vma->vm_private_data)[offset] =
45834 +                       pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT);
45835 +
45836 +               /* Save handles for unmapping later. */
45837 +               pending_handle(pending_idx, i/2).kernel = map[i].handle;
45838 +               pending_handle(pending_idx, i/2).user   = map[i+1].handle;
45839 +       }
45840 +
45841 +       /* Mark mapped pages as reserved: */
45842 +       for (i = 0; i < req->nr_segments; i++) {
45843 +               unsigned long kvaddr;
45844 +               kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i);
45845 +               SetPageReserved(pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT));
45846 +       }
45847 +
45848 +       pending_req = &pending_reqs[pending_idx];
45849 +       pending_req->blkif     = blkif;
45850 +       pending_req->id        = req->id;
45851 +       pending_req->operation = req->operation;
45852 +       pending_req->status    = BLKIF_RSP_OKAY;
45853 +       pending_req->nr_pages  = nseg;
45854 +       req->id = MAKE_ID(blkif->domid, pending_idx);
45855 +       //atomic_set(&pending_req->pendcnt, nbio);
45856 +       pending_cons++;
45857 +       blkif_get(blkif);
45858 +
45859 +       /* Finally, write the request message to the user ring. */
45860 +       target = RING_GET_REQUEST(&blktap_ufe_ring,
45861 +                                 blktap_ufe_ring.req_prod_pvt);
45862 +       memcpy(target, req, sizeof(*req));
45863 +       blktap_ufe_ring.req_prod_pvt++;
45864 +       return;
45865 +
45866 + bad_descriptor:
45867 +       make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
45868 +} 
45869 +
45870 +
45871 +
45872 +/******************************************************************
45873 + * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
45874 + */
45875 +
45876 +
45877 +static void make_response(blkif_t *blkif, unsigned long id, 
45878 +                          unsigned short op, int st)
45879 +{
45880 +       blkif_response_t *resp;
45881 +       unsigned long     flags;
45882 +       blkif_back_ring_t *blk_ring = &blkif->blk_ring;
45883 +
45884 +       /* Place on the response ring for the relevant domain. */ 
45885 +       spin_lock_irqsave(&blkif->blk_ring_lock, flags);
45886 +       resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
45887 +       resp->id        = id;
45888 +       resp->operation = op;
45889 +       resp->status    = st;
45890 +       wmb(); /* Ensure other side can see the response fields. */
45891 +       blk_ring->rsp_prod_pvt++;
45892 +       RING_PUSH_RESPONSES(blk_ring);
45893 +       spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
45894 +
45895 +       /* Kick the relevant domain. */
45896 +       notify_remote_via_irq(blkif->irq);
45897 +}
45898 +
45899 +static struct miscdevice blktap_miscdev = {
45900 +       .minor        = BLKTAP_MINOR,
45901 +       .name         = "blktap",
45902 +       .fops         = &blktap_fops,
45903 +       .devfs_name   = "misc/blktap",
45904 +};
45905 +
45906 +void blkif_deschedule(blkif_t *blkif)
45907 +{
45908 +       remove_from_blkdev_list(blkif);
45909 +}
45910 +
45911 +static int __init blkif_init(void)
45912 +{
45913 +       int i, j, err;
45914 +       struct page *page;
45915 +
45916 +       blkif_interface_init();
45917 +
45918 +       page = balloon_alloc_empty_page_range(MMAP_PAGES);
45919 +       BUG_ON(page == NULL);
45920 +       mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
45921 +
45922 +       pending_cons = 0;
45923 +       pending_prod = MAX_PENDING_REQS;
45924 +       memset(pending_reqs, 0, sizeof(pending_reqs));
45925 +       for ( i = 0; i < MAX_PENDING_REQS; i++ )
45926 +               pending_ring[i] = i;
45927 +    
45928 +       spin_lock_init(&blkio_schedule_list_lock);
45929 +       INIT_LIST_HEAD(&blkio_schedule_list);
45930 +
45931 +       i = kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES);
45932 +       BUG_ON(i<0);
45933 +
45934 +       blkif_xenbus_init();
45935 +
45936 +       for (i = 0; i < MAX_PENDING_REQS ; i++)
45937 +               for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
45938 +                       BLKTAP_INVALIDATE_HANDLE(&pending_handle(i, j));
45939 +
45940 +       err = misc_register(&blktap_miscdev);
45941 +       if (err != 0) {
45942 +               printk(KERN_ALERT "Couldn't register /dev/misc/blktap (%d)\n",
45943 +                      err);
45944 +               return err;
45945 +       }
45946 +
45947 +       init_waitqueue_head(&blktap_wait);
45948 +
45949 +       return 0;
45950 +}
45951 +
45952 +__initcall(blkif_init);
45953 +
45954 +/*
45955 + * Local variables:
45956 + *  c-file-style: "linux"
45957 + *  indent-tabs-mode: t
45958 + *  c-indent-level: 8
45959 + *  c-basic-offset: 8
45960 + *  tab-width: 8
45961 + * End:
45962 + */
45963 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/blktap/common.h linux-2.6.16/drivers/xen/blktap/common.h
45964 --- linux-2.6.16.orig/drivers/xen/blktap/common.h       1970-01-01 01:00:00.000000000 +0100
45965 +++ linux-2.6.16/drivers/xen/blktap/common.h    2006-06-26 09:51:32.000000000 +0200
45966 @@ -0,0 +1,110 @@
45967 +
45968 +#ifndef __BLKIF__BACKEND__COMMON_H__
45969 +#define __BLKIF__BACKEND__COMMON_H__
45970 +
45971 +#include <linux/config.h>
45972 +#include <linux/version.h>
45973 +#include <linux/module.h>
45974 +#include <linux/interrupt.h>
45975 +#include <linux/slab.h>
45976 +#include <linux/blkdev.h>
45977 +#include <linux/vmalloc.h>
45978 +#include <asm/io.h>
45979 +#include <asm/setup.h>
45980 +#include <asm/pgalloc.h>
45981 +#include <xen/evtchn.h>
45982 +#include <asm/hypervisor.h>
45983 +#include <xen/interface/io/blkif.h>
45984 +#include <xen/interface/io/ring.h>
45985 +#include <xen/gnttab.h>
45986 +#include <xen/driver_util.h>
45987 +
45988 +#define DPRINTK(_f, _a...) pr_debug("(file=%s, line=%d) " _f, \
45989 +                                    __FILE__ , __LINE__ , ## _a )
45990 +
45991 +#define WPRINTK(fmt, args...) printk(KERN_WARNING "blk_tap: " fmt, ##args)
45992 +
45993 +struct vbd {
45994 +       blkif_vdev_t   handle;      /* what the domain refers to this vbd as */
45995 +       unsigned char  readonly;    /* Non-zero -> read-only */
45996 +       unsigned char  type;        /* VDISK_xxx */
45997 +       u32            pdevice;     /* phys device that this vbd maps to */
45998 +       struct block_device *bdev;
45999 +}; 
46000 +
46001 +typedef struct blkif_st {
46002 +       /* Unique identifier for this interface. */
46003 +       domid_t           domid;
46004 +       unsigned int      handle;
46005 +       /* Physical parameters of the comms window. */
46006 +       unsigned int      evtchn;
46007 +       unsigned int      irq;
46008 +       /* Comms information. */
46009 +       blkif_back_ring_t blk_ring;
46010 +       struct vm_struct *blk_ring_area;
46011 +       /* VBDs attached to this interface. */
46012 +       struct vbd        vbd;
46013 +       /* Private fields. */
46014 +       enum { DISCONNECTED, CONNECTED } status;
46015 +#ifdef CONFIG_XEN_BLKDEV_TAP_BE
46016 +       /* Is this a blktap frontend */
46017 +       unsigned int     is_blktap;
46018 +#endif
46019 +       struct list_head blkdev_list;
46020 +       spinlock_t       blk_ring_lock;
46021 +       atomic_t         refcnt;
46022 +
46023 +       struct work_struct free_work;
46024 +
46025 +       grant_handle_t   shmem_handle;
46026 +       grant_ref_t      shmem_ref;
46027 +} blkif_t;
46028 +
46029 +blkif_t *alloc_blkif(domid_t domid);
46030 +void free_blkif_callback(blkif_t *blkif);
46031 +int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn);
46032 +
46033 +#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
46034 +#define blkif_put(_b)                             \
46035 +    do {                                          \
46036 +        if ( atomic_dec_and_test(&(_b)->refcnt) ) \
46037 +            free_blkif_callback(_b);             \
46038 +    } while (0)
46039 +
46040 +/* Create a vbd. */
46041 +int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, u32 pdevice,
46042 +              int readonly);
46043 +void vbd_free(struct vbd *vbd);
46044 +
46045 +unsigned long vbd_size(struct vbd *vbd);
46046 +unsigned int vbd_info(struct vbd *vbd);
46047 +unsigned long vbd_secsize(struct vbd *vbd);
46048 +
46049 +struct phys_req {
46050 +       unsigned short       dev;
46051 +       unsigned short       nr_sects;
46052 +       struct block_device *bdev;
46053 +       blkif_sector_t       sector_number;
46054 +};
46055 +
46056 +int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation); 
46057 +
46058 +void blkif_interface_init(void);
46059 +
46060 +void blkif_deschedule(blkif_t *blkif);
46061 +
46062 +void blkif_xenbus_init(void);
46063 +
46064 +irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
46065 +
46066 +#endif /* __BLKIF__BACKEND__COMMON_H__ */
46067 +
46068 +/*
46069 + * Local variables:
46070 + *  c-file-style: "linux"
46071 + *  indent-tabs-mode: t
46072 + *  c-indent-level: 8
46073 + *  c-basic-offset: 8
46074 + *  tab-width: 8
46075 + * End:
46076 + */
46077 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/blktap/interface.c linux-2.6.16/drivers/xen/blktap/interface.c
46078 --- linux-2.6.16.orig/drivers/xen/blktap/interface.c    1970-01-01 01:00:00.000000000 +0100
46079 +++ linux-2.6.16/drivers/xen/blktap/interface.c 2006-06-26 09:51:32.000000000 +0200
46080 @@ -0,0 +1,146 @@
46081 +/******************************************************************************
46082 + * arch/xen/drivers/blkif/backend/interface.c
46083 + * 
46084 + * Block-device interface management.
46085 + * 
46086 + * Copyright (c) 2004, Keir Fraser
46087 + */
46088 +
46089 +#include "common.h"
46090 +#include <xen/evtchn.h>
46091 +
46092 +static kmem_cache_t *blkif_cachep;
46093 +
46094 +blkif_t *alloc_blkif(domid_t domid)
46095 +{
46096 +       blkif_t *blkif;
46097 +
46098 +       blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
46099 +       if (!blkif)
46100 +               return ERR_PTR(-ENOMEM);
46101 +
46102 +       memset(blkif, 0, sizeof(*blkif));
46103 +       blkif->domid = domid;
46104 +       blkif->status = DISCONNECTED;
46105 +       spin_lock_init(&blkif->blk_ring_lock);
46106 +       atomic_set(&blkif->refcnt, 1);
46107 +
46108 +       return blkif;
46109 +}
46110 +
46111 +static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
46112 +{
46113 +       struct gnttab_map_grant_ref op;
46114 +       int ret;
46115 +
46116 +       op.host_addr = (unsigned long)blkif->blk_ring_area->addr;
46117 +       op.flags     = GNTMAP_host_map;
46118 +       op.ref       = shared_page;
46119 +       op.dom       = blkif->domid;
46120 +
46121 +       lock_vm_area(blkif->blk_ring_area);
46122 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
46123 +       unlock_vm_area(blkif->blk_ring_area);
46124 +       BUG_ON(ret);
46125 +
46126 +       if (op.status) {
46127 +               DPRINTK(" Grant table operation failure !\n");
46128 +               return op.status;
46129 +       }
46130 +
46131 +       blkif->shmem_ref    = shared_page;
46132 +       blkif->shmem_handle = op.handle;
46133 +
46134 +       return 0;
46135 +}
46136 +
46137 +static void unmap_frontend_page(blkif_t *blkif)
46138 +{
46139 +       struct gnttab_unmap_grant_ref op;
46140 +       int ret;
46141 +
46142 +       op.host_addr    = (unsigned long)blkif->blk_ring_area->addr;
46143 +       op.handle       = blkif->shmem_handle;
46144 +       op.dev_bus_addr = 0;
46145 +
46146 +       lock_vm_area(blkif->blk_ring_area);
46147 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
46148 +       unlock_vm_area(blkif->blk_ring_area);
46149 +       BUG_ON(ret);
46150 +}
46151 +
46152 +int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
46153 +{
46154 +       blkif_sring_t *sring;
46155 +       int err;
46156 +       evtchn_op_t op = {
46157 +               .cmd = EVTCHNOP_bind_interdomain,
46158 +               .u.bind_interdomain.remote_dom  = blkif->domid,
46159 +               .u.bind_interdomain.remote_port = evtchn };
46160 +
46161 +       if ((blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL)
46162 +               return -ENOMEM;
46163 +
46164 +       err = map_frontend_page(blkif, shared_page);
46165 +       if (err) {
46166 +               free_vm_area(blkif->blk_ring_area);
46167 +               return err;
46168 +       }
46169 +
46170 +       err = HYPERVISOR_event_channel_op(&op);
46171 +       if (err) {
46172 +               unmap_frontend_page(blkif);
46173 +               free_vm_area(blkif->blk_ring_area);
46174 +               return err;
46175 +       }
46176 +
46177 +       blkif->evtchn = op.u.bind_interdomain.local_port;
46178 +
46179 +       sring = (blkif_sring_t *)blkif->blk_ring_area->addr;
46180 +       BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE);
46181 +
46182 +       blkif->irq = bind_evtchn_to_irqhandler(
46183 +               blkif->evtchn, blkif_be_int, 0, "blkif-backend", blkif);
46184 +
46185 +       blkif->status = CONNECTED;
46186 +
46187 +       return 0;
46188 +}
46189 +
46190 +static void free_blkif(void *arg)
46191 +{
46192 +       blkif_t *blkif = (blkif_t *)arg;
46193 +
46194 +       if (blkif->irq)
46195 +               unbind_from_irqhandler(blkif->irq, blkif);
46196 +
46197 +       if (blkif->blk_ring.sring) {
46198 +               unmap_frontend_page(blkif);
46199 +               free_vm_area(blkif->blk_ring_area);
46200 +               blkif->blk_ring.sring = NULL;
46201 +       }
46202 +
46203 +       kmem_cache_free(blkif_cachep, blkif);
46204 +}
46205 +
46206 +void free_blkif_callback(blkif_t *blkif)
46207 +{
46208 +       INIT_WORK(&blkif->free_work, free_blkif, (void *)blkif);
46209 +       schedule_work(&blkif->free_work);
46210 +}
46211 +
46212 +void __init blkif_interface_init(void)
46213 +{
46214 +       blkif_cachep = kmem_cache_create(
46215 +               "blkif_cache", sizeof(blkif_t), 0, 0, NULL, NULL);
46216 +}
46217 +
46218 +/*
46219 + * Local variables:
46220 + *  c-file-style: "linux"
46221 + *  indent-tabs-mode: t
46222 + *  c-indent-level: 8
46223 + *  c-basic-offset: 8
46224 + *  tab-width: 8
46225 + * End:
46226 + */
46227 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/blktap/xenbus.c linux-2.6.16/drivers/xen/blktap/xenbus.c
46228 --- linux-2.6.16.orig/drivers/xen/blktap/xenbus.c       1970-01-01 01:00:00.000000000 +0100
46229 +++ linux-2.6.16/drivers/xen/blktap/xenbus.c    2006-06-26 09:51:32.000000000 +0200
46230 @@ -0,0 +1,233 @@
46231 +/*  Xenbus code for blkif tap
46232 +
46233 +    A Warfield.
46234 +
46235 +    Hastily modified from the oroginal backend code:
46236 +
46237 +    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
46238 +
46239 +    This program is free software; you can redistribute it and/or modify
46240 +    it under the terms of the GNU General Public License as published by
46241 +    the Free Software Foundation; either version 2 of the License, or
46242 +    (at your option) any later version.
46243 +
46244 +    This program is distributed in the hope that it will be useful,
46245 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
46246 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
46247 +    GNU General Public License for more details.
46248 +
46249 +    You should have received a copy of the GNU General Public License
46250 +    along with this program; if not, write to the Free Software
46251 +    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
46252 +*/
46253 +
46254 +#include <stdarg.h>
46255 +#include <linux/module.h>
46256 +#include <xen/xenbus.h>
46257 +#include "common.h"
46258 +
46259 +struct backend_info
46260 +{
46261 +       struct xenbus_device *dev;
46262 +
46263 +       /* our communications channel */
46264 +       blkif_t *blkif;
46265 +
46266 +       long int frontend_id;
46267 +
46268 +       /* watch back end for changes */
46269 +       struct xenbus_watch backend_watch;
46270 +
46271 +       /* watch front end for changes */
46272 +       struct xenbus_watch watch;
46273 +       char *frontpath;
46274 +};
46275 +
46276 +static int blkback_remove(struct xenbus_device *dev)
46277 +{
46278 +       struct backend_info *be = dev->data;
46279 +
46280 +       if (be->watch.node)
46281 +               unregister_xenbus_watch(&be->watch);
46282 +       unregister_xenbus_watch(&be->backend_watch);
46283 +       if (be->blkif)
46284 +               blkif_put(be->blkif);
46285 +       kfree(be->frontpath);
46286 +       kfree(be);
46287 +       return 0;
46288 +}
46289 +
46290 +/* Front end tells us frame. */
46291 +static void frontend_changed(struct xenbus_watch *watch,
46292 +                            const char **vec, unsigned int len)
46293 +{
46294 +       unsigned long ring_ref;
46295 +       unsigned int evtchn;
46296 +       int err;
46297 +       struct backend_info *be
46298 +               = container_of(watch, struct backend_info, watch);
46299 +
46300 +       /* If other end is gone, delete ourself. */
46301 +       if (vec && !xenbus_exists(XBT_NULL, be->frontpath, "")) {
46302 +               xenbus_rm(be->dev->nodename, "");
46303 +               device_unregister(&be->dev->dev);
46304 +               return;
46305 +       }
46306 +       if (be->blkif == NULL || be->blkif->status == CONNECTED)
46307 +               return;
46308 +
46309 +       err = xenbus_gather(be->frontpath, "ring-ref", "%lu", &ring_ref,
46310 +                           "event-channel", "%u", &evtchn, NULL);
46311 +       if (err) {
46312 +               xenbus_dev_error(be->dev, err,
46313 +                                "reading %s/ring-ref and event-channel",
46314 +                                be->frontpath);
46315 +               return;
46316 +       }
46317 +
46318 +       /* Map the shared frame, irq etc. */
46319 +       err = blkif_map(be->blkif, ring_ref, evtchn);
46320 +       if (err) {
46321 +               xenbus_dev_error(be->dev, err, "mapping ring-ref %lu port %u",
46322 +                                ring_ref, evtchn);
46323 +               goto abort;
46324 +       }
46325 +
46326 +       xenbus_dev_ok(be->dev);
46327 +
46328 +       return;
46329 +
46330 +abort:
46331 +       xenbus_transaction_end(1);
46332 +}
46333 +
46334 +/* 
46335 +   Setup supplies physical device.  
46336 +   We provide event channel and device details to front end.
46337 +   Frontend supplies shared frame and event channel.
46338 + */
46339 +static void backend_changed(struct xenbus_watch *watch,
46340 +                           const char **vec, unsigned int len)
46341 +{
46342 +       int err;
46343 +       char *p;
46344 +       long int handle;
46345 +       struct backend_info *be
46346 +               = container_of(watch, struct backend_info, backend_watch);
46347 +       struct xenbus_device *dev = be->dev;
46348 +
46349 +       if (be->blkif == NULL) {
46350 +               /* Front end dir is a number, which is used as the handle. */
46351 +               p = strrchr(be->frontpath, '/') + 1;
46352 +               handle = simple_strtoul(p, NULL, 0);
46353 +
46354 +               be->blkif = alloc_blkif(be->frontend_id);
46355 +               if (IS_ERR(be->blkif)) {
46356 +                       err = PTR_ERR(be->blkif);
46357 +                       be->blkif = NULL;
46358 +                       xenbus_dev_error(dev, err, "creating block interface");
46359 +                       return;
46360 +               }
46361 +
46362 +               /* Pass in NULL node to skip exist test. */
46363 +               frontend_changed(&be->watch, NULL, 0);
46364 +       }
46365 +}
46366 +
46367 +static int blkback_probe(struct xenbus_device *dev,
46368 +                        const struct xenbus_device_id *id)
46369 +{
46370 +       struct backend_info *be;
46371 +       char *frontend;
46372 +       int err;
46373 +
46374 +       be = kzalloc(sizeof(*be), GFP_KERNEL);
46375 +       if (!be) {
46376 +               xenbus_dev_error(dev, -ENOMEM, "allocating backend structure");
46377 +               return -ENOMEM;
46378 +       }
46379 +
46380 +       frontend = NULL;
46381 +       err = xenbus_gather(dev->nodename,
46382 +                           "frontend-id", "%li", &be->frontend_id,
46383 +                           "frontend", NULL, &frontend,
46384 +                           NULL);
46385 +       if (XENBUS_EXIST_ERR(err))
46386 +               goto free_be;
46387 +       if (err < 0) {
46388 +               xenbus_dev_error(dev, err,
46389 +                                "reading %s/frontend or frontend-id",
46390 +                                dev->nodename);
46391 +               goto free_be;
46392 +       }
46393 +       if (strlen(frontend) == 0 || !xenbus_exists(XBT_NULL, frontend, "")) {
46394 +               /* If we can't get a frontend path and a frontend-id,
46395 +                * then our bus-id is no longer valid and we need to
46396 +                * destroy the backend device.
46397 +                */
46398 +               err = -ENOENT;
46399 +               goto free_be;
46400 +       }
46401 +
46402 +       be->dev = dev;
46403 +       be->backend_watch.node = dev->nodename;
46404 +       be->backend_watch.callback = backend_changed;
46405 +       /* Registration implicitly fires backend_changed once */
46406 +       err = register_xenbus_watch(&be->backend_watch);
46407 +       if (err) {
46408 +               be->backend_watch.node = NULL;
46409 +               xenbus_dev_error(dev, err, "adding backend watch on %s",
46410 +                                dev->nodename);
46411 +               goto free_be;
46412 +       }
46413 +
46414 +       be->frontpath = frontend;
46415 +       be->watch.node = be->frontpath;
46416 +       be->watch.callback = frontend_changed;
46417 +       err = register_xenbus_watch(&be->watch);
46418 +       if (err) {
46419 +               be->watch.node = NULL;
46420 +               xenbus_dev_error(dev, err,
46421 +                                "adding frontend watch on %s",
46422 +                                be->frontpath);
46423 +               goto free_be;
46424 +       }
46425 +
46426 +       dev->data = be;
46427 +       return 0;
46428 +
46429 + free_be:
46430 +       if (be->backend_watch.node)
46431 +               unregister_xenbus_watch(&be->backend_watch);
46432 +       kfree(frontend);
46433 +       kfree(be);
46434 +       return err;
46435 +}
46436 +
46437 +static struct xenbus_device_id blkback_ids[] = {
46438 +       { "vbd" },
46439 +       { "" }
46440 +};
46441 +
46442 +static struct xenbus_driver blkback = {
46443 +       .name = "vbd",
46444 +       .owner = THIS_MODULE,
46445 +       .ids = blkback_ids,
46446 +       .probe = blkback_probe,
46447 +       .remove = blkback_remove,
46448 +};
46449 +
46450 +void blkif_xenbus_init(void)
46451 +{
46452 +       xenbus_register_backend(&blkback);
46453 +}
46454 +
46455 +/*
46456 + * Local variables:
46457 + *  c-file-style: "linux"
46458 + *  indent-tabs-mode: t
46459 + *  c-indent-level: 8
46460 + *  c-basic-offset: 8
46461 + *  tab-width: 8
46462 + * End:
46463 + */
46464 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/char/Makefile linux-2.6.16/drivers/xen/char/Makefile
46465 --- linux-2.6.16.orig/drivers/xen/char/Makefile 1970-01-01 01:00:00.000000000 +0100
46466 +++ linux-2.6.16/drivers/xen/char/Makefile      2006-06-26 09:51:32.000000000 +0200
46467 @@ -0,0 +1,2 @@
46468 +
46469 +obj-y  := mem.o
46470 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/char/mem.c linux-2.6.16/drivers/xen/char/mem.c
46471 --- linux-2.6.16.orig/drivers/xen/char/mem.c    1970-01-01 01:00:00.000000000 +0100
46472 +++ linux-2.6.16/drivers/xen/char/mem.c 2006-06-26 09:51:32.000000000 +0200
46473 @@ -0,0 +1,192 @@
46474 +/*
46475 + *  Originally from linux/drivers/char/mem.c
46476 + *
46477 + *  Copyright (C) 1991, 1992  Linus Torvalds
46478 + *
46479 + *  Added devfs support. 
46480 + *    Jan-11-1998, C. Scott Ananian <cananian@alumni.princeton.edu>
46481 + *  Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar <kanoj@sgi.com>
46482 + */
46483 +
46484 +#include <linux/config.h>
46485 +#include <linux/mm.h>
46486 +#include <linux/miscdevice.h>
46487 +#include <linux/slab.h>
46488 +#include <linux/vmalloc.h>
46489 +#include <linux/mman.h>
46490 +#include <linux/random.h>
46491 +#include <linux/init.h>
46492 +#include <linux/raw.h>
46493 +#include <linux/tty.h>
46494 +#include <linux/capability.h>
46495 +#include <linux/smp_lock.h>
46496 +#include <linux/devfs_fs_kernel.h>
46497 +#include <linux/ptrace.h>
46498 +#include <linux/device.h>
46499 +#include <asm/pgalloc.h>
46500 +#include <asm/uaccess.h>
46501 +#include <asm/io.h>
46502 +#include <asm/hypervisor.h>
46503 +
46504 +static inline int uncached_access(struct file *file)
46505 +{
46506 +        if (file->f_flags & O_SYNC)
46507 +                return 1;
46508 +        /* Xen sets correct MTRR type on non-RAM for us. */
46509 +        return 0;
46510 +}
46511 +
46512 +/*
46513 + * This funcion reads the *physical* memory. The f_pos points directly to the 
46514 + * memory location. 
46515 + */
46516 +static ssize_t read_mem(struct file * file, char __user * buf,
46517 +                       size_t count, loff_t *ppos)
46518 +{
46519 +       unsigned long p = *ppos, ignored;
46520 +       ssize_t read = 0, sz;
46521 +       void __iomem *v;
46522 +
46523 +       while (count > 0) {
46524 +               /*
46525 +                * Handle first page in case it's not aligned
46526 +                */
46527 +               if (-p & (PAGE_SIZE - 1))
46528 +                       sz = -p & (PAGE_SIZE - 1);
46529 +               else
46530 +                       sz = PAGE_SIZE;
46531 +
46532 +               sz = min_t(unsigned long, sz, count);
46533 +
46534 +               if ((v = ioremap(p, sz)) == NULL) {
46535 +                       /*
46536 +                        * Some programs (e.g., dmidecode) groove off into weird RAM
46537 +                        * areas where no tables can possibly exist (because Xen will
46538 +                        * have stomped on them!). These programs get rather upset if
46539 +                        * we let them know that Xen failed their access, so we fake
46540 +                        * out a read of all zeroes. :-)
46541 +                        */
46542 +                       if (clear_user(buf, count))
46543 +                               return -EFAULT;
46544 +                       read += count;
46545 +                       break;
46546 +               }
46547 +
46548 +               ignored = copy_to_user(buf, v, sz);
46549 +               iounmap(v);
46550 +               if (ignored)
46551 +                       return -EFAULT;
46552 +               buf += sz;
46553 +               p += sz;
46554 +               count -= sz;
46555 +               read += sz;
46556 +       }
46557 +
46558 +       *ppos += read;
46559 +       return read;
46560 +}
46561 +
46562 +static ssize_t write_mem(struct file * file, const char __user * buf, 
46563 +                        size_t count, loff_t *ppos)
46564 +{
46565 +       unsigned long p = *ppos, ignored;
46566 +       ssize_t written = 0, sz;
46567 +       void __iomem *v;
46568 +
46569 +       while (count > 0) {
46570 +               /*
46571 +                * Handle first page in case it's not aligned
46572 +                */
46573 +               if (-p & (PAGE_SIZE - 1))
46574 +                       sz = -p & (PAGE_SIZE - 1);
46575 +               else
46576 +                       sz = PAGE_SIZE;
46577 +
46578 +               sz = min_t(unsigned long, sz, count);
46579 +
46580 +               if ((v = ioremap(p, sz)) == NULL)
46581 +                       break;
46582 +
46583 +               ignored = copy_from_user(v, buf, sz);
46584 +               iounmap(v);
46585 +               if (ignored) {
46586 +                       written += sz - ignored;
46587 +                       if (written)
46588 +                               break;
46589 +                       return -EFAULT;
46590 +               }
46591 +               buf += sz;
46592 +               p += sz;
46593 +               count -= sz;
46594 +               written += sz;
46595 +       }
46596 +
46597 +       *ppos += written;
46598 +       return written;
46599 +}
46600 +
46601 +static int mmap_mem(struct file * file, struct vm_area_struct * vma)
46602 +{
46603 +       size_t size = vma->vm_end - vma->vm_start;
46604 +
46605 +       if (uncached_access(file))
46606 +               vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
46607 +
46608 +       /* We want to return the real error code, not EAGAIN. */
46609 +       return direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
46610 +                                     size, vma->vm_page_prot, DOMID_IO);
46611 +}
46612 +
46613 +/*
46614 + * The memory devices use the full 32/64 bits of the offset, and so we cannot
46615 + * check against negative addresses: they are ok. The return value is weird,
46616 + * though, in that case (0).
46617 + *
46618 + * also note that seeking relative to the "end of file" isn't supported:
46619 + * it has no meaning, so it returns -EINVAL.
46620 + */
46621 +static loff_t memory_lseek(struct file * file, loff_t offset, int orig)
46622 +{
46623 +       loff_t ret;
46624 +
46625 +       mutex_lock(&file->f_dentry->d_inode->i_mutex);
46626 +       switch (orig) {
46627 +               case 0:
46628 +                       file->f_pos = offset;
46629 +                       ret = file->f_pos;
46630 +                       force_successful_syscall_return();
46631 +                       break;
46632 +               case 1:
46633 +                       file->f_pos += offset;
46634 +                       ret = file->f_pos;
46635 +                       force_successful_syscall_return();
46636 +                       break;
46637 +               default:
46638 +                       ret = -EINVAL;
46639 +       }
46640 +       mutex_unlock(&file->f_dentry->d_inode->i_mutex);
46641 +       return ret;
46642 +}
46643 +
46644 +static int open_mem(struct inode * inode, struct file * filp)
46645 +{
46646 +       return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
46647 +}
46648 +
46649 +struct file_operations mem_fops = {
46650 +       .llseek         = memory_lseek,
46651 +       .read           = read_mem,
46652 +       .write          = write_mem,
46653 +       .mmap           = mmap_mem,
46654 +       .open           = open_mem,
46655 +};
46656 +
46657 +/*
46658 + * Local variables:
46659 + *  c-file-style: "linux"
46660 + *  indent-tabs-mode: t
46661 + *  c-indent-level: 8
46662 + *  c-basic-offset: 8
46663 + *  tab-width: 8
46664 + * End:
46665 + */
46666 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/console/Makefile linux-2.6.16/drivers/xen/console/Makefile
46667 --- linux-2.6.16.orig/drivers/xen/console/Makefile      1970-01-01 01:00:00.000000000 +0100
46668 +++ linux-2.6.16/drivers/xen/console/Makefile   2006-06-26 09:51:32.000000000 +0200
46669 @@ -0,0 +1,2 @@
46670 +
46671 +obj-y  := console.o xencons_ring.o
46672 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/console/console.c linux-2.6.16/drivers/xen/console/console.c
46673 --- linux-2.6.16.orig/drivers/xen/console/console.c     1970-01-01 01:00:00.000000000 +0100
46674 +++ linux-2.6.16/drivers/xen/console/console.c  2006-06-26 09:51:32.000000000 +0200
46675 @@ -0,0 +1,648 @@
46676 +/******************************************************************************
46677 + * console.c
46678 + * 
46679 + * Virtual console driver.
46680 + * 
46681 + * Copyright (c) 2002-2004, K A Fraser.
46682 + * 
46683 + * This program is free software; you can redistribute it and/or
46684 + * modify it under the terms of the GNU General Public License version 2
46685 + * as published by the Free Software Foundation; or, when distributed
46686 + * separately from the Linux kernel or incorporated into other
46687 + * software packages, subject to the following license:
46688 + * 
46689 + * Permission is hereby granted, free of charge, to any person obtaining a copy
46690 + * of this source file (the "Software"), to deal in the Software without
46691 + * restriction, including without limitation the rights to use, copy, modify,
46692 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
46693 + * and to permit persons to whom the Software is furnished to do so, subject to
46694 + * the following conditions:
46695 + * 
46696 + * The above copyright notice and this permission notice shall be included in
46697 + * all copies or substantial portions of the Software.
46698 + * 
46699 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
46700 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46701 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
46702 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46703 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
46704 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
46705 + * IN THE SOFTWARE.
46706 + */
46707 +
46708 +#include <linux/config.h>
46709 +#include <linux/version.h>
46710 +#include <linux/module.h>
46711 +#include <linux/errno.h>
46712 +#include <linux/signal.h>
46713 +#include <linux/sched.h>
46714 +#include <linux/interrupt.h>
46715 +#include <linux/tty.h>
46716 +#include <linux/tty_flip.h>
46717 +#include <linux/serial.h>
46718 +#include <linux/major.h>
46719 +#include <linux/ptrace.h>
46720 +#include <linux/ioport.h>
46721 +#include <linux/mm.h>
46722 +#include <linux/slab.h>
46723 +#include <linux/init.h>
46724 +#include <linux/console.h>
46725 +#include <linux/bootmem.h>
46726 +#include <linux/sysrq.h>
46727 +#include <asm/io.h>
46728 +#include <asm/irq.h>
46729 +#include <asm/uaccess.h>
46730 +#include <xen/interface/xen.h>
46731 +#include <xen/interface/event_channel.h>
46732 +#include <asm/hypervisor.h>
46733 +#include <xen/evtchn.h>
46734 +#include <xen/xencons.h>
46735 +
46736 +/*
46737 + * Modes:
46738 + *  'xencons=off'  [XC_OFF]:     Console is disabled.
46739 + *  'xencons=tty'  [XC_TTY]:     Console attached to '/dev/tty[0-9]+'.
46740 + *  'xencons=ttyS' [XC_SERIAL]:  Console attached to '/dev/ttyS[0-9]+'.
46741 + *                 [XC_DEFAULT]: DOM0 -> XC_SERIAL ; all others -> XC_TTY.
46742 + * 
46743 + * NB. In mode XC_TTY, we create dummy consoles for tty2-63. This suppresses
46744 + * warnings from standard distro startup scripts.
46745 + */
46746 +static enum { XC_OFF, XC_DEFAULT, XC_TTY, XC_SERIAL } xc_mode = XC_DEFAULT;
46747 +static int xc_num = -1;
46748 +
46749 +#ifdef CONFIG_MAGIC_SYSRQ
46750 +static unsigned long sysrq_requested;
46751 +extern int sysrq_enabled;
46752 +#endif
46753 +
46754 +static int __init xencons_setup(char *str)
46755 +{
46756 +       char *q;
46757 +       int n;
46758 +
46759 +       if (!strncmp(str, "ttyS", 4))
46760 +               xc_mode = XC_SERIAL;
46761 +       else if (!strncmp(str, "tty", 3))
46762 +               xc_mode = XC_TTY;
46763 +       else if (!strncmp(str, "off", 3))
46764 +               xc_mode = XC_OFF;
46765 +
46766 +       switch (xc_mode) {
46767 +       case XC_SERIAL:
46768 +               n = simple_strtol(str+4, &q, 10);
46769 +               if (q > (str + 4))
46770 +                       xc_num = n;
46771 +               break;
46772 +       case XC_TTY:
46773 +               n = simple_strtol(str+3, &q, 10);
46774 +               if (q > (str + 3))
46775 +                       xc_num = n;
46776 +               break;
46777 +       default:
46778 +               break;
46779 +       }
46780 +
46781 +       return 1;
46782 +}
46783 +__setup("xencons=", xencons_setup);
46784 +
46785 +/* The kernel and user-land drivers share a common transmit buffer. */
46786 +static unsigned int wbuf_size = 4096;
46787 +#define WBUF_MASK(_i) ((_i)&(wbuf_size-1))
46788 +static char *wbuf;
46789 +static unsigned int wc, wp; /* write_cons, write_prod */
46790 +
46791 +static int __init xencons_bufsz_setup(char *str)
46792 +{
46793 +       unsigned int goal;
46794 +       goal = simple_strtoul(str, NULL, 0);
46795 +       while (wbuf_size < goal)
46796 +               wbuf_size <<= 1;
46797 +       return 1;
46798 +}
46799 +__setup("xencons_bufsz=", xencons_bufsz_setup);
46800 +
46801 +/* This lock protects accesses to the common transmit buffer. */
46802 +static spinlock_t xencons_lock = SPIN_LOCK_UNLOCKED;
46803 +
46804 +/* Common transmit-kick routine. */
46805 +static void __xencons_tx_flush(void);
46806 +
46807 +static struct tty_driver *xencons_driver;
46808 +
46809 +/******************** Kernel console driver ********************************/
46810 +
46811 +static void kcons_write(
46812 +       struct console *c, const char *s, unsigned int count)
46813 +{
46814 +       int           i = 0;
46815 +       unsigned long flags;
46816 +
46817 +       spin_lock_irqsave(&xencons_lock, flags);
46818 +
46819 +       while (i < count) {
46820 +               for (; i < count; i++) {
46821 +                       if ((wp - wc) >= (wbuf_size - 1))
46822 +                               break;
46823 +                       if ((wbuf[WBUF_MASK(wp++)] = s[i]) == '\n')
46824 +                               wbuf[WBUF_MASK(wp++)] = '\r';
46825 +               }
46826 +
46827 +               __xencons_tx_flush();
46828 +       }
46829 +
46830 +       spin_unlock_irqrestore(&xencons_lock, flags);
46831 +}
46832 +
46833 +static void kcons_write_dom0(
46834 +       struct console *c, const char *s, unsigned int count)
46835 +{
46836 +       int rc;
46837 +
46838 +       while ((count > 0) &&
46839 +              ((rc = HYPERVISOR_console_io(
46840 +                       CONSOLEIO_write, count, (char *)s)) > 0)) {
46841 +               count -= rc;
46842 +               s += rc;
46843 +       }
46844 +}
46845 +
46846 +static struct tty_driver *kcons_device(struct console *c, int *index)
46847 +{
46848 +       *index = 0;
46849 +       return xencons_driver;
46850 +}
46851 +
46852 +static struct console kcons_info = {
46853 +       .device = kcons_device,
46854 +       .flags  = CON_PRINTBUFFER,
46855 +       .index  = -1,
46856 +};
46857 +
46858 +#define __RETCODE 0
46859 +static int __init xen_console_init(void)
46860 +{
46861 +       if (xen_init() < 0)
46862 +               return __RETCODE;
46863 +
46864 +       if (xen_start_info->flags & SIF_INITDOMAIN) {
46865 +               if (xc_mode == XC_DEFAULT)
46866 +                       xc_mode = XC_SERIAL;
46867 +               kcons_info.write = kcons_write_dom0;
46868 +               if (xc_mode == XC_SERIAL)
46869 +                       kcons_info.flags |= CON_ENABLED;
46870 +       } else {
46871 +               if (xc_mode == XC_DEFAULT)
46872 +                       xc_mode = XC_TTY;
46873 +               kcons_info.write = kcons_write;
46874 +       }
46875 +
46876 +       switch (xc_mode) {
46877 +       case XC_SERIAL:
46878 +               strcpy(kcons_info.name, "ttyS");
46879 +               if (xc_num == -1)
46880 +                       xc_num = 0;
46881 +               break;
46882 +
46883 +       case XC_TTY:
46884 +               strcpy(kcons_info.name, "tty");
46885 +               if (xc_num == -1)
46886 +                       xc_num = 1;
46887 +               break;
46888 +
46889 +       default:
46890 +               return __RETCODE;
46891 +       }
46892 +
46893 +       wbuf = alloc_bootmem(wbuf_size);
46894 +
46895 +       register_console(&kcons_info);
46896 +
46897 +       return __RETCODE;
46898 +}
46899 +console_initcall(xen_console_init);
46900 +
46901 +/*** Useful function for console debugging -- goes straight to Xen. ***/
46902 +asmlinkage int xprintk(const char *fmt, ...)
46903 +{
46904 +       va_list args;
46905 +       int printk_len;
46906 +       static char printk_buf[1024];
46907 +
46908 +       /* Emit the output into the temporary buffer */
46909 +       va_start(args, fmt);
46910 +       printk_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args);
46911 +       va_end(args);
46912 +
46913 +       /* Send the processed output directly to Xen. */
46914 +       kcons_write_dom0(NULL, printk_buf, printk_len);
46915 +
46916 +       return 0;
46917 +}
46918 +
46919 +/*** Forcibly flush console data before dying. ***/
46920 +void xencons_force_flush(void)
46921 +{
46922 +       int sz;
46923 +
46924 +       /* Emergency console is synchronous, so there's nothing to flush. */
46925 +       if (xen_start_info->flags & SIF_INITDOMAIN)
46926 +               return;
46927 +
46928 +       /* Spin until console data is flushed through to the daemon. */
46929 +       while (wc != wp) {
46930 +               int sent = 0;
46931 +               if ((sz = wp - wc) == 0)
46932 +                       continue;
46933 +               sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz);
46934 +               if (sent > 0)
46935 +                       wc += sent;
46936 +       }
46937 +}
46938 +
46939 +
46940 +/******************** User-space console driver (/dev/console) ************/
46941 +
46942 +#define DRV(_d)         (_d)
46943 +#define TTY_INDEX(_tty) ((_tty)->index)
46944 +
46945 +static struct termios *xencons_termios[MAX_NR_CONSOLES];
46946 +static struct termios *xencons_termios_locked[MAX_NR_CONSOLES];
46947 +static struct tty_struct *xencons_tty;
46948 +static int xencons_priv_irq;
46949 +static char x_char;
46950 +
46951 +void xencons_rx(char *buf, unsigned len, struct pt_regs *regs)
46952 +{
46953 +       int           i;
46954 +       unsigned long flags;
46955 +
46956 +       spin_lock_irqsave(&xencons_lock, flags);
46957 +       if (xencons_tty == NULL)
46958 +               goto out;
46959 +
46960 +       for (i = 0; i < len; i++) {
46961 +#ifdef CONFIG_MAGIC_SYSRQ
46962 +               if (sysrq_enabled) {
46963 +                       if (buf[i] == '\x0f') { /* ^O */
46964 +                               sysrq_requested = jiffies;
46965 +                               continue; /* don't print the sysrq key */
46966 +                       } else if (sysrq_requested) {
46967 +                               unsigned long sysrq_timeout =
46968 +                                       sysrq_requested + HZ*2;
46969 +                               sysrq_requested = 0;
46970 +                               if (time_before(jiffies, sysrq_timeout)) {
46971 +                                       spin_unlock_irqrestore(
46972 +                                               &xencons_lock, flags);
46973 +                                       handle_sysrq(
46974 +                                               buf[i], regs, xencons_tty);
46975 +                                       spin_lock_irqsave(
46976 +                                               &xencons_lock, flags);
46977 +                                       continue;
46978 +                               }
46979 +                       }
46980 +               }
46981 +#endif
46982 +               tty_insert_flip_char(xencons_tty, buf[i], 0);
46983 +       }
46984 +       tty_flip_buffer_push(xencons_tty);
46985 +
46986 + out:
46987 +       spin_unlock_irqrestore(&xencons_lock, flags);
46988 +}
46989 +
46990 +static void __xencons_tx_flush(void)
46991 +{
46992 +       int sent, sz, work_done = 0;
46993 +
46994 +       if (x_char) {
46995 +               if (xen_start_info->flags & SIF_INITDOMAIN)
46996 +                       kcons_write_dom0(NULL, &x_char, 1);
46997 +               else
46998 +                       while (x_char)
46999 +                               if (xencons_ring_send(&x_char, 1) == 1)
47000 +                                       break;
47001 +               x_char = 0;
47002 +               work_done = 1;
47003 +       }
47004 +
47005 +       while (wc != wp) {
47006 +               sz = wp - wc;
47007 +               if (sz > (wbuf_size - WBUF_MASK(wc)))
47008 +                       sz = wbuf_size - WBUF_MASK(wc);
47009 +               if (xen_start_info->flags & SIF_INITDOMAIN) {
47010 +                       kcons_write_dom0(NULL, &wbuf[WBUF_MASK(wc)], sz);
47011 +                       wc += sz;
47012 +               } else {
47013 +                       sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz);
47014 +                       if (sent == 0)
47015 +                               break;
47016 +                       wc += sent;
47017 +               }
47018 +               work_done = 1;
47019 +       }
47020 +
47021 +       if (work_done && (xencons_tty != NULL)) {
47022 +               wake_up_interruptible(&xencons_tty->write_wait);
47023 +               if ((xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) &&
47024 +                   (xencons_tty->ldisc.write_wakeup != NULL))
47025 +                       (xencons_tty->ldisc.write_wakeup)(xencons_tty);
47026 +       }
47027 +}
47028 +
47029 +void xencons_tx(void)
47030 +{
47031 +       unsigned long flags;
47032 +
47033 +       spin_lock_irqsave(&xencons_lock, flags);
47034 +       __xencons_tx_flush();
47035 +       spin_unlock_irqrestore(&xencons_lock, flags);
47036 +}
47037 +
47038 +/* Privileged receive callback and transmit kicker. */
47039 +static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id,
47040 +                                          struct pt_regs *regs)
47041 +{
47042 +       static char rbuf[16];
47043 +       int         l;
47044 +
47045 +       while ((l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0)
47046 +               xencons_rx(rbuf, l, regs);
47047 +
47048 +       xencons_tx();
47049 +
47050 +       return IRQ_HANDLED;
47051 +}
47052 +
47053 +static int xencons_write_room(struct tty_struct *tty)
47054 +{
47055 +       return wbuf_size - (wp - wc);
47056 +}
47057 +
47058 +static int xencons_chars_in_buffer(struct tty_struct *tty)
47059 +{
47060 +       return wp - wc;
47061 +}
47062 +
47063 +static void xencons_send_xchar(struct tty_struct *tty, char ch)
47064 +{
47065 +       unsigned long flags;
47066 +
47067 +       if (TTY_INDEX(tty) != 0)
47068 +               return;
47069 +
47070 +       spin_lock_irqsave(&xencons_lock, flags);
47071 +       x_char = ch;
47072 +       __xencons_tx_flush();
47073 +       spin_unlock_irqrestore(&xencons_lock, flags);
47074 +}
47075 +
47076 +static void xencons_throttle(struct tty_struct *tty)
47077 +{
47078 +       if (TTY_INDEX(tty) != 0)
47079 +               return;
47080 +
47081 +       if (I_IXOFF(tty))
47082 +               xencons_send_xchar(tty, STOP_CHAR(tty));
47083 +}
47084 +
47085 +static void xencons_unthrottle(struct tty_struct *tty)
47086 +{
47087 +       if (TTY_INDEX(tty) != 0)
47088 +               return;
47089 +
47090 +       if (I_IXOFF(tty)) {
47091 +               if (x_char != 0)
47092 +                       x_char = 0;
47093 +               else
47094 +                       xencons_send_xchar(tty, START_CHAR(tty));
47095 +       }
47096 +}
47097 +
47098 +static void xencons_flush_buffer(struct tty_struct *tty)
47099 +{
47100 +       unsigned long flags;
47101 +
47102 +       if (TTY_INDEX(tty) != 0)
47103 +               return;
47104 +
47105 +       spin_lock_irqsave(&xencons_lock, flags);
47106 +       wc = wp = 0;
47107 +       spin_unlock_irqrestore(&xencons_lock, flags);
47108 +}
47109 +
47110 +static inline int __xencons_put_char(int ch)
47111 +{
47112 +       char _ch = (char)ch;
47113 +       if ((wp - wc) == wbuf_size)
47114 +               return 0;
47115 +       wbuf[WBUF_MASK(wp++)] = _ch;
47116 +       return 1;
47117 +}
47118 +
47119 +static int xencons_write(
47120 +       struct tty_struct *tty,
47121 +       const unsigned char *buf,
47122 +       int count)
47123 +{
47124 +       int i;
47125 +       unsigned long flags;
47126 +
47127 +       if (TTY_INDEX(tty) != 0)
47128 +               return count;
47129 +
47130 +       spin_lock_irqsave(&xencons_lock, flags);
47131 +
47132 +       for (i = 0; i < count; i++)
47133 +               if (!__xencons_put_char(buf[i]))
47134 +                       break;
47135 +
47136 +       if (i != 0)
47137 +               __xencons_tx_flush();
47138 +
47139 +       spin_unlock_irqrestore(&xencons_lock, flags);
47140 +
47141 +       return i;
47142 +}
47143 +
47144 +static void xencons_put_char(struct tty_struct *tty, u_char ch)
47145 +{
47146 +       unsigned long flags;
47147 +
47148 +       if (TTY_INDEX(tty) != 0)
47149 +               return;
47150 +
47151 +       spin_lock_irqsave(&xencons_lock, flags);
47152 +       (void)__xencons_put_char(ch);
47153 +       spin_unlock_irqrestore(&xencons_lock, flags);
47154 +}
47155 +
47156 +static void xencons_flush_chars(struct tty_struct *tty)
47157 +{
47158 +       unsigned long flags;
47159 +
47160 +       if (TTY_INDEX(tty) != 0)
47161 +               return;
47162 +
47163 +       spin_lock_irqsave(&xencons_lock, flags);
47164 +       __xencons_tx_flush();
47165 +       spin_unlock_irqrestore(&xencons_lock, flags);
47166 +}
47167 +
47168 +static void xencons_wait_until_sent(struct tty_struct *tty, int timeout)
47169 +{
47170 +       unsigned long orig_jiffies = jiffies;
47171 +
47172 +       if (TTY_INDEX(tty) != 0)
47173 +               return;
47174 +
47175 +       while (DRV(tty->driver)->chars_in_buffer(tty)) {
47176 +               set_current_state(TASK_INTERRUPTIBLE);
47177 +               schedule_timeout(1);
47178 +               if (signal_pending(current))
47179 +                       break;
47180 +               if (timeout && time_after(jiffies, orig_jiffies + timeout))
47181 +                       break;
47182 +       }
47183 +
47184 +       set_current_state(TASK_RUNNING);
47185 +}
47186 +
47187 +static int xencons_open(struct tty_struct *tty, struct file *filp)
47188 +{
47189 +       unsigned long flags;
47190 +
47191 +       if (TTY_INDEX(tty) != 0)
47192 +               return 0;
47193 +
47194 +       spin_lock_irqsave(&xencons_lock, flags);
47195 +       tty->driver_data = NULL;
47196 +       if (xencons_tty == NULL)
47197 +               xencons_tty = tty;
47198 +       __xencons_tx_flush();
47199 +       spin_unlock_irqrestore(&xencons_lock, flags);
47200 +
47201 +       return 0;
47202 +}
47203 +
47204 +static void xencons_close(struct tty_struct *tty, struct file *filp)
47205 +{
47206 +       unsigned long flags;
47207 +
47208 +       if (TTY_INDEX(tty) != 0)
47209 +               return;
47210 +
47211 +       if (tty->count == 1) {
47212 +               tty->closing = 1;
47213 +               tty_wait_until_sent(tty, 0);
47214 +               if (DRV(tty->driver)->flush_buffer != NULL)
47215 +                       DRV(tty->driver)->flush_buffer(tty);
47216 +               if (tty->ldisc.flush_buffer != NULL)
47217 +                       tty->ldisc.flush_buffer(tty);
47218 +               tty->closing = 0;
47219 +               spin_lock_irqsave(&xencons_lock, flags);
47220 +               xencons_tty = NULL;
47221 +               spin_unlock_irqrestore(&xencons_lock, flags);
47222 +       }
47223 +}
47224 +
47225 +static struct tty_operations xencons_ops = {
47226 +       .open = xencons_open,
47227 +       .close = xencons_close,
47228 +       .write = xencons_write,
47229 +       .write_room = xencons_write_room,
47230 +       .put_char = xencons_put_char,
47231 +       .flush_chars = xencons_flush_chars,
47232 +       .chars_in_buffer = xencons_chars_in_buffer,
47233 +       .send_xchar = xencons_send_xchar,
47234 +       .flush_buffer = xencons_flush_buffer,
47235 +       .throttle = xencons_throttle,
47236 +       .unthrottle = xencons_unthrottle,
47237 +       .wait_until_sent = xencons_wait_until_sent,
47238 +};
47239 +
47240 +static int __init xencons_init(void)
47241 +{
47242 +       int rc;
47243 +
47244 +       if (xen_init() < 0)
47245 +               return -ENODEV;
47246 +
47247 +       if (xc_mode == XC_OFF)
47248 +               return 0;
47249 +
47250 +       xencons_ring_init();
47251 +
47252 +       xencons_driver = alloc_tty_driver((xc_mode == XC_SERIAL) ?
47253 +                                         1 : MAX_NR_CONSOLES);
47254 +       if (xencons_driver == NULL)
47255 +               return -ENOMEM;
47256 +
47257 +       DRV(xencons_driver)->name            = "xencons";
47258 +       DRV(xencons_driver)->major           = TTY_MAJOR;
47259 +       DRV(xencons_driver)->type            = TTY_DRIVER_TYPE_SERIAL;
47260 +       DRV(xencons_driver)->subtype         = SERIAL_TYPE_NORMAL;
47261 +       DRV(xencons_driver)->init_termios    = tty_std_termios;
47262 +       DRV(xencons_driver)->flags           =
47263 +               TTY_DRIVER_REAL_RAW |
47264 +               TTY_DRIVER_RESET_TERMIOS |
47265 +               TTY_DRIVER_NO_DEVFS;
47266 +       DRV(xencons_driver)->termios         = xencons_termios;
47267 +       DRV(xencons_driver)->termios_locked  = xencons_termios_locked;
47268 +
47269 +       if (xc_mode == XC_SERIAL) {
47270 +               DRV(xencons_driver)->name        = "ttyS";
47271 +               DRV(xencons_driver)->minor_start = 64 + xc_num;
47272 +               DRV(xencons_driver)->name_base   = 0 + xc_num;
47273 +       } else {
47274 +               DRV(xencons_driver)->name        = "tty";
47275 +               DRV(xencons_driver)->minor_start = xc_num;
47276 +               DRV(xencons_driver)->name_base   = xc_num;
47277 +       }
47278 +
47279 +       tty_set_operations(xencons_driver, &xencons_ops);
47280 +
47281 +       if ((rc = tty_register_driver(DRV(xencons_driver))) != 0) {
47282 +               printk("WARNING: Failed to register Xen virtual "
47283 +                      "console driver as '%s%d'\n",
47284 +                      DRV(xencons_driver)->name,
47285 +                      DRV(xencons_driver)->name_base);
47286 +               put_tty_driver(xencons_driver);
47287 +               xencons_driver = NULL;
47288 +               return rc;
47289 +       }
47290 +
47291 +       tty_register_device(xencons_driver, 0, NULL);
47292 +
47293 +       if (xen_start_info->flags & SIF_INITDOMAIN) {
47294 +               xencons_priv_irq = bind_virq_to_irqhandler(
47295 +                       VIRQ_CONSOLE,
47296 +                       0,
47297 +                       xencons_priv_interrupt,
47298 +                       0,
47299 +                       "console",
47300 +                       NULL);
47301 +               BUG_ON(xencons_priv_irq < 0);
47302 +       }
47303 +
47304 +       printk("Xen virtual console successfully installed as %s%d\n",
47305 +              DRV(xencons_driver)->name,
47306 +              DRV(xencons_driver)->name_base );
47307 +
47308 +       return 0;
47309 +}
47310 +
47311 +module_init(xencons_init);
47312 +
47313 +MODULE_LICENSE("Dual BSD/GPL");
47314 +
47315 +/*
47316 + * Local variables:
47317 + *  c-file-style: "linux"
47318 + *  indent-tabs-mode: t
47319 + *  c-indent-level: 8
47320 + *  c-basic-offset: 8
47321 + *  tab-width: 8
47322 + * End:
47323 + */
47324 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/console/xencons_ring.c linux-2.6.16/drivers/xen/console/xencons_ring.c
47325 --- linux-2.6.16.orig/drivers/xen/console/xencons_ring.c        1970-01-01 01:00:00.000000000 +0100
47326 +++ linux-2.6.16/drivers/xen/console/xencons_ring.c     2006-06-26 09:51:32.000000000 +0200
47327 @@ -0,0 +1,151 @@
47328 +/* 
47329 + * This program is free software; you can redistribute it and/or
47330 + * modify it under the terms of the GNU General Public License version 2
47331 + * as published by the Free Software Foundation; or, when distributed
47332 + * separately from the Linux kernel or incorporated into other
47333 + * software packages, subject to the following license:
47334 + * 
47335 + * Permission is hereby granted, free of charge, to any person obtaining a copy
47336 + * of this source file (the "Software"), to deal in the Software without
47337 + * restriction, including without limitation the rights to use, copy, modify,
47338 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
47339 + * and to permit persons to whom the Software is furnished to do so, subject to
47340 + * the following conditions:
47341 + * 
47342 + * The above copyright notice and this permission notice shall be included in
47343 + * all copies or substantial portions of the Software.
47344 + * 
47345 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
47346 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
47347 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
47348 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
47349 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
47350 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
47351 + * IN THE SOFTWARE.
47352 + */
47353 +
47354 +#include <linux/version.h>
47355 +#include <linux/module.h>
47356 +#include <linux/errno.h>
47357 +#include <linux/signal.h>
47358 +#include <linux/sched.h>
47359 +#include <linux/interrupt.h>
47360 +#include <linux/tty.h>
47361 +#include <linux/tty_flip.h>
47362 +#include <linux/serial.h>
47363 +#include <linux/major.h>
47364 +#include <linux/ptrace.h>
47365 +#include <linux/ioport.h>
47366 +#include <linux/mm.h>
47367 +#include <linux/slab.h>
47368 +
47369 +#include <asm/hypervisor.h>
47370 +#include <xen/evtchn.h>
47371 +#include <xen/xencons.h>
47372 +#include <linux/wait.h>
47373 +#include <linux/interrupt.h>
47374 +#include <linux/sched.h>
47375 +#include <linux/err.h>
47376 +#include <xen/interface/io/console.h>
47377 +
47378 +static int xencons_irq;
47379 +
47380 +static inline struct xencons_interface *xencons_interface(void)
47381 +{
47382 +       return mfn_to_virt(xen_start_info->console_mfn);
47383 +}
47384 +
47385 +static inline void notify_daemon(void)
47386 +{
47387 +       /* Use evtchn: this is called early, before irq is set up. */
47388 +       notify_remote_via_evtchn(xen_start_info->console_evtchn);
47389 +}
47390 +
47391 +int xencons_ring_send(const char *data, unsigned len)
47392 +{
47393 +       int sent = 0;
47394 +       struct xencons_interface *intf = xencons_interface();
47395 +       XENCONS_RING_IDX cons, prod;
47396 +
47397 +       cons = intf->out_cons;
47398 +       prod = intf->out_prod;
47399 +       mb();
47400 +       BUG_ON((prod - cons) > sizeof(intf->out));
47401 +
47402 +       while ((sent < len) && ((prod - cons) < sizeof(intf->out)))
47403 +               intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++];
47404 +
47405 +       wmb();
47406 +       intf->out_prod = prod;
47407 +
47408 +       notify_daemon();
47409 +
47410 +       return sent;
47411 +}
47412 +
47413 +static irqreturn_t handle_input(int irq, void *unused, struct pt_regs *regs)
47414 +{
47415 +       struct xencons_interface *intf = xencons_interface();
47416 +       XENCONS_RING_IDX cons, prod;
47417 +
47418 +       cons = intf->in_cons;
47419 +       prod = intf->in_prod;
47420 +       mb();
47421 +       BUG_ON((prod - cons) > sizeof(intf->in));
47422 +
47423 +       while (cons != prod) {
47424 +               xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1, regs);
47425 +               cons++;
47426 +       }
47427 +
47428 +       mb();
47429 +       intf->in_cons = cons;
47430 +
47431 +       notify_daemon();
47432 +
47433 +       xencons_tx();
47434 +
47435 +       return IRQ_HANDLED;
47436 +}
47437 +
47438 +int xencons_ring_init(void)
47439 +{
47440 +       int err;
47441 +
47442 +       if (xencons_irq)
47443 +               unbind_from_irqhandler(xencons_irq, NULL);
47444 +       xencons_irq = 0;
47445 +
47446 +       if (!xen_start_info->console_evtchn)
47447 +               return 0;
47448 +
47449 +       err = bind_evtchn_to_irqhandler(
47450 +               xen_start_info->console_evtchn,
47451 +               handle_input, 0, "xencons", NULL);
47452 +       if (err <= 0) {
47453 +               printk(KERN_ERR "XEN console request irq failed %i\n", err);
47454 +               return err;
47455 +       }
47456 +
47457 +       xencons_irq = err;
47458 +
47459 +       /* In case we have in-flight data after save/restore... */
47460 +       notify_daemon();
47461 +
47462 +       return 0;
47463 +}
47464 +
47465 +void xencons_resume(void)
47466 +{
47467 +       (void)xencons_ring_init();
47468 +}
47469 +
47470 +/*
47471 + * Local variables:
47472 + *  c-file-style: "linux"
47473 + *  indent-tabs-mode: t
47474 + *  c-indent-level: 8
47475 + *  c-basic-offset: 8
47476 + *  tab-width: 8
47477 + * End:
47478 + */
47479 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/core/Makefile linux-2.6.16/drivers/xen/core/Makefile
47480 --- linux-2.6.16.orig/drivers/xen/core/Makefile 1970-01-01 01:00:00.000000000 +0100
47481 +++ linux-2.6.16/drivers/xen/core/Makefile      2006-06-26 09:51:32.000000000 +0200
47482 @@ -0,0 +1,11 @@
47483 +#
47484 +# Makefile for the linux kernel.
47485 +#
47486 +
47487 +obj-y   := evtchn.o reboot.o gnttab.o features.o
47488 +
47489 +obj-$(CONFIG_PROC_FS) += xen_proc.o
47490 +obj-$(CONFIG_NET)     += skbuff.o
47491 +obj-$(CONFIG_SMP)     += smpboot.o
47492 +obj-$(CONFIG_SYSFS)   += hypervisor_sysfs.o
47493 +obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o
47494 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/core/evtchn.c linux-2.6.16/drivers/xen/core/evtchn.c
47495 --- linux-2.6.16.orig/drivers/xen/core/evtchn.c 1970-01-01 01:00:00.000000000 +0100
47496 +++ linux-2.6.16/drivers/xen/core/evtchn.c      2006-06-26 09:51:32.000000000 +0200
47497 @@ -0,0 +1,863 @@
47498 +/******************************************************************************
47499 + * evtchn.c
47500 + * 
47501 + * Communication via Xen event channels.
47502 + * 
47503 + * Copyright (c) 2002-2005, K A Fraser
47504 + * 
47505 + * This program is free software; you can redistribute it and/or
47506 + * modify it under the terms of the GNU General Public License version 2
47507 + * as published by the Free Software Foundation; or, when distributed
47508 + * separately from the Linux kernel or incorporated into other
47509 + * software packages, subject to the following license:
47510 + * 
47511 + * Permission is hereby granted, free of charge, to any person obtaining a copy
47512 + * of this source file (the "Software"), to deal in the Software without
47513 + * restriction, including without limitation the rights to use, copy, modify,
47514 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
47515 + * and to permit persons to whom the Software is furnished to do so, subject to
47516 + * the following conditions:
47517 + * 
47518 + * The above copyright notice and this permission notice shall be included in
47519 + * all copies or substantial portions of the Software.
47520 + * 
47521 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
47522 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
47523 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
47524 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
47525 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
47526 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
47527 + * IN THE SOFTWARE.
47528 + */
47529 +
47530 +#include <linux/config.h>
47531 +#include <linux/module.h>
47532 +#include <linux/irq.h>
47533 +#include <linux/interrupt.h>
47534 +#include <linux/sched.h>
47535 +#include <linux/kernel_stat.h>
47536 +#include <linux/version.h>
47537 +#include <asm/atomic.h>
47538 +#include <asm/system.h>
47539 +#include <asm/ptrace.h>
47540 +#include <asm/synch_bitops.h>
47541 +#include <xen/interface/event_channel.h>
47542 +#include <xen/interface/physdev.h>
47543 +#include <asm/hypervisor.h>
47544 +#include <xen/evtchn.h>
47545 +#include <linux/mc146818rtc.h> /* RTC_IRQ */
47546 +
47547 +/*
47548 + * This lock protects updates to the following mapping and reference-count
47549 + * arrays. The lock does not need to be acquired to read the mapping tables.
47550 + */
47551 +static spinlock_t irq_mapping_update_lock;
47552 +
47553 +/* IRQ <-> event-channel mappings. */
47554 +static int evtchn_to_irq[NR_EVENT_CHANNELS];
47555 +
47556 +/* Packed IRQ information: binding type, sub-type index, and event channel. */
47557 +static u32 irq_info[NR_IRQS];
47558 +
47559 +/* Binding types. */
47560 +enum { IRQT_UNBOUND, IRQT_PIRQ, IRQT_VIRQ, IRQT_IPI, IRQT_EVTCHN };
47561 +
47562 +/* Constructor for packed IRQ information. */
47563 +static inline u32 mk_irq_info(u32 type, u32 index, u32 evtchn)
47564 +{
47565 +       return ((type << 24) | (index << 16) | evtchn);
47566 +}
47567 +
47568 +/* Convenient shorthand for packed representation of an unbound IRQ. */
47569 +#define IRQ_UNBOUND    mk_irq_info(IRQT_UNBOUND, 0, 0)
47570 +
47571 +/*
47572 + * Accessors for packed IRQ information.
47573 + */
47574 +
47575 +static inline unsigned int evtchn_from_irq(int irq)
47576 +{
47577 +       return (u16)(irq_info[irq]);
47578 +}
47579 +
47580 +static inline unsigned int index_from_irq(int irq)
47581 +{
47582 +       return (u8)(irq_info[irq] >> 16);
47583 +}
47584 +
47585 +static inline unsigned int type_from_irq(int irq)
47586 +{
47587 +       return (u8)(irq_info[irq] >> 24);
47588 +}
47589 +
47590 +/* IRQ <-> VIRQ mapping. */
47591 +DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]);
47592 +
47593 +/* IRQ <-> IPI mapping. */
47594 +#ifndef NR_IPIS
47595 +#define NR_IPIS 1
47596 +#endif
47597 +DEFINE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
47598 +
47599 +/* Reference counts for bindings to IRQs. */
47600 +static int irq_bindcount[NR_IRQS];
47601 +
47602 +/* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
47603 +static unsigned long pirq_needs_unmask_notify[NR_PIRQS/sizeof(unsigned long)];
47604 +
47605 +#ifdef CONFIG_SMP
47606 +
47607 +static u8 cpu_evtchn[NR_EVENT_CHANNELS];
47608 +static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
47609 +
47610 +static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh,
47611 +                                          unsigned int idx)
47612 +{
47613 +       return (sh->evtchn_pending[idx] &
47614 +               cpu_evtchn_mask[cpu][idx] &
47615 +               ~sh->evtchn_mask[idx]);
47616 +}
47617 +
47618 +static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
47619 +{
47620 +       clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]);
47621 +       set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]);
47622 +       cpu_evtchn[chn] = cpu;
47623 +}
47624 +
47625 +static void init_evtchn_cpu_bindings(void)
47626 +{
47627 +       /* By default all event channels notify CPU#0. */
47628 +       memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
47629 +       memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
47630 +}
47631 +
47632 +static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
47633 +{
47634 +       return cpu_evtchn[evtchn];
47635 +}
47636 +
47637 +#else
47638 +
47639 +static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh,
47640 +                                          unsigned int idx)
47641 +{
47642 +       return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx]);
47643 +}
47644 +
47645 +static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
47646 +{
47647 +}
47648 +
47649 +static void init_evtchn_cpu_bindings(void)
47650 +{
47651 +}
47652 +
47653 +static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
47654 +{
47655 +       return 0;
47656 +}
47657 +
47658 +#endif
47659 +
47660 +/* Upcall to generic IRQ layer. */
47661 +#ifdef CONFIG_X86
47662 +extern fastcall unsigned int do_IRQ(struct pt_regs *regs);
47663 +#if defined (__i386__)
47664 +static inline void exit_idle(void) {}
47665 +#define IRQ_REG orig_eax
47666 +#elif defined (__x86_64__)
47667 +#include <asm/idle.h>
47668 +#define IRQ_REG orig_rax
47669 +#endif
47670 +#define do_IRQ(irq, regs) do {         \
47671 +       (regs)->IRQ_REG = ~(irq);       \
47672 +       do_IRQ((regs));                 \
47673 +} while (0)
47674 +#endif
47675 +
47676 +/* Xen will never allocate port zero for any purpose. */
47677 +#define VALID_EVTCHN(chn)      ((chn) != 0)
47678 +
47679 +/*
47680 + * Force a proper event-channel callback from Xen after clearing the
47681 + * callback mask. We do this in a very simple manner, by making a call
47682 + * down into Xen. The pending flag will be checked by Xen on return.
47683 + */
47684 +void force_evtchn_callback(void)
47685 +{
47686 +       (void)HYPERVISOR_xen_version(0, NULL);
47687 +}
47688 +EXPORT_SYMBOL_GPL(force_evtchn_callback);
47689 +
47690 +/* NB. Interrupts are disabled on entry. */
47691 +asmlinkage void evtchn_do_upcall(struct pt_regs *regs)
47692 +{
47693 +       unsigned long  l1, l2;
47694 +       unsigned int   l1i, l2i, port;
47695 +       int            irq, cpu = smp_processor_id();
47696 +       shared_info_t *s = HYPERVISOR_shared_info;
47697 +       vcpu_info_t   *vcpu_info = &s->vcpu_info[cpu];
47698 +
47699 +       vcpu_info->evtchn_upcall_pending = 0;
47700 +
47701 +       /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
47702 +       l1 = xchg(&vcpu_info->evtchn_pending_sel, 0);
47703 +       while (l1 != 0) {
47704 +               l1i = __ffs(l1);
47705 +               l1 &= ~(1UL << l1i);
47706 +
47707 +               while ((l2 = active_evtchns(cpu, s, l1i)) != 0) {
47708 +                       l2i = __ffs(l2);
47709 +
47710 +                       port = (l1i * BITS_PER_LONG) + l2i;
47711 +                       if ((irq = evtchn_to_irq[port]) != -1)
47712 +                               do_IRQ(irq, regs);
47713 +                       else {
47714 +                               exit_idle();
47715 +                               evtchn_device_upcall(port);
47716 +                       }
47717 +               }
47718 +       }
47719 +}
47720 +
47721 +static int find_unbound_irq(void)
47722 +{
47723 +       int irq;
47724 +
47725 +       for (irq = 0; irq < NR_IRQS; irq++)
47726 +               if (irq_bindcount[irq] == 0)
47727 +                       break;
47728 +
47729 +       if (irq == NR_IRQS)
47730 +               panic("No available IRQ to bind to: increase NR_IRQS!\n");
47731 +
47732 +       return irq;
47733 +}
47734 +
47735 +static int bind_evtchn_to_irq(unsigned int evtchn)
47736 +{
47737 +       int irq;
47738 +
47739 +       spin_lock(&irq_mapping_update_lock);
47740 +
47741 +       if ((irq = evtchn_to_irq[evtchn]) == -1) {
47742 +               irq = find_unbound_irq();
47743 +               evtchn_to_irq[evtchn] = irq;
47744 +               irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
47745 +       }
47746 +
47747 +       irq_bindcount[irq]++;
47748 +
47749 +       spin_unlock(&irq_mapping_update_lock);
47750 +
47751 +       return irq;
47752 +}
47753 +
47754 +static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
47755 +{
47756 +       evtchn_op_t op = { .cmd = EVTCHNOP_bind_virq };
47757 +       int evtchn, irq;
47758 +
47759 +       spin_lock(&irq_mapping_update_lock);
47760 +
47761 +       if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) {
47762 +               op.u.bind_virq.virq = virq;
47763 +               op.u.bind_virq.vcpu = cpu;
47764 +               BUG_ON(HYPERVISOR_event_channel_op(&op) != 0);
47765 +               evtchn = op.u.bind_virq.port;
47766 +
47767 +               irq = find_unbound_irq();
47768 +               evtchn_to_irq[evtchn] = irq;
47769 +               irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
47770 +
47771 +               per_cpu(virq_to_irq, cpu)[virq] = irq;
47772 +
47773 +               bind_evtchn_to_cpu(evtchn, cpu);
47774 +       }
47775 +
47776 +       irq_bindcount[irq]++;
47777 +
47778 +       spin_unlock(&irq_mapping_update_lock);
47779 +
47780 +       return irq;
47781 +}
47782 +
47783 +static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
47784 +{
47785 +       evtchn_op_t op = { .cmd = EVTCHNOP_bind_ipi };
47786 +       int evtchn, irq;
47787 +
47788 +       spin_lock(&irq_mapping_update_lock);
47789 +
47790 +       if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) {
47791 +               op.u.bind_ipi.vcpu = cpu;
47792 +               BUG_ON(HYPERVISOR_event_channel_op(&op) != 0);
47793 +               evtchn = op.u.bind_ipi.port;
47794 +
47795 +               irq = find_unbound_irq();
47796 +               evtchn_to_irq[evtchn] = irq;
47797 +               irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
47798 +
47799 +               per_cpu(ipi_to_irq, cpu)[ipi] = irq;
47800 +
47801 +               bind_evtchn_to_cpu(evtchn, cpu);
47802 +       }
47803 +
47804 +       irq_bindcount[irq]++;
47805 +
47806 +       spin_unlock(&irq_mapping_update_lock);
47807 +
47808 +       return irq;
47809 +}
47810 +
47811 +static void unbind_from_irq(unsigned int irq)
47812 +{
47813 +       evtchn_op_t op = { .cmd = EVTCHNOP_close };
47814 +       int evtchn = evtchn_from_irq(irq);
47815 +
47816 +       spin_lock(&irq_mapping_update_lock);
47817 +
47818 +       if ((--irq_bindcount[irq] == 0) && VALID_EVTCHN(evtchn)) {
47819 +               op.u.close.port = evtchn;
47820 +               BUG_ON(HYPERVISOR_event_channel_op(&op) != 0);
47821 +
47822 +               switch (type_from_irq(irq)) {
47823 +               case IRQT_VIRQ:
47824 +                       per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
47825 +                               [index_from_irq(irq)] = -1;
47826 +                       break;
47827 +               case IRQT_IPI:
47828 +                       per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))
47829 +                               [index_from_irq(irq)] = -1;
47830 +                       break;
47831 +               default:
47832 +                       break;
47833 +               }
47834 +
47835 +               /* Closed ports are implicitly re-bound to VCPU0. */
47836 +               bind_evtchn_to_cpu(evtchn, 0);
47837 +
47838 +               evtchn_to_irq[evtchn] = -1;
47839 +               irq_info[irq] = IRQ_UNBOUND;
47840 +       }
47841 +
47842 +       spin_unlock(&irq_mapping_update_lock);
47843 +}
47844 +
47845 +int bind_evtchn_to_irqhandler(
47846 +       unsigned int evtchn,
47847 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
47848 +       unsigned long irqflags,
47849 +       const char *devname,
47850 +       void *dev_id)
47851 +{
47852 +       unsigned int irq;
47853 +       int retval;
47854 +
47855 +       irq = bind_evtchn_to_irq(evtchn);
47856 +       retval = request_irq(irq, handler, irqflags, devname, dev_id);
47857 +       if (retval != 0) {
47858 +               unbind_from_irq(irq);
47859 +               return retval;
47860 +       }
47861 +
47862 +       return irq;
47863 +}
47864 +EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
47865 +
47866 +int bind_virq_to_irqhandler(
47867 +       unsigned int virq,
47868 +       unsigned int cpu,
47869 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
47870 +       unsigned long irqflags,
47871 +       const char *devname,
47872 +       void *dev_id)
47873 +{
47874 +       unsigned int irq;
47875 +       int retval;
47876 +
47877 +       irq = bind_virq_to_irq(virq, cpu);
47878 +       retval = request_irq(irq, handler, irqflags, devname, dev_id);
47879 +       if (retval != 0) {
47880 +               unbind_from_irq(irq);
47881 +               return retval;
47882 +       }
47883 +
47884 +       return irq;
47885 +}
47886 +EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
47887 +
47888 +int bind_ipi_to_irqhandler(
47889 +       unsigned int ipi,
47890 +       unsigned int cpu,
47891 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
47892 +       unsigned long irqflags,
47893 +       const char *devname,
47894 +       void *dev_id)
47895 +{
47896 +       unsigned int irq;
47897 +       int retval;
47898 +
47899 +       irq = bind_ipi_to_irq(ipi, cpu);
47900 +       retval = request_irq(irq, handler, irqflags, devname, dev_id);
47901 +       if (retval != 0) {
47902 +               unbind_from_irq(irq);
47903 +               return retval;
47904 +       }
47905 +
47906 +       return irq;
47907 +}
47908 +EXPORT_SYMBOL_GPL(bind_ipi_to_irqhandler);
47909 +
47910 +void unbind_from_irqhandler(unsigned int irq, void *dev_id)
47911 +{
47912 +       free_irq(irq, dev_id);
47913 +       unbind_from_irq(irq);
47914 +}
47915 +EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
47916 +
47917 +#ifdef CONFIG_SMP
47918 +static void do_nothing_function(void *ign)
47919 +{
47920 +}
47921 +#endif
47922 +
47923 +/* Rebind an evtchn so that it gets delivered to a specific cpu */
47924 +static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
47925 +{
47926 +       evtchn_op_t op = { .cmd = EVTCHNOP_bind_vcpu };
47927 +       int evtchn;
47928 +
47929 +       spin_lock(&irq_mapping_update_lock);
47930 +
47931 +       evtchn = evtchn_from_irq(irq);
47932 +       if (!VALID_EVTCHN(evtchn)) {
47933 +               spin_unlock(&irq_mapping_update_lock);
47934 +               return;
47935 +       }
47936 +
47937 +       /* Send future instances of this interrupt to other vcpu. */
47938 +       op.u.bind_vcpu.port = evtchn;
47939 +       op.u.bind_vcpu.vcpu = tcpu;
47940 +
47941 +       /*
47942 +        * If this fails, it usually just indicates that we're dealing with a 
47943 +        * virq or IPI channel, which don't actually need to be rebound. Ignore
47944 +        * it, but don't do the xenlinux-level rebind in that case.
47945 +        */
47946 +       if (HYPERVISOR_event_channel_op(&op) >= 0)
47947 +               bind_evtchn_to_cpu(evtchn, tcpu);
47948 +
47949 +       spin_unlock(&irq_mapping_update_lock);
47950 +
47951 +       /*
47952 +        * Now send the new target processor a NOP IPI. When this returns, it
47953 +        * will check for any pending interrupts, and so service any that got 
47954 +        * delivered to the wrong processor by mistake.
47955 +        * 
47956 +        * XXX: The only time this is called with interrupts disabled is from
47957 +        * the hotplug/hotunplug path. In that case, all cpus are stopped with 
47958 +        * interrupts disabled, and the missed interrupts will be picked up
47959 +        * when they start again. This is kind of a hack.
47960 +        */
47961 +       if (!irqs_disabled())
47962 +               smp_call_function(do_nothing_function, NULL, 0, 0);
47963 +}
47964 +
47965 +
47966 +static void set_affinity_irq(unsigned irq, cpumask_t dest)
47967 +{
47968 +       unsigned tcpu = first_cpu(dest);
47969 +       rebind_irq_to_cpu(irq, tcpu);
47970 +}
47971 +
47972 +/*
47973 + * Interface to generic handling in irq.c
47974 + */
47975 +
47976 +static unsigned int startup_dynirq(unsigned int irq)
47977 +{
47978 +       int evtchn = evtchn_from_irq(irq);
47979 +
47980 +       if (VALID_EVTCHN(evtchn))
47981 +               unmask_evtchn(evtchn);
47982 +       return 0;
47983 +}
47984 +
47985 +static void shutdown_dynirq(unsigned int irq)
47986 +{
47987 +       int evtchn = evtchn_from_irq(irq);
47988 +
47989 +       if (VALID_EVTCHN(evtchn))
47990 +               mask_evtchn(evtchn);
47991 +}
47992 +
47993 +static void enable_dynirq(unsigned int irq)
47994 +{
47995 +       int evtchn = evtchn_from_irq(irq);
47996 +
47997 +       if (VALID_EVTCHN(evtchn))
47998 +               unmask_evtchn(evtchn);
47999 +}
48000 +
48001 +static void disable_dynirq(unsigned int irq)
48002 +{
48003 +       int evtchn = evtchn_from_irq(irq);
48004 +
48005 +       if (VALID_EVTCHN(evtchn))
48006 +               mask_evtchn(evtchn);
48007 +}
48008 +
48009 +static void ack_dynirq(unsigned int irq)
48010 +{
48011 +       int evtchn = evtchn_from_irq(irq);
48012 +
48013 +       if (VALID_EVTCHN(evtchn)) {
48014 +               mask_evtchn(evtchn);
48015 +               clear_evtchn(evtchn);
48016 +       }
48017 +}
48018 +
48019 +static void end_dynirq(unsigned int irq)
48020 +{
48021 +       int evtchn = evtchn_from_irq(irq);
48022 +
48023 +       if (VALID_EVTCHN(evtchn) && !(irq_desc[irq].status & IRQ_DISABLED))
48024 +               unmask_evtchn(evtchn);
48025 +}
48026 +
48027 +static struct hw_interrupt_type dynirq_type = {
48028 +       "Dynamic-irq",
48029 +       startup_dynirq,
48030 +       shutdown_dynirq,
48031 +       enable_dynirq,
48032 +       disable_dynirq,
48033 +       ack_dynirq,
48034 +       end_dynirq,
48035 +       set_affinity_irq
48036 +};
48037 +
48038 +static inline void pirq_unmask_notify(int pirq)
48039 +{
48040 +       physdev_op_t op;
48041 +       if (unlikely(test_bit(pirq, &pirq_needs_unmask_notify[0]))) {
48042 +               op.cmd = PHYSDEVOP_IRQ_UNMASK_NOTIFY;
48043 +               (void)HYPERVISOR_physdev_op(&op);
48044 +       }
48045 +}
48046 +
48047 +static inline void pirq_query_unmask(int pirq)
48048 +{
48049 +       physdev_op_t op;
48050 +       op.cmd = PHYSDEVOP_IRQ_STATUS_QUERY;
48051 +       op.u.irq_status_query.irq = pirq;
48052 +       (void)HYPERVISOR_physdev_op(&op);
48053 +       clear_bit(pirq, &pirq_needs_unmask_notify[0]);
48054 +       if (op.u.irq_status_query.flags & PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY)
48055 +               set_bit(pirq, &pirq_needs_unmask_notify[0]);
48056 +}
48057 +
48058 +/*
48059 + * On startup, if there is no action associated with the IRQ then we are
48060 + * probing. In this case we should not share with others as it will confuse us.
48061 + */
48062 +#define probing_irq(_irq) (irq_desc[(_irq)].action == NULL)
48063 +
48064 +static unsigned int startup_pirq(unsigned int irq)
48065 +{
48066 +       evtchn_op_t op = { .cmd = EVTCHNOP_bind_pirq };
48067 +       int evtchn = evtchn_from_irq(irq);
48068 +
48069 +       if (VALID_EVTCHN(evtchn))
48070 +               goto out;
48071 +
48072 +       op.u.bind_pirq.pirq  = irq;
48073 +       /* NB. We are happy to share unless we are probing. */
48074 +       op.u.bind_pirq.flags = probing_irq(irq) ? 0 : BIND_PIRQ__WILL_SHARE;
48075 +       if (HYPERVISOR_event_channel_op(&op) != 0) {
48076 +               if (!probing_irq(irq))
48077 +                       printk(KERN_INFO "Failed to obtain physical IRQ %d\n",
48078 +                              irq);
48079 +               return 0;
48080 +       }
48081 +       evtchn = op.u.bind_pirq.port;
48082 +
48083 +       pirq_query_unmask(irq_to_pirq(irq));
48084 +
48085 +       bind_evtchn_to_cpu(evtchn, 0);
48086 +       evtchn_to_irq[evtchn] = irq;
48087 +       irq_info[irq] = mk_irq_info(IRQT_PIRQ, irq, evtchn);
48088 +
48089 + out:
48090 +       unmask_evtchn(evtchn);
48091 +       pirq_unmask_notify(irq_to_pirq(irq));
48092 +
48093 +       return 0;
48094 +}
48095 +
48096 +static void shutdown_pirq(unsigned int irq)
48097 +{
48098 +       evtchn_op_t op = { .cmd = EVTCHNOP_close };
48099 +       int evtchn = evtchn_from_irq(irq);
48100 +
48101 +       if (!VALID_EVTCHN(evtchn))
48102 +               return;
48103 +
48104 +       mask_evtchn(evtchn);
48105 +
48106 +       op.u.close.port = evtchn;
48107 +       BUG_ON(HYPERVISOR_event_channel_op(&op) != 0);
48108 +
48109 +       bind_evtchn_to_cpu(evtchn, 0);
48110 +       evtchn_to_irq[evtchn] = -1;
48111 +       irq_info[irq] = IRQ_UNBOUND;
48112 +}
48113 +
48114 +static void enable_pirq(unsigned int irq)
48115 +{
48116 +       int evtchn = evtchn_from_irq(irq);
48117 +
48118 +       if (VALID_EVTCHN(evtchn)) {
48119 +               unmask_evtchn(evtchn);
48120 +               pirq_unmask_notify(irq_to_pirq(irq));
48121 +       }
48122 +}
48123 +
48124 +static void disable_pirq(unsigned int irq)
48125 +{
48126 +       int evtchn = evtchn_from_irq(irq);
48127 +
48128 +       if (VALID_EVTCHN(evtchn))
48129 +               mask_evtchn(evtchn);
48130 +}
48131 +
48132 +static void ack_pirq(unsigned int irq)
48133 +{
48134 +       int evtchn = evtchn_from_irq(irq);
48135 +
48136 +       if (VALID_EVTCHN(evtchn)) {
48137 +               mask_evtchn(evtchn);
48138 +               clear_evtchn(evtchn);
48139 +       }
48140 +}
48141 +
48142 +static void end_pirq(unsigned int irq)
48143 +{
48144 +       int evtchn = evtchn_from_irq(irq);
48145 +
48146 +       if (VALID_EVTCHN(evtchn) && !(irq_desc[irq].status & IRQ_DISABLED)) {
48147 +               unmask_evtchn(evtchn);
48148 +               pirq_unmask_notify(irq_to_pirq(irq));
48149 +       }
48150 +}
48151 +
48152 +static struct hw_interrupt_type pirq_type = {
48153 +       "Phys-irq",
48154 +       startup_pirq,
48155 +       shutdown_pirq,
48156 +       enable_pirq,
48157 +       disable_pirq,
48158 +       ack_pirq,
48159 +       end_pirq,
48160 +       set_affinity_irq
48161 +};
48162 +
48163 +void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i)
48164 +{
48165 +       int evtchn = evtchn_from_irq(i);
48166 +       shared_info_t *s = HYPERVISOR_shared_info;
48167 +       if (!VALID_EVTCHN(evtchn))
48168 +               return;
48169 +       BUG_ON(!synch_test_bit(evtchn, &s->evtchn_mask[0]));
48170 +       synch_set_bit(evtchn, &s->evtchn_pending[0]);
48171 +}
48172 +
48173 +void notify_remote_via_irq(int irq)
48174 +{
48175 +       int evtchn = evtchn_from_irq(irq);
48176 +
48177 +       if (VALID_EVTCHN(evtchn))
48178 +               notify_remote_via_evtchn(evtchn);
48179 +}
48180 +EXPORT_SYMBOL_GPL(notify_remote_via_irq);
48181 +
48182 +void mask_evtchn(int port)
48183 +{
48184 +       shared_info_t *s = HYPERVISOR_shared_info;
48185 +       synch_set_bit(port, &s->evtchn_mask[0]);
48186 +}
48187 +EXPORT_SYMBOL_GPL(mask_evtchn);
48188 +
48189 +void unmask_evtchn(int port)
48190 +{
48191 +       shared_info_t *s = HYPERVISOR_shared_info;
48192 +       unsigned int cpu = smp_processor_id();
48193 +       vcpu_info_t *vcpu_info = &s->vcpu_info[cpu];
48194 +
48195 +       /* Slow path (hypercall) if this is a non-local port. */
48196 +       if (unlikely(cpu != cpu_from_evtchn(port))) {
48197 +               evtchn_op_t op = { .cmd = EVTCHNOP_unmask,
48198 +                                  .u.unmask.port = port };
48199 +               (void)HYPERVISOR_event_channel_op(&op);
48200 +               return;
48201 +       }
48202 +
48203 +       synch_clear_bit(port, &s->evtchn_mask[0]);
48204 +
48205 +       /*
48206 +        * The following is basically the equivalent of 'hw_resend_irq'. Just
48207 +        * like a real IO-APIC we 'lose the interrupt edge' if the channel is
48208 +        * masked.
48209 +        */
48210 +       if (synch_test_bit(port, &s->evtchn_pending[0]) &&
48211 +           !synch_test_and_set_bit(port / BITS_PER_LONG,
48212 +                                   &vcpu_info->evtchn_pending_sel)) {
48213 +               vcpu_info->evtchn_upcall_pending = 1;
48214 +               if (!vcpu_info->evtchn_upcall_mask)
48215 +                       force_evtchn_callback();
48216 +       }
48217 +}
48218 +EXPORT_SYMBOL_GPL(unmask_evtchn);
48219 +
48220 +void irq_resume(void)
48221 +{
48222 +       evtchn_op_t op;
48223 +       int         cpu, pirq, virq, ipi, irq, evtchn;
48224 +
48225 +       init_evtchn_cpu_bindings();
48226 +
48227 +       /* New event-channel space is not 'live' yet. */
48228 +       for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
48229 +               mask_evtchn(evtchn);
48230 +
48231 +       /* Check that no PIRQs are still bound. */
48232 +       for (pirq = 0; pirq < NR_PIRQS; pirq++)
48233 +               BUG_ON(irq_info[pirq_to_irq(pirq)] != IRQ_UNBOUND);
48234 +
48235 +       /* Secondary CPUs must have no VIRQ or IPI bindings. */
48236 +       for (cpu = 1; cpu < NR_CPUS; cpu++) {
48237 +               for (virq = 0; virq < NR_VIRQS; virq++)
48238 +                       BUG_ON(per_cpu(virq_to_irq, cpu)[virq] != -1);
48239 +               for (ipi = 0; ipi < NR_IPIS; ipi++)
48240 +                       BUG_ON(per_cpu(ipi_to_irq, cpu)[ipi] != -1);
48241 +       }
48242 +
48243 +       /* No IRQ <-> event-channel mappings. */
48244 +       for (irq = 0; irq < NR_IRQS; irq++)
48245 +               irq_info[irq] &= ~0xFFFF; /* zap event-channel binding */
48246 +       for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
48247 +               evtchn_to_irq[evtchn] = -1;
48248 +
48249 +       /* Primary CPU: rebind VIRQs automatically. */
48250 +       for (virq = 0; virq < NR_VIRQS; virq++) {
48251 +               if ((irq = per_cpu(virq_to_irq, 0)[virq]) == -1)
48252 +                       continue;
48253 +
48254 +               BUG_ON(irq_info[irq] != mk_irq_info(IRQT_VIRQ, virq, 0));
48255 +
48256 +               /* Get a new binding from Xen. */
48257 +               memset(&op, 0, sizeof(op));
48258 +               op.cmd              = EVTCHNOP_bind_virq;
48259 +               op.u.bind_virq.virq = virq;
48260 +               op.u.bind_virq.vcpu = 0;
48261 +               BUG_ON(HYPERVISOR_event_channel_op(&op) != 0);
48262 +               evtchn = op.u.bind_virq.port;
48263 +
48264 +               /* Record the new mapping. */
48265 +               evtchn_to_irq[evtchn] = irq;
48266 +               irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
48267 +
48268 +               /* Ready for use. */
48269 +               unmask_evtchn(evtchn);
48270 +       }
48271 +
48272 +       /* Primary CPU: rebind IPIs automatically. */
48273 +       for (ipi = 0; ipi < NR_IPIS; ipi++) {
48274 +               if ((irq = per_cpu(ipi_to_irq, 0)[ipi]) == -1)
48275 +                       continue;
48276 +
48277 +               BUG_ON(irq_info[irq] != mk_irq_info(IRQT_IPI, ipi, 0));
48278 +
48279 +               /* Get a new binding from Xen. */
48280 +               memset(&op, 0, sizeof(op));
48281 +               op.cmd = EVTCHNOP_bind_ipi;
48282 +               op.u.bind_ipi.vcpu = 0;
48283 +               BUG_ON(HYPERVISOR_event_channel_op(&op) != 0);
48284 +               evtchn = op.u.bind_ipi.port;
48285 +
48286 +               /* Record the new mapping. */
48287 +               evtchn_to_irq[evtchn] = irq;
48288 +               irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
48289 +
48290 +               /* Ready for use. */
48291 +               unmask_evtchn(evtchn);
48292 +       }
48293 +}
48294 +
48295 +void __init init_IRQ(void)
48296 +{
48297 +       int i;
48298 +       int cpu;
48299 +
48300 +       irq_ctx_init(0);
48301 +
48302 +       spin_lock_init(&irq_mapping_update_lock);
48303 +
48304 +       init_evtchn_cpu_bindings();
48305 +
48306 +       /* No VIRQ or IPI bindings. */
48307 +       for (cpu = 0; cpu < NR_CPUS; cpu++) {
48308 +               for (i = 0; i < NR_VIRQS; i++)
48309 +                       per_cpu(virq_to_irq, cpu)[i] = -1;
48310 +               for (i = 0; i < NR_IPIS; i++)
48311 +                       per_cpu(ipi_to_irq, cpu)[i] = -1;
48312 +       }
48313 +
48314 +       /* No event-channel -> IRQ mappings. */
48315 +       for (i = 0; i < NR_EVENT_CHANNELS; i++) {
48316 +               evtchn_to_irq[i] = -1;
48317 +               mask_evtchn(i); /* No event channels are 'live' right now. */
48318 +       }
48319 +
48320 +       /* No IRQ -> event-channel mappings. */
48321 +       for (i = 0; i < NR_IRQS; i++)
48322 +               irq_info[i] = IRQ_UNBOUND;
48323 +
48324 +       /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
48325 +       for (i = 0; i < NR_DYNIRQS; i++) {
48326 +               irq_bindcount[dynirq_to_irq(i)] = 0;
48327 +
48328 +               irq_desc[dynirq_to_irq(i)].status  = IRQ_DISABLED;
48329 +               irq_desc[dynirq_to_irq(i)].action  = NULL;
48330 +               irq_desc[dynirq_to_irq(i)].depth   = 1;
48331 +               irq_desc[dynirq_to_irq(i)].handler = &dynirq_type;
48332 +       }
48333 +
48334 +       /* Phys IRQ space is statically bound (1:1 mapping). Nail refcnts. */
48335 +       for (i = 0; i < NR_PIRQS; i++) {
48336 +               irq_bindcount[pirq_to_irq(i)] = 1;
48337 +
48338 +#ifdef RTC_IRQ
48339 +               /* If not domain 0, force our RTC driver to fail its probe. */
48340 +               if ((i == RTC_IRQ) &&
48341 +                   !(xen_start_info->flags & SIF_INITDOMAIN))
48342 +                       continue;
48343 +#endif
48344 +
48345 +               irq_desc[pirq_to_irq(i)].status  = IRQ_DISABLED;
48346 +               irq_desc[pirq_to_irq(i)].action  = NULL;
48347 +               irq_desc[pirq_to_irq(i)].depth   = 1;
48348 +               irq_desc[pirq_to_irq(i)].handler = &pirq_type;
48349 +       }
48350 +}
48351 +
48352 +/*
48353 + * Local variables:
48354 + *  c-file-style: "linux"
48355 + *  indent-tabs-mode: t
48356 + *  c-indent-level: 8
48357 + *  c-basic-offset: 8
48358 + *  tab-width: 8
48359 + * End:
48360 + */
48361 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/core/features.c linux-2.6.16/drivers/xen/core/features.c
48362 --- linux-2.6.16.orig/drivers/xen/core/features.c       1970-01-01 01:00:00.000000000 +0100
48363 +++ linux-2.6.16/drivers/xen/core/features.c    2006-06-26 09:51:32.000000000 +0200
48364 @@ -0,0 +1,29 @@
48365 +/******************************************************************************
48366 + * features.c
48367 + *
48368 + * Xen feature flags.
48369 + *
48370 + * Copyright (c) 2006, Ian Campbell, XenSource Inc.
48371 + */
48372 +#include <linux/types.h>
48373 +#include <linux/cache.h>
48374 +#include <linux/module.h>
48375 +#include <asm/hypervisor.h>
48376 +#include <xen/features.h>
48377 +
48378 +u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
48379 +EXPORT_SYMBOL_GPL(xen_features);
48380 +
48381 +void setup_xen_features(void)
48382 +{
48383 +       xen_feature_info_t fi;
48384 +       int i, j;
48385 +
48386 +       for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
48387 +               fi.submap_idx = i;
48388 +               if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
48389 +                       break;
48390 +               for (j=0; j<32; j++)
48391 +                       xen_features[i*32+j] = !!(fi.submap & 1<<j);
48392 +       }
48393 +}
48394 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/core/gnttab.c linux-2.6.16/drivers/xen/core/gnttab.c
48395 --- linux-2.6.16.orig/drivers/xen/core/gnttab.c 1970-01-01 01:00:00.000000000 +0100
48396 +++ linux-2.6.16/drivers/xen/core/gnttab.c      2006-06-26 09:51:32.000000000 +0200
48397 @@ -0,0 +1,464 @@
48398 +/******************************************************************************
48399 + * gnttab.c
48400 + * 
48401 + * Granting foreign access to our memory reservation.
48402 + * 
48403 + * Copyright (c) 2005, Christopher Clark
48404 + * Copyright (c) 2004-2005, K A Fraser
48405 + * 
48406 + * This program is free software; you can redistribute it and/or
48407 + * modify it under the terms of the GNU General Public License version 2
48408 + * as published by the Free Software Foundation; or, when distributed
48409 + * separately from the Linux kernel or incorporated into other
48410 + * software packages, subject to the following license:
48411 + * 
48412 + * Permission is hereby granted, free of charge, to any person obtaining a copy
48413 + * of this source file (the "Software"), to deal in the Software without
48414 + * restriction, including without limitation the rights to use, copy, modify,
48415 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
48416 + * and to permit persons to whom the Software is furnished to do so, subject to
48417 + * the following conditions:
48418 + * 
48419 + * The above copyright notice and this permission notice shall be included in
48420 + * all copies or substantial portions of the Software.
48421 + * 
48422 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
48423 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
48424 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
48425 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
48426 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
48427 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
48428 + * IN THE SOFTWARE.
48429 + */
48430 +
48431 +#include <linux/config.h>
48432 +#include <linux/module.h>
48433 +#include <linux/sched.h>
48434 +#include <linux/mm.h>
48435 +#include <linux/vmalloc.h>
48436 +#include <asm/pgtable.h>
48437 +#include <xen/interface/xen.h>
48438 +#include <asm/fixmap.h>
48439 +#include <asm/uaccess.h>
48440 +#include <xen/gnttab.h>
48441 +#include <asm/synch_bitops.h>
48442 +
48443 +#if 1
48444 +#define ASSERT(_p)                                                           \
48445 +       if (!(_p)) { printk(KERN_ALERT"Assertion '%s': line %d, file %s\n",   \
48446 +       #_p , __LINE__, __FILE__); *(int*)0=0; }
48447 +#else
48448 +#define ASSERT(_p) ((void)0)
48449 +#endif
48450 +
48451 +#define WPRINTK(fmt, args...)                          \
48452 +       printk(KERN_WARNING "xen_grant: " fmt, ##args)
48453 +
48454 +
48455 +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
48456 +EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref);
48457 +EXPORT_SYMBOL_GPL(gnttab_end_foreign_access);
48458 +EXPORT_SYMBOL_GPL(gnttab_query_foreign_access);
48459 +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer);
48460 +EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);
48461 +EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer);
48462 +EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references);
48463 +EXPORT_SYMBOL_GPL(gnttab_free_grant_references);
48464 +EXPORT_SYMBOL_GPL(gnttab_free_grant_reference);
48465 +EXPORT_SYMBOL_GPL(gnttab_claim_grant_reference);
48466 +EXPORT_SYMBOL_GPL(gnttab_release_grant_reference);
48467 +EXPORT_SYMBOL_GPL(gnttab_request_free_callback);
48468 +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref);
48469 +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref);
48470 +
48471 +/* External tools reserve first few grant table entries. */
48472 +#define NR_RESERVED_ENTRIES 8
48473 +
48474 +#define NR_GRANT_ENTRIES (NR_GRANT_FRAMES * PAGE_SIZE / sizeof(grant_entry_t))
48475 +#define GNTTAB_LIST_END (NR_GRANT_ENTRIES + 1)
48476 +
48477 +static grant_ref_t gnttab_list[NR_GRANT_ENTRIES];
48478 +static int gnttab_free_count;
48479 +static grant_ref_t gnttab_free_head;
48480 +static spinlock_t gnttab_list_lock = SPIN_LOCK_UNLOCKED;
48481 +
48482 +static grant_entry_t *shared = NULL;
48483 +
48484 +static struct gnttab_free_callback *gnttab_free_callback_list = NULL;
48485 +
48486 +static int
48487 +get_free_entries(int count)
48488 +{
48489 +       unsigned long flags;
48490 +       int ref;
48491 +       grant_ref_t head;
48492 +       spin_lock_irqsave(&gnttab_list_lock, flags);
48493 +       if (gnttab_free_count < count) {
48494 +               spin_unlock_irqrestore(&gnttab_list_lock, flags);
48495 +               return -1;
48496 +       }
48497 +       ref = head = gnttab_free_head;
48498 +       gnttab_free_count -= count;
48499 +       while (count-- > 1)
48500 +               head = gnttab_list[head];
48501 +       gnttab_free_head = gnttab_list[head];
48502 +       gnttab_list[head] = GNTTAB_LIST_END;
48503 +       spin_unlock_irqrestore(&gnttab_list_lock, flags);
48504 +       return ref;
48505 +}
48506 +
48507 +#define get_free_entry() get_free_entries(1)
48508 +
48509 +static void
48510 +do_free_callbacks(void)
48511 +{
48512 +       struct gnttab_free_callback *callback, *next;
48513 +
48514 +       callback = gnttab_free_callback_list;
48515 +       gnttab_free_callback_list = NULL;
48516 +
48517 +       while (callback != NULL) {
48518 +               next = callback->next;
48519 +               if (gnttab_free_count >= callback->count) {
48520 +                       callback->next = NULL;
48521 +                       callback->fn(callback->arg);
48522 +               } else {
48523 +                       callback->next = gnttab_free_callback_list;
48524 +                       gnttab_free_callback_list = callback;
48525 +               }
48526 +               callback = next;
48527 +       }
48528 +}
48529 +
48530 +static inline void
48531 +check_free_callbacks(void)
48532 +{
48533 +       if (unlikely(gnttab_free_callback_list))
48534 +               do_free_callbacks();
48535 +}
48536 +
48537 +static void
48538 +put_free_entry(grant_ref_t ref)
48539 +{
48540 +       unsigned long flags;
48541 +       spin_lock_irqsave(&gnttab_list_lock, flags);
48542 +       gnttab_list[ref] = gnttab_free_head;
48543 +       gnttab_free_head = ref;
48544 +       gnttab_free_count++;
48545 +       check_free_callbacks();
48546 +       spin_unlock_irqrestore(&gnttab_list_lock, flags);
48547 +}
48548 +
48549 +/*
48550 + * Public grant-issuing interface functions
48551 + */
48552 +
48553 +int
48554 +gnttab_grant_foreign_access(domid_t domid, unsigned long frame, int readonly)
48555 +{
48556 +       int ref;
48557 +
48558 +       if (unlikely((ref = get_free_entry()) == -1))
48559 +               return -ENOSPC;
48560 +
48561 +       shared[ref].frame = frame;
48562 +       shared[ref].domid = domid;
48563 +       wmb();
48564 +       shared[ref].flags = GTF_permit_access | (readonly ? GTF_readonly : 0);
48565 +
48566 +       return ref;
48567 +}
48568 +
48569 +void
48570 +gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
48571 +                               unsigned long frame, int readonly)
48572 +{
48573 +       shared[ref].frame = frame;
48574 +       shared[ref].domid = domid;
48575 +       wmb();
48576 +       shared[ref].flags = GTF_permit_access | (readonly ? GTF_readonly : 0);
48577 +}
48578 +
48579 +
48580 +int
48581 +gnttab_query_foreign_access(grant_ref_t ref)
48582 +{
48583 +       u16 nflags;
48584 +
48585 +       nflags = shared[ref].flags;
48586 +
48587 +       return (nflags & (GTF_reading|GTF_writing));
48588 +}
48589 +
48590 +int
48591 +gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
48592 +{
48593 +       u16 flags, nflags;
48594 +
48595 +       nflags = shared[ref].flags;
48596 +       do {
48597 +               if ((flags = nflags) & (GTF_reading|GTF_writing)) {
48598 +                       printk(KERN_ALERT "WARNING: g.e. still in use!\n");
48599 +                       return 0;
48600 +               }
48601 +       } while ((nflags = synch_cmpxchg(&shared[ref].flags, flags, 0)) !=
48602 +                flags);
48603 +
48604 +       return 1;
48605 +}
48606 +
48607 +void
48608 +gnttab_end_foreign_access(grant_ref_t ref, int readonly, unsigned long page)
48609 +{
48610 +       if (gnttab_end_foreign_access_ref(ref, readonly)) {
48611 +               put_free_entry(ref);
48612 +               if (page != 0) {
48613 +                       free_page(page);
48614 +               }
48615 +       } else {
48616 +               /* XXX This needs to be fixed so that the ref and page are
48617 +                  placed on a list to be freed up later. */
48618 +               printk(KERN_WARNING
48619 +                      "WARNING: leaking g.e. and page still in use!\n");
48620 +       }
48621 +}
48622 +
48623 +int
48624 +gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn)
48625 +{
48626 +       int ref;
48627 +
48628 +       if (unlikely((ref = get_free_entry()) == -1))
48629 +               return -ENOSPC;
48630 +       gnttab_grant_foreign_transfer_ref(ref, domid, pfn);
48631 +
48632 +       return ref;
48633 +}
48634 +
48635 +void
48636 +gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,
48637 +                                 unsigned long pfn)
48638 +{
48639 +       shared[ref].frame = pfn;
48640 +       shared[ref].domid = domid;
48641 +       wmb();
48642 +       shared[ref].flags = GTF_accept_transfer;
48643 +}
48644 +
48645 +unsigned long
48646 +gnttab_end_foreign_transfer_ref(grant_ref_t ref)
48647 +{
48648 +       unsigned long frame;
48649 +       u16           flags;
48650 +
48651 +       /*
48652 +         * If a transfer is not even yet started, try to reclaim the grant
48653 +         * reference and return failure (== 0).
48654 +         */
48655 +       while (!((flags = shared[ref].flags) & GTF_transfer_committed)) {
48656 +               if (synch_cmpxchg(&shared[ref].flags, flags, 0) == flags)
48657 +                       return 0;
48658 +               cpu_relax();
48659 +       }
48660 +
48661 +       /* If a transfer is in progress then wait until it is completed. */
48662 +       while (!(flags & GTF_transfer_completed)) {
48663 +               flags = shared[ref].flags;
48664 +               cpu_relax();
48665 +       }
48666 +
48667 +       /* Read the frame number /after/ reading completion status. */
48668 +       rmb();
48669 +       frame = shared[ref].frame;
48670 +       BUG_ON(frame == 0);
48671 +
48672 +       return frame;
48673 +}
48674 +
48675 +unsigned long
48676 +gnttab_end_foreign_transfer(grant_ref_t ref)
48677 +{
48678 +       unsigned long frame = gnttab_end_foreign_transfer_ref(ref);
48679 +       put_free_entry(ref);
48680 +       return frame;
48681 +}
48682 +
48683 +void
48684 +gnttab_free_grant_reference(grant_ref_t ref)
48685 +{
48686 +
48687 +       put_free_entry(ref);
48688 +}
48689 +
48690 +void
48691 +gnttab_free_grant_references(grant_ref_t head)
48692 +{
48693 +       grant_ref_t ref;
48694 +       unsigned long flags;
48695 +       int count = 1;
48696 +       if (head == GNTTAB_LIST_END)
48697 +               return;
48698 +       spin_lock_irqsave(&gnttab_list_lock, flags);
48699 +       ref = head;
48700 +       while (gnttab_list[ref] != GNTTAB_LIST_END) {
48701 +               ref = gnttab_list[ref];
48702 +               count++;
48703 +       }
48704 +       gnttab_list[ref] = gnttab_free_head;
48705 +       gnttab_free_head = head;
48706 +       gnttab_free_count += count;
48707 +       check_free_callbacks();
48708 +       spin_unlock_irqrestore(&gnttab_list_lock, flags);
48709 +}
48710 +
48711 +int
48712 +gnttab_alloc_grant_references(u16 count, grant_ref_t *head)
48713 +{
48714 +       int h = get_free_entries(count);
48715 +
48716 +       if (h == -1)
48717 +               return -ENOSPC;
48718 +
48719 +       *head = h;
48720 +
48721 +       return 0;
48722 +}
48723 +
48724 +int
48725 +gnttab_claim_grant_reference(grant_ref_t *private_head)
48726 +{
48727 +       grant_ref_t g = *private_head;
48728 +       if (unlikely(g == GNTTAB_LIST_END))
48729 +               return -ENOSPC;
48730 +       *private_head = gnttab_list[g];
48731 +       return g;
48732 +}
48733 +
48734 +void
48735 +gnttab_release_grant_reference(grant_ref_t *private_head, grant_ref_t  release)
48736 +{
48737 +       gnttab_list[release] = *private_head;
48738 +       *private_head = release;
48739 +}
48740 +
48741 +void
48742 +gnttab_request_free_callback(struct gnttab_free_callback *callback,
48743 +                            void (*fn)(void *), void *arg, u16 count)
48744 +{
48745 +       unsigned long flags;
48746 +       spin_lock_irqsave(&gnttab_list_lock, flags);
48747 +       if (callback->next)
48748 +               goto out;
48749 +       callback->fn = fn;
48750 +       callback->arg = arg;
48751 +       callback->count = count;
48752 +       callback->next = gnttab_free_callback_list;
48753 +       gnttab_free_callback_list = callback;
48754 +       check_free_callbacks();
48755 + out:
48756 +       spin_unlock_irqrestore(&gnttab_list_lock, flags);
48757 +}
48758 +
48759 +#ifndef __ia64__
48760 +static int map_pte_fn(pte_t *pte, struct page *pmd_page,
48761 +                     unsigned long addr, void *data)
48762 +{
48763 +       unsigned long **frames = (unsigned long **)data;
48764 +
48765 +       set_pte_at(&init_mm, addr, pte, pfn_pte_ma((*frames)[0], PAGE_KERNEL));
48766 +       (*frames)++;
48767 +       return 0;
48768 +}
48769 +
48770 +static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
48771 +                     unsigned long addr, void *data)
48772 +{
48773 +
48774 +       set_pte_at(&init_mm, addr, pte, __pte(0));
48775 +       return 0;
48776 +}
48777 +#endif
48778 +
48779 +int
48780 +gnttab_resume(void)
48781 +{
48782 +       gnttab_setup_table_t setup;
48783 +       unsigned long frames[NR_GRANT_FRAMES];
48784 +       int rc;
48785 +#ifndef __ia64__
48786 +       void *pframes = frames;
48787 +       struct vm_struct *area;
48788 +#endif
48789 +
48790 +       setup.dom        = DOMID_SELF;
48791 +       setup.nr_frames  = NR_GRANT_FRAMES;
48792 +       setup.frame_list = frames;
48793 +
48794 +       rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
48795 +       if (rc == -ENOSYS)
48796 +               return -ENOSYS;
48797 +
48798 +       BUG_ON(rc || setup.status);
48799 +
48800 +#ifndef __ia64__
48801 +       if (shared == NULL) {
48802 +               area = get_vm_area(PAGE_SIZE * NR_GRANT_FRAMES, VM_IOREMAP);
48803 +               BUG_ON(area == NULL);
48804 +               shared = area->addr;
48805 +       }
48806 +       rc = apply_to_page_range(&init_mm, (unsigned long)shared,
48807 +                                PAGE_SIZE * NR_GRANT_FRAMES,
48808 +                                map_pte_fn, &pframes);
48809 +       BUG_ON(rc);
48810 +#else
48811 +       shared = __va(frames[0] << PAGE_SHIFT);
48812 +       printk("grant table at %p\n", shared);
48813 +#endif
48814 +
48815 +       return 0;
48816 +}
48817 +
48818 +int
48819 +gnttab_suspend(void)
48820 +{
48821 +
48822 +#ifndef __ia64__
48823 +       apply_to_page_range(&init_mm, (unsigned long)shared,
48824 +                           PAGE_SIZE * NR_GRANT_FRAMES,
48825 +                           unmap_pte_fn, NULL);
48826 +#endif
48827 +
48828 +       return 0;
48829 +}
48830 +
48831 +static int __init
48832 +gnttab_init(void)
48833 +{
48834 +       int i;
48835 +
48836 +       if (xen_init() < 0)
48837 +               return -ENODEV;
48838 +
48839 +       if (gnttab_resume() < 0)
48840 +               return -ENODEV;
48841 +
48842 +       for (i = NR_RESERVED_ENTRIES; i < NR_GRANT_ENTRIES; i++)
48843 +               gnttab_list[i] = i + 1;
48844 +       gnttab_free_count = NR_GRANT_ENTRIES - NR_RESERVED_ENTRIES;
48845 +       gnttab_free_head  = NR_RESERVED_ENTRIES;
48846 +
48847 +       printk("Grant table initialized\n");
48848 +       return 0;
48849 +}
48850 +
48851 +core_initcall(gnttab_init);
48852 +
48853 +/*
48854 + * Local variables:
48855 + *  c-file-style: "linux"
48856 + *  indent-tabs-mode: t
48857 + *  c-indent-level: 8
48858 + *  c-basic-offset: 8
48859 + *  tab-width: 8
48860 + * End:
48861 + */
48862 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/core/hypervisor_sysfs.c linux-2.6.16/drivers/xen/core/hypervisor_sysfs.c
48863 --- linux-2.6.16.orig/drivers/xen/core/hypervisor_sysfs.c       1970-01-01 01:00:00.000000000 +0100
48864 +++ linux-2.6.16/drivers/xen/core/hypervisor_sysfs.c    2006-06-26 09:51:32.000000000 +0200
48865 @@ -0,0 +1,57 @@
48866 +/*
48867 + *  copyright (c) 2006 IBM Corporation
48868 + *  Authored by: Mike D. Day <ncmike@us.ibm.com>
48869 + *
48870 + *  This program is free software; you can redistribute it and/or modify
48871 + *  it under the terms of the GNU General Public License version 2 as
48872 + *  published by the Free Software Foundation.
48873 + */
48874 +
48875 +#include <linux/config.h>
48876 +#include <linux/kernel.h>
48877 +#include <linux/module.h>
48878 +#include <linux/kobject.h>
48879 +#include <xen/hypervisor_sysfs.h>
48880 +
48881 +decl_subsys(hypervisor, NULL, NULL);
48882 +
48883 +static ssize_t hyp_sysfs_show(struct kobject *kobj,
48884 +                             struct attribute *attr,
48885 +                             char *buffer)
48886 +{
48887 +       struct hyp_sysfs_attr *hyp_attr;
48888 +       hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
48889 +       if (hyp_attr->show)
48890 +               return hyp_attr->show(hyp_attr, buffer);
48891 +       return 0;
48892 +}
48893 +
48894 +static ssize_t hyp_sysfs_store(struct kobject *kobj,
48895 +                              struct attribute *attr,
48896 +                              const char *buffer,
48897 +                              size_t len)
48898 +{
48899 +       struct hyp_sysfs_attr *hyp_attr;
48900 +       hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
48901 +       if (hyp_attr->store)
48902 +               return hyp_attr->store(hyp_attr, buffer, len);
48903 +       return 0;
48904 +}
48905 +
48906 +struct sysfs_ops hyp_sysfs_ops = {
48907 +       .show = hyp_sysfs_show,
48908 +       .store = hyp_sysfs_store,
48909 +};
48910 +
48911 +static struct kobj_type hyp_sysfs_kobj_type = {
48912 +       .sysfs_ops = &hyp_sysfs_ops,
48913 +};
48914 +
48915 +static int __init hypervisor_subsys_init(void)
48916 +{
48917 +       hypervisor_subsys.kset.kobj.ktype = &hyp_sysfs_kobj_type;
48918 +       return subsystem_register(&hypervisor_subsys);
48919 +}
48920 +
48921 +device_initcall(hypervisor_subsys_init);
48922 +EXPORT_SYMBOL_GPL(hypervisor_subsys);
48923 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/core/reboot.c linux-2.6.16/drivers/xen/core/reboot.c
48924 --- linux-2.6.16.orig/drivers/xen/core/reboot.c 1970-01-01 01:00:00.000000000 +0100
48925 +++ linux-2.6.16/drivers/xen/core/reboot.c      2006-06-26 09:51:32.000000000 +0200
48926 @@ -0,0 +1,381 @@
48927 +#define __KERNEL_SYSCALLS__
48928 +#include <linux/version.h>
48929 +#include <linux/kernel.h>
48930 +#include <linux/mm.h>
48931 +#include <linux/unistd.h>
48932 +#include <linux/module.h>
48933 +#include <linux/reboot.h>
48934 +#include <linux/sysrq.h>
48935 +#include <linux/stringify.h>
48936 +#include <asm/irq.h>
48937 +#include <asm/mmu_context.h>
48938 +#include <xen/evtchn.h>
48939 +#include <asm/hypervisor.h>
48940 +#include <xen/interface/dom0_ops.h>
48941 +#include <xen/xenbus.h>
48942 +#include <linux/cpu.h>
48943 +#include <linux/kthread.h>
48944 +#include <xen/gnttab.h>
48945 +#include <xen/xencons.h>
48946 +
48947 +#if defined(__i386__) || defined(__x86_64__)
48948 +/*
48949 + * Power off function, if any
48950 + */
48951 +void (*pm_power_off)(void);
48952 +EXPORT_SYMBOL(pm_power_off);
48953 +#endif
48954 +
48955 +extern void ctrl_alt_del(void);
48956 +
48957 +#define SHUTDOWN_INVALID  -1
48958 +#define SHUTDOWN_POWEROFF  0
48959 +#define SHUTDOWN_SUSPEND   2
48960 +/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
48961 + * report a crash, not be instructed to crash!
48962 + * HALT is the same as POWEROFF, as far as we're concerned.  The tools use
48963 + * the distinction when we return the reason code to them.
48964 + */
48965 +#define SHUTDOWN_HALT      4
48966 +
48967 +void machine_emergency_restart(void)
48968 +{
48969 +       /* We really want to get pending console data out before we die. */
48970 +       xencons_force_flush();
48971 +       HYPERVISOR_shutdown(SHUTDOWN_reboot);
48972 +}
48973 +
48974 +void machine_restart(char * __unused)
48975 +{
48976 +       machine_emergency_restart();
48977 +}
48978 +
48979 +void machine_halt(void)
48980 +{
48981 +       machine_power_off();
48982 +}
48983 +
48984 +void machine_power_off(void)
48985 +{
48986 +       /* We really want to get pending console data out before we die. */
48987 +       xencons_force_flush();
48988 +#if defined(__i386__) || defined(__x86_64__)
48989 +       if (pm_power_off)
48990 +               pm_power_off();
48991 +#endif
48992 +       HYPERVISOR_shutdown(SHUTDOWN_poweroff);
48993 +}
48994 +
48995 +int reboot_thru_bios = 0;      /* for dmi_scan.c */
48996 +EXPORT_SYMBOL(machine_restart);
48997 +EXPORT_SYMBOL(machine_halt);
48998 +EXPORT_SYMBOL(machine_power_off);
48999 +
49000 +
49001 +/******************************************************************************
49002 + * Stop/pickle callback handling.
49003 + */
49004 +
49005 +/* Ignore multiple shutdown requests. */
49006 +static int shutting_down = SHUTDOWN_INVALID;
49007 +static void __shutdown_handler(void *unused);
49008 +static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL);
49009 +
49010 +#ifdef CONFIG_SMP
49011 +int  smp_suspend(void);
49012 +void smp_resume(void);
49013 +#else
49014 +#define smp_suspend()  (0)
49015 +#define smp_resume()   ((void)0)
49016 +#endif
49017 +
49018 +/* Ensure we run on the idle task page tables so that we will
49019 +   switch page tables before running user space. This is needed
49020 +   on architectures with separate kernel and user page tables
49021 +   because the user page table pointer is not saved/restored. */
49022 +static void switch_idle_mm(void)
49023 +{
49024 +       struct mm_struct *mm = current->active_mm;
49025 +
49026 +       if (mm == &init_mm)
49027 +               return;
49028 +
49029 +       atomic_inc(&init_mm.mm_count);
49030 +       switch_mm(mm, &init_mm, current);
49031 +       current->active_mm = &init_mm;
49032 +       mmdrop(mm);
49033 +}
49034 +
49035 +static int __do_suspend(void *ignore)
49036 +{
49037 +       int i, j, k, fpp, err;
49038 +
49039 +       extern unsigned long max_pfn;
49040 +       extern unsigned long *pfn_to_mfn_frame_list_list;
49041 +       extern unsigned long *pfn_to_mfn_frame_list[];
49042 +
49043 +       extern void time_resume(void);
49044 +
49045 +       BUG_ON(smp_processor_id() != 0);
49046 +       BUG_ON(in_interrupt());
49047 +
49048 +       if (xen_feature(XENFEAT_auto_translated_physmap)) {
49049 +               printk(KERN_WARNING "Cannot suspend in "
49050 +                      "auto_translated_physmap mode.\n");
49051 +               return -EOPNOTSUPP;
49052 +       }
49053 +
49054 +       err = smp_suspend();
49055 +       if (err)
49056 +               return err;
49057 +
49058 +       xenbus_suspend();
49059 +
49060 +       preempt_disable();
49061 +
49062 +#ifdef __i386__
49063 +       kmem_cache_shrink(pgd_cache);
49064 +#endif
49065 +       mm_pin_all();
49066 +
49067 +       __cli();
49068 +       preempt_enable();
49069 +
49070 +       gnttab_suspend();
49071 +
49072 +       HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
49073 +       clear_fixmap(FIX_SHARED_INFO);
49074 +
49075 +       xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
49076 +       xen_start_info->console_mfn = mfn_to_pfn(xen_start_info->console_mfn);
49077 +
49078 +       /*
49079 +        * We'll stop somewhere inside this hypercall. When it returns,
49080 +        * we'll start resuming after the restore.
49081 +        */
49082 +       HYPERVISOR_suspend(virt_to_mfn(xen_start_info));
49083 +
49084 +       shutting_down = SHUTDOWN_INVALID;
49085 +
49086 +       set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
49087 +
49088 +       HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
49089 +
49090 +       memset(empty_zero_page, 0, PAGE_SIZE);
49091 +
49092 +       HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
49093 +               virt_to_mfn(pfn_to_mfn_frame_list_list);
49094 +
49095 +       fpp = PAGE_SIZE/sizeof(unsigned long);
49096 +       for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) {
49097 +               if ((j % fpp) == 0) {
49098 +                       k++;
49099 +                       pfn_to_mfn_frame_list_list[k] =
49100 +                               virt_to_mfn(pfn_to_mfn_frame_list[k]);
49101 +                       j = 0;
49102 +               }
49103 +               pfn_to_mfn_frame_list[k][j] =
49104 +                       virt_to_mfn(&phys_to_machine_mapping[i]);
49105 +       }
49106 +       HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
49107 +
49108 +       gnttab_resume();
49109 +
49110 +       irq_resume();
49111 +
49112 +       time_resume();
49113 +
49114 +       switch_idle_mm();
49115 +
49116 +       __sti();
49117 +
49118 +       xencons_resume();
49119 +
49120 +       xenbus_resume();
49121 +
49122 +       smp_resume();
49123 +
49124 +       return err;
49125 +}
49126 +
49127 +static int shutdown_process(void *__unused)
49128 +{
49129 +       static char *envp[] = { "HOME=/", "TERM=linux",
49130 +                               "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
49131 +       static char *poweroff_argv[] = { "/sbin/poweroff", NULL };
49132 +
49133 +       extern asmlinkage long sys_reboot(int magic1, int magic2,
49134 +                                         unsigned int cmd, void *arg);
49135 +
49136 +       if ((shutting_down == SHUTDOWN_POWEROFF) ||
49137 +           (shutting_down == SHUTDOWN_HALT)) {
49138 +               if (execve("/sbin/poweroff", poweroff_argv, envp) < 0) {
49139 +                       sys_reboot(LINUX_REBOOT_MAGIC1,
49140 +                                  LINUX_REBOOT_MAGIC2,
49141 +                                  LINUX_REBOOT_CMD_POWER_OFF,
49142 +                                  NULL);
49143 +               }
49144 +       }
49145 +
49146 +       shutting_down = SHUTDOWN_INVALID; /* could try again */
49147 +
49148 +       return 0;
49149 +}
49150 +
49151 +static int kthread_create_on_cpu(int (*f)(void *arg),
49152 +                                void *arg,
49153 +                                const char *name,
49154 +                                int cpu)
49155 +{
49156 +       struct task_struct *p;
49157 +       p = kthread_create(f, arg, name);
49158 +       if (IS_ERR(p))
49159 +               return PTR_ERR(p);
49160 +       kthread_bind(p, cpu);
49161 +       wake_up_process(p);
49162 +       return 0;
49163 +}
49164 +
49165 +static void __shutdown_handler(void *unused)
49166 +{
49167 +       int err;
49168 +
49169 +       if (shutting_down != SHUTDOWN_SUSPEND)
49170 +               err = kernel_thread(shutdown_process, NULL,
49171 +                                   CLONE_FS | CLONE_FILES);
49172 +       else
49173 +               err = kthread_create_on_cpu(__do_suspend, NULL, "suspend", 0);
49174 +
49175 +       if (err < 0) {
49176 +               printk(KERN_WARNING "Error creating shutdown process (%d): "
49177 +                      "retrying...\n", -err);
49178 +               schedule_delayed_work(&shutdown_work, HZ/2);
49179 +       }
49180 +}
49181 +
49182 +static void shutdown_handler(struct xenbus_watch *watch,
49183 +                            const char **vec, unsigned int len)
49184 +{
49185 +       char *str;
49186 +       xenbus_transaction_t xbt;
49187 +       int err;
49188 +
49189 +       if (shutting_down != SHUTDOWN_INVALID)
49190 +               return;
49191 +
49192 + again:
49193 +       err = xenbus_transaction_start(&xbt);
49194 +       if (err)
49195 +               return;
49196 +       str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
49197 +       /* Ignore read errors and empty reads. */
49198 +       if (XENBUS_IS_ERR_READ(str)) {
49199 +               xenbus_transaction_end(xbt, 1);
49200 +               return;
49201 +       }
49202 +
49203 +       xenbus_write(xbt, "control", "shutdown", "");
49204 +
49205 +       err = xenbus_transaction_end(xbt, 0);
49206 +       if (err == -EAGAIN) {
49207 +               kfree(str);
49208 +               goto again;
49209 +       }
49210 +
49211 +       if (strcmp(str, "poweroff") == 0)
49212 +               shutting_down = SHUTDOWN_POWEROFF;
49213 +       else if (strcmp(str, "reboot") == 0)
49214 +               ctrl_alt_del();
49215 +       else if (strcmp(str, "suspend") == 0)
49216 +               shutting_down = SHUTDOWN_SUSPEND;
49217 +       else if (strcmp(str, "halt") == 0)
49218 +               shutting_down = SHUTDOWN_HALT;
49219 +       else {
49220 +               printk("Ignoring shutdown request: %s\n", str);
49221 +               shutting_down = SHUTDOWN_INVALID;
49222 +       }
49223 +
49224 +       if (shutting_down != SHUTDOWN_INVALID)
49225 +               schedule_work(&shutdown_work);
49226 +
49227 +       kfree(str);
49228 +}
49229 +
49230 +static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
49231 +                         unsigned int len)
49232 +{
49233 +       char sysrq_key = '\0';
49234 +       xenbus_transaction_t xbt;
49235 +       int err;
49236 +
49237 + again:
49238 +       err = xenbus_transaction_start(&xbt);
49239 +       if (err)
49240 +               return;
49241 +       if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
49242 +               printk(KERN_ERR "Unable to read sysrq code in "
49243 +                      "control/sysrq\n");
49244 +               xenbus_transaction_end(xbt, 1);
49245 +               return;
49246 +       }
49247 +
49248 +       if (sysrq_key != '\0')
49249 +               xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
49250 +
49251 +       err = xenbus_transaction_end(xbt, 0);
49252 +       if (err == -EAGAIN)
49253 +               goto again;
49254 +
49255 +#ifdef CONFIG_MAGIC_SYSRQ
49256 +       if (sysrq_key != '\0')
49257 +               handle_sysrq(sysrq_key, NULL, NULL);
49258 +#endif
49259 +}
49260 +
49261 +static struct xenbus_watch shutdown_watch = {
49262 +       .node = "control/shutdown",
49263 +       .callback = shutdown_handler
49264 +};
49265 +
49266 +static struct xenbus_watch sysrq_watch = {
49267 +       .node ="control/sysrq",
49268 +       .callback = sysrq_handler
49269 +};
49270 +
49271 +static int setup_shutdown_watcher(struct notifier_block *notifier,
49272 +                                  unsigned long event,
49273 +                                  void *data)
49274 +{
49275 +       int err;
49276 +
49277 +       err = register_xenbus_watch(&shutdown_watch);
49278 +       if (err)
49279 +               printk(KERN_ERR "Failed to set shutdown watcher\n");
49280 +
49281 +       err = register_xenbus_watch(&sysrq_watch);
49282 +       if (err)
49283 +               printk(KERN_ERR "Failed to set sysrq watcher\n");
49284 +
49285 +       return NOTIFY_DONE;
49286 +}
49287 +
49288 +static int __init setup_shutdown_event(void)
49289 +{
49290 +       static struct notifier_block xenstore_notifier = {
49291 +               .notifier_call = setup_shutdown_watcher
49292 +       };
49293 +       register_xenstore_notifier(&xenstore_notifier);
49294 +       return 0;
49295 +}
49296 +
49297 +subsys_initcall(setup_shutdown_event);
49298 +
49299 +/*
49300 + * Local variables:
49301 + *  c-file-style: "linux"
49302 + *  indent-tabs-mode: t
49303 + *  c-indent-level: 8
49304 + *  c-basic-offset: 8
49305 + *  tab-width: 8
49306 + * End:
49307 + */
49308 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/core/skbuff.c linux-2.6.16/drivers/xen/core/skbuff.c
49309 --- linux-2.6.16.orig/drivers/xen/core/skbuff.c 1970-01-01 01:00:00.000000000 +0100
49310 +++ linux-2.6.16/drivers/xen/core/skbuff.c      2006-06-26 09:51:32.000000000 +0200
49311 @@ -0,0 +1,144 @@
49312 +
49313 +#include <linux/config.h>
49314 +#include <linux/module.h>
49315 +#include <linux/version.h>
49316 +#include <linux/kernel.h>
49317 +#include <linux/sched.h>
49318 +#include <linux/slab.h>
49319 +#include <linux/netdevice.h>
49320 +#include <linux/inetdevice.h>
49321 +#include <linux/etherdevice.h>
49322 +#include <linux/skbuff.h>
49323 +#include <linux/init.h>
49324 +#include <asm/io.h>
49325 +#include <asm/page.h>
49326 +#include <asm/hypervisor.h>
49327 +
49328 +/* Referenced in netback.c. */
49329 +/*static*/ kmem_cache_t *skbuff_cachep;
49330 +EXPORT_SYMBOL(skbuff_cachep);
49331 +
49332 +#define MAX_SKBUFF_ORDER 4
49333 +static kmem_cache_t *skbuff_order_cachep[MAX_SKBUFF_ORDER + 1];
49334 +
49335 +static struct {
49336 +       int size;
49337 +       kmem_cache_t *cachep;
49338 +} skbuff_small[] = { { 512, NULL }, { 2048, NULL } };
49339 +
49340 +struct sk_buff *__alloc_skb(unsigned int length, gfp_t gfp_mask,
49341 +                           int fclone)
49342 +{
49343 +       int order, i;
49344 +       kmem_cache_t *cachep;
49345 +
49346 +       length = SKB_DATA_ALIGN(length) + sizeof(struct skb_shared_info);
49347 +
49348 +       if (length <= skbuff_small[ARRAY_SIZE(skbuff_small)-1].size) {
49349 +               for (i = 0; skbuff_small[i].size < length; i++)
49350 +                       continue;
49351 +               cachep = skbuff_small[i].cachep;
49352 +       } else {
49353 +               order = get_order(length);
49354 +               if (order > MAX_SKBUFF_ORDER) {
49355 +                       printk(KERN_ALERT "Attempt to allocate order %d "
49356 +                              "skbuff. Increase MAX_SKBUFF_ORDER.\n", order);
49357 +                       return NULL;
49358 +               }
49359 +               cachep = skbuff_order_cachep[order];
49360 +       }
49361 +
49362 +       length -= sizeof(struct skb_shared_info);
49363 +
49364 +       return alloc_skb_from_cache(cachep, length, gfp_mask, fclone);
49365 +}
49366 +
49367 +struct sk_buff *__dev_alloc_skb(unsigned int length, gfp_t gfp_mask)
49368 +{
49369 +       struct sk_buff *skb;
49370 +       int order;
49371 +
49372 +       length = SKB_DATA_ALIGN(length + 16);
49373 +       order = get_order(length + sizeof(struct skb_shared_info));
49374 +       if (order > MAX_SKBUFF_ORDER) {
49375 +               printk(KERN_ALERT "Attempt to allocate order %d skbuff. "
49376 +                      "Increase MAX_SKBUFF_ORDER.\n", order);
49377 +               return NULL;
49378 +       }
49379 +
49380 +       skb = alloc_skb_from_cache(
49381 +               skbuff_order_cachep[order], length, gfp_mask, 0);
49382 +       if (skb != NULL)
49383 +               skb_reserve(skb, 16);
49384 +
49385 +       return skb;
49386 +}
49387 +
49388 +static void skbuff_ctor(void *buf, kmem_cache_t *cachep, unsigned long unused)
49389 +{
49390 +       int order = 0;
49391 +
49392 +       while (skbuff_order_cachep[order] != cachep)
49393 +               order++;
49394 +
49395 +       /* Do our best to allocate contiguous memory but fall back to IOMMU. */
49396 +       if (order != 0)
49397 +               (void)xen_create_contiguous_region(
49398 +                       (unsigned long)buf, order, 0);
49399 +
49400 +       scrub_pages(buf, 1 << order);
49401 +}
49402 +
49403 +static void skbuff_dtor(void *buf, kmem_cache_t *cachep, unsigned long unused)
49404 +{
49405 +       int order = 0;
49406 +
49407 +       while (skbuff_order_cachep[order] != cachep)
49408 +               order++;
49409 +
49410 +       if (order != 0)
49411 +               xen_destroy_contiguous_region((unsigned long)buf, order);
49412 +}
49413 +
49414 +static int __init skbuff_init(void)
49415 +{
49416 +       static char name[MAX_SKBUFF_ORDER + 1][20];
49417 +       static char small_name[ARRAY_SIZE(skbuff_small)][20];
49418 +       unsigned long size;
49419 +       int i, order;
49420 +
49421 +       for (i = 0; i < ARRAY_SIZE(skbuff_small); i++) {
49422 +               size = skbuff_small[i].size;
49423 +               sprintf(small_name[i], "xen-skb-%lu", size);
49424 +               /*
49425 +                * No ctor/dtor: objects do not span page boundaries, and they
49426 +                * are only used on transmit path so no need for scrubbing.
49427 +                */
49428 +               skbuff_small[i].cachep = kmem_cache_create(
49429 +                       small_name[i], size, size, 0, NULL, NULL);
49430 +       }
49431 +
49432 +       for (order = 0; order <= MAX_SKBUFF_ORDER; order++) {
49433 +               size = PAGE_SIZE << order;
49434 +               sprintf(name[order], "xen-skb-%lu", size);
49435 +               skbuff_order_cachep[order] = kmem_cache_create(
49436 +                       name[order], size, size, 0, skbuff_ctor, skbuff_dtor);
49437 +       }
49438 +
49439 +       skbuff_cachep = skbuff_order_cachep[0];
49440 +
49441 +       return 0;
49442 +}
49443 +core_initcall(skbuff_init);
49444 +
49445 +EXPORT_SYMBOL(__dev_alloc_skb);
49446 +
49447 +/*
49448 + * Local variables:
49449 + *  c-file-style: "linux"
49450 + *  indent-tabs-mode: t
49451 + *  c-indent-level: 8
49452 + *  c-basic-offset: 8
49453 + *  tab-width: 8
49454 + * End:
49455 + */
49456 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/core/smpboot.c linux-2.6.16/drivers/xen/core/smpboot.c
49457 --- linux-2.6.16.orig/drivers/xen/core/smpboot.c        1970-01-01 01:00:00.000000000 +0100
49458 +++ linux-2.6.16/drivers/xen/core/smpboot.c     2006-06-26 09:51:32.000000000 +0200
49459 @@ -0,0 +1,581 @@
49460 +/*
49461 + *     Xen SMP booting functions
49462 + *
49463 + *     See arch/i386/kernel/smpboot.c for copyright and credits for derived
49464 + *     portions of this file.
49465 + */
49466 +
49467 +#include <linux/module.h>
49468 +#include <linux/config.h>
49469 +#include <linux/init.h>
49470 +#include <linux/kernel.h>
49471 +#include <linux/mm.h>
49472 +#include <linux/sched.h>
49473 +#include <linux/kernel_stat.h>
49474 +#include <linux/smp_lock.h>
49475 +#include <linux/irq.h>
49476 +#include <linux/bootmem.h>
49477 +#include <linux/notifier.h>
49478 +#include <linux/cpu.h>
49479 +#include <linux/percpu.h>
49480 +#include <asm/desc.h>
49481 +#include <asm/arch_hooks.h>
49482 +#include <asm/pgalloc.h>
49483 +#include <xen/evtchn.h>
49484 +#include <xen/interface/vcpu.h>
49485 +#include <xen/xenbus.h>
49486 +
49487 +#ifdef CONFIG_SMP_ALTERNATIVES
49488 +#include <asm/smp_alt.h>
49489 +#endif
49490 +
49491 +extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *);
49492 +extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *);
49493 +
49494 +extern void local_setup_timer(unsigned int cpu);
49495 +extern void local_teardown_timer(unsigned int cpu);
49496 +
49497 +extern void hypervisor_callback(void);
49498 +extern void failsafe_callback(void);
49499 +extern void system_call(void);
49500 +extern void smp_trap_init(trap_info_t *);
49501 +
49502 +/* Number of siblings per CPU package */
49503 +int smp_num_siblings = 1;
49504 +int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */
49505 +EXPORT_SYMBOL(phys_proc_id);
49506 +int cpu_core_id[NR_CPUS]; /* Core ID of each logical CPU */
49507 +EXPORT_SYMBOL(cpu_core_id);
49508 +
49509 +cpumask_t cpu_online_map;
49510 +EXPORT_SYMBOL(cpu_online_map);
49511 +cpumask_t cpu_possible_map;
49512 +EXPORT_SYMBOL(cpu_possible_map);
49513 +
49514 +struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
49515 +EXPORT_SYMBOL(cpu_data);
49516 +
49517 +#ifdef CONFIG_HOTPLUG_CPU
49518 +DEFINE_PER_CPU(int, cpu_state) = { 0 };
49519 +#endif
49520 +
49521 +static DEFINE_PER_CPU(int, resched_irq);
49522 +static DEFINE_PER_CPU(int, callfunc_irq);
49523 +static char resched_name[NR_CPUS][15];
49524 +static char callfunc_name[NR_CPUS][15];
49525 +
49526 +u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
49527 +
49528 +void *xquad_portio;
49529 +
49530 +cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
49531 +cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
49532 +EXPORT_SYMBOL(cpu_core_map);
49533 +
49534 +#if defined(__i386__)
49535 +u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = 0xff };
49536 +EXPORT_SYMBOL(x86_cpu_to_apicid);
49537 +#elif !defined(CONFIG_X86_IO_APIC)
49538 +unsigned int maxcpus = NR_CPUS;
49539 +#endif
49540 +
49541 +/*
49542 + * Set of CPUs that remote admin software will allow us to bring online.
49543 + * Notified to us via xenbus.
49544 + */
49545 +static cpumask_t xenbus_allowed_cpumask;
49546 +
49547 +/* Set of CPUs that local admin will allow us to bring online. */
49548 +static cpumask_t local_allowed_cpumask = CPU_MASK_ALL;
49549 +
49550 +void __init prefill_possible_map(void)
49551 +{
49552 +       int i, rc;
49553 +
49554 +       if (!cpus_empty(cpu_possible_map))
49555 +               return;
49556 +
49557 +       for (i = 0; i < NR_CPUS; i++) {
49558 +               rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
49559 +               if (rc == -ENOENT)
49560 +                       break;
49561 +               cpu_set(i, cpu_possible_map);
49562 +       }
49563 +}
49564 +
49565 +void __init smp_alloc_memory(void)
49566 +{
49567 +}
49568 +
49569 +static void xen_smp_intr_init(unsigned int cpu)
49570 +{
49571 +       sprintf(resched_name[cpu], "resched%d", cpu);
49572 +       per_cpu(resched_irq, cpu) =
49573 +               bind_ipi_to_irqhandler(
49574 +                       RESCHEDULE_VECTOR,
49575 +                       cpu,
49576 +                       smp_reschedule_interrupt,
49577 +                       SA_INTERRUPT,
49578 +                       resched_name[cpu],
49579 +                       NULL);
49580 +       BUG_ON(per_cpu(resched_irq, cpu) < 0);
49581 +
49582 +       sprintf(callfunc_name[cpu], "callfunc%d", cpu);
49583 +       per_cpu(callfunc_irq, cpu) =
49584 +               bind_ipi_to_irqhandler(
49585 +                       CALL_FUNCTION_VECTOR,
49586 +                       cpu,
49587 +                       smp_call_function_interrupt,
49588 +                       SA_INTERRUPT,
49589 +                       callfunc_name[cpu],
49590 +                       NULL);
49591 +       BUG_ON(per_cpu(callfunc_irq, cpu) < 0);
49592 +
49593 +       if (cpu != 0)
49594 +               local_setup_timer(cpu);
49595 +}
49596 +
49597 +#ifdef CONFIG_HOTPLUG_CPU
49598 +static void xen_smp_intr_exit(unsigned int cpu)
49599 +{
49600 +       if (cpu != 0)
49601 +               local_teardown_timer(cpu);
49602 +
49603 +       unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
49604 +       unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
49605 +}
49606 +#endif
49607 +
49608 +static void cpu_bringup(void)
49609 +{
49610 +       cpu_init();
49611 +       touch_softlockup_watchdog();
49612 +       preempt_disable();
49613 +       local_irq_enable();
49614 +       cpu_idle();
49615 +}
49616 +
49617 +static void vcpu_prepare(int vcpu)
49618 +{
49619 +       vcpu_guest_context_t ctxt;
49620 +       struct task_struct *idle = idle_task(vcpu);
49621 +#ifdef __x86_64__
49622 +       struct desc_ptr *gdt_descr = &cpu_gdt_descr[vcpu];
49623 +#else
49624 +       struct Xgt_desc_struct *gdt_descr = &per_cpu(cpu_gdt_descr, vcpu);
49625 +#endif
49626 +
49627 +       if (vcpu == 0)
49628 +               return;
49629 +
49630 +       memset(&ctxt, 0, sizeof(ctxt));
49631 +
49632 +       ctxt.flags = VGCF_IN_KERNEL;
49633 +       ctxt.user_regs.ds = __USER_DS;
49634 +       ctxt.user_regs.es = __USER_DS;
49635 +       ctxt.user_regs.fs = 0;
49636 +       ctxt.user_regs.gs = 0;
49637 +       ctxt.user_regs.ss = __KERNEL_DS;
49638 +       ctxt.user_regs.eip = (unsigned long)cpu_bringup;
49639 +       ctxt.user_regs.eflags = X86_EFLAGS_IF | 0x1000; /* IOPL_RING1 */
49640 +
49641 +       memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
49642 +
49643 +       smp_trap_init(ctxt.trap_ctxt);
49644 +
49645 +       ctxt.ldt_ents = 0;
49646 +
49647 +       ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address);
49648 +       ctxt.gdt_ents      = gdt_descr->size / 8;
49649 +
49650 +#ifdef __i386__
49651 +       ctxt.user_regs.cs = __KERNEL_CS;
49652 +       ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
49653 +
49654 +       ctxt.kernel_ss = __KERNEL_DS;
49655 +       ctxt.kernel_sp = idle->thread.esp0;
49656 +
49657 +       ctxt.event_callback_cs     = __KERNEL_CS;
49658 +       ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
49659 +       ctxt.failsafe_callback_cs  = __KERNEL_CS;
49660 +       ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
49661 +
49662 +       ctxt.ctrlreg[3] = virt_to_mfn(swapper_pg_dir) << PAGE_SHIFT;
49663 +#else /* __x86_64__ */
49664 +       ctxt.user_regs.cs = __KERNEL_CS;
49665 +       ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
49666 +
49667 +       ctxt.kernel_ss = __KERNEL_DS;
49668 +       ctxt.kernel_sp = idle->thread.rsp0;
49669 +
49670 +       ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
49671 +       ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
49672 +       ctxt.syscall_callback_eip  = (unsigned long)system_call;
49673 +
49674 +       ctxt.ctrlreg[3] = virt_to_mfn(init_level4_pgt) << PAGE_SHIFT;
49675 +
49676 +       ctxt.gs_base_kernel = (unsigned long)(cpu_pda(vcpu));
49677 +#endif
49678 +
49679 +       BUG_ON(HYPERVISOR_vcpu_op(VCPUOP_initialise, vcpu, &ctxt));
49680 +}
49681 +
49682 +void __init smp_prepare_cpus(unsigned int max_cpus)
49683 +{
49684 +       int cpu;
49685 +       struct task_struct *idle;
49686 +#ifdef __x86_64__
49687 +       struct desc_ptr *gdt_descr;
49688 +#else
49689 +       struct Xgt_desc_struct *gdt_descr;
49690 +#endif
49691 +
49692 +       cpu_data[0] = boot_cpu_data;
49693 +
49694 +       cpu_2_logical_apicid[0] = 0;
49695 +       x86_cpu_to_apicid[0] = 0;
49696 +
49697 +       current_thread_info()->cpu = 0;
49698 +       cpu_sibling_map[0] = cpumask_of_cpu(0);
49699 +       cpu_core_map[0]    = cpumask_of_cpu(0);
49700 +
49701 +       xen_smp_intr_init(0);
49702 +
49703 +       for_each_cpu_mask (cpu, cpu_possible_map) {
49704 +               if (cpu == 0)
49705 +                       continue;
49706 +
49707 +#ifdef __x86_64__
49708 +               gdt_descr = &cpu_gdt_descr[cpu];
49709 +#else
49710 +               gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
49711 +#endif
49712 +               gdt_descr->address = get_zeroed_page(GFP_KERNEL);
49713 +               if (unlikely(!gdt_descr->address)) {
49714 +                       printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
49715 +                       continue;
49716 +               }
49717 +               gdt_descr->size = GDT_SIZE;
49718 +               memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
49719 +               make_page_readonly(
49720 +                       (void *)gdt_descr->address,
49721 +                       XENFEAT_writable_descriptor_tables);
49722 +
49723 +               cpu_data[cpu] = boot_cpu_data;
49724 +               cpu_2_logical_apicid[cpu] = cpu;
49725 +               x86_cpu_to_apicid[cpu] = cpu;
49726 +
49727 +               idle = fork_idle(cpu);
49728 +               if (IS_ERR(idle))
49729 +                       panic("failed fork for CPU %d", cpu);
49730 +
49731 +#ifdef __x86_64__
49732 +               cpu_pda(cpu)->pcurrent = idle;
49733 +               cpu_pda(cpu)->cpunumber = cpu;
49734 +               clear_ti_thread_flag(idle->thread_info, TIF_FORK);
49735 +#endif
49736 +
49737 +               irq_ctx_init(cpu);
49738 +
49739 +#ifdef CONFIG_HOTPLUG_CPU
49740 +               if (xen_start_info->flags & SIF_INITDOMAIN)
49741 +                       cpu_set(cpu, cpu_present_map);
49742 +#else
49743 +               cpu_set(cpu, cpu_present_map);
49744 +#endif
49745 +
49746 +               vcpu_prepare(cpu);
49747 +       }
49748 +
49749 +       xenbus_allowed_cpumask = cpu_present_map;
49750 +
49751 +       /* Currently, Xen gives no dynamic NUMA/HT info. */
49752 +       for (cpu = 1; cpu < NR_CPUS; cpu++) {
49753 +               cpu_sibling_map[cpu] = cpumask_of_cpu(cpu);
49754 +               cpu_core_map[cpu]    = cpumask_of_cpu(cpu);
49755 +       }
49756 +
49757 +#ifdef CONFIG_X86_IO_APIC
49758 +       /*
49759 +        * Here we can be sure that there is an IO-APIC in the system. Let's
49760 +        * go and set it up:
49761 +        */
49762 +       if (!skip_ioapic_setup && nr_ioapics)
49763 +               setup_IO_APIC();
49764 +#endif
49765 +}
49766 +
49767 +void __devinit smp_prepare_boot_cpu(void)
49768 +{
49769 +       prefill_possible_map();
49770 +       cpu_present_map  = cpumask_of_cpu(0);
49771 +       cpu_online_map   = cpumask_of_cpu(0);
49772 +}
49773 +
49774 +static int local_cpu_hotplug_request(void)
49775 +{
49776 +       /*
49777 +        * We assume a CPU hotplug request comes from local admin if it is made
49778 +        * via a userspace process (i.e., one with a real mm_struct).
49779 +        */
49780 +       return (current->mm != NULL);
49781 +}
49782 +
49783 +#ifdef CONFIG_HOTPLUG_CPU
49784 +
49785 +/*
49786 + * Initialize cpu_present_map late to skip SMP boot code in init/main.c.
49787 + * But do it early enough to catch critical for_each_present_cpu() loops
49788 + * in i386-specific code.
49789 + */
49790 +static int __init initialize_cpu_present_map(void)
49791 +{
49792 +       cpu_present_map = cpu_possible_map;
49793 +       return 0;
49794 +}
49795 +core_initcall(initialize_cpu_present_map);
49796 +
49797 +static void vcpu_hotplug(unsigned int cpu)
49798 +{
49799 +       int err;
49800 +       char dir[32], state[32];
49801 +
49802 +       if ((cpu >= NR_CPUS) || !cpu_possible(cpu))
49803 +               return;
49804 +
49805 +       sprintf(dir, "cpu/%d", cpu);
49806 +       err = xenbus_scanf(XBT_NULL, dir, "availability", "%s", state);
49807 +       if (err != 1) {
49808 +               printk(KERN_ERR "XENBUS: Unable to read cpu state\n");
49809 +               return;
49810 +       }
49811 +
49812 +       if (strcmp(state, "online") == 0) {
49813 +               cpu_set(cpu, xenbus_allowed_cpumask);
49814 +               (void)cpu_up(cpu);
49815 +       } else if (strcmp(state, "offline") == 0) {
49816 +               cpu_clear(cpu, xenbus_allowed_cpumask);
49817 +               (void)cpu_down(cpu);
49818 +       } else {
49819 +               printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n",
49820 +                      state, cpu);
49821 +       }
49822 +}
49823 +
49824 +static void handle_vcpu_hotplug_event(
49825 +       struct xenbus_watch *watch, const char **vec, unsigned int len)
49826 +{
49827 +       int cpu;
49828 +       char *cpustr;
49829 +       const char *node = vec[XS_WATCH_PATH];
49830 +
49831 +       if ((cpustr = strstr(node, "cpu/")) != NULL) {
49832 +               sscanf(cpustr, "cpu/%d", &cpu);
49833 +               vcpu_hotplug(cpu);
49834 +       }
49835 +}
49836 +
49837 +static int smpboot_cpu_notify(struct notifier_block *notifier,
49838 +                             unsigned long action, void *hcpu)
49839 +{
49840 +       int cpu = (long)hcpu;
49841 +
49842 +       /*
49843 +        * We do this in a callback notifier rather than __cpu_disable()
49844 +        * because local_cpu_hotplug_request() does not work in the latter
49845 +        * as it's always executed from within a stopmachine kthread.
49846 +        */
49847 +       if ((action == CPU_DOWN_PREPARE) && local_cpu_hotplug_request())
49848 +               cpu_clear(cpu, local_allowed_cpumask);
49849 +
49850 +       return NOTIFY_OK;
49851 +}
49852 +
49853 +static int setup_cpu_watcher(struct notifier_block *notifier,
49854 +                             unsigned long event, void *data)
49855 +{
49856 +       int i;
49857 +
49858 +       static struct xenbus_watch cpu_watch = {
49859 +               .node = "cpu",
49860 +               .callback = handle_vcpu_hotplug_event,
49861 +               .flags = XBWF_new_thread };
49862 +       (void)register_xenbus_watch(&cpu_watch);
49863 +
49864 +       if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
49865 +               for_each_cpu(i)
49866 +                       vcpu_hotplug(i);
49867 +               printk(KERN_INFO "Brought up %ld CPUs\n",
49868 +                      (long)num_online_cpus());
49869 +       }
49870 +
49871 +       return NOTIFY_DONE;
49872 +}
49873 +
49874 +static int __init setup_vcpu_hotplug_event(void)
49875 +{
49876 +       static struct notifier_block hotplug_cpu = {
49877 +               .notifier_call = smpboot_cpu_notify };
49878 +       static struct notifier_block xsn_cpu = {
49879 +               .notifier_call = setup_cpu_watcher };
49880 +
49881 +       register_cpu_notifier(&hotplug_cpu);
49882 +       register_xenstore_notifier(&xsn_cpu);
49883 +
49884 +       return 0;
49885 +}
49886 +
49887 +arch_initcall(setup_vcpu_hotplug_event);
49888 +
49889 +int smp_suspend(void)
49890 +{
49891 +       int i, err;
49892 +
49893 +       lock_cpu_hotplug();
49894 +
49895 +       /*
49896 +        * Take all other CPUs offline. We hold the hotplug mutex to
49897 +        * avoid other processes bringing up CPUs under our feet.
49898 +        */
49899 +       while (num_online_cpus() > 1) {
49900 +               unlock_cpu_hotplug();
49901 +               for_each_online_cpu(i) {
49902 +                       if (i == 0)
49903 +                               continue;
49904 +                       err = cpu_down(i);
49905 +                       if (err) {
49906 +                               printk(KERN_CRIT "Failed to take all CPUs "
49907 +                                      "down: %d.\n", err);
49908 +                               for_each_cpu(i)
49909 +                                       vcpu_hotplug(i);
49910 +                               return err;
49911 +                       }
49912 +               }
49913 +               lock_cpu_hotplug();
49914 +       }
49915 +
49916 +       return 0;
49917 +}
49918 +
49919 +void smp_resume(void)
49920 +{
49921 +       int i;
49922 +
49923 +       for_each_cpu(i)
49924 +               vcpu_prepare(i);
49925 +
49926 +       unlock_cpu_hotplug();
49927 +
49928 +       for_each_cpu(i)
49929 +               vcpu_hotplug(i);
49930 +}
49931 +
49932 +int __cpu_disable(void)
49933 +{
49934 +       cpumask_t map = cpu_online_map;
49935 +       int cpu = smp_processor_id();
49936 +
49937 +       if (cpu == 0)
49938 +               return -EBUSY;
49939 +
49940 +       cpu_clear(cpu, map);
49941 +       fixup_irqs(map);
49942 +       cpu_clear(cpu, cpu_online_map);
49943 +
49944 +       return 0;
49945 +}
49946 +
49947 +void __cpu_die(unsigned int cpu)
49948 +{
49949 +       while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
49950 +               current->state = TASK_UNINTERRUPTIBLE;
49951 +               schedule_timeout(HZ/10);
49952 +       }
49953 +
49954 +       xen_smp_intr_exit(cpu);
49955 +
49956 +#ifdef CONFIG_SMP_ALTERNATIVES
49957 +       if (num_online_cpus() == 1)
49958 +               unprepare_for_smp();
49959 +#endif
49960 +}
49961 +
49962 +#else /* !CONFIG_HOTPLUG_CPU */
49963 +
49964 +int smp_suspend(void)
49965 +{
49966 +       if (num_online_cpus() > 1) {
49967 +               printk(KERN_WARNING "Can't suspend SMP guests "
49968 +                      "without CONFIG_HOTPLUG_CPU\n");
49969 +               return -EOPNOTSUPP;
49970 +       }
49971 +       return 0;
49972 +}
49973 +
49974 +void smp_resume(void)
49975 +{
49976 +}
49977 +
49978 +int __cpu_disable(void)
49979 +{
49980 +       return -ENOSYS;
49981 +}
49982 +
49983 +void __cpu_die(unsigned int cpu)
49984 +{
49985 +       BUG();
49986 +}
49987 +
49988 +#endif /* CONFIG_HOTPLUG_CPU */
49989 +
49990 +int __devinit __cpu_up(unsigned int cpu)
49991 +{
49992 +       int rc;
49993 +
49994 +       if (local_cpu_hotplug_request()) {
49995 +               cpu_set(cpu, local_allowed_cpumask);
49996 +               if (!cpu_isset(cpu, xenbus_allowed_cpumask)) {
49997 +                       printk("%s: attempt to bring up CPU %u disallowed by "
49998 +                              "remote admin.\n", __FUNCTION__, cpu);
49999 +                       return -EBUSY;
50000 +               }
50001 +       } else if (!cpu_isset(cpu, local_allowed_cpumask) ||
50002 +                  !cpu_isset(cpu, xenbus_allowed_cpumask)) {
50003 +               return -EBUSY;
50004 +       }
50005 +
50006 +#ifdef CONFIG_SMP_ALTERNATIVES
50007 +       if (num_online_cpus() == 1)
50008 +               prepare_for_smp();
50009 +#endif
50010 +
50011 +       xen_smp_intr_init(cpu);
50012 +       cpu_set(cpu, cpu_online_map);
50013 +
50014 +       rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
50015 +       if (rc != 0)
50016 +               BUG();
50017 +
50018 +       return 0;
50019 +}
50020 +
50021 +void __init smp_cpus_done(unsigned int max_cpus)
50022 +{
50023 +}
50024 +
50025 +#ifndef CONFIG_X86_LOCAL_APIC
50026 +int setup_profiling_timer(unsigned int multiplier)
50027 +{
50028 +       return -EINVAL;
50029 +}
50030 +#endif
50031 +
50032 +/*
50033 + * Local variables:
50034 + *  c-file-style: "linux"
50035 + *  indent-tabs-mode: t
50036 + *  c-indent-level: 8
50037 + *  c-basic-offset: 8
50038 + *  tab-width: 8
50039 + * End:
50040 + */
50041 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/core/xen_proc.c linux-2.6.16/drivers/xen/core/xen_proc.c
50042 --- linux-2.6.16.orig/drivers/xen/core/xen_proc.c       1970-01-01 01:00:00.000000000 +0100
50043 +++ linux-2.6.16/drivers/xen/core/xen_proc.c    2006-06-26 09:51:32.000000000 +0200
50044 @@ -0,0 +1,29 @@
50045 +
50046 +#include <linux/config.h>
50047 +#include <linux/proc_fs.h>
50048 +#include <xen/xen_proc.h>
50049 +
50050 +static struct proc_dir_entry *xen_base;
50051 +
50052 +struct proc_dir_entry *create_xen_proc_entry(const char *name, mode_t mode)
50053 +{
50054 +       if ( xen_base == NULL )
50055 +               if ( (xen_base = proc_mkdir("xen", &proc_root)) == NULL )
50056 +                       panic("Couldn't create /proc/xen");
50057 +       return create_proc_entry(name, mode, xen_base);
50058 +}
50059 +
50060 +void remove_xen_proc_entry(const char *name)
50061 +{
50062 +       remove_proc_entry(name, xen_base);
50063 +}
50064 +
50065 +/*
50066 + * Local variables:
50067 + *  c-file-style: "linux"
50068 + *  indent-tabs-mode: t
50069 + *  c-indent-level: 8
50070 + *  c-basic-offset: 8
50071 + *  tab-width: 8
50072 + * End:
50073 + */
50074 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/core/xen_sysfs.c linux-2.6.16/drivers/xen/core/xen_sysfs.c
50075 --- linux-2.6.16.orig/drivers/xen/core/xen_sysfs.c      1970-01-01 01:00:00.000000000 +0100
50076 +++ linux-2.6.16/drivers/xen/core/xen_sysfs.c   2006-06-26 09:51:32.000000000 +0200
50077 @@ -0,0 +1,311 @@
50078 +/*
50079 + *  copyright (c) 2006 IBM Corporation
50080 + *  Authored by: Mike D. Day <ncmike@us.ibm.com>
50081 + *
50082 + *  This program is free software; you can redistribute it and/or modify
50083 + *  it under the terms of the GNU General Public License version 2 as
50084 + *  published by the Free Software Foundation.
50085 + */
50086 +
50087 +#include <linux/config.h>
50088 +#include <linux/kernel.h>
50089 +#include <linux/module.h>
50090 +#include <linux/init.h>
50091 +#include <asm/hypervisor.h>
50092 +#include <xen/features.h>
50093 +#include <xen/hypervisor_sysfs.h>
50094 +
50095 +MODULE_LICENSE("GPL");
50096 +MODULE_AUTHOR("Mike D. Day <ncmike@us.ibm.com>");
50097 +
50098 +static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer)
50099 +{
50100 +       return sprintf(buffer, "xen\n");
50101 +}
50102 +
50103 +HYPERVISOR_ATTR_RO(type);
50104 +
50105 +static int __init xen_sysfs_type_init(void)
50106 +{
50107 +       return sysfs_create_file(&hypervisor_subsys.kset.kobj, &type_attr.attr);
50108 +}
50109 +
50110 +static void xen_sysfs_type_destroy(void)
50111 +{
50112 +       sysfs_remove_file(&hypervisor_subsys.kset.kobj, &type_attr.attr);
50113 +}
50114 +
50115 +/* xen version attributes */
50116 +static ssize_t major_show(struct hyp_sysfs_attr *attr, char *buffer)
50117 +{
50118 +       int version = HYPERVISOR_xen_version(XENVER_version, NULL);
50119 +       if (version)
50120 +               return sprintf(buffer, "%d\n", version >> 16);
50121 +       return -ENODEV;
50122 +}
50123 +
50124 +HYPERVISOR_ATTR_RO(major);
50125 +
50126 +static ssize_t minor_show(struct hyp_sysfs_attr *attr, char *buffer)
50127 +{
50128 +       int version = HYPERVISOR_xen_version(XENVER_version, NULL);
50129 +       if (version)
50130 +               return sprintf(buffer, "%d\n", version & 0xff);
50131 +       return -ENODEV;
50132 +}
50133 +
50134 +HYPERVISOR_ATTR_RO(minor);
50135 +
50136 +static ssize_t extra_show(struct hyp_sysfs_attr *attr, char *buffer)
50137 +{
50138 +       int ret;
50139 +       char *extra = kmalloc(XEN_EXTRAVERSION_LEN, GFP_KERNEL);
50140 +       if (extra) {
50141 +               ret = HYPERVISOR_xen_version(XENVER_extraversion, extra);
50142 +               if (!ret)
50143 +                       return sprintf(buffer, "%s\n", extra);
50144 +               kfree(extra);
50145 +       } else
50146 +               ret = -ENOMEM;
50147 +       return ret;
50148 +}
50149 +
50150 +HYPERVISOR_ATTR_RO(extra);
50151 +
50152 +static struct attribute *version_attrs[] = {
50153 +       &major_attr.attr,
50154 +       &minor_attr.attr,
50155 +       &extra_attr.attr,
50156 +       NULL
50157 +};
50158 +
50159 +static struct attribute_group version_group = {
50160 +       .name = "version",
50161 +       .attrs = version_attrs,
50162 +};
50163 +
50164 +static int __init xen_sysfs_version_init(void)
50165 +{
50166 +       return sysfs_create_group(&hypervisor_subsys.kset.kobj, &version_group);
50167 +}
50168 +
50169 +static void xen_sysfs_version_destroy(void)
50170 +{
50171 +       sysfs_remove_group(&hypervisor_subsys.kset.kobj, &version_group);
50172 +}
50173 +
50174 +/* xen compilation attributes */
50175 +
50176 +static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer)
50177 +{
50178 +       int ret;
50179 +       struct xen_compile_info *info =
50180 +           kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
50181 +       if (info) {
50182 +               ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
50183 +               if (!ret)
50184 +                       ret = sprintf(buffer, "%s\n", info->compiler);
50185 +               kfree(info);
50186 +       } else
50187 +               ret = -ENOMEM;
50188 +
50189 +       return ret;
50190 +}
50191 +
50192 +HYPERVISOR_ATTR_RO(compiler);
50193 +
50194 +static ssize_t compiled_by_show(struct hyp_sysfs_attr *attr, char *buffer)
50195 +{
50196 +       int ret;
50197 +       struct xen_compile_info *info;
50198 +
50199 +       info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
50200 +       if (info) {
50201 +               ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
50202 +               if (!ret)
50203 +                       ret = sprintf(buffer, "%s\n", info->compile_by);
50204 +               kfree(info);
50205 +       } else
50206 +               ret = -ENOMEM;
50207 +       return ret;
50208 +}
50209 +
50210 +HYPERVISOR_ATTR_RO(compiled_by);
50211 +
50212 +static ssize_t compile_date_show(struct hyp_sysfs_attr *attr, char *buffer)
50213 +{
50214 +       int ret;
50215 +       struct xen_compile_info *info;
50216 +
50217 +       info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
50218 +       if (info) {
50219 +               ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
50220 +               if (!ret)
50221 +                       ret = sprintf(buffer, "%s\n", info->compile_date);
50222 +               kfree(info);
50223 +       } else
50224 +               ret = -ENOMEM;
50225 +       return ret;
50226 +}
50227 +
50228 +HYPERVISOR_ATTR_RO(compile_date);
50229 +
50230 +static struct attribute *xen_compile_attrs[] = {
50231 +       &compiler_attr.attr,
50232 +       &compiled_by_attr.attr,
50233 +       &compile_date_attr.attr,
50234 +       NULL
50235 +};
50236 +
50237 +static struct attribute_group xen_compilation_group = {
50238 +       .name = "compilation",
50239 +       .attrs = xen_compile_attrs,
50240 +};
50241 +
50242 +int __init static xen_compilation_init(void)
50243 +{
50244 +       return sysfs_create_group(&hypervisor_subsys.kset.kobj,
50245 +                                 &xen_compilation_group);
50246 +}
50247 +
50248 +static void xen_compilation_destroy(void)
50249 +{
50250 +       sysfs_remove_group(&hypervisor_subsys.kset.kobj,
50251 +                          &xen_compilation_group);
50252 +}
50253 +
50254 +/* xen properties info */
50255 +
50256 +static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer)
50257 +{
50258 +       int ret;
50259 +       char *caps = kmalloc(XEN_CAPABILITIES_INFO_LEN, GFP_KERNEL);
50260 +       if (caps) {
50261 +               ret = HYPERVISOR_xen_version(XENVER_capabilities, caps);
50262 +               if (!ret)
50263 +                       ret = sprintf(buffer, "%s\n", caps);
50264 +               kfree(caps);
50265 +       } else
50266 +               ret = -ENOMEM;
50267 +       return ret;
50268 +}
50269 +
50270 +HYPERVISOR_ATTR_RO(capabilities);
50271 +
50272 +static ssize_t changeset_show(struct hyp_sysfs_attr *attr, char *buffer)
50273 +{
50274 +       int ret;
50275 +       char *cset = kmalloc(XEN_CHANGESET_INFO_LEN, GFP_KERNEL);
50276 +       if (cset) {
50277 +               ret = HYPERVISOR_xen_version(XENVER_changeset, cset);
50278 +               if (!ret)
50279 +                       ret = sprintf(buffer, "%s\n", cset);
50280 +               kfree(cset);
50281 +       } else
50282 +               ret = -ENOMEM;
50283 +       return ret;
50284 +}
50285 +
50286 +HYPERVISOR_ATTR_RO(changeset);
50287 +
50288 +static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer)
50289 +{
50290 +       int ret;
50291 +       struct xen_platform_parameters *parms =
50292 +           kmalloc(sizeof(struct xen_platform_parameters), GFP_KERNEL);
50293 +       if (parms) {
50294 +               ret = HYPERVISOR_xen_version(XENVER_platform_parameters, parms);
50295 +               if (!ret)
50296 +                       ret = sprintf(buffer, "%lx\n", parms->virt_start);
50297 +               kfree(parms);
50298 +       } else
50299 +               ret = -ENOMEM;
50300 +       return ret;
50301 +}
50302 +
50303 +HYPERVISOR_ATTR_RO(virtual_start);
50304 +
50305 +/* eventually there will be several more features to export */
50306 +static ssize_t xen_feature_show(int index, char *buffer)
50307 +{
50308 +       int ret;
50309 +
50310 +       struct xen_feature_info *info =
50311 +           kmalloc(sizeof(struct xen_feature_info), GFP_KERNEL);
50312 +       if (info) {
50313 +               info->submap_idx = index;
50314 +               ret = HYPERVISOR_xen_version(XENVER_get_features, info);
50315 +               if (!ret)
50316 +                       ret = sprintf(buffer, "%d\n", info->submap);
50317 +               kfree(info);
50318 +       } else
50319 +               ret = -ENOMEM;
50320 +       return ret;
50321 +}
50322 +
50323 +static ssize_t writable_pt_show(struct hyp_sysfs_attr *attr, char *buffer)
50324 +{
50325 +       return xen_feature_show(XENFEAT_writable_page_tables, buffer);
50326 +}
50327 +
50328 +HYPERVISOR_ATTR_RO(writable_pt);
50329 +
50330 +static struct attribute *xen_properties_attrs[] = {
50331 +       &capabilities_attr.attr,
50332 +       &changeset_attr.attr,
50333 +       &virtual_start_attr.attr,
50334 +       &writable_pt_attr.attr,
50335 +       NULL
50336 +};
50337 +
50338 +static struct attribute_group xen_properties_group = {
50339 +       .name = "properties",
50340 +       .attrs = xen_properties_attrs,
50341 +};
50342 +
50343 +static int __init xen_properties_init(void)
50344 +{
50345 +       return sysfs_create_group(&hypervisor_subsys.kset.kobj,
50346 +                                 &xen_properties_group);
50347 +}
50348 +
50349 +static void xen_properties_destroy(void)
50350 +{
50351 +       sysfs_remove_group(&hypervisor_subsys.kset.kobj, &xen_properties_group);
50352 +}
50353 +
50354 +static int __init hyper_sysfs_init(void)
50355 +{
50356 +       int ret = xen_sysfs_type_init();
50357 +       if (ret)
50358 +               goto out;
50359 +       ret = xen_sysfs_version_init();
50360 +       if (ret)
50361 +               goto version_out;
50362 +       ret = xen_compilation_init();
50363 +       if (ret)
50364 +               goto comp_out;
50365 +       ret = xen_properties_init();
50366 +       if (!ret)
50367 +               goto out;
50368 +
50369 +       xen_compilation_destroy();
50370 +comp_out:
50371 +       xen_sysfs_version_destroy();
50372 +version_out:
50373 +       xen_sysfs_type_destroy();
50374 +out:
50375 +       return ret;
50376 +}
50377 +
50378 +static void hyper_sysfs_exit(void)
50379 +{
50380 +       xen_properties_destroy();
50381 +       xen_compilation_destroy();
50382 +       xen_sysfs_version_destroy();
50383 +       xen_sysfs_type_destroy();
50384 +
50385 +}
50386 +
50387 +module_init(hyper_sysfs_init);
50388 +module_exit(hyper_sysfs_exit);
50389 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/evtchn/Makefile linux-2.6.16/drivers/xen/evtchn/Makefile
50390 --- linux-2.6.16.orig/drivers/xen/evtchn/Makefile       1970-01-01 01:00:00.000000000 +0100
50391 +++ linux-2.6.16/drivers/xen/evtchn/Makefile    2006-06-26 09:51:32.000000000 +0200
50392 @@ -0,0 +1,2 @@
50393 +
50394 +obj-y  := evtchn.o
50395 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/evtchn/evtchn.c linux-2.6.16/drivers/xen/evtchn/evtchn.c
50396 --- linux-2.6.16.orig/drivers/xen/evtchn/evtchn.c       1970-01-01 01:00:00.000000000 +0100
50397 +++ linux-2.6.16/drivers/xen/evtchn/evtchn.c    2006-06-26 09:51:32.000000000 +0200
50398 @@ -0,0 +1,464 @@
50399 +/******************************************************************************
50400 + * evtchn.c
50401 + * 
50402 + * Driver for receiving and demuxing event-channel signals.
50403 + * 
50404 + * Copyright (c) 2004-2005, K A Fraser
50405 + * Multi-process extensions Copyright (c) 2004, Steven Smith
50406 + * 
50407 + * This program is free software; you can redistribute it and/or
50408 + * modify it under the terms of the GNU General Public License version 2
50409 + * as published by the Free Software Foundation; or, when distributed
50410 + * separately from the Linux kernel or incorporated into other
50411 + * software packages, subject to the following license:
50412 + * 
50413 + * Permission is hereby granted, free of charge, to any person obtaining a copy
50414 + * of this source file (the "Software"), to deal in the Software without
50415 + * restriction, including without limitation the rights to use, copy, modify,
50416 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
50417 + * and to permit persons to whom the Software is furnished to do so, subject to
50418 + * the following conditions:
50419 + * 
50420 + * The above copyright notice and this permission notice shall be included in
50421 + * all copies or substantial portions of the Software.
50422 + * 
50423 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
50424 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
50425 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
50426 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
50427 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
50428 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
50429 + * IN THE SOFTWARE.
50430 + */
50431 +
50432 +#include <linux/config.h>
50433 +#include <linux/module.h>
50434 +#include <linux/kernel.h>
50435 +#include <linux/sched.h>
50436 +#include <linux/slab.h>
50437 +#include <linux/string.h>
50438 +#include <linux/errno.h>
50439 +#include <linux/fs.h>
50440 +#include <linux/errno.h>
50441 +#include <linux/miscdevice.h>
50442 +#include <linux/major.h>
50443 +#include <linux/proc_fs.h>
50444 +#include <linux/stat.h>
50445 +#include <linux/poll.h>
50446 +#include <linux/irq.h>
50447 +#include <linux/init.h>
50448 +#include <linux/gfp.h>
50449 +#include <xen/evtchn.h>
50450 +#include <xen/public/evtchn.h>
50451 +
50452 +struct per_user_data {
50453 +       /* Notification ring, accessed via /dev/xen/evtchn. */
50454 +#define EVTCHN_RING_SIZE     (PAGE_SIZE / sizeof(evtchn_port_t))
50455 +#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1))
50456 +       evtchn_port_t *ring;
50457 +       unsigned int ring_cons, ring_prod, ring_overflow;
50458 +
50459 +       /* Processes wait on this queue when ring is empty. */
50460 +       wait_queue_head_t evtchn_wait;
50461 +       struct fasync_struct *evtchn_async_queue;
50462 +};
50463 +
50464 +/* Who's bound to each port? */
50465 +static struct per_user_data *port_user[NR_EVENT_CHANNELS];
50466 +static spinlock_t port_user_lock;
50467 +
50468 +void evtchn_device_upcall(int port)
50469 +{
50470 +       struct per_user_data *u;
50471 +
50472 +       spin_lock(&port_user_lock);
50473 +
50474 +       mask_evtchn(port);
50475 +       clear_evtchn(port);
50476 +
50477 +       if ((u = port_user[port]) != NULL) {
50478 +               if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
50479 +                       u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
50480 +                       if (u->ring_cons == u->ring_prod++) {
50481 +                               wake_up_interruptible(&u->evtchn_wait);
50482 +                               kill_fasync(&u->evtchn_async_queue,
50483 +                                           SIGIO, POLL_IN);
50484 +                       }
50485 +               } else {
50486 +                       u->ring_overflow = 1;
50487 +               }
50488 +       }
50489 +
50490 +       spin_unlock(&port_user_lock);
50491 +}
50492 +
50493 +static ssize_t evtchn_read(struct file *file, char __user *buf,
50494 +                           size_t count, loff_t *ppos)
50495 +{
50496 +       int rc;
50497 +       unsigned int c, p, bytes1 = 0, bytes2 = 0;
50498 +       struct per_user_data *u = file->private_data;
50499 +
50500 +       /* Whole number of ports. */
50501 +       count &= ~(sizeof(evtchn_port_t)-1);
50502 +
50503 +       if (count == 0)
50504 +               return 0;
50505 +
50506 +       if (count > PAGE_SIZE)
50507 +               count = PAGE_SIZE;
50508 +
50509 +       for (;;) {
50510 +               if (u->ring_overflow)
50511 +                       return -EFBIG;
50512 +
50513 +               if ((c = u->ring_cons) != (p = u->ring_prod))
50514 +                       break;
50515 +
50516 +               if (file->f_flags & O_NONBLOCK)
50517 +                       return -EAGAIN;
50518 +
50519 +               rc = wait_event_interruptible(
50520 +                       u->evtchn_wait, u->ring_cons != u->ring_prod);
50521 +               if (rc)
50522 +                       return rc;
50523 +       }
50524 +
50525 +       /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */
50526 +       if (((c ^ p) & EVTCHN_RING_SIZE) != 0) {
50527 +               bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) *
50528 +                       sizeof(evtchn_port_t);
50529 +               bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t);
50530 +       } else {
50531 +               bytes1 = (p - c) * sizeof(evtchn_port_t);
50532 +               bytes2 = 0;
50533 +       }
50534 +
50535 +       /* Truncate chunks according to caller's maximum byte count. */
50536 +       if (bytes1 > count) {
50537 +               bytes1 = count;
50538 +               bytes2 = 0;
50539 +       } else if ((bytes1 + bytes2) > count) {
50540 +               bytes2 = count - bytes1;
50541 +       }
50542 +
50543 +       if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) ||
50544 +           ((bytes2 != 0) &&
50545 +            copy_to_user(&buf[bytes1], &u->ring[0], bytes2)))
50546 +               return -EFAULT;
50547 +
50548 +       u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t);
50549 +
50550 +       return bytes1 + bytes2;
50551 +}
50552 +
50553 +static ssize_t evtchn_write(struct file *file, const char __user *buf,
50554 +                            size_t count, loff_t *ppos)
50555 +{
50556 +       int  rc, i;
50557 +       evtchn_port_t *kbuf = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
50558 +       struct per_user_data *u = file->private_data;
50559 +
50560 +       if (kbuf == NULL)
50561 +               return -ENOMEM;
50562 +
50563 +       /* Whole number of ports. */
50564 +       count &= ~(sizeof(evtchn_port_t)-1);
50565 +
50566 +       if (count == 0) {
50567 +               rc = 0;
50568 +               goto out;
50569 +       }
50570 +
50571 +       if (count > PAGE_SIZE)
50572 +               count = PAGE_SIZE;
50573 +
50574 +       if (copy_from_user(kbuf, buf, count) != 0) {
50575 +               rc = -EFAULT;
50576 +               goto out;
50577 +       }
50578 +
50579 +       spin_lock_irq(&port_user_lock);
50580 +       for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
50581 +               if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
50582 +                       unmask_evtchn(kbuf[i]);
50583 +       spin_unlock_irq(&port_user_lock);
50584 +
50585 +       rc = count;
50586 +
50587 + out:
50588 +       free_page((unsigned long)kbuf);
50589 +       return rc;
50590 +}
50591 +
50592 +static void evtchn_bind_to_user(struct per_user_data *u, int port)
50593 +{
50594 +       spin_lock_irq(&port_user_lock);
50595 +       BUG_ON(port_user[port] != NULL);
50596 +       port_user[port] = u;
50597 +       unmask_evtchn(port);
50598 +       spin_unlock_irq(&port_user_lock);
50599 +}
50600 +
50601 +static int evtchn_ioctl(struct inode *inode, struct file *file,
50602 +                        unsigned int cmd, unsigned long arg)
50603 +{
50604 +       int rc;
50605 +       struct per_user_data *u = file->private_data;
50606 +       void __user *uarg = (void __user *) arg;
50607 +       evtchn_op_t op = { 0 };
50608 +
50609 +       switch (cmd) {
50610 +       case IOCTL_EVTCHN_BIND_VIRQ: {
50611 +               struct ioctl_evtchn_bind_virq bind;
50612 +
50613 +               rc = -EFAULT;
50614 +               if (copy_from_user(&bind, uarg, sizeof(bind)))
50615 +                       break;
50616 +
50617 +               op.cmd = EVTCHNOP_bind_virq;
50618 +               op.u.bind_virq.virq = bind.virq;
50619 +               op.u.bind_virq.vcpu = 0;
50620 +               rc = HYPERVISOR_event_channel_op(&op);
50621 +               if (rc != 0)
50622 +                       break;
50623 +
50624 +               rc = op.u.bind_virq.port;
50625 +               evtchn_bind_to_user(u, rc);
50626 +               break;
50627 +       }
50628 +
50629 +       case IOCTL_EVTCHN_BIND_INTERDOMAIN: {
50630 +               struct ioctl_evtchn_bind_interdomain bind;
50631 +
50632 +               rc = -EFAULT;
50633 +               if (copy_from_user(&bind, uarg, sizeof(bind)))
50634 +                       break;
50635 +
50636 +               op.cmd = EVTCHNOP_bind_interdomain;
50637 +               op.u.bind_interdomain.remote_dom  = bind.remote_domain;
50638 +               op.u.bind_interdomain.remote_port = bind.remote_port;
50639 +               rc = HYPERVISOR_event_channel_op(&op);
50640 +               if (rc != 0)
50641 +                       break;
50642 +
50643 +               rc = op.u.bind_interdomain.local_port;
50644 +               evtchn_bind_to_user(u, rc);
50645 +               break;
50646 +       }
50647 +
50648 +       case IOCTL_EVTCHN_BIND_UNBOUND_PORT: {
50649 +               struct ioctl_evtchn_bind_unbound_port bind;
50650 +
50651 +               rc = -EFAULT;
50652 +               if (copy_from_user(&bind, uarg, sizeof(bind)))
50653 +                       break;
50654 +
50655 +               op.cmd = EVTCHNOP_alloc_unbound;
50656 +               op.u.alloc_unbound.dom        = DOMID_SELF;
50657 +               op.u.alloc_unbound.remote_dom = bind.remote_domain;
50658 +               rc = HYPERVISOR_event_channel_op(&op);
50659 +               if (rc != 0)
50660 +                       break;
50661 +
50662 +               rc = op.u.alloc_unbound.port;
50663 +               evtchn_bind_to_user(u, rc);
50664 +               break;
50665 +       }
50666 +
50667 +       case IOCTL_EVTCHN_UNBIND: {
50668 +               struct ioctl_evtchn_unbind unbind;
50669 +               int ret;
50670 +
50671 +               rc = -EFAULT;
50672 +               if (copy_from_user(&unbind, uarg, sizeof(unbind)))
50673 +                       break;
50674 +
50675 +               rc = -EINVAL;
50676 +               if (unbind.port >= NR_EVENT_CHANNELS)
50677 +                       break;
50678 +
50679 +               spin_lock_irq(&port_user_lock);
50680 +    
50681 +               rc = -ENOTCONN;
50682 +               if (port_user[unbind.port] != u) {
50683 +                       spin_unlock_irq(&port_user_lock);
50684 +                       break;
50685 +               }
50686 +
50687 +               port_user[unbind.port] = NULL;
50688 +               mask_evtchn(unbind.port);
50689 +
50690 +               spin_unlock_irq(&port_user_lock);
50691 +
50692 +               op.cmd = EVTCHNOP_close;
50693 +               op.u.close.port = unbind.port;
50694 +               ret = HYPERVISOR_event_channel_op(&op);
50695 +               BUG_ON(ret);
50696 +
50697 +               rc = 0;
50698 +               break;
50699 +       }
50700 +
50701 +       case IOCTL_EVTCHN_NOTIFY: {
50702 +               struct ioctl_evtchn_notify notify;
50703 +
50704 +               rc = -EFAULT;
50705 +               if (copy_from_user(&notify, uarg, sizeof(notify)))
50706 +                       break;
50707 +
50708 +               if (notify.port >= NR_EVENT_CHANNELS) {
50709 +                       rc = -EINVAL;
50710 +               } else if (port_user[notify.port] != u) {
50711 +                       rc = -ENOTCONN;
50712 +               } else {
50713 +                       notify_remote_via_evtchn(notify.port);
50714 +                       rc = 0;
50715 +               }
50716 +               break;
50717 +       }
50718 +
50719 +       case IOCTL_EVTCHN_RESET: {
50720 +               /* Initialise the ring to empty. Clear errors. */
50721 +               spin_lock_irq(&port_user_lock);
50722 +               u->ring_cons = u->ring_prod = u->ring_overflow = 0;
50723 +               spin_unlock_irq(&port_user_lock);
50724 +               rc = 0;
50725 +               break;
50726 +       }
50727 +
50728 +       default:
50729 +               rc = -ENOSYS;
50730 +               break;
50731 +       }
50732 +
50733 +       return rc;
50734 +}
50735 +
50736 +static unsigned int evtchn_poll(struct file *file, poll_table *wait)
50737 +{
50738 +       unsigned int mask = POLLOUT | POLLWRNORM;
50739 +       struct per_user_data *u = file->private_data;
50740 +
50741 +       poll_wait(file, &u->evtchn_wait, wait);
50742 +       if (u->ring_cons != u->ring_prod)
50743 +               mask |= POLLIN | POLLRDNORM;
50744 +       if (u->ring_overflow)
50745 +               mask = POLLERR;
50746 +       return mask;
50747 +}
50748 +
50749 +static int evtchn_fasync(int fd, struct file *filp, int on)
50750 +{
50751 +       struct per_user_data *u = filp->private_data;
50752 +       return fasync_helper(fd, filp, on, &u->evtchn_async_queue);
50753 +}
50754 +
50755 +static int evtchn_open(struct inode *inode, struct file *filp)
50756 +{
50757 +       struct per_user_data *u;
50758 +
50759 +       if ((u = kmalloc(sizeof(*u), GFP_KERNEL)) == NULL)
50760 +               return -ENOMEM;
50761 +
50762 +       memset(u, 0, sizeof(*u));
50763 +       init_waitqueue_head(&u->evtchn_wait);
50764 +
50765 +       u->ring = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
50766 +       if (u->ring == NULL) {
50767 +               kfree(u);
50768 +               return -ENOMEM;
50769 +       }
50770 +
50771 +       filp->private_data = u;
50772 +
50773 +       return 0;
50774 +}
50775 +
50776 +static int evtchn_release(struct inode *inode, struct file *filp)
50777 +{
50778 +       int i;
50779 +       struct per_user_data *u = filp->private_data;
50780 +       evtchn_op_t op = { 0 };
50781 +
50782 +       spin_lock_irq(&port_user_lock);
50783 +
50784 +       free_page((unsigned long)u->ring);
50785 +
50786 +       for (i = 0; i < NR_EVENT_CHANNELS; i++) {
50787 +               int ret;
50788 +               if (port_user[i] != u)
50789 +                       continue;
50790 +
50791 +               port_user[i] = NULL;
50792 +               mask_evtchn(i);
50793 +
50794 +               op.cmd = EVTCHNOP_close;
50795 +               op.u.close.port = i;
50796 +               ret = HYPERVISOR_event_channel_op(&op);
50797 +               BUG_ON(ret);
50798 +       }
50799 +
50800 +       spin_unlock_irq(&port_user_lock);
50801 +
50802 +       kfree(u);
50803 +
50804 +       return 0;
50805 +}
50806 +
50807 +static struct file_operations evtchn_fops = {
50808 +       .owner   = THIS_MODULE,
50809 +       .read    = evtchn_read,
50810 +       .write   = evtchn_write,
50811 +       .ioctl   = evtchn_ioctl,
50812 +       .poll    = evtchn_poll,
50813 +       .fasync  = evtchn_fasync,
50814 +       .open    = evtchn_open,
50815 +       .release = evtchn_release,
50816 +};
50817 +
50818 +static struct miscdevice evtchn_miscdev = {
50819 +       .minor        = EVTCHN_MINOR,
50820 +       .name         = "evtchn",
50821 +       .fops         = &evtchn_fops,
50822 +       .devfs_name   = "misc/evtchn",
50823 +};
50824 +
50825 +static int __init evtchn_init(void)
50826 +{
50827 +       int err;
50828 +
50829 +       spin_lock_init(&port_user_lock);
50830 +       memset(port_user, 0, sizeof(port_user));
50831 +
50832 +       /* Create '/dev/misc/evtchn'. */
50833 +       err = misc_register(&evtchn_miscdev);
50834 +       if (err != 0) {
50835 +               printk(KERN_ALERT "Could not register /dev/misc/evtchn\n");
50836 +               return err;
50837 +       }
50838 +
50839 +       printk("Event-channel device installed.\n");
50840 +
50841 +       return 0;
50842 +}
50843 +
50844 +static void evtchn_cleanup(void)
50845 +{
50846 +       misc_deregister(&evtchn_miscdev);
50847 +}
50848 +
50849 +module_init(evtchn_init);
50850 +module_exit(evtchn_cleanup);
50851 +
50852 +MODULE_LICENSE("Dual BSD/GPL");
50853 +
50854 +/*
50855 + * Local variables:
50856 + *  c-file-style: "linux"
50857 + *  indent-tabs-mode: t
50858 + *  c-indent-level: 8
50859 + *  c-basic-offset: 8
50860 + *  tab-width: 8
50861 + * End:
50862 + */
50863 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/net_driver_util.c linux-2.6.16/drivers/xen/net_driver_util.c
50864 --- linux-2.6.16.orig/drivers/xen/net_driver_util.c     1970-01-01 01:00:00.000000000 +0100
50865 +++ linux-2.6.16/drivers/xen/net_driver_util.c  2006-06-26 09:51:32.000000000 +0200
50866 @@ -0,0 +1,68 @@
50867 +/*****************************************************************************
50868 + *
50869 + * Utility functions for Xen network devices.
50870 + *
50871 + * Copyright (c) 2005 XenSource Ltd.
50872 + * 
50873 + * This program is free software; you can redistribute it and/or
50874 + * modify it under the terms of the GNU General Public License version 2
50875 + * as published by the Free Software Foundation; or, when distributed
50876 + * separately from the Linux kernel or incorporated into other
50877 + * software packages, subject to the following license:
50878 + * 
50879 + * Permission is hereby granted, free of charge, to any person obtaining a
50880 + * copy of this source file (the "Software"), to deal in the Software without
50881 + * restriction, including without limitation the rights to use, copy, modify,
50882 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
50883 + * and to permit persons to whom the Software is furnished to do so, subject
50884 + * to the following conditions:
50885 + * 
50886 + * The above copyright notice and this permission notice shall be included in
50887 + * all copies or substantial portions of the Software.
50888 + * 
50889 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
50890 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
50891 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
50892 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
50893 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
50894 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
50895 + * DEALINGS IN THE SOFTWARE.
50896 + */
50897 +
50898 +#include <linux/if_ether.h>
50899 +#include <linux/err.h>
50900 +#include <linux/module.h>
50901 +#include <xen/net_driver_util.h>
50902 +
50903 +
50904 +int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
50905 +{
50906 +       char *s;
50907 +       int i;
50908 +       char *e;
50909 +       char *macstr = xenbus_read(XBT_NULL, dev->nodename, "mac", NULL);
50910 +       if (IS_ERR(macstr))
50911 +               return PTR_ERR(macstr);
50912 +       s = macstr;
50913 +       for (i = 0; i < ETH_ALEN; i++) {
50914 +               mac[i] = simple_strtoul(s, &e, 16);
50915 +               if (s == e || (e[0] != ':' && e[0] != 0)) {
50916 +                       kfree(macstr);
50917 +                       return -ENOENT;
50918 +               }
50919 +               s = &e[1];
50920 +       }
50921 +       kfree(macstr);
50922 +       return 0;
50923 +}
50924 +EXPORT_SYMBOL_GPL(xen_net_read_mac);
50925 +
50926 +/*
50927 + * Local variables:
50928 + *  c-file-style: "linux"
50929 + *  indent-tabs-mode: t
50930 + *  c-indent-level: 8
50931 + *  c-basic-offset: 8
50932 + *  tab-width: 8
50933 + * End:
50934 + */
50935 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/netback/Makefile linux-2.6.16/drivers/xen/netback/Makefile
50936 --- linux-2.6.16.orig/drivers/xen/netback/Makefile      1970-01-01 01:00:00.000000000 +0100
50937 +++ linux-2.6.16/drivers/xen/netback/Makefile   2006-06-26 09:51:32.000000000 +0200
50938 @@ -0,0 +1,5 @@
50939 +obj-$(CONFIG_XEN_NETDEV_BACKEND) := netbk.o
50940 +obj-$(CONFIG_XEN_NETDEV_LOOPBACK) += netloop.o
50941 +
50942 +netbk-y   := netback.o xenbus.o interface.o
50943 +netloop-y := loopback.o
50944 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/netback/common.h linux-2.6.16/drivers/xen/netback/common.h
50945 --- linux-2.6.16.orig/drivers/xen/netback/common.h      1970-01-01 01:00:00.000000000 +0100
50946 +++ linux-2.6.16/drivers/xen/netback/common.h   2006-06-26 09:51:32.000000000 +0200
50947 @@ -0,0 +1,133 @@
50948 +/******************************************************************************
50949 + * arch/xen/drivers/netif/backend/common.h
50950 + * 
50951 + * This program is free software; you can redistribute it and/or
50952 + * modify it under the terms of the GNU General Public License version 2
50953 + * as published by the Free Software Foundation; or, when distributed
50954 + * separately from the Linux kernel or incorporated into other
50955 + * software packages, subject to the following license:
50956 + * 
50957 + * Permission is hereby granted, free of charge, to any person obtaining a copy
50958 + * of this source file (the "Software"), to deal in the Software without
50959 + * restriction, including without limitation the rights to use, copy, modify,
50960 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
50961 + * and to permit persons to whom the Software is furnished to do so, subject to
50962 + * the following conditions:
50963 + * 
50964 + * The above copyright notice and this permission notice shall be included in
50965 + * all copies or substantial portions of the Software.
50966 + * 
50967 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
50968 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
50969 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
50970 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
50971 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
50972 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
50973 + * IN THE SOFTWARE.
50974 + */
50975 +
50976 +#ifndef __NETIF__BACKEND__COMMON_H__
50977 +#define __NETIF__BACKEND__COMMON_H__
50978 +
50979 +#include <linux/config.h>
50980 +#include <linux/version.h>
50981 +#include <linux/module.h>
50982 +#include <linux/interrupt.h>
50983 +#include <linux/slab.h>
50984 +#include <linux/ip.h>
50985 +#include <linux/in.h>
50986 +#include <linux/netdevice.h>
50987 +#include <linux/etherdevice.h>
50988 +#include <xen/evtchn.h>
50989 +#include <xen/interface/io/netif.h>
50990 +#include <asm/io.h>
50991 +#include <asm/pgalloc.h>
50992 +#include <xen/interface/grant_table.h>
50993 +#include <xen/gnttab.h>
50994 +#include <xen/driver_util.h>
50995 +
50996 +#define DPRINTK(_f, _a...) pr_debug("(file=%s, line=%d) " _f, \
50997 +                                    __FILE__ , __LINE__ , ## _a )
50998 +#define IPRINTK(fmt, args...) \
50999 +    printk(KERN_INFO "xen_net: " fmt, ##args)
51000 +#define WPRINTK(fmt, args...) \
51001 +    printk(KERN_WARNING "xen_net: " fmt, ##args)
51002 +
51003 +typedef struct netif_st {
51004 +       /* Unique identifier for this interface. */
51005 +       domid_t          domid;
51006 +       unsigned int     handle;
51007 +
51008 +       u8               fe_dev_addr[6];
51009 +
51010 +       /* Physical parameters of the comms window. */
51011 +       grant_handle_t   tx_shmem_handle;
51012 +       grant_ref_t      tx_shmem_ref; 
51013 +       grant_handle_t   rx_shmem_handle;
51014 +       grant_ref_t      rx_shmem_ref; 
51015 +       unsigned int     evtchn;
51016 +       unsigned int     irq;
51017 +
51018 +       /* The shared rings and indexes. */
51019 +       netif_tx_back_ring_t tx;
51020 +       netif_rx_back_ring_t rx;
51021 +       struct vm_struct *tx_comms_area;
51022 +       struct vm_struct *rx_comms_area;
51023 +
51024 +       /* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */
51025 +       RING_IDX rx_req_cons_peek;
51026 +
51027 +       /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
51028 +       unsigned long   credit_bytes;
51029 +       unsigned long   credit_usec;
51030 +       unsigned long   remaining_credit;
51031 +       struct timer_list credit_timeout;
51032 +
51033 +       /* Miscellaneous private stuff. */
51034 +       enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
51035 +       int active;
51036 +       struct list_head list;  /* scheduling list */
51037 +       atomic_t         refcnt;
51038 +       struct net_device *dev;
51039 +       struct net_device_stats stats;
51040 +
51041 +       struct work_struct free_work;
51042 +} netif_t;
51043 +
51044 +#define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE)
51045 +#define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE)
51046 +
51047 +void netif_disconnect(netif_t *netif);
51048 +
51049 +netif_t *alloc_netif(domid_t domid, unsigned int handle, u8 be_mac[ETH_ALEN]);
51050 +void free_netif(netif_t *netif);
51051 +int netif_map(netif_t *netif, unsigned long tx_ring_ref,
51052 +             unsigned long rx_ring_ref, unsigned int evtchn);
51053 +
51054 +#define netif_get(_b) (atomic_inc(&(_b)->refcnt))
51055 +#define netif_put(_b)                                          \
51056 +       do {                                                    \
51057 +               if ( atomic_dec_and_test(&(_b)->refcnt) )       \
51058 +                       free_netif(_b);                         \
51059 +       } while (0)
51060 +
51061 +void netif_xenbus_init(void);
51062 +
51063 +void netif_schedule_work(netif_t *netif);
51064 +void netif_deschedule_work(netif_t *netif);
51065 +
51066 +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
51067 +struct net_device_stats *netif_be_get_stats(struct net_device *dev);
51068 +irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs);
51069 +
51070 +#endif /* __NETIF__BACKEND__COMMON_H__ */
51071 +
51072 +/*
51073 + * Local variables:
51074 + *  c-file-style: "linux"
51075 + *  indent-tabs-mode: t
51076 + *  c-indent-level: 8
51077 + *  c-basic-offset: 8
51078 + *  tab-width: 8
51079 + * End:
51080 + */
51081 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/netback/interface.c linux-2.6.16/drivers/xen/netback/interface.c
51082 --- linux-2.6.16.orig/drivers/xen/netback/interface.c   1970-01-01 01:00:00.000000000 +0100
51083 +++ linux-2.6.16/drivers/xen/netback/interface.c        2006-06-26 09:51:32.000000000 +0200
51084 @@ -0,0 +1,334 @@
51085 +/******************************************************************************
51086 + * arch/xen/drivers/netif/backend/interface.c
51087 + * 
51088 + * Network-device interface management.
51089 + * 
51090 + * Copyright (c) 2004-2005, Keir Fraser
51091 + * 
51092 + * This program is free software; you can redistribute it and/or
51093 + * modify it under the terms of the GNU General Public License version 2
51094 + * as published by the Free Software Foundation; or, when distributed
51095 + * separately from the Linux kernel or incorporated into other
51096 + * software packages, subject to the following license:
51097 + * 
51098 + * Permission is hereby granted, free of charge, to any person obtaining a copy
51099 + * of this source file (the "Software"), to deal in the Software without
51100 + * restriction, including without limitation the rights to use, copy, modify,
51101 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
51102 + * and to permit persons to whom the Software is furnished to do so, subject to
51103 + * the following conditions:
51104 + * 
51105 + * The above copyright notice and this permission notice shall be included in
51106 + * all copies or substantial portions of the Software.
51107 + * 
51108 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
51109 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
51110 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
51111 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
51112 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
51113 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
51114 + * IN THE SOFTWARE.
51115 + */
51116 +
51117 +#include "common.h"
51118 +#include <linux/ethtool.h>
51119 +#include <linux/rtnetlink.h>
51120 +
51121 +static void __netif_up(netif_t *netif)
51122 +{
51123 +       struct net_device *dev = netif->dev;
51124 +       spin_lock_bh(&dev->xmit_lock);
51125 +       netif->active = 1;
51126 +       spin_unlock_bh(&dev->xmit_lock);
51127 +       enable_irq(netif->irq);
51128 +       netif_schedule_work(netif);
51129 +}
51130 +
51131 +static void __netif_down(netif_t *netif)
51132 +{
51133 +       struct net_device *dev = netif->dev;
51134 +       disable_irq(netif->irq);
51135 +       spin_lock_bh(&dev->xmit_lock);
51136 +       netif->active = 0;
51137 +       spin_unlock_bh(&dev->xmit_lock);
51138 +       netif_deschedule_work(netif);
51139 +}
51140 +
51141 +static int net_open(struct net_device *dev)
51142 +{
51143 +       netif_t *netif = netdev_priv(dev);
51144 +       if (netif->status == CONNECTED)
51145 +               __netif_up(netif);
51146 +       netif_start_queue(dev);
51147 +       return 0;
51148 +}
51149 +
51150 +static int net_close(struct net_device *dev)
51151 +{
51152 +       netif_t *netif = netdev_priv(dev);
51153 +       netif_stop_queue(dev);
51154 +       if (netif->status == CONNECTED)
51155 +               __netif_down(netif);
51156 +       return 0;
51157 +}
51158 +
51159 +static struct ethtool_ops network_ethtool_ops =
51160 +{
51161 +       .get_tx_csum = ethtool_op_get_tx_csum,
51162 +       .set_tx_csum = ethtool_op_set_tx_csum,
51163 +};
51164 +
51165 +netif_t *alloc_netif(domid_t domid, unsigned int handle, u8 be_mac[ETH_ALEN])
51166 +{
51167 +       int err = 0, i;
51168 +       struct net_device *dev;
51169 +       netif_t *netif;
51170 +       char name[IFNAMSIZ] = {};
51171 +
51172 +       snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
51173 +       dev = alloc_netdev(sizeof(netif_t), name, ether_setup);
51174 +       if (dev == NULL) {
51175 +               DPRINTK("Could not create netif: out of memory\n");
51176 +               return ERR_PTR(-ENOMEM);
51177 +       }
51178 +
51179 +       netif = netdev_priv(dev);
51180 +       memset(netif, 0, sizeof(*netif));
51181 +       netif->domid  = domid;
51182 +       netif->handle = handle;
51183 +       netif->status = DISCONNECTED;
51184 +       atomic_set(&netif->refcnt, 0);
51185 +       netif->dev = dev;
51186 +
51187 +       netif->credit_bytes = netif->remaining_credit = ~0UL;
51188 +       netif->credit_usec  = 0UL;
51189 +       init_timer(&netif->credit_timeout);
51190 +
51191 +       dev->hard_start_xmit = netif_be_start_xmit;
51192 +       dev->get_stats       = netif_be_get_stats;
51193 +       dev->open            = net_open;
51194 +       dev->stop            = net_close;
51195 +       dev->features        = NETIF_F_IP_CSUM;
51196 +
51197 +       SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
51198 +
51199 +       /* Disable queuing. */
51200 +       dev->tx_queue_len = 0;
51201 +
51202 +       for (i = 0; i < ETH_ALEN; i++)
51203 +               if (be_mac[i] != 0)
51204 +                       break;
51205 +       if (i == ETH_ALEN) {
51206 +               /*
51207 +                * Initialise a dummy MAC address. We choose the numerically
51208 +                * largest non-broadcast address to prevent the address getting
51209 +                * stolen by an Ethernet bridge for STP purposes.
51210 +                 * (FE:FF:FF:FF:FF:FF) 
51211 +                */ 
51212 +               memset(dev->dev_addr, 0xFF, ETH_ALEN);
51213 +               dev->dev_addr[0] &= ~0x01;
51214 +       } else
51215 +               memcpy(dev->dev_addr, be_mac, ETH_ALEN);
51216 +
51217 +       rtnl_lock();
51218 +       err = register_netdevice(dev);
51219 +       rtnl_unlock();
51220 +       if (err) {
51221 +               DPRINTK("Could not register new net device %s: err=%d\n",
51222 +                       dev->name, err);
51223 +               free_netdev(dev);
51224 +               return ERR_PTR(err);
51225 +       }
51226 +
51227 +       DPRINTK("Successfully created netif\n");
51228 +       return netif;
51229 +}
51230 +
51231 +static int map_frontend_pages(
51232 +       netif_t *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref)
51233 +{
51234 +       struct gnttab_map_grant_ref op;
51235 +       int ret;
51236 +
51237 +       op.host_addr = (unsigned long)netif->tx_comms_area->addr;
51238 +       op.flags     = GNTMAP_host_map;
51239 +       op.ref       = tx_ring_ref;
51240 +       op.dom       = netif->domid;
51241 +    
51242 +       lock_vm_area(netif->tx_comms_area);
51243 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
51244 +       unlock_vm_area(netif->tx_comms_area);
51245 +       BUG_ON(ret);
51246 +
51247 +       if (op.status) { 
51248 +               DPRINTK(" Gnttab failure mapping tx_ring_ref!\n");
51249 +               return op.status;
51250 +       }
51251 +
51252 +       netif->tx_shmem_ref    = tx_ring_ref;
51253 +       netif->tx_shmem_handle = op.handle;
51254 +
51255 +       op.host_addr = (unsigned long)netif->rx_comms_area->addr;
51256 +       op.flags     = GNTMAP_host_map;
51257 +       op.ref       = rx_ring_ref;
51258 +       op.dom       = netif->domid;
51259 +
51260 +       lock_vm_area(netif->rx_comms_area);
51261 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
51262 +       unlock_vm_area(netif->rx_comms_area);
51263 +       BUG_ON(ret);
51264 +
51265 +       if (op.status) {
51266 +               DPRINTK(" Gnttab failure mapping rx_ring_ref!\n");
51267 +               return op.status;
51268 +       }
51269 +
51270 +       netif->rx_shmem_ref    = rx_ring_ref;
51271 +       netif->rx_shmem_handle = op.handle;
51272 +
51273 +       return 0;
51274 +}
51275 +
51276 +static void unmap_frontend_pages(netif_t *netif)
51277 +{
51278 +       struct gnttab_unmap_grant_ref op;
51279 +       int ret;
51280 +
51281 +       op.host_addr    = (unsigned long)netif->tx_comms_area->addr;
51282 +       op.handle       = netif->tx_shmem_handle;
51283 +       op.dev_bus_addr = 0;
51284 +
51285 +       lock_vm_area(netif->tx_comms_area);
51286 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
51287 +       unlock_vm_area(netif->tx_comms_area);
51288 +       BUG_ON(ret);
51289 +
51290 +       op.host_addr    = (unsigned long)netif->rx_comms_area->addr;
51291 +       op.handle       = netif->rx_shmem_handle;
51292 +       op.dev_bus_addr = 0;
51293 +
51294 +       lock_vm_area(netif->rx_comms_area);
51295 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
51296 +       unlock_vm_area(netif->rx_comms_area);
51297 +       BUG_ON(ret);
51298 +}
51299 +
51300 +int netif_map(netif_t *netif, unsigned long tx_ring_ref,
51301 +             unsigned long rx_ring_ref, unsigned int evtchn)
51302 +{
51303 +       int err = -ENOMEM;
51304 +       netif_tx_sring_t *txs;
51305 +       netif_rx_sring_t *rxs;
51306 +       evtchn_op_t op = {
51307 +               .cmd = EVTCHNOP_bind_interdomain,
51308 +               .u.bind_interdomain.remote_dom = netif->domid,
51309 +               .u.bind_interdomain.remote_port = evtchn };
51310 +
51311 +       /* Already connected through? */
51312 +       if (netif->irq)
51313 +               return 0;
51314 +
51315 +       netif->tx_comms_area = alloc_vm_area(PAGE_SIZE);
51316 +       if (netif->tx_comms_area == NULL)
51317 +               return -ENOMEM;
51318 +       netif->rx_comms_area = alloc_vm_area(PAGE_SIZE);
51319 +       if (netif->rx_comms_area == NULL)
51320 +               goto err_rx;
51321 +
51322 +       err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref);
51323 +       if (err)
51324 +               goto err_map;
51325 +
51326 +       err = HYPERVISOR_event_channel_op(&op);
51327 +       if (err)
51328 +               goto err_hypervisor;
51329 +
51330 +       netif->evtchn = op.u.bind_interdomain.local_port;
51331 +
51332 +       netif->irq = bind_evtchn_to_irqhandler(
51333 +               netif->evtchn, netif_be_int, 0, netif->dev->name, netif);
51334 +       disable_irq(netif->irq);
51335 +
51336 +       txs = (netif_tx_sring_t *)netif->tx_comms_area->addr;
51337 +       BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE);
51338 +
51339 +       rxs = (netif_rx_sring_t *)
51340 +               ((char *)netif->rx_comms_area->addr);
51341 +       BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE);
51342 +
51343 +       netif->rx_req_cons_peek = 0;
51344 +
51345 +       netif_get(netif);
51346 +       wmb(); /* Other CPUs see new state before interface is started. */
51347 +
51348 +       rtnl_lock();
51349 +       netif->status = CONNECTED;
51350 +       wmb();
51351 +       if (netif_running(netif->dev))
51352 +               __netif_up(netif);
51353 +       rtnl_unlock();
51354 +
51355 +       return 0;
51356 +err_hypervisor:
51357 +       unmap_frontend_pages(netif);
51358 +err_map:
51359 +       free_vm_area(netif->rx_comms_area);
51360 +err_rx:
51361 +       free_vm_area(netif->tx_comms_area);
51362 +       return err;
51363 +}
51364 +
51365 +static void free_netif_callback(void *arg)
51366 +{
51367 +       netif_t *netif = (netif_t *)arg;
51368 +
51369 +       if (netif->irq)
51370 +               unbind_from_irqhandler(netif->irq, netif);
51371 +       
51372 +       unregister_netdev(netif->dev);
51373 +
51374 +       if (netif->tx.sring) {
51375 +               unmap_frontend_pages(netif);
51376 +               free_vm_area(netif->tx_comms_area);
51377 +               free_vm_area(netif->rx_comms_area);
51378 +       }
51379 +
51380 +       free_netdev(netif->dev);
51381 +}
51382 +
51383 +void free_netif(netif_t *netif)
51384 +{
51385 +       INIT_WORK(&netif->free_work, free_netif_callback, (void *)netif);
51386 +       schedule_work(&netif->free_work);
51387 +}
51388 +
51389 +void netif_disconnect(netif_t *netif)
51390 +{
51391 +       switch (netif->status) {
51392 +       case CONNECTED:
51393 +               rtnl_lock();
51394 +               netif->status = DISCONNECTING;
51395 +               wmb();
51396 +               if (netif_running(netif->dev))
51397 +                       __netif_down(netif);
51398 +               rtnl_unlock();
51399 +               netif_put(netif);
51400 +               break;
51401 +       case DISCONNECTED:
51402 +               BUG_ON(atomic_read(&netif->refcnt) != 0);
51403 +               free_netif(netif);
51404 +               break;
51405 +       default:
51406 +               BUG();
51407 +       }
51408 +}
51409 +
51410 +/*
51411 + * Local variables:
51412 + *  c-file-style: "linux"
51413 + *  indent-tabs-mode: t
51414 + *  c-indent-level: 8
51415 + *  c-basic-offset: 8
51416 + *  tab-width: 8
51417 + * End:
51418 + */
51419 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/netback/loopback.c linux-2.6.16/drivers/xen/netback/loopback.c
51420 --- linux-2.6.16.orig/drivers/xen/netback/loopback.c    1970-01-01 01:00:00.000000000 +0100
51421 +++ linux-2.6.16/drivers/xen/netback/loopback.c 2006-06-26 09:51:32.000000000 +0200
51422 @@ -0,0 +1,254 @@
51423 +/******************************************************************************
51424 + * netback/loopback.c
51425 + * 
51426 + * A two-interface loopback device to emulate a local netfront-netback
51427 + * connection. This ensures that local packet delivery looks identical
51428 + * to inter-domain delivery. Most importantly, packets delivered locally
51429 + * originating from other domains will get *copied* when they traverse this
51430 + * driver. This prevents unbounded delays in socket-buffer queues from
51431 + * causing the netback driver to "seize up".
51432 + * 
51433 + * This driver creates a symmetric pair of loopback interfaces with names
51434 + * vif0.0 and veth0. The intention is that 'vif0.0' is bound to an Ethernet
51435 + * bridge, just like a proper netback interface, while a local IP interface
51436 + * is configured on 'veth0'.
51437 + * 
51438 + * As with a real netback interface, vif0.0 is configured with a suitable
51439 + * dummy MAC address. No default is provided for veth0: a reasonable strategy
51440 + * is to transfer eth0's MAC address to veth0, and give eth0 a dummy address
51441 + * (to avoid confusing the Etherbridge).
51442 + * 
51443 + * Copyright (c) 2005 K A Fraser
51444 + * 
51445 + * This program is free software; you can redistribute it and/or
51446 + * modify it under the terms of the GNU General Public License version 2
51447 + * as published by the Free Software Foundation; or, when distributed
51448 + * separately from the Linux kernel or incorporated into other
51449 + * software packages, subject to the following license:
51450 + * 
51451 + * Permission is hereby granted, free of charge, to any person obtaining a copy
51452 + * of this source file (the "Software"), to deal in the Software without
51453 + * restriction, including without limitation the rights to use, copy, modify,
51454 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
51455 + * and to permit persons to whom the Software is furnished to do so, subject to
51456 + * the following conditions:
51457 + * 
51458 + * The above copyright notice and this permission notice shall be included in
51459 + * all copies or substantial portions of the Software.
51460 + * 
51461 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
51462 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
51463 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
51464 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
51465 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
51466 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
51467 + * IN THE SOFTWARE.
51468 + */
51469 +
51470 +#include <linux/config.h>
51471 +#include <linux/module.h>
51472 +#include <linux/netdevice.h>
51473 +#include <linux/inetdevice.h>
51474 +#include <linux/etherdevice.h>
51475 +#include <linux/skbuff.h>
51476 +#include <linux/ethtool.h>
51477 +#include <net/dst.h>
51478 +
51479 +static int nloopbacks = 8;
51480 +module_param(nloopbacks, int, 0);
51481 +MODULE_PARM_DESC(nloopbacks, "Number of netback-loopback devices to create");
51482 +
51483 +struct net_private {
51484 +       struct net_device *loopback_dev;
51485 +       struct net_device_stats stats;
51486 +};
51487 +
51488 +static int loopback_open(struct net_device *dev)
51489 +{
51490 +       struct net_private *np = netdev_priv(dev);
51491 +       memset(&np->stats, 0, sizeof(np->stats));
51492 +       netif_start_queue(dev);
51493 +       return 0;
51494 +}
51495 +
51496 +static int loopback_close(struct net_device *dev)
51497 +{
51498 +       netif_stop_queue(dev);
51499 +       return 0;
51500 +}
51501 +
51502 +static int loopback_start_xmit(struct sk_buff *skb, struct net_device *dev)
51503 +{
51504 +       struct net_private *np = netdev_priv(dev);
51505 +
51506 +       dst_release(skb->dst);
51507 +       skb->dst = NULL;
51508 +
51509 +       skb_orphan(skb);
51510 +
51511 +       np->stats.tx_bytes += skb->len;
51512 +       np->stats.tx_packets++;
51513 +
51514 +       /* Switch to loopback context. */
51515 +       dev = np->loopback_dev;
51516 +       np  = netdev_priv(dev);
51517 +
51518 +       np->stats.rx_bytes += skb->len;
51519 +       np->stats.rx_packets++;
51520 +
51521 +       if (skb->ip_summed == CHECKSUM_HW) {
51522 +               /* Defer checksum calculation. */
51523 +               skb->proto_csum_blank = 1;
51524 +               /* Must be a local packet: assert its integrity. */
51525 +               skb->proto_data_valid = 1;
51526 +       }
51527 +
51528 +       skb->ip_summed = skb->proto_data_valid ?
51529 +               CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
51530 +
51531 +       skb->pkt_type = PACKET_HOST; /* overridden by eth_type_trans() */
51532 +       skb->protocol = eth_type_trans(skb, dev);
51533 +       skb->dev      = dev;
51534 +       dev->last_rx  = jiffies;
51535 +       netif_rx(skb);
51536 +
51537 +       return 0;
51538 +}
51539 +
51540 +static struct net_device_stats *loopback_get_stats(struct net_device *dev)
51541 +{
51542 +       struct net_private *np = netdev_priv(dev);
51543 +       return &np->stats;
51544 +}
51545 +
51546 +static struct ethtool_ops network_ethtool_ops =
51547 +{
51548 +       .get_tx_csum = ethtool_op_get_tx_csum,
51549 +       .set_tx_csum = ethtool_op_set_tx_csum,
51550 +};
51551 +
51552 +static void loopback_construct(struct net_device *dev, struct net_device *lo)
51553 +{
51554 +       struct net_private *np = netdev_priv(dev);
51555 +
51556 +       np->loopback_dev     = lo;
51557 +
51558 +       dev->open            = loopback_open;
51559 +       dev->stop            = loopback_close;
51560 +       dev->hard_start_xmit = loopback_start_xmit;
51561 +       dev->get_stats       = loopback_get_stats;
51562 +
51563 +       dev->tx_queue_len    = 0;
51564 +
51565 +       dev->features        = (NETIF_F_HIGHDMA |
51566 +                               NETIF_F_LLTX |
51567 +                               NETIF_F_IP_CSUM);
51568 +
51569 +       SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
51570 +
51571 +       /*
51572 +        * We do not set a jumbo MTU on the interface. Otherwise the network
51573 +        * stack will try to send large packets that will get dropped by the
51574 +        * Ethernet bridge (unless the physical Ethernet interface is
51575 +        * configured to transfer jumbo packets). If a larger MTU is desired
51576 +        * then the system administrator can specify it using the 'ifconfig'
51577 +        * command.
51578 +        */
51579 +       /*dev->mtu             = 16*1024;*/
51580 +}
51581 +
51582 +static int __init make_loopback(int i)
51583 +{
51584 +       struct net_device *dev1, *dev2;
51585 +       char dev_name[IFNAMSIZ];
51586 +       int err = -ENOMEM;
51587 +
51588 +       sprintf(dev_name, "vif0.%d", i);
51589 +       dev1 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup);
51590 +       if (!dev1)
51591 +               return err;
51592 +
51593 +       sprintf(dev_name, "veth%d", i);
51594 +       dev2 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup);
51595 +       if (!dev2)
51596 +               goto fail_netdev2;
51597 +
51598 +       loopback_construct(dev1, dev2);
51599 +       loopback_construct(dev2, dev1);
51600 +
51601 +       /*
51602 +        * Initialise a dummy MAC address for the 'dummy backend' interface. We
51603 +        * choose the numerically largest non-broadcast address to prevent the
51604 +        * address getting stolen by an Ethernet bridge for STP purposes.
51605 +        */
51606 +       memset(dev1->dev_addr, 0xFF, ETH_ALEN);
51607 +       dev1->dev_addr[0] &= ~0x01;
51608 +
51609 +       if ((err = register_netdev(dev1)) != 0)
51610 +               goto fail;
51611 +
51612 +       if ((err = register_netdev(dev2)) != 0) {
51613 +               unregister_netdev(dev1);
51614 +               goto fail;
51615 +       }
51616 +
51617 +       return 0;
51618 +
51619 + fail:
51620 +       free_netdev(dev2);
51621 + fail_netdev2:
51622 +       free_netdev(dev1);
51623 +       return err;
51624 +}
51625 +
51626 +static void __init clean_loopback(int i)
51627 +{
51628 +       struct net_device *dev1, *dev2;
51629 +       char dev_name[IFNAMSIZ];
51630 +
51631 +       sprintf(dev_name, "vif0.%d", i);
51632 +       dev1 = dev_get_by_name(dev_name);
51633 +       sprintf(dev_name, "veth%d", i);
51634 +       dev2 = dev_get_by_name(dev_name);
51635 +       if (dev1 && dev2) {
51636 +               unregister_netdev(dev2);
51637 +               unregister_netdev(dev1);
51638 +               free_netdev(dev2);
51639 +               free_netdev(dev1);
51640 +       }
51641 +}
51642 +
51643 +static int __init loopback_init(void)
51644 +{
51645 +       int i, err = 0;
51646 +
51647 +       for (i = 0; i < nloopbacks; i++)
51648 +               if ((err = make_loopback(i)) != 0)
51649 +                       break;
51650 +
51651 +       return err;
51652 +}
51653 +
51654 +module_init(loopback_init);
51655 +
51656 +static void __exit loopback_exit(void)
51657 +{
51658 +       int i;
51659 +
51660 +       for (i = nloopbacks; i-- > 0; )
51661 +               clean_loopback(i);
51662 +}
51663 +
51664 +module_exit(loopback_exit);
51665 +
51666 +MODULE_LICENSE("Dual BSD/GPL");
51667 +
51668 +/*
51669 + * Local variables:
51670 + *  c-file-style: "linux"
51671 + *  indent-tabs-mode: t
51672 + *  c-indent-level: 8
51673 + *  c-basic-offset: 8
51674 + *  tab-width: 8
51675 + * End:
51676 + */
51677 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/netback/netback.c linux-2.6.16/drivers/xen/netback/netback.c
51678 --- linux-2.6.16.orig/drivers/xen/netback/netback.c     1970-01-01 01:00:00.000000000 +0100
51679 +++ linux-2.6.16/drivers/xen/netback/netback.c  2006-06-26 09:51:32.000000000 +0200
51680 @@ -0,0 +1,868 @@
51681 +/******************************************************************************
51682 + * drivers/xen/netback/netback.c
51683 + * 
51684 + * Back-end of the driver for virtual network devices. This portion of the
51685 + * driver exports a 'unified' network-device interface that can be accessed
51686 + * by any operating system that implements a compatible front end. A 
51687 + * reference front-end implementation can be found in:
51688 + *  drivers/xen/netfront/netfront.c
51689 + * 
51690 + * Copyright (c) 2002-2005, K A Fraser
51691 + * 
51692 + * This program is free software; you can redistribute it and/or
51693 + * modify it under the terms of the GNU General Public License version 2
51694 + * as published by the Free Software Foundation; or, when distributed
51695 + * separately from the Linux kernel or incorporated into other
51696 + * software packages, subject to the following license:
51697 + * 
51698 + * Permission is hereby granted, free of charge, to any person obtaining a copy
51699 + * of this source file (the "Software"), to deal in the Software without
51700 + * restriction, including without limitation the rights to use, copy, modify,
51701 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
51702 + * and to permit persons to whom the Software is furnished to do so, subject to
51703 + * the following conditions:
51704 + * 
51705 + * The above copyright notice and this permission notice shall be included in
51706 + * all copies or substantial portions of the Software.
51707 + * 
51708 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
51709 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
51710 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
51711 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
51712 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
51713 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
51714 + * IN THE SOFTWARE.
51715 + */
51716 +
51717 +#include "common.h"
51718 +#include <xen/balloon.h>
51719 +#include <xen/interface/memory.h>
51720 +
51721 +/*#define NETBE_DEBUG_INTERRUPT*/
51722 +
51723 +static void netif_idx_release(u16 pending_idx);
51724 +static void netif_page_release(struct page *page);
51725 +static void make_tx_response(netif_t *netif, 
51726 +                             u16      id,
51727 +                             s8       st);
51728 +static int  make_rx_response(netif_t *netif, 
51729 +                             u16      id, 
51730 +                             s8       st,
51731 +                             u16      offset,
51732 +                             u16      size,
51733 +                             u16      flags);
51734 +
51735 +static void net_tx_action(unsigned long unused);
51736 +static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
51737 +
51738 +static void net_rx_action(unsigned long unused);
51739 +static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
51740 +
51741 +static struct timer_list net_timer;
51742 +
51743 +#define MAX_PENDING_REQS 256
51744 +
51745 +static struct sk_buff_head rx_queue;
51746 +static multicall_entry_t rx_mcl[NET_RX_RING_SIZE+1];
51747 +static mmu_update_t rx_mmu[NET_RX_RING_SIZE];
51748 +static gnttab_transfer_t grant_rx_op[NET_RX_RING_SIZE];
51749 +static unsigned char rx_notify[NR_IRQS];
51750 +
51751 +static unsigned long mmap_vstart;
51752 +#define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE))
51753 +
51754 +#define PKT_PROT_LEN 64
51755 +
51756 +static struct {
51757 +       netif_tx_request_t req;
51758 +       netif_t *netif;
51759 +} pending_tx_info[MAX_PENDING_REQS];
51760 +static u16 pending_ring[MAX_PENDING_REQS];
51761 +typedef unsigned int PEND_RING_IDX;
51762 +#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
51763 +static PEND_RING_IDX pending_prod, pending_cons;
51764 +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
51765 +
51766 +/* Freed TX SKBs get batched on this ring before return to pending_ring. */
51767 +static u16 dealloc_ring[MAX_PENDING_REQS];
51768 +static PEND_RING_IDX dealloc_prod, dealloc_cons;
51769 +
51770 +static struct sk_buff_head tx_queue;
51771 +
51772 +static grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
51773 +static gnttab_unmap_grant_ref_t tx_unmap_ops[MAX_PENDING_REQS];
51774 +static gnttab_map_grant_ref_t tx_map_ops[MAX_PENDING_REQS];
51775 +
51776 +static struct list_head net_schedule_list;
51777 +static spinlock_t net_schedule_list_lock;
51778 +
51779 +#define MAX_MFN_ALLOC 64
51780 +static unsigned long mfn_list[MAX_MFN_ALLOC];
51781 +static unsigned int alloc_index = 0;
51782 +static spinlock_t mfn_lock = SPIN_LOCK_UNLOCKED;
51783 +
51784 +static unsigned long alloc_mfn(void)
51785 +{
51786 +       unsigned long mfn = 0, flags;
51787 +       struct xen_memory_reservation reservation = {
51788 +               .extent_start = mfn_list,
51789 +               .nr_extents   = MAX_MFN_ALLOC,
51790 +               .extent_order = 0,
51791 +               .domid        = DOMID_SELF
51792 +       };
51793 +       spin_lock_irqsave(&mfn_lock, flags);
51794 +       if ( unlikely(alloc_index == 0) )
51795 +               alloc_index = HYPERVISOR_memory_op(
51796 +                       XENMEM_increase_reservation, &reservation);
51797 +       if ( alloc_index != 0 )
51798 +               mfn = mfn_list[--alloc_index];
51799 +       spin_unlock_irqrestore(&mfn_lock, flags);
51800 +       return mfn;
51801 +}
51802 +
51803 +static inline void maybe_schedule_tx_action(void)
51804 +{
51805 +       smp_mb();
51806 +       if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
51807 +           !list_empty(&net_schedule_list))
51808 +               tasklet_schedule(&net_tx_tasklet);
51809 +}
51810 +
51811 +/*
51812 + * A gross way of confirming the origin of an skb data page. The slab
51813 + * allocator abuses a field in the page struct to cache the kmem_cache_t ptr.
51814 + */
51815 +static inline int is_xen_skb(struct sk_buff *skb)
51816 +{
51817 +       extern kmem_cache_t *skbuff_cachep;
51818 +       kmem_cache_t *cp = (kmem_cache_t *)virt_to_page(skb->head)->lru.next;
51819 +       return (cp == skbuff_cachep);
51820 +}
51821 +
51822 +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
51823 +{
51824 +       netif_t *netif = netdev_priv(dev);
51825 +
51826 +       BUG_ON(skb->dev != dev);
51827 +
51828 +       /* Drop the packet if the target domain has no receive buffers. */
51829 +       if (!netif->active || 
51830 +           (netif->rx_req_cons_peek == netif->rx.sring->req_prod) ||
51831 +           ((netif->rx_req_cons_peek - netif->rx.rsp_prod_pvt) ==
51832 +            NET_RX_RING_SIZE))
51833 +               goto drop;
51834 +
51835 +       /*
51836 +        * We do not copy the packet unless:
51837 +        *  1. The data is shared; or
51838 +        *  2. The data is not allocated from our special cache.
51839 +        * NB. We also couldn't cope with fragmented packets, but we won't get
51840 +        *     any because we not advertise the NETIF_F_SG feature.
51841 +        */
51842 +       if (skb_shared(skb) || skb_cloned(skb) || !is_xen_skb(skb)) {
51843 +               int hlen = skb->data - skb->head;
51844 +               int ret;
51845 +               struct sk_buff *nskb = dev_alloc_skb(hlen + skb->len);
51846 +               if ( unlikely(nskb == NULL) )
51847 +                       goto drop;
51848 +               skb_reserve(nskb, hlen);
51849 +               __skb_put(nskb, skb->len);
51850 +               ret = skb_copy_bits(skb, -hlen, nskb->data - hlen,
51851 +                                    skb->len + hlen);
51852 +               BUG_ON(ret);
51853 +               nskb->dev = skb->dev;
51854 +               nskb->proto_data_valid = skb->proto_data_valid;
51855 +               dev_kfree_skb(skb);
51856 +               skb = nskb;
51857 +       }
51858 +
51859 +       netif->rx_req_cons_peek++;
51860 +       netif_get(netif);
51861 +
51862 +       skb_queue_tail(&rx_queue, skb);
51863 +       tasklet_schedule(&net_rx_tasklet);
51864 +
51865 +       return 0;
51866 +
51867 + drop:
51868 +       netif->stats.tx_dropped++;
51869 +       dev_kfree_skb(skb);
51870 +       return 0;
51871 +}
51872 +
51873 +#if 0
51874 +static void xen_network_done_notify(void)
51875 +{
51876 +       static struct net_device *eth0_dev = NULL;
51877 +       if (unlikely(eth0_dev == NULL))
51878 +               eth0_dev = __dev_get_by_name("eth0");
51879 +       netif_rx_schedule(eth0_dev);
51880 +}
51881 +/* 
51882 + * Add following to poll() function in NAPI driver (Tigon3 is example):
51883 + *  if ( xen_network_done() )
51884 + *      tg3_enable_ints(tp); 
51885 + */
51886 +int xen_network_done(void)
51887 +{
51888 +       return skb_queue_empty(&rx_queue);
51889 +}
51890 +#endif
51891 +
51892 +static void net_rx_action(unsigned long unused)
51893 +{
51894 +       netif_t *netif = NULL; 
51895 +       s8 status;
51896 +       u16 size, id, irq, flags;
51897 +       multicall_entry_t *mcl;
51898 +       mmu_update_t *mmu;
51899 +       gnttab_transfer_t *gop;
51900 +       unsigned long vdata, old_mfn, new_mfn;
51901 +       struct sk_buff_head rxq;
51902 +       struct sk_buff *skb;
51903 +       u16 notify_list[NET_RX_RING_SIZE];
51904 +       int notify_nr = 0;
51905 +       int ret;
51906 +
51907 +       skb_queue_head_init(&rxq);
51908 +
51909 +       mcl = rx_mcl;
51910 +       mmu = rx_mmu;
51911 +       gop = grant_rx_op;
51912 +
51913 +       while ((skb = skb_dequeue(&rx_queue)) != NULL) {
51914 +               netif   = netdev_priv(skb->dev);
51915 +               vdata   = (unsigned long)skb->data;
51916 +               old_mfn = virt_to_mfn(vdata);
51917 +
51918 +               /* Memory squeeze? Back off for an arbitrary while. */
51919 +               if ((new_mfn = alloc_mfn()) == 0) {
51920 +                       if ( net_ratelimit() )
51921 +                               WPRINTK("Memory squeeze in netback driver.\n");
51922 +                       mod_timer(&net_timer, jiffies + HZ);
51923 +                       skb_queue_head(&rx_queue, skb);
51924 +                       break;
51925 +               }
51926 +               /*
51927 +                * Set the new P2M table entry before reassigning the old data
51928 +                * page. Heed the comment in pgtable-2level.h:pte_page(). :-)
51929 +                */
51930 +               set_phys_to_machine(__pa(skb->data) >> PAGE_SHIFT, new_mfn);
51931 +
51932 +               MULTI_update_va_mapping(mcl, vdata,
51933 +                                       pfn_pte_ma(new_mfn, PAGE_KERNEL), 0);
51934 +               mcl++;
51935 +
51936 +               gop->mfn = old_mfn;
51937 +               gop->domid = netif->domid;
51938 +               gop->ref = RING_GET_REQUEST(
51939 +                       &netif->rx, netif->rx.req_cons)->gref;
51940 +               netif->rx.req_cons++;
51941 +               gop++;
51942 +
51943 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
51944 +                       mmu->ptr = ((maddr_t)new_mfn << PAGE_SHIFT) |
51945 +                               MMU_MACHPHYS_UPDATE;
51946 +                       mmu->val = __pa(vdata) >> PAGE_SHIFT;
51947 +                       mmu++;
51948 +               }
51949 +
51950 +               __skb_queue_tail(&rxq, skb);
51951 +
51952 +               /* Filled the batch queue? */
51953 +               if ((gop - grant_rx_op) == ARRAY_SIZE(grant_rx_op))
51954 +                       break;
51955 +       }
51956 +
51957 +       if (mcl == rx_mcl)
51958 +               return;
51959 +
51960 +       mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
51961 +
51962 +       if (mmu - rx_mmu) {
51963 +               mcl->op = __HYPERVISOR_mmu_update;
51964 +               mcl->args[0] = (unsigned long)rx_mmu;
51965 +               mcl->args[1] = mmu - rx_mmu;
51966 +               mcl->args[2] = 0;
51967 +               mcl->args[3] = DOMID_SELF;
51968 +               mcl++;
51969 +       }
51970 +
51971 +       ret = HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl);
51972 +       BUG_ON(ret != 0);
51973 +
51974 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_transfer, grant_rx_op, 
51975 +                                       gop - grant_rx_op);
51976 +       BUG_ON(ret != 0);
51977 +
51978 +       mcl = rx_mcl;
51979 +       gop = grant_rx_op;
51980 +       while ((skb = __skb_dequeue(&rxq)) != NULL) {
51981 +               netif   = netdev_priv(skb->dev);
51982 +               size    = skb->tail - skb->data;
51983 +
51984 +               /* Rederive the machine addresses. */
51985 +               new_mfn = mcl->args[1] >> PAGE_SHIFT;
51986 +               old_mfn = gop->mfn;
51987 +               atomic_set(&(skb_shinfo(skb)->dataref), 1);
51988 +               skb_shinfo(skb)->nr_frags = 0;
51989 +               skb_shinfo(skb)->frag_list = NULL;
51990 +
51991 +               netif->stats.tx_bytes += size;
51992 +               netif->stats.tx_packets++;
51993 +
51994 +               /* The update_va_mapping() must not fail. */
51995 +               BUG_ON(mcl->result != 0);
51996 +
51997 +               /* Check the reassignment error code. */
51998 +               status = NETIF_RSP_OKAY;
51999 +               if (gop->status != 0) { 
52000 +                       DPRINTK("Bad status %d from grant transfer to DOM%u\n",
52001 +                               gop->status, netif->domid);
52002 +                       /*
52003 +                         * Page no longer belongs to us unless GNTST_bad_page,
52004 +                         * but that should be a fatal error anyway.
52005 +                         */
52006 +                       BUG_ON(gop->status == GNTST_bad_page);
52007 +                       status = NETIF_RSP_ERROR; 
52008 +               }
52009 +               irq = netif->irq;
52010 +               id = RING_GET_REQUEST(&netif->rx, netif->rx.rsp_prod_pvt)->id;
52011 +               flags = 0;
52012 +               if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
52013 +                       flags |= NETRXF_csum_blank | NETRXF_data_validated;
52014 +               else if (skb->proto_data_valid) /* remote but checksummed? */
52015 +                       flags |= NETRXF_data_validated;
52016 +               if (make_rx_response(netif, id, status,
52017 +                                    (unsigned long)skb->data & ~PAGE_MASK,
52018 +                                    size, flags) &&
52019 +                   (rx_notify[irq] == 0)) {
52020 +                       rx_notify[irq] = 1;
52021 +                       notify_list[notify_nr++] = irq;
52022 +               }
52023 +
52024 +               netif_put(netif);
52025 +               dev_kfree_skb(skb);
52026 +               mcl++;
52027 +               gop++;
52028 +       }
52029 +
52030 +       while (notify_nr != 0) {
52031 +               irq = notify_list[--notify_nr];
52032 +               rx_notify[irq] = 0;
52033 +               notify_remote_via_irq(irq);
52034 +       }
52035 +
52036 +       /* More work to do? */
52037 +       if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer))
52038 +               tasklet_schedule(&net_rx_tasklet);
52039 +#if 0
52040 +       else
52041 +               xen_network_done_notify();
52042 +#endif
52043 +}
52044 +
52045 +static void net_alarm(unsigned long unused)
52046 +{
52047 +       tasklet_schedule(&net_rx_tasklet);
52048 +}
52049 +
52050 +struct net_device_stats *netif_be_get_stats(struct net_device *dev)
52051 +{
52052 +       netif_t *netif = netdev_priv(dev);
52053 +       return &netif->stats;
52054 +}
52055 +
52056 +static int __on_net_schedule_list(netif_t *netif)
52057 +{
52058 +       return netif->list.next != NULL;
52059 +}
52060 +
52061 +static void remove_from_net_schedule_list(netif_t *netif)
52062 +{
52063 +       spin_lock_irq(&net_schedule_list_lock);
52064 +       if (likely(__on_net_schedule_list(netif))) {
52065 +               list_del(&netif->list);
52066 +               netif->list.next = NULL;
52067 +               netif_put(netif);
52068 +       }
52069 +       spin_unlock_irq(&net_schedule_list_lock);
52070 +}
52071 +
52072 +static void add_to_net_schedule_list_tail(netif_t *netif)
52073 +{
52074 +       if (__on_net_schedule_list(netif))
52075 +               return;
52076 +
52077 +       spin_lock_irq(&net_schedule_list_lock);
52078 +       if (!__on_net_schedule_list(netif) && netif->active) {
52079 +               list_add_tail(&netif->list, &net_schedule_list);
52080 +               netif_get(netif);
52081 +       }
52082 +       spin_unlock_irq(&net_schedule_list_lock);
52083 +}
52084 +
52085 +/*
52086 + * Note on CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER:
52087 + * If this driver is pipelining transmit requests then we can be very
52088 + * aggressive in avoiding new-packet notifications -- frontend only needs to
52089 + * send a notification if there are no outstanding unreceived responses.
52090 + * If we may be buffer transmit buffers for any reason then we must be rather
52091 + * more conservative and treat this as the final check for pending work.
52092 + */
52093 +void netif_schedule_work(netif_t *netif)
52094 +{
52095 +       int more_to_do;
52096 +
52097 +#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
52098 +       more_to_do = RING_HAS_UNCONSUMED_REQUESTS(&netif->tx);
52099 +#else
52100 +       RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
52101 +#endif
52102 +
52103 +       if (more_to_do) {
52104 +               add_to_net_schedule_list_tail(netif);
52105 +               maybe_schedule_tx_action();
52106 +       }
52107 +}
52108 +
52109 +void netif_deschedule_work(netif_t *netif)
52110 +{
52111 +       remove_from_net_schedule_list(netif);
52112 +}
52113 +
52114 +
52115 +static void tx_credit_callback(unsigned long data)
52116 +{
52117 +       netif_t *netif = (netif_t *)data;
52118 +       netif->remaining_credit = netif->credit_bytes;
52119 +       netif_schedule_work(netif);
52120 +}
52121 +
52122 +inline static void net_tx_action_dealloc(void)
52123 +{
52124 +       gnttab_unmap_grant_ref_t *gop;
52125 +       u16 pending_idx;
52126 +       PEND_RING_IDX dc, dp;
52127 +       netif_t *netif;
52128 +       int ret;
52129 +
52130 +       dc = dealloc_cons;
52131 +       dp = dealloc_prod;
52132 +
52133 +       /*
52134 +        * Free up any grants we have finished using
52135 +        */
52136 +       gop = tx_unmap_ops;
52137 +       while (dc != dp) {
52138 +               pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)];
52139 +               gop->host_addr    = MMAP_VADDR(pending_idx);
52140 +               gop->dev_bus_addr = 0;
52141 +               gop->handle       = grant_tx_handle[pending_idx];
52142 +               gop++;
52143 +       }
52144 +       ret = HYPERVISOR_grant_table_op(
52145 +               GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops);
52146 +       BUG_ON(ret);
52147 +
52148 +       while (dealloc_cons != dp) {
52149 +               pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)];
52150 +
52151 +               netif = pending_tx_info[pending_idx].netif;
52152 +
52153 +               make_tx_response(netif, pending_tx_info[pending_idx].req.id, 
52154 +                                NETIF_RSP_OKAY);
52155 +        
52156 +               pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
52157 +
52158 +               netif_put(netif);
52159 +       }
52160 +}
52161 +
52162 +/* Called after netfront has transmitted */
52163 +static void net_tx_action(unsigned long unused)
52164 +{
52165 +       struct list_head *ent;
52166 +       struct sk_buff *skb;
52167 +       netif_t *netif;
52168 +       netif_tx_request_t txreq;
52169 +       u16 pending_idx;
52170 +       RING_IDX i;
52171 +       gnttab_map_grant_ref_t *mop;
52172 +       unsigned int data_len;
52173 +       int ret, work_to_do;
52174 +
52175 +       if (dealloc_cons != dealloc_prod)
52176 +               net_tx_action_dealloc();
52177 +
52178 +       mop = tx_map_ops;
52179 +       while ((NR_PENDING_REQS < MAX_PENDING_REQS) &&
52180 +               !list_empty(&net_schedule_list)) {
52181 +               /* Get a netif from the list with work to do. */
52182 +               ent = net_schedule_list.next;
52183 +               netif = list_entry(ent, netif_t, list);
52184 +               netif_get(netif);
52185 +               remove_from_net_schedule_list(netif);
52186 +
52187 +               RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do);
52188 +               if (!work_to_do) {
52189 +                       netif_put(netif);
52190 +                       continue;
52191 +               }
52192 +
52193 +               i = netif->tx.req_cons;
52194 +               rmb(); /* Ensure that we see the request before we copy it. */
52195 +               memcpy(&txreq, RING_GET_REQUEST(&netif->tx, i), sizeof(txreq));
52196 +               /* Credit-based scheduling. */
52197 +               if (txreq.size > netif->remaining_credit) {
52198 +                       unsigned long now = jiffies;
52199 +                       unsigned long next_credit = 
52200 +                               netif->credit_timeout.expires +
52201 +                               msecs_to_jiffies(netif->credit_usec / 1000);
52202 +
52203 +                       /* Timer could already be pending in rare cases. */
52204 +                       if (timer_pending(&netif->credit_timeout))
52205 +                               break;
52206 +
52207 +                       /* Passed the point where we can replenish credit? */
52208 +                       if (time_after_eq(now, next_credit)) {
52209 +                               netif->credit_timeout.expires = now;
52210 +                               netif->remaining_credit = netif->credit_bytes;
52211 +                       }
52212 +
52213 +                       /* Still too big to send right now? Set a callback. */
52214 +                       if (txreq.size > netif->remaining_credit) {
52215 +                               netif->remaining_credit = 0;
52216 +                               netif->credit_timeout.data     =
52217 +                                       (unsigned long)netif;
52218 +                               netif->credit_timeout.function =
52219 +                                       tx_credit_callback;
52220 +                               __mod_timer(&netif->credit_timeout,
52221 +                                           next_credit);
52222 +                               break;
52223 +                       }
52224 +               }
52225 +               netif->remaining_credit -= txreq.size;
52226 +
52227 +               netif->tx.req_cons++;
52228 +
52229 +               netif_schedule_work(netif);
52230 +
52231 +               if (unlikely(txreq.size < ETH_HLEN) || 
52232 +                   unlikely(txreq.size > ETH_FRAME_LEN)) {
52233 +                       DPRINTK("Bad packet size: %d\n", txreq.size);
52234 +                       make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
52235 +                       netif_put(netif);
52236 +                       continue; 
52237 +               }
52238 +
52239 +               /* No crossing a page as the payload mustn't fragment. */
52240 +               if (unlikely((txreq.offset + txreq.size) >= PAGE_SIZE)) {
52241 +                       DPRINTK("txreq.offset: %x, size: %u, end: %lu\n", 
52242 +                               txreq.offset, txreq.size, 
52243 +                               (txreq.offset &~PAGE_MASK) + txreq.size);
52244 +                       make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
52245 +                       netif_put(netif);
52246 +                       continue;
52247 +               }
52248 +
52249 +               pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
52250 +
52251 +               data_len = (txreq.size > PKT_PROT_LEN) ?
52252 +                       PKT_PROT_LEN : txreq.size;
52253 +
52254 +               skb = alloc_skb(data_len+16, GFP_ATOMIC);
52255 +               if (unlikely(skb == NULL)) {
52256 +                       DPRINTK("Can't allocate a skb in start_xmit.\n");
52257 +                       make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
52258 +                       netif_put(netif);
52259 +                       break;
52260 +               }
52261 +
52262 +               /* Packets passed to netif_rx() must have some headroom. */
52263 +               skb_reserve(skb, 16);
52264 +
52265 +               mop->host_addr = MMAP_VADDR(pending_idx);
52266 +               mop->dom       = netif->domid;
52267 +               mop->ref       = txreq.gref;
52268 +               mop->flags     = GNTMAP_host_map | GNTMAP_readonly;
52269 +               mop++;
52270 +
52271 +               memcpy(&pending_tx_info[pending_idx].req,
52272 +                      &txreq, sizeof(txreq));
52273 +               pending_tx_info[pending_idx].netif = netif;
52274 +               *((u16 *)skb->data) = pending_idx;
52275 +
52276 +               __skb_queue_tail(&tx_queue, skb);
52277 +
52278 +               pending_cons++;
52279 +
52280 +               if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
52281 +                       break;
52282 +       }
52283 +
52284 +       if (mop == tx_map_ops)
52285 +               return;
52286 +
52287 +       ret = HYPERVISOR_grant_table_op(
52288 +               GNTTABOP_map_grant_ref, tx_map_ops, mop - tx_map_ops);
52289 +       BUG_ON(ret);
52290 +
52291 +       mop = tx_map_ops;
52292 +       while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
52293 +               pending_idx = *((u16 *)skb->data);
52294 +               netif       = pending_tx_info[pending_idx].netif;
52295 +               memcpy(&txreq, &pending_tx_info[pending_idx].req,
52296 +                      sizeof(txreq));
52297 +
52298 +               /* Check the remap error code. */
52299 +               if (unlikely(mop->status)) {
52300 +                       printk(KERN_ALERT "#### netback grant fails\n");
52301 +                       make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
52302 +                       netif_put(netif);
52303 +                       kfree_skb(skb);
52304 +                       mop++;
52305 +                       pending_ring[MASK_PEND_IDX(pending_prod++)] =
52306 +                               pending_idx;
52307 +                       continue;
52308 +               }
52309 +               set_phys_to_machine(
52310 +                       __pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT,
52311 +                       FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
52312 +               grant_tx_handle[pending_idx] = mop->handle;
52313 +
52314 +               data_len = (txreq.size > PKT_PROT_LEN) ?
52315 +                       PKT_PROT_LEN : txreq.size;
52316 +
52317 +               __skb_put(skb, data_len);
52318 +               memcpy(skb->data, 
52319 +                      (void *)(MMAP_VADDR(pending_idx)|txreq.offset),
52320 +                      data_len);
52321 +               if (data_len < txreq.size) {
52322 +                       /* Append the packet payload as a fragment. */
52323 +                       skb_shinfo(skb)->frags[0].page        = 
52324 +                               virt_to_page(MMAP_VADDR(pending_idx));
52325 +                       skb_shinfo(skb)->frags[0].size        =
52326 +                               txreq.size - data_len;
52327 +                       skb_shinfo(skb)->frags[0].page_offset = 
52328 +                               txreq.offset + data_len;
52329 +                       skb_shinfo(skb)->nr_frags = 1;
52330 +               } else {
52331 +                       /* Schedule a response immediately. */
52332 +                       netif_idx_release(pending_idx);
52333 +               }
52334 +
52335 +               skb->data_len  = txreq.size - data_len;
52336 +               skb->len      += skb->data_len;
52337 +
52338 +               skb->dev      = netif->dev;
52339 +               skb->protocol = eth_type_trans(skb, skb->dev);
52340 +
52341 +               /*
52342 +                * Old frontends do not assert data_validated but we
52343 +                * can infer it from csum_blank so test both flags.
52344 +                */
52345 +               if (txreq.flags & (NETTXF_data_validated|NETTXF_csum_blank)) {
52346 +                       skb->ip_summed = CHECKSUM_UNNECESSARY;
52347 +                       skb->proto_data_valid = 1;
52348 +               } else {
52349 +                       skb->ip_summed = CHECKSUM_NONE;
52350 +                       skb->proto_data_valid = 0;
52351 +               }
52352 +               skb->proto_csum_blank = !!(txreq.flags & NETTXF_csum_blank);
52353 +
52354 +               netif->stats.rx_bytes += txreq.size;
52355 +               netif->stats.rx_packets++;
52356 +
52357 +               netif_rx(skb);
52358 +               netif->dev->last_rx = jiffies;
52359 +
52360 +               mop++;
52361 +       }
52362 +}
52363 +
52364 +static void netif_idx_release(u16 pending_idx)
52365 +{
52366 +       static spinlock_t _lock = SPIN_LOCK_UNLOCKED;
52367 +       unsigned long flags;
52368 +
52369 +       spin_lock_irqsave(&_lock, flags);
52370 +       dealloc_ring[MASK_PEND_IDX(dealloc_prod++)] = pending_idx;
52371 +       spin_unlock_irqrestore(&_lock, flags);
52372 +
52373 +       tasklet_schedule(&net_tx_tasklet);
52374 +}
52375 +
52376 +static void netif_page_release(struct page *page)
52377 +{
52378 +       u16 pending_idx = page - virt_to_page(mmap_vstart);
52379 +
52380 +       /* Ready for next use. */
52381 +       set_page_count(page, 1);
52382 +
52383 +       netif_idx_release(pending_idx);
52384 +}
52385 +
52386 +irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs)
52387 +{
52388 +       netif_t *netif = dev_id;
52389 +       add_to_net_schedule_list_tail(netif);
52390 +       maybe_schedule_tx_action();
52391 +       return IRQ_HANDLED;
52392 +}
52393 +
52394 +static void make_tx_response(netif_t *netif, 
52395 +                             u16      id,
52396 +                             s8       st)
52397 +{
52398 +       RING_IDX i = netif->tx.rsp_prod_pvt;
52399 +       netif_tx_response_t *resp;
52400 +       int notify;
52401 +
52402 +       resp = RING_GET_RESPONSE(&netif->tx, i);
52403 +       resp->id     = id;
52404 +       resp->status = st;
52405 +
52406 +       netif->tx.rsp_prod_pvt = ++i;
52407 +       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify);
52408 +       if (notify)
52409 +               notify_remote_via_irq(netif->irq);
52410 +
52411 +#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
52412 +       if (i == netif->tx.req_cons) {
52413 +               int more_to_do;
52414 +               RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
52415 +               if (more_to_do)
52416 +                       add_to_net_schedule_list_tail(netif);
52417 +       }
52418 +#endif
52419 +}
52420 +
52421 +static int make_rx_response(netif_t *netif, 
52422 +                            u16      id, 
52423 +                            s8       st,
52424 +                            u16      offset,
52425 +                            u16      size,
52426 +                            u16      flags)
52427 +{
52428 +       RING_IDX i = netif->rx.rsp_prod_pvt;
52429 +       netif_rx_response_t *resp;
52430 +       int notify;
52431 +
52432 +       resp = RING_GET_RESPONSE(&netif->rx, i);
52433 +       resp->offset     = offset;
52434 +       resp->flags      = flags;
52435 +       resp->id         = id;
52436 +       resp->status     = (s16)size;
52437 +       if (st < 0)
52438 +               resp->status = (s16)st;
52439 +
52440 +       netif->rx.rsp_prod_pvt = ++i;
52441 +       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, notify);
52442 +
52443 +       return notify;
52444 +}
52445 +
52446 +#ifdef NETBE_DEBUG_INTERRUPT
52447 +static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
52448 +{
52449 +       struct list_head *ent;
52450 +       netif_t *netif;
52451 +       int i = 0;
52452 +
52453 +       printk(KERN_ALERT "netif_schedule_list:\n");
52454 +       spin_lock_irq(&net_schedule_list_lock);
52455 +
52456 +       list_for_each (ent, &net_schedule_list) {
52457 +               netif = list_entry(ent, netif_t, list);
52458 +               printk(KERN_ALERT " %d: private(rx_req_cons=%08x "
52459 +                      "rx_resp_prod=%08x\n",
52460 +                      i, netif->rx.req_cons, netif->rx.rsp_prod_pvt);
52461 +               printk(KERN_ALERT "   tx_req_cons=%08x tx_resp_prod=%08x)\n",
52462 +                      netif->tx.req_cons, netif->tx.rsp_prod_pvt);
52463 +               printk(KERN_ALERT "   shared(rx_req_prod=%08x "
52464 +                      "rx_resp_prod=%08x\n",
52465 +                      netif->rx.sring->req_prod, netif->rx.sring->rsp_prod);
52466 +               printk(KERN_ALERT "   rx_event=%08x tx_req_prod=%08x\n",
52467 +                      netif->rx.sring->rsp_event, netif->tx.sring->req_prod);
52468 +               printk(KERN_ALERT "   tx_resp_prod=%08x, tx_event=%08x)\n",
52469 +                      netif->tx.sring->rsp_prod, netif->tx.sring->rsp_event);
52470 +               i++;
52471 +       }
52472 +
52473 +       spin_unlock_irq(&net_schedule_list_lock);
52474 +       printk(KERN_ALERT " ** End of netif_schedule_list **\n");
52475 +
52476 +       return IRQ_HANDLED;
52477 +}
52478 +#endif
52479 +
52480 +static int __init netback_init(void)
52481 +{
52482 +       int i;
52483 +       struct page *page;
52484 +
52485 +       /* We can increase reservation by this much in net_rx_action(). */
52486 +       balloon_update_driver_allowance(NET_RX_RING_SIZE);
52487 +
52488 +       skb_queue_head_init(&rx_queue);
52489 +       skb_queue_head_init(&tx_queue);
52490 +
52491 +       init_timer(&net_timer);
52492 +       net_timer.data = 0;
52493 +       net_timer.function = net_alarm;
52494 +    
52495 +       page = balloon_alloc_empty_page_range(MAX_PENDING_REQS);
52496 +       BUG_ON(page == NULL);
52497 +       mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
52498 +
52499 +       for (i = 0; i < MAX_PENDING_REQS; i++) {
52500 +               page = virt_to_page(MMAP_VADDR(i));
52501 +               set_page_count(page, 1);
52502 +               SetPageForeign(page, netif_page_release);
52503 +       }
52504 +
52505 +       pending_cons = 0;
52506 +       pending_prod = MAX_PENDING_REQS;
52507 +       for (i = 0; i < MAX_PENDING_REQS; i++)
52508 +               pending_ring[i] = i;
52509 +
52510 +       spin_lock_init(&net_schedule_list_lock);
52511 +       INIT_LIST_HEAD(&net_schedule_list);
52512 +
52513 +       netif_xenbus_init();
52514 +
52515 +#ifdef NETBE_DEBUG_INTERRUPT
52516 +       (void)bind_virq_to_irqhandler(
52517 +               VIRQ_DEBUG,
52518 +               0,
52519 +               netif_be_dbg,
52520 +               SA_SHIRQ, 
52521 +               "net-be-dbg",
52522 +               &netif_be_dbg);
52523 +#endif
52524 +
52525 +       __unsafe(THIS_MODULE);
52526 +
52527 +       return 0;
52528 +}
52529 +
52530 +static void netback_cleanup(void)
52531 +{
52532 +       BUG();
52533 +}
52534 +
52535 +module_init(netback_init);
52536 +module_exit(netback_cleanup);
52537 +
52538 +MODULE_LICENSE("Dual BSD/GPL");
52539 +
52540 +/*
52541 + * Local variables:
52542 + *  c-file-style: "linux"
52543 + *  indent-tabs-mode: t
52544 + *  c-indent-level: 8
52545 + *  c-basic-offset: 8
52546 + *  tab-width: 8
52547 + * End:
52548 + */
52549 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/netback/xenbus.c linux-2.6.16/drivers/xen/netback/xenbus.c
52550 --- linux-2.6.16.orig/drivers/xen/netback/xenbus.c      1970-01-01 01:00:00.000000000 +0100
52551 +++ linux-2.6.16/drivers/xen/netback/xenbus.c   2006-06-26 09:51:32.000000000 +0200
52552 @@ -0,0 +1,366 @@
52553 +/*  Xenbus code for netif backend
52554 +    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
52555 +    Copyright (C) 2005 XenSource Ltd
52556 +
52557 +    This program is free software; you can redistribute it and/or modify
52558 +    it under the terms of the GNU General Public License as published by
52559 +    the Free Software Foundation; either version 2 of the License, or
52560 +    (at your option) any later version.
52561 +
52562 +    This program is distributed in the hope that it will be useful,
52563 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
52564 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
52565 +    GNU General Public License for more details.
52566 +
52567 +    You should have received a copy of the GNU General Public License
52568 +    along with this program; if not, write to the Free Software
52569 +    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
52570 +*/
52571 +
52572 +
52573 +#include <stdarg.h>
52574 +#include <linux/module.h>
52575 +#include <xen/xenbus.h>
52576 +#include <xen/net_driver_util.h>
52577 +#include "common.h"
52578 +
52579 +
52580 +#if 0
52581 +#undef DPRINTK
52582 +#define DPRINTK(fmt, args...) \
52583 +    printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
52584 +#endif
52585 +
52586 +
52587 +struct backend_info
52588 +{
52589 +       struct xenbus_device *dev;
52590 +       netif_t *netif;
52591 +       struct xenbus_watch backend_watch;
52592 +       XenbusState frontend_state;
52593 +};
52594 +
52595 +
52596 +static int connect_rings(struct backend_info *);
52597 +static void connect(struct backend_info *);
52598 +static void maybe_connect(struct backend_info *);
52599 +static void backend_changed(struct xenbus_watch *, const char **,
52600 +                           unsigned int);
52601 +
52602 +
52603 +static int netback_remove(struct xenbus_device *dev)
52604 +{
52605 +       struct backend_info *be = dev->data;
52606 +
52607 +       if (be->backend_watch.node) {
52608 +               unregister_xenbus_watch(&be->backend_watch);
52609 +               kfree(be->backend_watch.node);
52610 +               be->backend_watch.node = NULL;
52611 +       }
52612 +       if (be->netif) {
52613 +               netif_disconnect(be->netif);
52614 +               be->netif = NULL;
52615 +       }
52616 +       kfree(be);
52617 +       dev->data = NULL;
52618 +       return 0;
52619 +}
52620 +
52621 +
52622 +/**
52623 + * Entry point to this code when a new device is created.  Allocate the basic
52624 + * structures, and watch the store waiting for the hotplug scripts to tell us
52625 + * the device's handle.  Switch to InitWait.
52626 + */
52627 +static int netback_probe(struct xenbus_device *dev,
52628 +                        const struct xenbus_device_id *id)
52629 +{
52630 +       int err;
52631 +       struct backend_info *be = kzalloc(sizeof(struct backend_info),
52632 +                                         GFP_KERNEL);
52633 +       if (!be) {
52634 +               xenbus_dev_fatal(dev, -ENOMEM,
52635 +                                "allocating backend structure");
52636 +               return -ENOMEM;
52637 +       }
52638 +
52639 +       be->dev = dev;
52640 +       dev->data = be;
52641 +
52642 +       err = xenbus_watch_path2(dev, dev->nodename, "handle",
52643 +                                &be->backend_watch, backend_changed);
52644 +       if (err)
52645 +               goto fail;
52646 +
52647 +       err = xenbus_switch_state(dev, XenbusStateInitWait);
52648 +       if (err) {
52649 +               goto fail;
52650 +       }
52651 +
52652 +       return 0;
52653 +
52654 +fail:
52655 +       DPRINTK("failed");
52656 +       netback_remove(dev);
52657 +       return err;
52658 +}
52659 +
52660 +
52661 +/**
52662 + * Handle the creation of the hotplug script environment.  We add the script
52663 + * and vif variables to the environment, for the benefit of the vif-* hotplug
52664 + * scripts.
52665 + */
52666 +static int netback_uevent(struct xenbus_device *xdev, char **envp,
52667 +                         int num_envp, char *buffer, int buffer_size)
52668 +{
52669 +       struct backend_info *be = xdev->data;
52670 +       netif_t *netif = be->netif;
52671 +       int i = 0, length = 0;
52672 +       char *val;
52673 +
52674 +       DPRINTK("netback_uevent");
52675 +
52676 +       val = xenbus_read(XBT_NULL, xdev->nodename, "script", NULL);
52677 +       if (IS_ERR(val)) {
52678 +               int err = PTR_ERR(val);
52679 +               xenbus_dev_fatal(xdev, err, "reading script");
52680 +               return err;
52681 +       }
52682 +       else {
52683 +               add_uevent_var(envp, num_envp, &i, buffer, buffer_size,
52684 +                              &length, "script=%s", val);
52685 +               kfree(val);
52686 +       }
52687 +
52688 +       add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
52689 +                      "vif=%s", netif->dev->name);
52690 +
52691 +       envp[i] = NULL;
52692 +
52693 +       return 0;
52694 +}
52695 +
52696 +
52697 +/**
52698 + * Callback received when the hotplug scripts have placed the handle node.
52699 + * Read it, and create a netif structure.  If the frontend is ready, connect.
52700 + */
52701 +static void backend_changed(struct xenbus_watch *watch,
52702 +                           const char **vec, unsigned int len)
52703 +{
52704 +       int err;
52705 +       long handle;
52706 +       struct backend_info *be
52707 +               = container_of(watch, struct backend_info, backend_watch);
52708 +       struct xenbus_device *dev = be->dev;
52709 +
52710 +       DPRINTK("");
52711 +
52712 +       err = xenbus_scanf(XBT_NULL, dev->nodename, "handle", "%li", &handle);
52713 +       if (XENBUS_EXIST_ERR(err)) {
52714 +               /* Since this watch will fire once immediately after it is
52715 +                  registered, we expect this.  Ignore it, and wait for the
52716 +                  hotplug scripts. */
52717 +               return;
52718 +       }
52719 +       if (err != 1) {
52720 +               xenbus_dev_fatal(dev, err, "reading handle");
52721 +               return;
52722 +       }
52723 +
52724 +       if (be->netif == NULL) {
52725 +               u8 be_mac[ETH_ALEN] = { 0, 0, 0, 0, 0, 0 };
52726 +
52727 +               be->netif = alloc_netif(dev->otherend_id, handle, be_mac);
52728 +               if (IS_ERR(be->netif)) {
52729 +                       err = PTR_ERR(be->netif);
52730 +                       be->netif = NULL;
52731 +                       xenbus_dev_fatal(dev, err, "creating interface");
52732 +                       return;
52733 +               }
52734 +
52735 +               kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE);
52736 +
52737 +               maybe_connect(be);
52738 +       }
52739 +}
52740 +
52741 +
52742 +/**
52743 + * Callback received when the frontend's state changes.
52744 + */
52745 +static void frontend_changed(struct xenbus_device *dev,
52746 +                            XenbusState frontend_state)
52747 +{
52748 +       struct backend_info *be = dev->data;
52749 +
52750 +       DPRINTK("");
52751 +
52752 +       be->frontend_state = frontend_state;
52753 +
52754 +       switch (frontend_state) {
52755 +       case XenbusStateInitialising:
52756 +       case XenbusStateInitialised:
52757 +               break;
52758 +
52759 +       case XenbusStateConnected:
52760 +               maybe_connect(be);
52761 +               break;
52762 +
52763 +       case XenbusStateClosing:
52764 +               xenbus_switch_state(dev, XenbusStateClosing);
52765 +               break;
52766 +
52767 +       case XenbusStateClosed:
52768 +               if (be->netif != NULL)
52769 +                       kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
52770 +               device_unregister(&dev->dev);
52771 +               break;
52772 +
52773 +       case XenbusStateUnknown:
52774 +       case XenbusStateInitWait:
52775 +       default:
52776 +               xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
52777 +                                frontend_state);
52778 +               break;
52779 +       }
52780 +}
52781 +
52782 +
52783 +/* ** Connection ** */
52784 +
52785 +
52786 +static void maybe_connect(struct backend_info *be)
52787 +{
52788 +       if (be->netif && (be->frontend_state == XenbusStateConnected))
52789 +               connect(be);
52790 +}
52791 +
52792 +static void xen_net_read_rate(struct xenbus_device *dev,
52793 +                             unsigned long *bytes, unsigned long *usec)
52794 +{
52795 +       char *s, *e;
52796 +       unsigned long b, u;
52797 +       char *ratestr;
52798 +
52799 +       /* Default to unlimited bandwidth. */
52800 +       *bytes = ~0UL;
52801 +       *usec = 0;
52802 +
52803 +       ratestr = xenbus_read(XBT_NULL, dev->nodename, "rate", NULL);
52804 +       if (IS_ERR(ratestr))
52805 +               return;
52806 +
52807 +       s = ratestr;
52808 +       b = simple_strtoul(s, &e, 10);
52809 +       if ((s == e) || (*e != ','))
52810 +               goto fail;
52811 +
52812 +       s = e + 1;
52813 +       u = simple_strtoul(s, &e, 10);
52814 +       if ((s == e) || (*e != '\0'))
52815 +               goto fail;
52816 +
52817 +       *bytes = b;
52818 +       *usec = u;
52819 +
52820 +       kfree(ratestr);
52821 +       return;
52822 +
52823 + fail:
52824 +       WPRINTK("Failed to parse network rate limit. Traffic unlimited.\n");
52825 +       kfree(ratestr);
52826 +}
52827 +
52828 +
52829 +static void connect(struct backend_info *be)
52830 +{
52831 +       int err;
52832 +       struct xenbus_device *dev = be->dev;
52833 +
52834 +       err = connect_rings(be);
52835 +       if (err)
52836 +               return;
52837 +
52838 +       err = xen_net_read_mac(dev, be->netif->fe_dev_addr);
52839 +       if (err) {
52840 +               xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
52841 +               return;
52842 +       }
52843 +
52844 +       xen_net_read_rate(dev, &be->netif->credit_bytes,
52845 +                         &be->netif->credit_usec);
52846 +       be->netif->remaining_credit = be->netif->credit_bytes;
52847 +
52848 +       xenbus_switch_state(dev, XenbusStateConnected);
52849 +}
52850 +
52851 +
52852 +static int connect_rings(struct backend_info *be)
52853 +{
52854 +       struct xenbus_device *dev = be->dev;
52855 +       unsigned long tx_ring_ref, rx_ring_ref;
52856 +       unsigned int evtchn;
52857 +       int err;
52858 +
52859 +       DPRINTK("");
52860 +
52861 +       err = xenbus_gather(XBT_NULL, dev->otherend,
52862 +                           "tx-ring-ref", "%lu", &tx_ring_ref,
52863 +                           "rx-ring-ref", "%lu", &rx_ring_ref,
52864 +                           "event-channel", "%u", &evtchn, NULL);
52865 +       if (err) {
52866 +               xenbus_dev_fatal(dev, err,
52867 +                                "reading %s/ring-ref and event-channel",
52868 +                                dev->otherend);
52869 +               return err;
52870 +       }
52871 +
52872 +       /* Map the shared frame, irq etc. */
52873 +       err = netif_map(be->netif, tx_ring_ref, rx_ring_ref, evtchn);
52874 +       if (err) {
52875 +               xenbus_dev_fatal(dev, err,
52876 +                                "mapping shared-frames %lu/%lu port %u",
52877 +                                tx_ring_ref, rx_ring_ref, evtchn);
52878 +               return err;
52879 +       }
52880 +       return 0;
52881 +}
52882 +
52883 +
52884 +/* ** Driver Registration ** */
52885 +
52886 +
52887 +static struct xenbus_device_id netback_ids[] = {
52888 +       { "vif" },
52889 +       { "" }
52890 +};
52891 +
52892 +
52893 +static struct xenbus_driver netback = {
52894 +       .name = "vif",
52895 +       .owner = THIS_MODULE,
52896 +       .ids = netback_ids,
52897 +       .probe = netback_probe,
52898 +       .remove = netback_remove,
52899 +       .uevent = netback_uevent,
52900 +       .otherend_changed = frontend_changed,
52901 +};
52902 +
52903 +
52904 +void netif_xenbus_init(void)
52905 +{
52906 +       xenbus_register_backend(&netback);
52907 +}
52908 +
52909 +
52910 +/*
52911 + * Local variables:
52912 + *  c-file-style: "linux"
52913 + *  indent-tabs-mode: t
52914 + *  c-indent-level: 8
52915 + *  c-basic-offset: 8
52916 + *  tab-width: 8
52917 + * End:
52918 + */
52919 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/netfront/Makefile linux-2.6.16/drivers/xen/netfront/Makefile
52920 --- linux-2.6.16.orig/drivers/xen/netfront/Makefile     1970-01-01 01:00:00.000000000 +0100
52921 +++ linux-2.6.16/drivers/xen/netfront/Makefile  2006-06-26 09:51:32.000000000 +0200
52922 @@ -0,0 +1,4 @@
52923 +
52924 +obj-$(CONFIG_XEN_NETDEV_FRONTEND)      := xennet.o
52925 +
52926 +xennet-objs := netfront.o
52927 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/netfront/netfront.c linux-2.6.16/drivers/xen/netfront/netfront.c
52928 --- linux-2.6.16.orig/drivers/xen/netfront/netfront.c   1970-01-01 01:00:00.000000000 +0100
52929 +++ linux-2.6.16/drivers/xen/netfront/netfront.c        2006-06-26 09:51:32.000000000 +0200
52930 @@ -0,0 +1,1524 @@
52931 +/******************************************************************************
52932 + * Virtual network driver for conversing with remote driver backends.
52933 + * 
52934 + * Copyright (c) 2002-2005, K A Fraser
52935 + * Copyright (c) 2005, XenSource Ltd
52936 + * 
52937 + * This program is free software; you can redistribute it and/or
52938 + * modify it under the terms of the GNU General Public License version 2
52939 + * as published by the Free Software Foundation; or, when distributed
52940 + * separately from the Linux kernel or incorporated into other
52941 + * software packages, subject to the following license:
52942 + * 
52943 + * Permission is hereby granted, free of charge, to any person obtaining a copy
52944 + * of this source file (the "Software"), to deal in the Software without
52945 + * restriction, including without limitation the rights to use, copy, modify,
52946 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
52947 + * and to permit persons to whom the Software is furnished to do so, subject to
52948 + * the following conditions:
52949 + * 
52950 + * The above copyright notice and this permission notice shall be included in
52951 + * all copies or substantial portions of the Software.
52952 + * 
52953 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
52954 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
52955 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
52956 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
52957 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
52958 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
52959 + * IN THE SOFTWARE.
52960 + */
52961 +
52962 +#include <linux/config.h>
52963 +#include <linux/module.h>
52964 +#include <linux/version.h>
52965 +#include <linux/kernel.h>
52966 +#include <linux/sched.h>
52967 +#include <linux/slab.h>
52968 +#include <linux/string.h>
52969 +#include <linux/errno.h>
52970 +#include <linux/netdevice.h>
52971 +#include <linux/inetdevice.h>
52972 +#include <linux/etherdevice.h>
52973 +#include <linux/skbuff.h>
52974 +#include <linux/init.h>
52975 +#include <linux/bitops.h>
52976 +#include <linux/proc_fs.h>
52977 +#include <linux/ethtool.h>
52978 +#include <linux/in.h>
52979 +#include <net/sock.h>
52980 +#include <net/pkt_sched.h>
52981 +#include <net/arp.h>
52982 +#include <net/route.h>
52983 +#include <asm/io.h>
52984 +#include <asm/uaccess.h>
52985 +#include <xen/evtchn.h>
52986 +#include <xen/xenbus.h>
52987 +#include <xen/interface/io/netif.h>
52988 +#include <xen/interface/memory.h>
52989 +#include <xen/balloon.h>
52990 +#include <asm/page.h>
52991 +#include <asm/uaccess.h>
52992 +#include <xen/interface/grant_table.h>
52993 +#include <xen/gnttab.h>
52994 +#include <xen/net_driver_util.h>
52995 +
52996 +#define GRANT_INVALID_REF      0
52997 +
52998 +#define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE)
52999 +#define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE)
53000 +
53001 +static inline void init_skb_shinfo(struct sk_buff *skb)
53002 +{
53003 +       atomic_set(&(skb_shinfo(skb)->dataref), 1);
53004 +       skb_shinfo(skb)->nr_frags = 0;
53005 +       skb_shinfo(skb)->frag_list = NULL;
53006 +}
53007 +
53008 +struct netfront_info
53009 +{
53010 +       struct list_head list;
53011 +       struct net_device *netdev;
53012 +
53013 +       struct net_device_stats stats;
53014 +       unsigned int tx_full;
53015 +
53016 +       netif_tx_front_ring_t tx;
53017 +       netif_rx_front_ring_t rx;
53018 +
53019 +       spinlock_t   tx_lock;
53020 +       spinlock_t   rx_lock;
53021 +
53022 +       unsigned int handle;
53023 +       unsigned int evtchn, irq;
53024 +
53025 +       /* What is the status of our connection to the remote backend? */
53026 +#define BEST_CLOSED       0
53027 +#define BEST_DISCONNECTED 1
53028 +#define BEST_CONNECTED    2
53029 +       unsigned int backend_state;
53030 +
53031 +       /* Is this interface open or closed (down or up)? */
53032 +#define UST_CLOSED        0
53033 +#define UST_OPEN          1
53034 +       unsigned int user_state;
53035 +
53036 +       /* Receive-ring batched refills. */
53037 +#define RX_MIN_TARGET 8
53038 +#define RX_DFL_MIN_TARGET 64
53039 +#define RX_MAX_TARGET NET_RX_RING_SIZE
53040 +       int rx_min_target, rx_max_target, rx_target;
53041 +       struct sk_buff_head rx_batch;
53042 +
53043 +       struct timer_list rx_refill_timer;
53044 +
53045 +       /*
53046 +        * {tx,rx}_skbs store outstanding skbuffs. The first entry in each
53047 +        * array is an index into a chain of free entries.
53048 +        */
53049 +       struct sk_buff *tx_skbs[NET_TX_RING_SIZE+1];
53050 +       struct sk_buff *rx_skbs[NET_RX_RING_SIZE+1];
53051 +
53052 +       grant_ref_t gref_tx_head;
53053 +       grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1];
53054 +       grant_ref_t gref_rx_head;
53055 +       grant_ref_t grant_rx_ref[NET_TX_RING_SIZE + 1];
53056 +
53057 +       struct xenbus_device *xbdev;
53058 +       int tx_ring_ref;
53059 +       int rx_ring_ref;
53060 +       u8 mac[ETH_ALEN];
53061 +
53062 +       unsigned long rx_pfn_array[NET_RX_RING_SIZE];
53063 +       multicall_entry_t rx_mcl[NET_RX_RING_SIZE+1];
53064 +       mmu_update_t rx_mmu[NET_RX_RING_SIZE];
53065 +};
53066 +
53067 +/*
53068 + * Access macros for acquiring freeing slots in {tx,rx}_skbs[].
53069 + */
53070 +
53071 +static inline void add_id_to_freelist(struct sk_buff **list, unsigned short id)
53072 +{
53073 +       list[id] = list[0];
53074 +       list[0]  = (void *)(unsigned long)id;
53075 +}
53076 +
53077 +static inline unsigned short get_id_from_freelist(struct sk_buff **list)
53078 +{
53079 +       unsigned int id = (unsigned int)(unsigned long)list[0];
53080 +       list[0] = list[id];
53081 +       return id;
53082 +}
53083 +
53084 +#ifdef DEBUG
53085 +static char *be_state_name[] = {
53086 +       [BEST_CLOSED]       = "closed",
53087 +       [BEST_DISCONNECTED] = "disconnected",
53088 +       [BEST_CONNECTED]    = "connected",
53089 +};
53090 +#endif
53091 +
53092 +#define DPRINTK(fmt, args...) pr_debug("netfront (%s:%d) " fmt, \
53093 +                                       __FUNCTION__, __LINE__, ##args)
53094 +#define IPRINTK(fmt, args...)                          \
53095 +       printk(KERN_INFO "netfront: " fmt, ##args)
53096 +#define WPRINTK(fmt, args...)                          \
53097 +       printk(KERN_WARNING "netfront: " fmt, ##args)
53098 +
53099 +
53100 +static int talk_to_backend(struct xenbus_device *, struct netfront_info *);
53101 +static int setup_device(struct xenbus_device *, struct netfront_info *);
53102 +static int create_netdev(int, struct xenbus_device *, struct net_device **);
53103 +
53104 +static void netfront_closing(struct xenbus_device *);
53105 +
53106 +static void end_access(int, void *);
53107 +static void netif_disconnect_backend(struct netfront_info *);
53108 +static void close_netdev(struct netfront_info *);
53109 +static void netif_free(struct netfront_info *);
53110 +
53111 +static void show_device(struct netfront_info *);
53112 +
53113 +static void network_connect(struct net_device *);
53114 +static void network_tx_buf_gc(struct net_device *);
53115 +static void network_alloc_rx_buffers(struct net_device *);
53116 +static int send_fake_arp(struct net_device *);
53117 +
53118 +static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs);
53119 +
53120 +#ifdef CONFIG_PROC_FS
53121 +static int xennet_proc_init(void);
53122 +static int xennet_proc_addif(struct net_device *dev);
53123 +static void xennet_proc_delif(struct net_device *dev);
53124 +#else
53125 +#define xennet_proc_init()   (0)
53126 +#define xennet_proc_addif(d) (0)
53127 +#define xennet_proc_delif(d) ((void)0)
53128 +#endif
53129 +
53130 +
53131 +/**
53132 + * Entry point to this code when a new device is created.  Allocate the basic
53133 + * structures and the ring buffers for communication with the backend, and
53134 + * inform the backend of the appropriate details for those.  Switch to
53135 + * Connected state.
53136 + */
53137 +static int netfront_probe(struct xenbus_device *dev,
53138 +                         const struct xenbus_device_id *id)
53139 +{
53140 +       int err;
53141 +       struct net_device *netdev;
53142 +       struct netfront_info *info;
53143 +       unsigned int handle;
53144 +
53145 +       err = xenbus_scanf(XBT_NULL, dev->nodename, "handle", "%u", &handle);
53146 +       if (err != 1) {
53147 +               xenbus_dev_fatal(dev, err, "reading handle");
53148 +               return err;
53149 +       }
53150 +
53151 +       err = create_netdev(handle, dev, &netdev);
53152 +       if (err) {
53153 +               xenbus_dev_fatal(dev, err, "creating netdev");
53154 +               return err;
53155 +       }
53156 +
53157 +       info = netdev_priv(netdev);
53158 +       dev->data = info;
53159 +
53160 +       err = talk_to_backend(dev, info);
53161 +       if (err) {
53162 +               kfree(info);
53163 +               dev->data = NULL;
53164 +               return err;
53165 +       }
53166 +
53167 +       return 0;
53168 +}
53169 +
53170 +
53171 +/**
53172 + * We are reconnecting to the backend, due to a suspend/resume, or a backend
53173 + * driver restart.  We tear down our netif structure and recreate it, but
53174 + * leave the device-layer structures intact so that this is transparent to the
53175 + * rest of the kernel.
53176 + */
53177 +static int netfront_resume(struct xenbus_device *dev)
53178 +{
53179 +       struct netfront_info *info = dev->data;
53180 +
53181 +       DPRINTK("%s\n", dev->nodename);
53182 +
53183 +       netif_disconnect_backend(info);
53184 +       return talk_to_backend(dev, info);
53185 +}
53186 +
53187 +
53188 +/* Common code used when first setting up, and when resuming. */
53189 +static int talk_to_backend(struct xenbus_device *dev,
53190 +                          struct netfront_info *info)
53191 +{
53192 +       const char *message;
53193 +       xenbus_transaction_t xbt;
53194 +       int err;
53195 +
53196 +       err = xen_net_read_mac(dev, info->mac);
53197 +       if (err) {
53198 +               xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
53199 +               goto out;
53200 +       }
53201 +
53202 +       /* Create shared ring, alloc event channel. */
53203 +       err = setup_device(dev, info);
53204 +       if (err)
53205 +               goto out;
53206 +
53207 +again:
53208 +       err = xenbus_transaction_start(&xbt);
53209 +       if (err) {
53210 +               xenbus_dev_fatal(dev, err, "starting transaction");
53211 +               goto destroy_ring;
53212 +       }
53213 +
53214 +       err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref","%u",
53215 +                           info->tx_ring_ref);
53216 +       if (err) {
53217 +               message = "writing tx ring-ref";
53218 +               goto abort_transaction;
53219 +       }
53220 +       err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref","%u",
53221 +                           info->rx_ring_ref);
53222 +       if (err) {
53223 +               message = "writing rx ring-ref";
53224 +               goto abort_transaction;
53225 +       }
53226 +       err = xenbus_printf(xbt, dev->nodename,
53227 +                           "event-channel", "%u", info->evtchn);
53228 +       if (err) {
53229 +               message = "writing event-channel";
53230 +               goto abort_transaction;
53231 +       }
53232 +
53233 +       err = xenbus_printf(xbt, dev->nodename,
53234 +                           "state", "%d", XenbusStateConnected);
53235 +       if (err) {
53236 +               message = "writing frontend XenbusStateConnected";
53237 +               goto abort_transaction;
53238 +       }
53239 +
53240 +       err = xenbus_transaction_end(xbt, 0);
53241 +       if (err) {
53242 +               if (err == -EAGAIN)
53243 +                       goto again;
53244 +               xenbus_dev_fatal(dev, err, "completing transaction");
53245 +               goto destroy_ring;
53246 +       }
53247 +
53248 +       return 0;
53249 +
53250 + abort_transaction:
53251 +       xenbus_transaction_end(xbt, 1);
53252 +       xenbus_dev_fatal(dev, err, "%s", message);
53253 + destroy_ring:
53254 +       netif_free(info);
53255 + out:
53256 +       return err;
53257 +}
53258 +
53259 +
53260 +static int setup_device(struct xenbus_device *dev, struct netfront_info *info)
53261 +{
53262 +       netif_tx_sring_t *txs;
53263 +       netif_rx_sring_t *rxs;
53264 +       int err;
53265 +       struct net_device *netdev = info->netdev;
53266 +
53267 +       info->tx_ring_ref = GRANT_INVALID_REF;
53268 +       info->rx_ring_ref = GRANT_INVALID_REF;
53269 +       info->rx.sring = NULL;
53270 +       info->tx.sring = NULL;
53271 +       info->irq = 0;
53272 +
53273 +       txs = (netif_tx_sring_t *)__get_free_page(GFP_KERNEL);
53274 +       if (!txs) {
53275 +               err = -ENOMEM;
53276 +               xenbus_dev_fatal(dev, err, "allocating tx ring page");
53277 +               goto fail;
53278 +       }
53279 +       rxs = (netif_rx_sring_t *)__get_free_page(GFP_KERNEL);
53280 +       if (!rxs) {
53281 +               err = -ENOMEM;
53282 +               xenbus_dev_fatal(dev, err, "allocating rx ring page");
53283 +               goto fail;
53284 +       }
53285 +       memset(txs, 0, PAGE_SIZE);
53286 +       memset(rxs, 0, PAGE_SIZE);
53287 +       info->backend_state = BEST_DISCONNECTED;
53288 +
53289 +       SHARED_RING_INIT(txs);
53290 +       FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);
53291 +
53292 +       SHARED_RING_INIT(rxs);
53293 +       FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);
53294 +
53295 +       err = xenbus_grant_ring(dev, virt_to_mfn(txs));
53296 +       if (err < 0)
53297 +               goto fail;
53298 +       info->tx_ring_ref = err;
53299 +
53300 +       err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
53301 +       if (err < 0)
53302 +               goto fail;
53303 +       info->rx_ring_ref = err;
53304 +
53305 +       err = xenbus_alloc_evtchn(dev, &info->evtchn);
53306 +       if (err)
53307 +               goto fail;
53308 +
53309 +       memcpy(netdev->dev_addr, info->mac, ETH_ALEN);
53310 +       network_connect(netdev);
53311 +       info->irq = bind_evtchn_to_irqhandler(
53312 +               info->evtchn, netif_int, SA_SAMPLE_RANDOM, netdev->name,
53313 +               netdev);
53314 +       (void)send_fake_arp(netdev);
53315 +       show_device(info);
53316 +
53317 +       return 0;
53318 +
53319 + fail:
53320 +       netif_free(info);
53321 +       return err;
53322 +}
53323 +
53324 +
53325 +/**
53326 + * Callback received when the backend's state changes.
53327 + */
53328 +static void backend_changed(struct xenbus_device *dev,
53329 +                           XenbusState backend_state)
53330 +{
53331 +       DPRINTK("\n");
53332 +
53333 +       switch (backend_state) {
53334 +       case XenbusStateInitialising:
53335 +       case XenbusStateInitWait:
53336 +       case XenbusStateInitialised:
53337 +       case XenbusStateConnected:
53338 +       case XenbusStateUnknown:
53339 +       case XenbusStateClosed:
53340 +               break;
53341 +
53342 +       case XenbusStateClosing:
53343 +               netfront_closing(dev);
53344 +               break;
53345 +       }
53346 +}
53347 +
53348 +
53349 +/** Send a packet on a net device to encourage switches to learn the
53350 + * MAC. We send a fake ARP request.
53351 + *
53352 + * @param dev device
53353 + * @return 0 on success, error code otherwise
53354 + */
53355 +static int send_fake_arp(struct net_device *dev)
53356 +{
53357 +       struct sk_buff *skb;
53358 +       u32             src_ip, dst_ip;
53359 +
53360 +       dst_ip = INADDR_BROADCAST;
53361 +       src_ip = inet_select_addr(dev, dst_ip, RT_SCOPE_LINK);
53362 +
53363 +       /* No IP? Then nothing to do. */
53364 +       if (src_ip == 0)
53365 +               return 0;
53366 +
53367 +       skb = arp_create(ARPOP_REPLY, ETH_P_ARP,
53368 +                        dst_ip, dev, src_ip,
53369 +                        /*dst_hw*/ NULL, /*src_hw*/ NULL,
53370 +                        /*target_hw*/ dev->dev_addr);
53371 +       if (skb == NULL)
53372 +               return -ENOMEM;
53373 +
53374 +       return dev_queue_xmit(skb);
53375 +}
53376 +
53377 +
53378 +static int network_open(struct net_device *dev)
53379 +{
53380 +       struct netfront_info *np = netdev_priv(dev);
53381 +
53382 +       memset(&np->stats, 0, sizeof(np->stats));
53383 +
53384 +       np->user_state = UST_OPEN;
53385 +
53386 +       network_alloc_rx_buffers(dev);
53387 +       np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
53388 +
53389 +       netif_start_queue(dev);
53390 +
53391 +       return 0;
53392 +}
53393 +
53394 +static void network_tx_buf_gc(struct net_device *dev)
53395 +{
53396 +       RING_IDX i, prod;
53397 +       unsigned short id;
53398 +       struct netfront_info *np = netdev_priv(dev);
53399 +       struct sk_buff *skb;
53400 +
53401 +       if (np->backend_state != BEST_CONNECTED)
53402 +               return;
53403 +
53404 +       do {
53405 +               prod = np->tx.sring->rsp_prod;
53406 +               rmb(); /* Ensure we see responses up to 'rp'. */
53407 +
53408 +               for (i = np->tx.rsp_cons; i != prod; i++) {
53409 +                       id  = RING_GET_RESPONSE(&np->tx, i)->id;
53410 +                       skb = np->tx_skbs[id];
53411 +                       if (unlikely(gnttab_query_foreign_access(
53412 +                               np->grant_tx_ref[id]) != 0)) {
53413 +                               printk(KERN_ALERT "network_tx_buf_gc: warning "
53414 +                                      "-- grant still in use by backend "
53415 +                                      "domain.\n");
53416 +                               goto out;
53417 +                       }
53418 +                       gnttab_end_foreign_access_ref(
53419 +                               np->grant_tx_ref[id], GNTMAP_readonly);
53420 +                       gnttab_release_grant_reference(
53421 +                               &np->gref_tx_head, np->grant_tx_ref[id]);
53422 +                       np->grant_tx_ref[id] = GRANT_INVALID_REF;
53423 +                       add_id_to_freelist(np->tx_skbs, id);
53424 +                       dev_kfree_skb_irq(skb);
53425 +               }
53426 +
53427 +               np->tx.rsp_cons = prod;
53428 +
53429 +               /*
53430 +                * Set a new event, then check for race with update of tx_cons.
53431 +                * Note that it is essential to schedule a callback, no matter
53432 +                * how few buffers are pending. Even if there is space in the
53433 +                * transmit ring, higher layers may be blocked because too much
53434 +                * data is outstanding: in such cases notification from Xen is
53435 +                * likely to be the only kick that we'll get.
53436 +                */
53437 +               np->tx.sring->rsp_event =
53438 +                       prod + ((np->tx.sring->req_prod - prod) >> 1) + 1;
53439 +               mb();
53440 +       } while (prod != np->tx.sring->rsp_prod);
53441 +
53442 + out:
53443 +       if (np->tx_full &&
53444 +           ((np->tx.sring->req_prod - prod) < NET_TX_RING_SIZE)) {
53445 +               np->tx_full = 0;
53446 +               if (np->user_state == UST_OPEN)
53447 +                       netif_wake_queue(dev);
53448 +       }
53449 +}
53450 +
53451 +
53452 +static void rx_refill_timeout(unsigned long data)
53453 +{
53454 +       struct net_device *dev = (struct net_device *)data;
53455 +       netif_rx_schedule(dev);
53456 +}
53457 +
53458 +
53459 +static void network_alloc_rx_buffers(struct net_device *dev)
53460 +{
53461 +       unsigned short id;
53462 +       struct netfront_info *np = netdev_priv(dev);
53463 +       struct sk_buff *skb;
53464 +       int i, batch_target;
53465 +       RING_IDX req_prod = np->rx.req_prod_pvt;
53466 +       struct xen_memory_reservation reservation;
53467 +       grant_ref_t ref;
53468 +
53469 +       if (unlikely(np->backend_state != BEST_CONNECTED))
53470 +               return;
53471 +
53472 +       /*
53473 +        * Allocate skbuffs greedily, even though we batch updates to the
53474 +        * receive ring. This creates a less bursty demand on the memory
53475 +        * allocator, so should reduce the chance of failed allocation requests
53476 +        * both for ourself and for other kernel subsystems.
53477 +        */
53478 +       batch_target = np->rx_target - (req_prod - np->rx.rsp_cons);
53479 +       for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) {
53480 +               /*
53481 +                * Subtract dev_alloc_skb headroom (16 bytes) and shared info
53482 +                * tailroom then round down to SKB_DATA_ALIGN boundary.
53483 +                */
53484 +               skb = __dev_alloc_skb(
53485 +                       ((PAGE_SIZE - sizeof(struct skb_shared_info)) &
53486 +                        (-SKB_DATA_ALIGN(1))) - 16,
53487 +                       GFP_ATOMIC|__GFP_NOWARN);
53488 +               if (skb == NULL) {
53489 +                       /* Any skbuffs queued for refill? Force them out. */
53490 +                       if (i != 0)
53491 +                               goto refill;
53492 +                       /* Could not allocate any skbuffs. Try again later. */
53493 +                       mod_timer(&np->rx_refill_timer,
53494 +                                 jiffies + (HZ/10));
53495 +                       return;
53496 +               }
53497 +               __skb_queue_tail(&np->rx_batch, skb);
53498 +       }
53499 +
53500 +       /* Is the batch large enough to be worthwhile? */
53501 +       if (i < (np->rx_target/2))
53502 +               return;
53503 +
53504 +       /* Adjust our fill target if we risked running out of buffers. */
53505 +       if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) &&
53506 +           ((np->rx_target *= 2) > np->rx_max_target))
53507 +               np->rx_target = np->rx_max_target;
53508 +
53509 + refill:
53510 +       for (i = 0; ; i++) {
53511 +               if ((skb = __skb_dequeue(&np->rx_batch)) == NULL)
53512 +                       break;
53513 +
53514 +               skb->dev = dev;
53515 +
53516 +               id = get_id_from_freelist(np->rx_skbs);
53517 +
53518 +               np->rx_skbs[id] = skb;
53519 +
53520 +               RING_GET_REQUEST(&np->rx, req_prod + i)->id = id;
53521 +               ref = gnttab_claim_grant_reference(&np->gref_rx_head);
53522 +               BUG_ON((signed short)ref < 0);
53523 +               np->grant_rx_ref[id] = ref;
53524 +               gnttab_grant_foreign_transfer_ref(ref,
53525 +                                                 np->xbdev->otherend_id,
53526 +                                                 __pa(skb->head) >> PAGE_SHIFT);
53527 +               RING_GET_REQUEST(&np->rx, req_prod + i)->gref = ref;
53528 +               np->rx_pfn_array[i] = virt_to_mfn(skb->head);
53529 +
53530 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
53531 +                       /* Remove this page before passing back to Xen. */
53532 +                       set_phys_to_machine(__pa(skb->head) >> PAGE_SHIFT,
53533 +                                           INVALID_P2M_ENTRY);
53534 +                       MULTI_update_va_mapping(np->rx_mcl+i,
53535 +                                               (unsigned long)skb->head,
53536 +                                               __pte(0), 0);
53537 +               }
53538 +       }
53539 +
53540 +       /* Tell the ballon driver what is going on. */
53541 +       balloon_update_driver_allowance(i);
53542 +
53543 +       reservation.extent_start = np->rx_pfn_array;
53544 +       reservation.nr_extents   = i;
53545 +       reservation.extent_order = 0;
53546 +       reservation.address_bits = 0;
53547 +       reservation.domid        = DOMID_SELF;
53548 +
53549 +       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
53550 +               /* After all PTEs have been zapped, flush the TLB. */
53551 +               np->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] =
53552 +                       UVMF_TLB_FLUSH|UVMF_ALL;
53553 +
53554 +               /* Give away a batch of pages. */
53555 +               np->rx_mcl[i].op = __HYPERVISOR_memory_op;
53556 +               np->rx_mcl[i].args[0] = XENMEM_decrease_reservation;
53557 +               np->rx_mcl[i].args[1] = (unsigned long)&reservation;
53558 +
53559 +               /* Zap PTEs and give away pages in one big multicall. */
53560 +               (void)HYPERVISOR_multicall(np->rx_mcl, i+1);
53561 +
53562 +               /* Check return status of HYPERVISOR_memory_op(). */
53563 +               if (unlikely(np->rx_mcl[i].result != i))
53564 +                       panic("Unable to reduce memory reservation\n");
53565 +       } else
53566 +               if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
53567 +                                        &reservation) != i)
53568 +                       panic("Unable to reduce memory reservation\n");
53569 +
53570 +       /* Above is a suitable barrier to ensure backend will see requests. */
53571 +       np->rx.req_prod_pvt = req_prod + i;
53572 +       RING_PUSH_REQUESTS(&np->rx);
53573 +}
53574 +
53575 +
53576 +static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
53577 +{
53578 +       unsigned short id;
53579 +       struct netfront_info *np = netdev_priv(dev);
53580 +       netif_tx_request_t *tx;
53581 +       RING_IDX i;
53582 +       grant_ref_t ref;
53583 +       unsigned long mfn;
53584 +       int notify;
53585 +
53586 +       if (unlikely(np->tx_full)) {
53587 +               printk(KERN_ALERT "%s: full queue wasn't stopped!\n",
53588 +                      dev->name);
53589 +               netif_stop_queue(dev);
53590 +               goto drop;
53591 +       }
53592 +
53593 +       if (unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >=
53594 +                    PAGE_SIZE)) {
53595 +               struct sk_buff *nskb;
53596 +               nskb = __dev_alloc_skb(skb->len, GFP_ATOMIC|__GFP_NOWARN);
53597 +               if (unlikely(nskb == NULL))
53598 +                       goto drop;
53599 +               skb_put(nskb, skb->len);
53600 +               memcpy(nskb->data, skb->data, skb->len);
53601 +               nskb->dev = skb->dev;
53602 +               dev_kfree_skb(skb);
53603 +               skb = nskb;
53604 +       }
53605 +
53606 +       spin_lock_irq(&np->tx_lock);
53607 +
53608 +       if (np->backend_state != BEST_CONNECTED) {
53609 +               spin_unlock_irq(&np->tx_lock);
53610 +               goto drop;
53611 +       }
53612 +
53613 +       i = np->tx.req_prod_pvt;
53614 +
53615 +       id = get_id_from_freelist(np->tx_skbs);
53616 +       np->tx_skbs[id] = skb;
53617 +
53618 +       tx = RING_GET_REQUEST(&np->tx, i);
53619 +
53620 +       tx->id   = id;
53621 +       ref = gnttab_claim_grant_reference(&np->gref_tx_head);
53622 +       BUG_ON((signed short)ref < 0);
53623 +       mfn = virt_to_mfn(skb->data);
53624 +       gnttab_grant_foreign_access_ref(
53625 +               ref, np->xbdev->otherend_id, mfn, GNTMAP_readonly);
53626 +       tx->gref = np->grant_tx_ref[id] = ref;
53627 +       tx->offset = (unsigned long)skb->data & ~PAGE_MASK;
53628 +       tx->size = skb->len;
53629 +
53630 +       tx->flags = 0;
53631 +       if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
53632 +               tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
53633 +       if (skb->proto_data_valid) /* remote but checksummed? */
53634 +               tx->flags |= NETTXF_data_validated;
53635 +
53636 +       np->tx.req_prod_pvt = i + 1;
53637 +       RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify);
53638 +       if (notify)
53639 +               notify_remote_via_irq(np->irq);
53640 +
53641 +       network_tx_buf_gc(dev);
53642 +
53643 +       if (RING_FULL(&np->tx)) {
53644 +               np->tx_full = 1;
53645 +               netif_stop_queue(dev);
53646 +       }
53647 +
53648 +       spin_unlock_irq(&np->tx_lock);
53649 +
53650 +       np->stats.tx_bytes += skb->len;
53651 +       np->stats.tx_packets++;
53652 +
53653 +       return 0;
53654 +
53655 + drop:
53656 +       np->stats.tx_dropped++;
53657 +       dev_kfree_skb(skb);
53658 +       return 0;
53659 +}
53660 +
53661 +static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
53662 +{
53663 +       struct net_device *dev = dev_id;
53664 +       struct netfront_info *np = netdev_priv(dev);
53665 +       unsigned long flags;
53666 +
53667 +       spin_lock_irqsave(&np->tx_lock, flags);
53668 +       network_tx_buf_gc(dev);
53669 +       spin_unlock_irqrestore(&np->tx_lock, flags);
53670 +
53671 +       if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx) &&
53672 +           (np->user_state == UST_OPEN))
53673 +               netif_rx_schedule(dev);
53674 +
53675 +       return IRQ_HANDLED;
53676 +}
53677 +
53678 +
53679 +static int netif_poll(struct net_device *dev, int *pbudget)
53680 +{
53681 +       struct netfront_info *np = netdev_priv(dev);
53682 +       struct sk_buff *skb, *nskb;
53683 +       netif_rx_response_t *rx;
53684 +       RING_IDX i, rp;
53685 +       mmu_update_t *mmu = np->rx_mmu;
53686 +       multicall_entry_t *mcl = np->rx_mcl;
53687 +       int work_done, budget, more_to_do = 1;
53688 +       struct sk_buff_head rxq;
53689 +       unsigned long flags;
53690 +       unsigned long mfn;
53691 +       grant_ref_t ref;
53692 +
53693 +       spin_lock(&np->rx_lock);
53694 +
53695 +       if (np->backend_state != BEST_CONNECTED) {
53696 +               spin_unlock(&np->rx_lock);
53697 +               return 0;
53698 +       }
53699 +
53700 +       skb_queue_head_init(&rxq);
53701 +
53702 +       if ((budget = *pbudget) > dev->quota)
53703 +               budget = dev->quota;
53704 +       rp = np->rx.sring->rsp_prod;
53705 +       rmb(); /* Ensure we see queued responses up to 'rp'. */
53706 +
53707 +       for (i = np->rx.rsp_cons, work_done = 0;
53708 +            (i != rp) && (work_done < budget);
53709 +            i++, work_done++) {
53710 +               rx = RING_GET_RESPONSE(&np->rx, i);
53711 +
53712 +               /*
53713 +                 * This definitely indicates a bug, either in this driver or
53714 +                 * in the backend driver. In future this should flag the bad
53715 +                 * situation to the system controller to reboot the backed.
53716 +                 */
53717 +               if ((ref = np->grant_rx_ref[rx->id]) == GRANT_INVALID_REF) {
53718 +                       WPRINTK("Bad rx response id %d.\n", rx->id);
53719 +                       work_done--;
53720 +                       continue;
53721 +               }
53722 +
53723 +               /* Memory pressure, insufficient buffer headroom, ... */
53724 +               if ((mfn = gnttab_end_foreign_transfer_ref(ref)) == 0) {
53725 +                       if (net_ratelimit())
53726 +                               WPRINTK("Unfulfilled rx req (id=%d, st=%d).\n",
53727 +                                       rx->id, rx->status);
53728 +                       RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id =
53729 +                               rx->id;
53730 +                       RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref =
53731 +                               ref;
53732 +                       np->rx.req_prod_pvt++;
53733 +                       RING_PUSH_REQUESTS(&np->rx);
53734 +                       work_done--;
53735 +                       continue;
53736 +               }
53737 +
53738 +               gnttab_release_grant_reference(&np->gref_rx_head, ref);
53739 +               np->grant_rx_ref[rx->id] = GRANT_INVALID_REF;
53740 +
53741 +               skb = np->rx_skbs[rx->id];
53742 +               add_id_to_freelist(np->rx_skbs, rx->id);
53743 +
53744 +               /* NB. We handle skb overflow later. */
53745 +               skb->data = skb->head + rx->offset;
53746 +               skb->len  = rx->status;
53747 +               skb->tail = skb->data + skb->len;
53748 +
53749 +               /*
53750 +                * Old backends do not assert data_validated but we
53751 +                * can infer it from csum_blank so test both flags.
53752 +                */
53753 +               if (rx->flags & (NETRXF_data_validated|NETRXF_csum_blank)) {
53754 +                       skb->ip_summed = CHECKSUM_UNNECESSARY;
53755 +                       skb->proto_data_valid = 1;
53756 +               } else {
53757 +                       skb->ip_summed = CHECKSUM_NONE;
53758 +                       skb->proto_data_valid = 0;
53759 +               }
53760 +               skb->proto_csum_blank = !!(rx->flags & NETRXF_csum_blank);
53761 +
53762 +               np->stats.rx_packets++;
53763 +               np->stats.rx_bytes += rx->status;
53764 +
53765 +               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
53766 +                       /* Remap the page. */
53767 +                       MULTI_update_va_mapping(mcl, (unsigned long)skb->head,
53768 +                                               pfn_pte_ma(mfn, PAGE_KERNEL),
53769 +                                               0);
53770 +                       mcl++;
53771 +                       mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
53772 +                               | MMU_MACHPHYS_UPDATE;
53773 +                       mmu->val = __pa(skb->head) >> PAGE_SHIFT;
53774 +                       mmu++;
53775 +
53776 +                       set_phys_to_machine(__pa(skb->head) >> PAGE_SHIFT,
53777 +                                           mfn);
53778 +               }
53779 +
53780 +               __skb_queue_tail(&rxq, skb);
53781 +       }
53782 +
53783 +       /* Some pages are no longer absent... */
53784 +       balloon_update_driver_allowance(-work_done);
53785 +
53786 +       /* Do all the remapping work, and M2P updates, in one big hypercall. */
53787 +       if (likely((mcl - np->rx_mcl) != 0)) {
53788 +               mcl->op = __HYPERVISOR_mmu_update;
53789 +               mcl->args[0] = (unsigned long)np->rx_mmu;
53790 +               mcl->args[1] = mmu - np->rx_mmu;
53791 +               mcl->args[2] = 0;
53792 +               mcl->args[3] = DOMID_SELF;
53793 +               mcl++;
53794 +               (void)HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl);
53795 +       }
53796 +
53797 +       while ((skb = __skb_dequeue(&rxq)) != NULL) {
53798 +               if (skb->len > (dev->mtu + ETH_HLEN + 4)) {
53799 +                       if (net_ratelimit())
53800 +                               printk(KERN_INFO "Received packet too big for "
53801 +                                      "MTU (%d > %d)\n",
53802 +                                      skb->len - ETH_HLEN - 4, dev->mtu);
53803 +                       skb->len  = 0;
53804 +                       skb->tail = skb->data;
53805 +                       init_skb_shinfo(skb);
53806 +                       dev_kfree_skb(skb);
53807 +                       continue;
53808 +               }
53809 +
53810 +               /*
53811 +                * Enough room in skbuff for the data we were passed? Also,
53812 +                * Linux expects at least 16 bytes headroom in each rx buffer.
53813 +                */
53814 +               if (unlikely(skb->tail > skb->end) ||
53815 +                   unlikely((skb->data - skb->head) < 16)) {
53816 +                       if (net_ratelimit()) {
53817 +                               if (skb->tail > skb->end)
53818 +                                       printk(KERN_INFO "Received packet "
53819 +                                              "is %zd bytes beyond tail.\n",
53820 +                                              skb->tail - skb->end);
53821 +                               else
53822 +                                       printk(KERN_INFO "Received packet "
53823 +                                              "is %zd bytes before head.\n",
53824 +                                              16 - (skb->data - skb->head));
53825 +                       }
53826 +
53827 +                       nskb = __dev_alloc_skb(skb->len + 2,
53828 +                                              GFP_ATOMIC|__GFP_NOWARN);
53829 +                       if (nskb != NULL) {
53830 +                               skb_reserve(nskb, 2);
53831 +                               skb_put(nskb, skb->len);
53832 +                               memcpy(nskb->data, skb->data, skb->len);
53833 +                               nskb->dev = skb->dev;
53834 +                               nskb->ip_summed = skb->ip_summed;
53835 +                       }
53836 +
53837 +                       /* Reinitialise and then destroy the old skbuff. */
53838 +                       skb->len  = 0;
53839 +                       skb->tail = skb->data;
53840 +                       init_skb_shinfo(skb);
53841 +                       dev_kfree_skb(skb);
53842 +
53843 +                       /* Switch old for new, if we copied the buffer. */
53844 +                       if ((skb = nskb) == NULL)
53845 +                               continue;
53846 +               }
53847 +
53848 +               /* Set the shinfo area, which is hidden behind the data. */
53849 +               init_skb_shinfo(skb);
53850 +               /* Ethernet work: Delayed to here as it peeks the header. */
53851 +               skb->protocol = eth_type_trans(skb, dev);
53852 +
53853 +               /* Pass it up. */
53854 +               netif_receive_skb(skb);
53855 +               dev->last_rx = jiffies;
53856 +       }
53857 +
53858 +       np->rx.rsp_cons = i;
53859 +
53860 +       /* If we get a callback with very few responses, reduce fill target. */
53861 +       /* NB. Note exponential increase, linear decrease. */
53862 +       if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) >
53863 +            ((3*np->rx_target) / 4)) &&
53864 +           (--np->rx_target < np->rx_min_target))
53865 +               np->rx_target = np->rx_min_target;
53866 +
53867 +       network_alloc_rx_buffers(dev);
53868 +
53869 +       *pbudget   -= work_done;
53870 +       dev->quota -= work_done;
53871 +
53872 +       if (work_done < budget) {
53873 +               local_irq_save(flags);
53874 +
53875 +               RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do);
53876 +               if (!more_to_do)
53877 +                       __netif_rx_complete(dev);
53878 +
53879 +               local_irq_restore(flags);
53880 +       }
53881 +
53882 +       spin_unlock(&np->rx_lock);
53883 +
53884 +       return more_to_do;
53885 +}
53886 +
53887 +
53888 +static int network_close(struct net_device *dev)
53889 +{
53890 +       struct netfront_info *np = netdev_priv(dev);
53891 +       np->user_state = UST_CLOSED;
53892 +       netif_stop_queue(np->netdev);
53893 +       return 0;
53894 +}
53895 +
53896 +
53897 +static struct net_device_stats *network_get_stats(struct net_device *dev)
53898 +{
53899 +       struct netfront_info *np = netdev_priv(dev);
53900 +       return &np->stats;
53901 +}
53902 +
53903 +static void network_connect(struct net_device *dev)
53904 +{
53905 +       struct netfront_info *np;
53906 +       int i, requeue_idx;
53907 +       netif_tx_request_t *tx;
53908 +       struct sk_buff *skb;
53909 +
53910 +       np = netdev_priv(dev);
53911 +       spin_lock_irq(&np->tx_lock);
53912 +       spin_lock(&np->rx_lock);
53913 +
53914 +       /* Recovery procedure: */
53915 +
53916 +       /* Step 1: Reinitialise variables. */
53917 +       np->tx_full = 0;
53918 +
53919 +       /*
53920 +        * Step 2: Rebuild the RX and TX ring contents.
53921 +        * NB. We could just free the queued TX packets now but we hope
53922 +        * that sending them out might do some good.  We have to rebuild
53923 +        * the RX ring because some of our pages are currently flipped out
53924 +        * so we can't just free the RX skbs.
53925 +        * NB2. Freelist index entries are always going to be less than
53926 +        *  __PAGE_OFFSET, whereas pointers to skbs will always be equal or
53927 +        * greater than __PAGE_OFFSET: we use this property to distinguish
53928 +        * them.
53929 +        */
53930 +
53931 +       /*
53932 +        * Rebuild the TX buffer freelist and the TX ring itself.
53933 +        * NB. This reorders packets.  We could keep more private state
53934 +        * to avoid this but maybe it doesn't matter so much given the
53935 +        * interface has been down.
53936 +        */
53937 +       for (requeue_idx = 0, i = 1; i <= NET_TX_RING_SIZE; i++) {
53938 +               if ((unsigned long)np->tx_skbs[i] < __PAGE_OFFSET)
53939 +                       continue;
53940 +
53941 +               skb = np->tx_skbs[i];
53942 +
53943 +               tx = RING_GET_REQUEST(&np->tx, requeue_idx);
53944 +               requeue_idx++;
53945 +
53946 +               tx->id = i;
53947 +               gnttab_grant_foreign_access_ref(
53948 +                       np->grant_tx_ref[i], np->xbdev->otherend_id,
53949 +                       virt_to_mfn(np->tx_skbs[i]->data),
53950 +                       GNTMAP_readonly);
53951 +               tx->gref = np->grant_tx_ref[i];
53952 +               tx->offset = (unsigned long)skb->data & ~PAGE_MASK;
53953 +               tx->size = skb->len;
53954 +               tx->flags = 0;
53955 +               if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
53956 +                       tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
53957 +               if (skb->proto_data_valid) /* remote but checksummed? */
53958 +                       tx->flags |= NETTXF_data_validated;
53959 +
53960 +               np->stats.tx_bytes += skb->len;
53961 +               np->stats.tx_packets++;
53962 +       }
53963 +
53964 +       np->tx.req_prod_pvt = requeue_idx;
53965 +       RING_PUSH_REQUESTS(&np->tx);
53966 +
53967 +       /* Rebuild the RX buffer freelist and the RX ring itself. */
53968 +       for (requeue_idx = 0, i = 1; i <= NET_RX_RING_SIZE; i++) {
53969 +               if ((unsigned long)np->rx_skbs[i] < __PAGE_OFFSET)
53970 +                       continue;
53971 +               gnttab_grant_foreign_transfer_ref(
53972 +                       np->grant_rx_ref[i], np->xbdev->otherend_id,
53973 +                       __pa(np->rx_skbs[i]->data) >> PAGE_SHIFT);
53974 +               RING_GET_REQUEST(&np->rx, requeue_idx)->gref =
53975 +                       np->grant_rx_ref[i];
53976 +               RING_GET_REQUEST(&np->rx, requeue_idx)->id = i;
53977 +               requeue_idx++;
53978 +       }
53979 +
53980 +       np->rx.req_prod_pvt = requeue_idx;
53981 +       RING_PUSH_REQUESTS(&np->rx);
53982 +
53983 +       /*
53984 +        * Step 3: All public and private state should now be sane.  Get
53985 +        * ready to start sending and receiving packets and give the driver
53986 +        * domain a kick because we've probably just requeued some
53987 +        * packets.
53988 +        */
53989 +       np->backend_state = BEST_CONNECTED;
53990 +       notify_remote_via_irq(np->irq);
53991 +       network_tx_buf_gc(dev);
53992 +
53993 +       if (np->user_state == UST_OPEN)
53994 +               netif_start_queue(dev);
53995 +
53996 +       spin_unlock(&np->rx_lock);
53997 +       spin_unlock_irq(&np->tx_lock);
53998 +}
53999 +
54000 +static void show_device(struct netfront_info *np)
54001 +{
54002 +#ifdef DEBUG
54003 +       if (np) {
54004 +               IPRINTK("<vif handle=%u %s(%s) evtchn=%u tx=%p rx=%p>\n",
54005 +                       np->handle,
54006 +                       be_state_name[np->backend_state],
54007 +                       np->user_state ? "open" : "closed",
54008 +                       np->evtchn,
54009 +                       np->tx,
54010 +                       np->rx);
54011 +       } else
54012 +               IPRINTK("<vif NULL>\n");
54013 +#endif
54014 +}
54015 +
54016 +static void netif_uninit(struct net_device *dev)
54017 +{
54018 +       struct netfront_info *np = netdev_priv(dev);
54019 +       gnttab_free_grant_references(np->gref_tx_head);
54020 +       gnttab_free_grant_references(np->gref_rx_head);
54021 +}
54022 +
54023 +static struct ethtool_ops network_ethtool_ops =
54024 +{
54025 +       .get_tx_csum = ethtool_op_get_tx_csum,
54026 +       .set_tx_csum = ethtool_op_set_tx_csum,
54027 +};
54028 +
54029 +/** Create a network device.
54030 + * @param handle device handle
54031 + * @param val return parameter for created device
54032 + * @return 0 on success, error code otherwise
54033 + */
54034 +static int create_netdev(int handle, struct xenbus_device *dev,
54035 +                        struct net_device **val)
54036 +{
54037 +       int i, err = 0;
54038 +       struct net_device *netdev = NULL;
54039 +       struct netfront_info *np = NULL;
54040 +
54041 +       if ((netdev = alloc_etherdev(sizeof(struct netfront_info))) == NULL) {
54042 +               printk(KERN_WARNING "%s> alloc_etherdev failed.\n",
54043 +                      __FUNCTION__);
54044 +               err = -ENOMEM;
54045 +               goto exit;
54046 +       }
54047 +
54048 +       np                = netdev_priv(netdev);
54049 +       np->backend_state = BEST_CLOSED;
54050 +       np->user_state    = UST_CLOSED;
54051 +       np->handle        = handle;
54052 +       np->xbdev         = dev;
54053 +
54054 +       spin_lock_init(&np->tx_lock);
54055 +       spin_lock_init(&np->rx_lock);
54056 +
54057 +       skb_queue_head_init(&np->rx_batch);
54058 +       np->rx_target     = RX_DFL_MIN_TARGET;
54059 +       np->rx_min_target = RX_DFL_MIN_TARGET;
54060 +       np->rx_max_target = RX_MAX_TARGET;
54061 +
54062 +       init_timer(&np->rx_refill_timer);
54063 +       np->rx_refill_timer.data = (unsigned long)netdev;
54064 +       np->rx_refill_timer.function = rx_refill_timeout;
54065 +
54066 +       /* Initialise {tx,rx}_skbs as a free chain containing every entry. */
54067 +       for (i = 0; i <= NET_TX_RING_SIZE; i++) {
54068 +               np->tx_skbs[i] = (void *)((unsigned long) i+1);
54069 +               np->grant_tx_ref[i] = GRANT_INVALID_REF;
54070 +       }
54071 +
54072 +       for (i = 0; i <= NET_RX_RING_SIZE; i++) {
54073 +               np->rx_skbs[i] = (void *)((unsigned long) i+1);
54074 +               np->grant_rx_ref[i] = GRANT_INVALID_REF;
54075 +       }
54076 +
54077 +       /* A grant for every tx ring slot */
54078 +       if (gnttab_alloc_grant_references(NET_TX_RING_SIZE,
54079 +                                         &np->gref_tx_head) < 0) {
54080 +               printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n");
54081 +               err = -ENOMEM;
54082 +               goto exit;
54083 +       }
54084 +       /* A grant for every rx ring slot */
54085 +       if (gnttab_alloc_grant_references(NET_RX_RING_SIZE,
54086 +                                         &np->gref_rx_head) < 0) {
54087 +               printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n");
54088 +               gnttab_free_grant_references(np->gref_tx_head);
54089 +               err = -ENOMEM;
54090 +               goto exit;
54091 +       }
54092 +
54093 +       netdev->open            = network_open;
54094 +       netdev->hard_start_xmit = network_start_xmit;
54095 +       netdev->stop            = network_close;
54096 +       netdev->get_stats       = network_get_stats;
54097 +       netdev->poll            = netif_poll;
54098 +       netdev->uninit          = netif_uninit;
54099 +       netdev->weight          = 64;
54100 +       netdev->features        = NETIF_F_IP_CSUM;
54101 +
54102 +       SET_ETHTOOL_OPS(netdev, &network_ethtool_ops);
54103 +       SET_MODULE_OWNER(netdev);
54104 +       SET_NETDEV_DEV(netdev, &dev->dev);
54105 +
54106 +       if ((err = register_netdev(netdev)) != 0) {
54107 +               printk(KERN_WARNING "%s> register_netdev err=%d\n",
54108 +                      __FUNCTION__, err);
54109 +               goto exit_free_grefs;
54110 +       }
54111 +
54112 +       if ((err = xennet_proc_addif(netdev)) != 0) {
54113 +               unregister_netdev(netdev);
54114 +               goto exit_free_grefs;
54115 +       }
54116 +
54117 +       np->netdev = netdev;
54118 +
54119 + exit:
54120 +       if (err != 0)
54121 +               kfree(netdev);
54122 +       else if (val != NULL)
54123 +               *val = netdev;
54124 +       return err;
54125 +
54126 + exit_free_grefs:
54127 +       gnttab_free_grant_references(np->gref_tx_head);
54128 +       gnttab_free_grant_references(np->gref_rx_head);
54129 +       goto exit;
54130 +}
54131 +
54132 +/*
54133 + * We use this notifier to send out a fake ARP reply to reset switches and
54134 + * router ARP caches when an IP interface is brought up on a VIF.
54135 + */
54136 +static int
54137 +inetdev_notify(struct notifier_block *this, unsigned long event, void *ptr)
54138 +{
54139 +       struct in_ifaddr  *ifa = (struct in_ifaddr *)ptr;
54140 +       struct net_device *dev = ifa->ifa_dev->dev;
54141 +
54142 +       /* UP event and is it one of our devices? */
54143 +       if (event == NETDEV_UP && dev->open == network_open)
54144 +               (void)send_fake_arp(dev);
54145 +
54146 +       return NOTIFY_DONE;
54147 +}
54148 +
54149 +
54150 +/* ** Close down ** */
54151 +
54152 +
54153 +/**
54154 + * Handle the change of state of the backend to Closing.  We must delete our
54155 + * device-layer structures now, to ensure that writes are flushed through to
54156 + * the backend.  Once is this done, we can switch to Closed in
54157 + * acknowledgement.
54158 + */
54159 +static void netfront_closing(struct xenbus_device *dev)
54160 +{
54161 +       struct netfront_info *info = dev->data;
54162 +
54163 +       DPRINTK("netfront_closing: %s removed\n", dev->nodename);
54164 +
54165 +       close_netdev(info);
54166 +
54167 +       xenbus_switch_state(dev, XenbusStateClosed);
54168 +}
54169 +
54170 +
54171 +static int netfront_remove(struct xenbus_device *dev)
54172 +{
54173 +       struct netfront_info *info = dev->data;
54174 +
54175 +       DPRINTK("%s\n", dev->nodename);
54176 +
54177 +       netif_disconnect_backend(info);
54178 +       free_netdev(info->netdev);
54179 +
54180 +       return 0;
54181 +}
54182 +
54183 +
54184 +static void close_netdev(struct netfront_info *info)
54185 +{
54186 +       spin_lock_irq(&info->netdev->xmit_lock);
54187 +       netif_stop_queue(info->netdev);
54188 +       spin_unlock_irq(&info->netdev->xmit_lock);
54189 +
54190 +#ifdef CONFIG_PROC_FS
54191 +       xennet_proc_delif(info->netdev);
54192 +#endif
54193 +
54194 +       del_timer_sync(&info->rx_refill_timer);
54195 +
54196 +       unregister_netdev(info->netdev);
54197 +}
54198 +
54199 +
54200 +static void netif_disconnect_backend(struct netfront_info *info)
54201 +{
54202 +       /* Stop old i/f to prevent errors whilst we rebuild the state. */
54203 +       spin_lock_irq(&info->tx_lock);
54204 +       spin_lock(&info->rx_lock);
54205 +       info->backend_state = BEST_DISCONNECTED;
54206 +       spin_unlock(&info->rx_lock);
54207 +       spin_unlock_irq(&info->tx_lock);
54208 +
54209 +       if (info->irq)
54210 +               unbind_from_irqhandler(info->irq, info->netdev);
54211 +       info->evtchn = info->irq = 0;
54212 +
54213 +       end_access(info->tx_ring_ref, info->tx.sring);
54214 +       end_access(info->rx_ring_ref, info->rx.sring);
54215 +       info->tx_ring_ref = GRANT_INVALID_REF;
54216 +       info->rx_ring_ref = GRANT_INVALID_REF;
54217 +       info->tx.sring = NULL;
54218 +       info->rx.sring = NULL;
54219 +}
54220 +
54221 +
54222 +static void netif_free(struct netfront_info *info)
54223 +{
54224 +       close_netdev(info);
54225 +       netif_disconnect_backend(info);
54226 +       free_netdev(info->netdev);
54227 +}
54228 +
54229 +
54230 +static void end_access(int ref, void *page)
54231 +{
54232 +       if (ref != GRANT_INVALID_REF)
54233 +               gnttab_end_foreign_access(ref, 0, (unsigned long)page);
54234 +}
54235 +
54236 +
54237 +/* ** Driver registration ** */
54238 +
54239 +
54240 +static struct xenbus_device_id netfront_ids[] = {
54241 +       { "vif" },
54242 +       { "" }
54243 +};
54244 +
54245 +
54246 +static struct xenbus_driver netfront = {
54247 +       .name = "vif",
54248 +       .owner = THIS_MODULE,
54249 +       .ids = netfront_ids,
54250 +       .probe = netfront_probe,
54251 +       .remove = netfront_remove,
54252 +       .resume = netfront_resume,
54253 +       .otherend_changed = backend_changed,
54254 +};
54255 +
54256 +
54257 +static struct notifier_block notifier_inetdev = {
54258 +       .notifier_call  = inetdev_notify,
54259 +       .next           = NULL,
54260 +       .priority       = 0
54261 +};
54262 +
54263 +static int __init netif_init(void)
54264 +{
54265 +       int err = 0;
54266 +
54267 +       if (xen_start_info->flags & SIF_INITDOMAIN)
54268 +               return 0;
54269 +
54270 +       if ((err = xennet_proc_init()) != 0)
54271 +               return err;
54272 +
54273 +       IPRINTK("Initialising virtual ethernet driver.\n");
54274 +
54275 +       (void)register_inetaddr_notifier(&notifier_inetdev);
54276 +
54277 +       return xenbus_register_frontend(&netfront);
54278 +}
54279 +module_init(netif_init);
54280 +
54281 +
54282 +static void netif_exit(void)
54283 +{
54284 +       unregister_inetaddr_notifier(&notifier_inetdev);
54285 +
54286 +       return xenbus_unregister_driver(&netfront);
54287 +}
54288 +module_exit(netif_exit);
54289 +
54290 +MODULE_LICENSE("Dual BSD/GPL");
54291 +
54292 +
54293 +/* ** /proc **/
54294 +
54295 +
54296 +#ifdef CONFIG_PROC_FS
54297 +
54298 +#define TARGET_MIN 0UL
54299 +#define TARGET_MAX 1UL
54300 +#define TARGET_CUR 2UL
54301 +
54302 +static int xennet_proc_read(
54303 +       char *page, char **start, off_t off, int count, int *eof, void *data)
54304 +{
54305 +       struct net_device *dev =
54306 +               (struct net_device *)((unsigned long)data & ~3UL);
54307 +       struct netfront_info *np = netdev_priv(dev);
54308 +       int len = 0, which_target = (long)data & 3;
54309 +
54310 +       switch (which_target) {
54311 +       case TARGET_MIN:
54312 +               len = sprintf(page, "%d\n", np->rx_min_target);
54313 +               break;
54314 +       case TARGET_MAX:
54315 +               len = sprintf(page, "%d\n", np->rx_max_target);
54316 +               break;
54317 +       case TARGET_CUR:
54318 +               len = sprintf(page, "%d\n", np->rx_target);
54319 +               break;
54320 +       }
54321 +
54322 +       *eof = 1;
54323 +       return len;
54324 +}
54325 +
54326 +static int xennet_proc_write(
54327 +       struct file *file, const char __user *buffer,
54328 +       unsigned long count, void *data)
54329 +{
54330 +       struct net_device *dev =
54331 +               (struct net_device *)((unsigned long)data & ~3UL);
54332 +       struct netfront_info *np = netdev_priv(dev);
54333 +       int which_target = (long)data & 3;
54334 +       char string[64];
54335 +       long target;
54336 +
54337 +       if (!capable(CAP_SYS_ADMIN))
54338 +               return -EPERM;
54339 +
54340 +       if (count <= 1)
54341 +               return -EBADMSG; /* runt */
54342 +       if (count > sizeof(string))
54343 +               return -EFBIG;   /* too long */
54344 +
54345 +       if (copy_from_user(string, buffer, count))
54346 +               return -EFAULT;
54347 +       string[sizeof(string)-1] = '\0';
54348 +
54349 +       target = simple_strtol(string, NULL, 10);
54350 +       if (target < RX_MIN_TARGET)
54351 +               target = RX_MIN_TARGET;
54352 +       if (target > RX_MAX_TARGET)
54353 +               target = RX_MAX_TARGET;
54354 +
54355 +       spin_lock(&np->rx_lock);
54356 +
54357 +       switch (which_target) {
54358 +       case TARGET_MIN:
54359 +               if (target > np->rx_max_target)
54360 +                       np->rx_max_target = target;
54361 +               np->rx_min_target = target;
54362 +               if (target > np->rx_target)
54363 +                       np->rx_target = target;
54364 +               break;
54365 +       case TARGET_MAX:
54366 +               if (target < np->rx_min_target)
54367 +                       np->rx_min_target = target;
54368 +               np->rx_max_target = target;
54369 +               if (target < np->rx_target)
54370 +                       np->rx_target = target;
54371 +               break;
54372 +       case TARGET_CUR:
54373 +               break;
54374 +       }
54375 +
54376 +       network_alloc_rx_buffers(dev);
54377 +
54378 +       spin_unlock(&np->rx_lock);
54379 +
54380 +       return count;
54381 +}
54382 +
54383 +static int xennet_proc_init(void)
54384 +{
54385 +       if (proc_mkdir("xen/net", NULL) == NULL)
54386 +               return -ENOMEM;
54387 +       return 0;
54388 +}
54389 +
54390 +static int xennet_proc_addif(struct net_device *dev)
54391 +{
54392 +       struct proc_dir_entry *dir, *min, *max, *cur;
54393 +       char name[30];
54394 +
54395 +       sprintf(name, "xen/net/%s", dev->name);
54396 +
54397 +       dir = proc_mkdir(name, NULL);
54398 +       if (!dir)
54399 +               goto nomem;
54400 +
54401 +       min = create_proc_entry("rxbuf_min", 0644, dir);
54402 +       max = create_proc_entry("rxbuf_max", 0644, dir);
54403 +       cur = create_proc_entry("rxbuf_cur", 0444, dir);
54404 +       if (!min || !max || !cur)
54405 +               goto nomem;
54406 +
54407 +       min->read_proc  = xennet_proc_read;
54408 +       min->write_proc = xennet_proc_write;
54409 +       min->data       = (void *)((unsigned long)dev | TARGET_MIN);
54410 +
54411 +       max->read_proc  = xennet_proc_read;
54412 +       max->write_proc = xennet_proc_write;
54413 +       max->data       = (void *)((unsigned long)dev | TARGET_MAX);
54414 +
54415 +       cur->read_proc  = xennet_proc_read;
54416 +       cur->write_proc = xennet_proc_write;
54417 +       cur->data       = (void *)((unsigned long)dev | TARGET_CUR);
54418 +
54419 +       return 0;
54420 +
54421 + nomem:
54422 +       xennet_proc_delif(dev);
54423 +       return -ENOMEM;
54424 +}
54425 +
54426 +static void xennet_proc_delif(struct net_device *dev)
54427 +{
54428 +       char name[30];
54429 +
54430 +       sprintf(name, "xen/net/%s/rxbuf_min", dev->name);
54431 +       remove_proc_entry(name, NULL);
54432 +
54433 +       sprintf(name, "xen/net/%s/rxbuf_max", dev->name);
54434 +       remove_proc_entry(name, NULL);
54435 +
54436 +       sprintf(name, "xen/net/%s/rxbuf_cur", dev->name);
54437 +       remove_proc_entry(name, NULL);
54438 +
54439 +       sprintf(name, "xen/net/%s", dev->name);
54440 +       remove_proc_entry(name, NULL);
54441 +}
54442 +
54443 +#endif
54444 +
54445 +
54446 +/*
54447 + * Local variables:
54448 + *  c-file-style: "linux"
54449 + *  indent-tabs-mode: t
54450 + *  c-indent-level: 8
54451 + *  c-basic-offset: 8
54452 + *  tab-width: 8
54453 + * End:
54454 + */
54455 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/pciback/Makefile linux-2.6.16/drivers/xen/pciback/Makefile
54456 --- linux-2.6.16.orig/drivers/xen/pciback/Makefile      1970-01-01 01:00:00.000000000 +0100
54457 +++ linux-2.6.16/drivers/xen/pciback/Makefile   2006-06-26 09:51:32.000000000 +0200
54458 @@ -0,0 +1,10 @@
54459 +obj-$(CONFIG_XEN_PCIDEV_BACKEND) += pciback.o
54460 +
54461 +pciback-y := pci_stub.o pciback_ops.o xenbus.o
54462 +pciback-y += conf_space.o conf_space_header.o
54463 +pciback-$(CONFIG_XEN_PCIDEV_BACKEND_VPCI) += vpci.o
54464 +pciback-$(CONFIG_XEN_PCIDEV_BACKEND_PASS) += passthrough.o
54465 +
54466 +ifeq ($(CONFIG_XEN_PCIDEV_BE_DEBUG),y)
54467 +EXTRA_CFLAGS += -DDEBUG
54468 +endif
54469 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/pciback/conf_space.c linux-2.6.16/drivers/xen/pciback/conf_space.c
54470 --- linux-2.6.16.orig/drivers/xen/pciback/conf_space.c  1970-01-01 01:00:00.000000000 +0100
54471 +++ linux-2.6.16/drivers/xen/pciback/conf_space.c       2006-06-26 09:51:32.000000000 +0200
54472 @@ -0,0 +1,345 @@
54473 +/*
54474 + * PCI Backend - Functions for creating a virtual configuration space for
54475 + *               exported PCI Devices.
54476 + *               It's dangerous to allow PCI Driver Domains to change their
54477 + *               device's resources (memory, i/o ports, interrupts). We need to
54478 + *               restrict changes to certain PCI Configuration registers:
54479 + *               BARs, INTERRUPT_PIN, most registers in the header...
54480 + *
54481 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
54482 + */
54483 +
54484 +#include <linux/kernel.h>
54485 +#include <linux/pci.h>
54486 +#include "pciback.h"
54487 +#include "conf_space.h"
54488 +
54489 +static int permissive = 0;
54490 +module_param(permissive, bool, 0644);
54491 +
54492 +#define DEFINE_PCI_CONFIG(op,size,type)                                        \
54493 +int pciback_##op##_config_##size                                                       \
54494 +(struct pci_dev *dev, int offset, type value, void *data)      \
54495 +{                                                                                                                      \
54496 +       return pci_##op##_config_##size (dev, offset, value);   \
54497 +}
54498 +
54499 +DEFINE_PCI_CONFIG(read, byte, u8 *)
54500 +DEFINE_PCI_CONFIG(read, word, u16 *)
54501 +DEFINE_PCI_CONFIG(read, dword, u32 *)
54502 +
54503 +DEFINE_PCI_CONFIG(write, byte, u8)
54504 +DEFINE_PCI_CONFIG(write, word, u16)
54505 +DEFINE_PCI_CONFIG(write, dword, u32)
54506 +
54507 +static int conf_space_read(struct pci_dev *dev,
54508 +                          struct config_field_entry *entry, int offset,
54509 +                          u32 * value)
54510 +{
54511 +       int ret = 0;
54512 +       struct config_field *field = entry->field;
54513 +
54514 +       *value = 0;
54515 +
54516 +       switch (field->size) {
54517 +       case 1:
54518 +               if (field->u.b.read)
54519 +                       ret = field->u.b.read(dev, offset, (u8 *) value,
54520 +                                             entry->data);
54521 +               break;
54522 +       case 2:
54523 +               if (field->u.w.read)
54524 +                       ret = field->u.w.read(dev, offset, (u16 *) value,
54525 +                                             entry->data);
54526 +               break;
54527 +       case 4:
54528 +               if (field->u.dw.read)
54529 +                       ret = field->u.dw.read(dev, offset, value, entry->data);
54530 +               break;
54531 +       }
54532 +       return ret;
54533 +}
54534 +
54535 +static int conf_space_write(struct pci_dev *dev,
54536 +                           struct config_field_entry *entry, int offset,
54537 +                           u32 value)
54538 +{
54539 +       int ret = 0;
54540 +       struct config_field *field = entry->field;
54541 +
54542 +       switch (field->size) {
54543 +       case 1:
54544 +               if (field->u.b.write)
54545 +                       ret = field->u.b.write(dev, offset, (u8) value,
54546 +                                              entry->data);
54547 +               break;
54548 +       case 2:
54549 +               if (field->u.w.write)
54550 +                       ret = field->u.w.write(dev, offset, (u16) value,
54551 +                                              entry->data);
54552 +               break;
54553 +       case 4:
54554 +               if (field->u.dw.write)
54555 +                       ret = field->u.dw.write(dev, offset, value,
54556 +                                               entry->data);
54557 +               break;
54558 +       }
54559 +       return ret;
54560 +}
54561 +
54562 +static inline u32 get_mask(int size)
54563 +{
54564 +       if (size == 1)
54565 +               return 0xff;
54566 +       else if (size == 2)
54567 +               return 0xffff;
54568 +       else
54569 +               return 0xffffffff;
54570 +}
54571 +
54572 +static inline int valid_request(int offset, int size)
54573 +{
54574 +       /* Validate request (no un-aligned requests) */
54575 +       if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0)
54576 +               return 1;
54577 +       return 0;
54578 +}
54579 +
54580 +static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask,
54581 +                             int offset)
54582 +{
54583 +       if (offset >= 0) {
54584 +               new_val_mask <<= (offset * 8);
54585 +               new_val <<= (offset * 8);
54586 +       } else {
54587 +               new_val_mask >>= (offset * -8);
54588 +               new_val >>= (offset * -8);
54589 +       }
54590 +       val = (val & ~new_val_mask) | (new_val & new_val_mask);
54591 +
54592 +       return val;
54593 +}
54594 +
54595 +static int pcibios_err_to_errno(int err)
54596 +{
54597 +       switch (err) {
54598 +       case PCIBIOS_SUCCESSFUL:
54599 +               return XEN_PCI_ERR_success;
54600 +       case PCIBIOS_DEVICE_NOT_FOUND:
54601 +               return XEN_PCI_ERR_dev_not_found;
54602 +       case PCIBIOS_BAD_REGISTER_NUMBER:
54603 +               return XEN_PCI_ERR_invalid_offset;
54604 +       case PCIBIOS_FUNC_NOT_SUPPORTED:
54605 +               return XEN_PCI_ERR_not_implemented;
54606 +       case PCIBIOS_SET_FAILED:
54607 +               return XEN_PCI_ERR_access_denied;
54608 +       }
54609 +       return err;
54610 +}
54611 +
54612 +int pciback_config_read(struct pci_dev *dev, int offset, int size,
54613 +                       u32 * ret_val)
54614 +{
54615 +       int err = 0;
54616 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
54617 +       struct config_field_entry *cfg_entry;
54618 +       struct config_field *field;
54619 +       int req_start, req_end, field_start, field_end;
54620 +       /* if read fails for any reason, return 0 (as if device didn't respond) */
54621 +       u32 value = 0, tmp_val;
54622 +
54623 +       if (unlikely(verbose_request))
54624 +               printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x\n",
54625 +                      pci_name(dev), size, offset);
54626 +
54627 +       if (!valid_request(offset, size)) {
54628 +               err = XEN_PCI_ERR_invalid_offset;
54629 +               goto out;
54630 +       }
54631 +
54632 +       /* Get the real value first, then modify as appropriate */
54633 +       switch (size) {
54634 +       case 1:
54635 +               err = pci_read_config_byte(dev, offset, (u8 *) & value);
54636 +               break;
54637 +       case 2:
54638 +               err = pci_read_config_word(dev, offset, (u16 *) & value);
54639 +               break;
54640 +       case 4:
54641 +               err = pci_read_config_dword(dev, offset, &value);
54642 +               break;
54643 +       }
54644 +
54645 +       list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
54646 +               field = cfg_entry->field;
54647 +
54648 +               req_start = offset;
54649 +               req_end = offset + size;
54650 +               field_start = field->offset;
54651 +               field_end = field->offset + field->size;
54652 +
54653 +               if ((req_start >= field_start && req_start < field_end)
54654 +                   || (req_end > field_start && req_end <= field_end)) {
54655 +                       err = conf_space_read(dev, cfg_entry, field_start,
54656 +                                             &tmp_val);
54657 +                       if (err)
54658 +                               goto out;
54659 +
54660 +                       value = merge_value(value, tmp_val,
54661 +                                           get_mask(field->size),
54662 +                                           field_start - req_start);
54663 +               }
54664 +       }
54665 +
54666 +      out:
54667 +       if (unlikely(verbose_request))
54668 +               printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x = %x\n",
54669 +                      pci_name(dev), size, offset, value);
54670 +
54671 +       *ret_val = value;
54672 +       return pcibios_err_to_errno(err);
54673 +}
54674 +
54675 +int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value)
54676 +{
54677 +       int err = 0, handled = 0;
54678 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
54679 +       struct config_field_entry *cfg_entry;
54680 +       struct config_field *field;
54681 +       u32 tmp_val;
54682 +       int req_start, req_end, field_start, field_end;
54683 +
54684 +       if (unlikely(verbose_request))
54685 +               printk(KERN_DEBUG
54686 +                      "pciback: %s: write request %d bytes at 0x%x = %x\n",
54687 +                      pci_name(dev), size, offset, value);
54688 +
54689 +       if (!valid_request(offset, size))
54690 +               return XEN_PCI_ERR_invalid_offset;
54691 +
54692 +       list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
54693 +               field = cfg_entry->field;
54694 +
54695 +               req_start = offset;
54696 +               req_end = offset + size;
54697 +               field_start = field->offset;
54698 +               field_end = field->offset + field->size;
54699 +
54700 +               if ((req_start >= field_start && req_start < field_end)
54701 +                   || (req_end > field_start && req_end <= field_end)) {
54702 +                       tmp_val = 0;
54703 +
54704 +                       err = pciback_config_read(dev, field_start,
54705 +                                                 field->size, &tmp_val);
54706 +                       if (err)
54707 +                               break;
54708 +
54709 +                       tmp_val = merge_value(tmp_val, value, get_mask(size),
54710 +                                             req_start - field_start);
54711 +
54712 +                       err = conf_space_write(dev, cfg_entry, field_start,
54713 +                                              tmp_val);
54714 +                       handled = 1;
54715 +               }
54716 +       }
54717 +
54718 +       if (!handled && !err && permissive) {
54719 +               switch (size) {
54720 +               case 1:
54721 +                       err = pci_write_config_byte(dev, offset, (u8)value);
54722 +                       break;
54723 +               case 2:
54724 +                       err = pci_write_config_word(dev, offset, (u16)value);
54725 +                       break;
54726 +               case 4:
54727 +                       err = pci_write_config_dword(dev, offset, (u32)value);
54728 +                       break;
54729 +               }
54730 +       }
54731 +
54732 +       return pcibios_err_to_errno(err);
54733 +}
54734 +
54735 +void pciback_config_reset(struct pci_dev *dev)
54736 +{
54737 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
54738 +       struct config_field_entry *cfg_entry;
54739 +       struct config_field *field;
54740 +
54741 +       list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
54742 +               field = cfg_entry->field;
54743 +
54744 +               if (field->reset)
54745 +                       field->reset(dev, field->offset, cfg_entry->data);
54746 +       }
54747 +}
54748 +
54749 +void pciback_config_free(struct pci_dev *dev)
54750 +{
54751 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
54752 +       struct config_field_entry *cfg_entry, *t;
54753 +       struct config_field *field;
54754 +
54755 +       list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
54756 +               list_del(&cfg_entry->list);
54757 +
54758 +               field = cfg_entry->field;
54759 +
54760 +               if (field->release)
54761 +                       field->release(dev, field->offset, cfg_entry->data);
54762 +
54763 +               kfree(cfg_entry);
54764 +       }
54765 +}
54766 +
54767 +int pciback_config_add_field(struct pci_dev *dev, struct config_field *field)
54768 +{
54769 +       int err = 0;
54770 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
54771 +       struct config_field_entry *cfg_entry;
54772 +       void *tmp;
54773 +
54774 +       cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL);
54775 +       if (!cfg_entry) {
54776 +               err = -ENOMEM;
54777 +               goto out;
54778 +       }
54779 +
54780 +       cfg_entry->data = NULL;
54781 +       cfg_entry->field = field;
54782 +
54783 +       if (field->init) {
54784 +               tmp = field->init(dev, field->offset);
54785 +
54786 +               if (IS_ERR(tmp)) {
54787 +                       err = PTR_ERR(tmp);
54788 +                       goto out;
54789 +               }
54790 +
54791 +               cfg_entry->data = tmp;
54792 +       }
54793 +
54794 +       list_add_tail(&cfg_entry->list, &dev_data->config_fields);
54795 +
54796 +      out:
54797 +       if (err)
54798 +               kfree(cfg_entry);
54799 +
54800 +       return err;
54801 +}
54802 +
54803 +/* This sets up the device's virtual configuration space to keep track of 
54804 + * certain registers (like the base address registers (BARs) so that we can
54805 + * keep the client from manipulating them directly.
54806 + */
54807 +int pciback_config_init(struct pci_dev *dev)
54808 +{
54809 +       int err = 0;
54810 +       struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
54811 +
54812 +       INIT_LIST_HEAD(&dev_data->config_fields);
54813 +
54814 +       err = pciback_config_header_add_fields(dev);
54815 +
54816 +       return err;
54817 +}
54818 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/pciback/conf_space.h linux-2.6.16/drivers/xen/pciback/conf_space.h
54819 --- linux-2.6.16.orig/drivers/xen/pciback/conf_space.h  1970-01-01 01:00:00.000000000 +0100
54820 +++ linux-2.6.16/drivers/xen/pciback/conf_space.h       2006-06-26 09:51:32.000000000 +0200
54821 @@ -0,0 +1,97 @@
54822 +/*
54823 + * PCI Backend - Common data structures for overriding the configuration space
54824 + *
54825 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
54826 + */
54827 +
54828 +#ifndef __XEN_PCIBACK_CONF_SPACE_H__
54829 +#define __XEN_PCIBACK_CONF_SPACE_H__
54830 +
54831 +#include <linux/list.h>
54832 +
54833 +typedef void *(*conf_field_init) (struct pci_dev * dev, int offset);
54834 +typedef void (*conf_field_reset) (struct pci_dev * dev, int offset, void *data);
54835 +typedef void (*conf_field_free) (struct pci_dev * dev, int offset, void *data);
54836 +
54837 +typedef int (*conf_dword_write) (struct pci_dev * dev, int offset, u32 value,
54838 +                                void *data);
54839 +typedef int (*conf_word_write) (struct pci_dev * dev, int offset, u16 value,
54840 +                               void *data);
54841 +typedef int (*conf_byte_write) (struct pci_dev * dev, int offset, u8 value,
54842 +                               void *data);
54843 +typedef int (*conf_dword_read) (struct pci_dev * dev, int offset, u32 * value,
54844 +                               void *data);
54845 +typedef int (*conf_word_read) (struct pci_dev * dev, int offset, u16 * value,
54846 +                              void *data);
54847 +typedef int (*conf_byte_read) (struct pci_dev * dev, int offset, u8 * value,
54848 +                              void *data);
54849 +
54850 +/* These are the fields within the configuration space which we
54851 + * are interested in intercepting reads/writes to and changing their
54852 + * values.
54853 + */
54854 +struct config_field {
54855 +       unsigned int     offset;
54856 +       unsigned int     size;
54857 +       conf_field_init  init;
54858 +       conf_field_reset reset;
54859 +       conf_field_free  release;
54860 +       union {
54861 +               struct {
54862 +                       conf_dword_write write;
54863 +                       conf_dword_read read;
54864 +               } dw;
54865 +               struct {
54866 +                       conf_word_write write;
54867 +                       conf_word_read read;
54868 +               } w;
54869 +               struct {
54870 +                       conf_byte_write write;
54871 +                       conf_byte_read read;
54872 +               } b;
54873 +       } u;
54874 +};
54875 +
54876 +struct config_field_entry {
54877 +       struct list_head list;
54878 +       struct config_field *field;
54879 +       void *data;
54880 +};
54881 +
54882 +/* Add fields to a device - the add_fields macro expects to get a pointer to
54883 + * the first entry in an array (of which the ending is marked by size==0)
54884 + */
54885 +int pciback_config_add_field(struct pci_dev *dev, struct config_field *field);
54886 +static inline int pciback_config_add_fields(struct pci_dev *dev,
54887 +                                           struct config_field *field)
54888 +{
54889 +       int i, err = 0;
54890 +       for (i = 0; field[i].size != 0; i++) {
54891 +               err = pciback_config_add_field(dev, &field[i]);
54892 +               if (err)
54893 +                       break;
54894 +       }
54895 +       return err;
54896 +}
54897 +
54898 +/* Initializers which add fields to the virtual configuration space
54899 + * ** We could add initializers to allow a guest domain to touch
54900 + * the capability lists (for power management, the AGP bridge, etc.)
54901 + */
54902 +int pciback_config_header_add_fields(struct pci_dev *dev);
54903 +
54904 +/* Read/Write the real configuration space */
54905 +int pciback_read_config_byte(struct pci_dev *dev, int offset, u8 * value,
54906 +                            void *data);
54907 +int pciback_read_config_word(struct pci_dev *dev, int offset, u16 * value,
54908 +                            void *data);
54909 +int pciback_read_config_dword(struct pci_dev *dev, int offset, u32 * value,
54910 +                             void *data);
54911 +int pciback_write_config_byte(struct pci_dev *dev, int offset, u8 value,
54912 +                             void *data);
54913 +int pciback_write_config_word(struct pci_dev *dev, int offset, u16 value,
54914 +                             void *data);
54915 +int pciback_write_config_dword(struct pci_dev *dev, int offset, u32 value,
54916 +                              void *data);
54917 +
54918 +#endif                         /* __XEN_PCIBACK_CONF_SPACE_H__ */
54919 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/pciback/conf_space_header.c linux-2.6.16/drivers/xen/pciback/conf_space_header.c
54920 --- linux-2.6.16.orig/drivers/xen/pciback/conf_space_header.c   1970-01-01 01:00:00.000000000 +0100
54921 +++ linux-2.6.16/drivers/xen/pciback/conf_space_header.c        2006-06-26 09:51:32.000000000 +0200
54922 @@ -0,0 +1,267 @@
54923 +/*
54924 + * PCI Backend - Handles the virtual fields in the configuration space headers.
54925 + *
54926 + * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
54927 + */
54928 +
54929 +#include <linux/kernel.h>
54930 +#include <linux/pci.h>
54931 +#include "pciback.h"
54932 +#include "conf_space.h"
54933 +
54934 +struct pci_bar_info {
54935 +       u32 val;
54936 +       u32 len_val;
54937 +       int which;
54938 +};
54939 +
54940 +#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO))
54941 +#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER)
54942 +
54943 +static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
54944 +{
54945 +       if (!dev->is_enabled && is_enable_cmd(value)) {
54946 +               if (unlikely(verbose_request))
54947 +                       printk(KERN_DEBUG "pciback: %s: enable\n",
54948 +                              pci_name(dev));
54949 +               pci_enable_device(dev);
54950 +       } else if (dev->is_enabled && !is_enable_cmd(value)) {
54951 +               if (unlikely(verbose_request))
54952 +                       printk(KERN_DEBUG "pciback: %s: disable\n",
54953 +                              pci_name(dev));
54954 +               pci_disable_device(dev);
54955 +       }
54956 +
54957 +       if (!dev->is_busmaster && is_master_cmd(value)) {
54958 +               if (unlikely(verbose_request))
54959 +                       printk(KERN_DEBUG "pciback: %s: set bus master\n",
54960 +                              pci_name(dev));
54961 +               pci_set_master(dev);
54962 +       }
54963 +
54964 +       if (value & PCI_COMMAND_INVALIDATE) {
54965 +               if (unlikely(verbose_request))
54966 +                       printk(KERN_DEBUG
54967 +                              "pciback: %s: enable memory-write-invalidate\n",
54968 +                              pci_name(dev));
54969 +               pci_set_mwi(dev);
54970 +       }
54971 +
54972 +       return pci_write_config_word(dev, offset, value);
54973 +}
54974 +
54975 +static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data)
54976 +{
54977 +       struct pci_bar_info *bar = data;
54978 +
54979 +       if (unlikely(!bar)) {
54980 +               printk(KERN_WARNING "pciback: driver data not found for %s\n",
54981 +                      pci_name(dev));
54982 +               return XEN_PCI_ERR_op_failed;
54983 +       }
54984 +
54985 +       /* A write to obtain the length must happen as a 32-bit write.
54986 +        * This does not (yet) support writing individual bytes
54987 +        */
54988 +       if (value == ~PCI_ROM_ADDRESS_ENABLE)
54989 +               bar->which = 1;
54990 +       else
54991 +               bar->which = 0;
54992 +
54993 +       /* Do we need to support enabling/disabling the rom address here? */
54994 +
54995 +       return 0;
54996 +}
54997 +
54998 +/* For the BARs, only allow writes which write ~0 or
54999 + * the correct resource information
55000 + * (Needed for when the driver probes the resource usage)
55001 + */
55002 +static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data)
55003 +{
55004 +       struct pci_bar_info *bar = data;
55005 +
55006 +       if (unlikely(!bar)) {
55007 +               printk(KERN_WARNING "pciback: driver data not found for %s\n",
55008 +                      pci_name(dev));
55009 +               return XEN_PCI_ERR_op_failed;
55010 +       }
55011 +
55012 +       /* A write to obtain the length must happen as a 32-bit write.
55013 +        * This does not (yet) support writing individual bytes
55014 +        */
55015 +       if (value == ~0)
55016 +               bar->which = 1;
55017 +       else
55018 +               bar->which = 0;
55019 +
55020 +       return 0;
55021 +}
55022 +
55023 +static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data)
55024 +{
55025 +       struct pci_bar_info *bar = data;
55026 +
55027 +       if (unlikely(!bar)) {
55028 +               printk(KERN_WARNING "pciback: driver data not found for %s\n",
55029 +                      pci_name(dev));
55030 +               return XEN_PCI_ERR_op_failed;
55031 +       }
55032 +
55033 +       *value = bar->which ? bar->len_val : bar->val;
55034 +
55035 +       return 0;
55036 +}
55037 +
55038 +static inline void read_dev_bar(struct pci_dev *dev,
55039 +                               struct pci_bar_info *bar_info, int offset,
55040 +                               u32 len_mask)
55041 +{
55042 +       pci_read_config_dword(dev, offset, &bar_info->val);
55043 +       pci_write_config_dword(dev, offset, len_mask);
55044 +       pci_read_config_dword(dev, offset, &bar_info->len_val);
55045 +       pci_write_config_dword(dev, offset, bar_info->val);
55046 +}
55047 +
55048 +static void *bar_init(struct pci_dev *dev, int offset)
55049 +{
55050 +       struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
55051 +
55052 +       if (!bar)
55053 +               return ERR_PTR(-ENOMEM);
55054 +
55055 +       read_dev_bar(dev, bar, offset, ~0);
55056 +       bar->which = 0;
55057 +
55058 +       return bar;
55059 +}
55060 +
55061 +static void *rom_init(struct pci_dev *dev, int offset)
55062 +{
55063 +       struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
55064 +
55065 +       if (!bar)
55066 +               return ERR_PTR(-ENOMEM);
55067 +
55068 +       read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE);
55069 +       bar->which = 0;
55070 +
55071 +       return bar;
55072 +}
55073 +
55074 +static void bar_reset(struct pci_dev *dev, int offset, void *data)
55075 +{
55076 +       struct pci_bar_info *bar = data;
55077 +
55078 +       bar->which = 0;
55079 +}
55080 +
55081 +static void bar_release(struct pci_dev *dev, int offset, void *data)
55082 +{
55083 +       kfree(data);
55084 +}
55085 +
55086 +static int interrupt_read(struct pci_dev *dev, int offset, u8 * value,
55087 +                         void *data)
55088 +{
55089 +       *value = (u8) dev->irq;
55090 +
55091 +       return 0;
55092 +}
55093 +
55094 +struct config_field header_common[] = {
55095 +       {
55096 +        .offset    = PCI_COMMAND,
55097 +        .size      = 2,
55098 +        .u.w.read  = pciback_read_config_word,
55099 +        .u.w.write = command_write,
55100 +        },
55101 +       {
55102 +        .offset    = PCI_INTERRUPT_LINE,
55103 +        .size      = 1,
55104 +        .u.b.read  = interrupt_read,
55105 +        .u.b.write = NULL,
55106 +        },
55107 +       {
55108 +        /* Any side effects of letting driver domain control cache line? */
55109 +        .offset    = PCI_CACHE_LINE_SIZE,
55110 +        .size      = 1,
55111 +        .u.b.read  = pciback_read_config_byte,
55112 +        .u.b.write = pciback_write_config_byte,
55113 +        },
55114 +       {
55115 +        .size = 0,
55116 +        },
55117 +};
55118 +
55119 +#define CFG_FIELD_BAR(reg_offset)                      \
55120 +       {                                               \
55121 +        .offset     = reg_offset,                      \
55122 +        .size       = 4,                               \
55123 +        .init       = bar_init,                        \
55124 +        .reset      = bar_reset,                       \
55125 +        .release    = bar_release,                     \
55126 +        .u.dw.read  = bar_read,                        \
55127 +        .u.dw.write = bar_write,                       \
55128 +        }
55129 +
55130 +#define CFG_FIELD_ROM(reg_offset)                      \
55131 +       {                                               \
55132 +        .offset     = reg_offset,                      \
55133 +        .size       = 4,                               \
55134 +        .init       = rom_init,                        \
55135 +        .reset      = bar_reset,                       \
55136 +        .release    = bar_release,                     \
55137 +        .u.dw.read  = bar_read,                        \
55138 +        .u.dw.write = rom_write,                       \
55139 +        }
55140 +
55141 +struct config_field header_0[] = {
55142 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
55143 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
55144 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_2),
55145 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_3),
55146 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_4),
55147 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_5),
55148 +       CFG_FIELD_ROM(PCI_ROM_ADDRESS),
55149 +       {
55150 +        .size = 0,
55151 +        },
55152 +};
55153 +
55154 +struct config_field header_1[] = {
55155 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
55156 +       CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
55157 +       CFG_FIELD_ROM(PCI_ROM_ADDRESS1),
55158 +       {
55159 +        .size = 0,
55160 +        },
55161 +};
55162 +
55163 +int pciback_config_header_add_fields(struct pci_dev *dev)
55164 +{
55165 +       int err;
55166 +
55167 +       err = pciback_config_add_fields(dev, header_common);
55168 +       if (err)
55169 +               goto out;
55170 +
55171 +       switch (dev->hdr_type) {
55172 +       case PCI_HEADER_TYPE_NORMAL:
55173 +               err = pciback_config_add_fields(dev, header_0);
55174 +               break;
55175 +
55176 +       case PCI_HEADER_TYPE_BRIDGE:
55177 +               err = pciback_config_add_fields(dev, header_1);
55178 +               break;
55179 +
55180 +       default:
55181 +               err = -EINVAL;
55182 +               printk(KERN_ERR "pciback: %s: Unsupported header type %d!\n",
55183 +                      pci_name(dev), dev->hdr_type);
55184 +               break;
55185 +       }
55186 +
55187 +      out:
55188 +       return err;
55189 +}
55190 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/pciback/passthrough.c linux-2.6.16/drivers/xen/pciback/passthrough.c
55191 --- linux-2.6.16.orig/drivers/xen/pciback/passthrough.c 1970-01-01 01:00:00.000000000 +0100
55192 +++ linux-2.6.16/drivers/xen/pciback/passthrough.c      2006-06-26 09:51:32.000000000 +0200
55193 @@ -0,0 +1,157 @@
55194 +/*
55195 + * PCI Backend - Provides restricted access to the real PCI bus topology
55196 + *               to the frontend
55197 + *
55198 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
55199 + */
55200 +
55201 +#include <linux/list.h>
55202 +#include <linux/pci.h>
55203 +#include <linux/spinlock.h>
55204 +#include "pciback.h"
55205 +
55206 +struct passthrough_dev_data {
55207 +       /* Access to dev_list must be protected by lock */
55208 +       struct list_head dev_list;
55209 +       spinlock_t lock;
55210 +};
55211 +
55212 +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
55213 +                                   unsigned int domain, unsigned int bus,
55214 +                                   unsigned int devfn)
55215 +{
55216 +       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
55217 +       struct pci_dev_entry *dev_entry;
55218 +       struct pci_dev *dev = NULL;
55219 +       unsigned long flags;
55220 +
55221 +       spin_lock_irqsave(&dev_data->lock, flags);
55222 +
55223 +       list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
55224 +               if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus)
55225 +                   && bus == (unsigned int)dev_entry->dev->bus->number
55226 +                   && devfn == dev_entry->dev->devfn) {
55227 +                       dev = dev_entry->dev;
55228 +                       break;
55229 +               }
55230 +       }
55231 +
55232 +       spin_unlock_irqrestore(&dev_data->lock, flags);
55233 +
55234 +       return dev;
55235 +}
55236 +
55237 +int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
55238 +{
55239 +       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
55240 +       struct pci_dev_entry *dev_entry;
55241 +       unsigned long flags;
55242 +
55243 +       dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
55244 +       if (!dev_entry)
55245 +               return -ENOMEM;
55246 +       dev_entry->dev = dev;
55247 +
55248 +       spin_lock_irqsave(&dev_data->lock, flags);
55249 +       list_add_tail(&dev_entry->list, &dev_data->dev_list);
55250 +       spin_unlock_irqrestore(&dev_data->lock, flags);
55251 +
55252 +       return 0;
55253 +}
55254 +
55255 +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
55256 +{
55257 +       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
55258 +       struct pci_dev_entry *dev_entry, *t;
55259 +       struct pci_dev *found_dev = NULL;
55260 +       unsigned long flags;
55261 +
55262 +       spin_lock_irqsave(&dev_data->lock, flags);
55263 +
55264 +       list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
55265 +               if (dev_entry->dev == dev) {
55266 +                       list_del(&dev_entry->list);
55267 +                       found_dev = dev_entry->dev;
55268 +                       kfree(dev_entry);
55269 +               }
55270 +       }
55271 +
55272 +       spin_unlock_irqrestore(&dev_data->lock, flags);
55273 +
55274 +       if (found_dev)
55275 +               pcistub_put_pci_dev(found_dev);
55276 +}
55277 +
55278 +int pciback_init_devices(struct pciback_device *pdev)
55279 +{
55280 +       struct passthrough_dev_data *dev_data;
55281 +
55282 +       dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
55283 +       if (!dev_data)
55284 +               return -ENOMEM;
55285 +
55286 +       spin_lock_init(&dev_data->lock);
55287 +
55288 +       INIT_LIST_HEAD(&dev_data->dev_list);
55289 +
55290 +       pdev->pci_dev_data = dev_data;
55291 +
55292 +       return 0;
55293 +}
55294 +
55295 +int pciback_publish_pci_roots(struct pciback_device *pdev,
55296 +                             publish_pci_root_cb publish_root_cb)
55297 +{
55298 +       int err = 0;
55299 +       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
55300 +       struct pci_dev_entry *dev_entry, *e;
55301 +       struct pci_dev *dev;
55302 +       int found;
55303 +       unsigned int domain, bus;
55304 +
55305 +       spin_lock(&dev_data->lock);
55306 +
55307 +       list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
55308 +               /* Only publish this device as a root if none of its
55309 +                * parent bridges are exported
55310 +                */
55311 +               found = 0;
55312 +               dev = dev_entry->dev->bus->self;
55313 +               for (; !found && dev != NULL; dev = dev->bus->self) {
55314 +                       list_for_each_entry(e, &dev_data->dev_list, list) {
55315 +                               if (dev == e->dev) {
55316 +                                       found = 1;
55317 +                                       break;
55318 +                               }
55319 +                       }
55320 +               }
55321 +
55322 +               domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus);
55323 +               bus = (unsigned int)dev_entry->dev->bus->number;
55324 +
55325 +               if (!found) {
55326 +                       err = publish_root_cb(pdev, domain, bus);
55327 +                       if (err)
55328 +                               break;
55329 +               }
55330 +       }
55331 +
55332 +       spin_unlock(&dev_data->lock);
55333 +
55334 +       return err;
55335 +}
55336 +
55337 +void pciback_release_devices(struct pciback_device *pdev)
55338 +{
55339 +       struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
55340 +       struct pci_dev_entry *dev_entry, *t;
55341 +
55342 +       list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
55343 +               list_del(&dev_entry->list);
55344 +               pcistub_put_pci_dev(dev_entry->dev);
55345 +               kfree(dev_entry);
55346 +       }
55347 +
55348 +       kfree(dev_data);
55349 +       pdev->pci_dev_data = NULL;
55350 +}
55351 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/pciback/pci_stub.c linux-2.6.16/drivers/xen/pciback/pci_stub.c
55352 --- linux-2.6.16.orig/drivers/xen/pciback/pci_stub.c    1970-01-01 01:00:00.000000000 +0100
55353 +++ linux-2.6.16/drivers/xen/pciback/pci_stub.c 2006-06-26 09:51:32.000000000 +0200
55354 @@ -0,0 +1,695 @@
55355 +/*
55356 + * PCI Stub Driver - Grabs devices in backend to be exported later
55357 + *
55358 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
55359 + */
55360 +#include <linux/module.h>
55361 +#include <linux/init.h>
55362 +#include <linux/list.h>
55363 +#include <linux/spinlock.h>
55364 +#include <linux/kref.h>
55365 +#include <asm/atomic.h>
55366 +#include "pciback.h"
55367 +
55368 +static char *pci_devs_to_hide = NULL;
55369 +module_param_named(hide, pci_devs_to_hide, charp, 0444);
55370 +
55371 +struct pcistub_device_id {
55372 +       struct list_head slot_list;
55373 +       int domain;
55374 +       unsigned char bus;
55375 +       unsigned int devfn;
55376 +};
55377 +static LIST_HEAD(pcistub_device_ids);
55378 +static DEFINE_SPINLOCK(device_ids_lock);
55379 +
55380 +struct pcistub_device {
55381 +       struct kref kref;
55382 +       struct list_head dev_list;
55383 +       spinlock_t lock;
55384 +
55385 +       struct pci_dev *dev;
55386 +       struct pciback_device *pdev;    /* non-NULL if struct pci_dev is in use */
55387 +};
55388 +/* Access to pcistub_devices & seized_devices lists and the initialize_devices
55389 + * flag must be locked with pcistub_devices_lock
55390 + */
55391 +static DEFINE_SPINLOCK(pcistub_devices_lock);
55392 +static LIST_HEAD(pcistub_devices);
55393 +
55394 +/* wait for device_initcall before initializing our devices
55395 + * (see pcistub_init_devices_late)
55396 + */
55397 +static int initialize_devices = 0;
55398 +static LIST_HEAD(seized_devices);
55399 +
55400 +static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev)
55401 +{
55402 +       struct pcistub_device *psdev;
55403 +
55404 +       dev_dbg(&dev->dev, "pcistub_device_alloc\n");
55405 +
55406 +       psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC);
55407 +       if (!psdev)
55408 +               return NULL;
55409 +
55410 +       psdev->dev = pci_dev_get(dev);
55411 +       if (!psdev->dev) {
55412 +               kfree(psdev);
55413 +               return NULL;
55414 +       }
55415 +
55416 +       kref_init(&psdev->kref);
55417 +       spin_lock_init(&psdev->lock);
55418 +
55419 +       return psdev;
55420 +}
55421 +
55422 +/* Don't call this directly as it's called by pcistub_device_put */
55423 +static void pcistub_device_release(struct kref *kref)
55424 +{
55425 +       struct pcistub_device *psdev;
55426 +
55427 +       psdev = container_of(kref, struct pcistub_device, kref);
55428 +
55429 +       dev_dbg(&psdev->dev->dev, "pcistub_device_release\n");
55430 +
55431 +       /* Clean-up the device */
55432 +       pciback_reset_device(psdev->dev);
55433 +       pciback_config_free(psdev->dev);
55434 +       kfree(pci_get_drvdata(psdev->dev));
55435 +       pci_set_drvdata(psdev->dev, NULL);
55436 +
55437 +       pci_dev_put(psdev->dev);
55438 +
55439 +       kfree(psdev);
55440 +}
55441 +
55442 +static inline void pcistub_device_get(struct pcistub_device *psdev)
55443 +{
55444 +       kref_get(&psdev->kref);
55445 +}
55446 +
55447 +static inline void pcistub_device_put(struct pcistub_device *psdev)
55448 +{
55449 +       kref_put(&psdev->kref, pcistub_device_release);
55450 +}
55451 +
55452 +static struct pci_dev *pcistub_device_get_pci_dev(struct pciback_device *pdev,
55453 +                                                 struct pcistub_device *psdev)
55454 +{
55455 +       struct pci_dev *pci_dev = NULL;
55456 +       unsigned long flags;
55457 +
55458 +       pcistub_device_get(psdev);
55459 +
55460 +       spin_lock_irqsave(&psdev->lock, flags);
55461 +       if (!psdev->pdev) {
55462 +               psdev->pdev = pdev;
55463 +               pci_dev = psdev->dev;
55464 +       }
55465 +       spin_unlock_irqrestore(&psdev->lock, flags);
55466 +
55467 +       if (!pci_dev)
55468 +               pcistub_device_put(psdev);
55469 +
55470 +       return pci_dev;
55471 +}
55472 +
55473 +struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
55474 +                                           int domain, int bus,
55475 +                                           int slot, int func)
55476 +{
55477 +       struct pcistub_device *psdev;
55478 +       struct pci_dev *found_dev = NULL;
55479 +       unsigned long flags;
55480 +
55481 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
55482 +
55483 +       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
55484 +               if (psdev->dev != NULL
55485 +                   && domain == pci_domain_nr(psdev->dev->bus)
55486 +                   && bus == psdev->dev->bus->number
55487 +                   && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
55488 +                       found_dev = pcistub_device_get_pci_dev(pdev, psdev);
55489 +                       break;
55490 +               }
55491 +       }
55492 +
55493 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
55494 +       return found_dev;
55495 +}
55496 +
55497 +struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
55498 +                                   struct pci_dev *dev)
55499 +{
55500 +       struct pcistub_device *psdev;
55501 +       struct pci_dev *found_dev = NULL;
55502 +       unsigned long flags;
55503 +
55504 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
55505 +
55506 +       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
55507 +               if (psdev->dev == dev) {
55508 +                       found_dev = pcistub_device_get_pci_dev(pdev, psdev);
55509 +                       break;
55510 +               }
55511 +       }
55512 +
55513 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
55514 +       return found_dev;
55515 +}
55516 +
55517 +void pcistub_put_pci_dev(struct pci_dev *dev)
55518 +{
55519 +       struct pcistub_device *psdev, *found_psdev = NULL;
55520 +       unsigned long flags;
55521 +
55522 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
55523 +
55524 +       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
55525 +               if (psdev->dev == dev) {
55526 +                       found_psdev = psdev;
55527 +                       break;
55528 +               }
55529 +       }
55530 +
55531 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
55532 +
55533 +       /* Cleanup our device
55534 +        * (so it's ready for the next domain)
55535 +        */
55536 +       pciback_reset_device(found_psdev->dev);
55537 +       pciback_config_reset(found_psdev->dev);
55538 +
55539 +       spin_lock_irqsave(&found_psdev->lock, flags);
55540 +       found_psdev->pdev = NULL;
55541 +       spin_unlock_irqrestore(&found_psdev->lock, flags);
55542 +
55543 +       pcistub_device_put(found_psdev);
55544 +}
55545 +
55546 +static int __devinit pcistub_match_one(struct pci_dev *dev,
55547 +                                      struct pcistub_device_id *pdev_id)
55548 +{
55549 +       /* Match the specified device by domain, bus, slot, func and also if
55550 +        * any of the device's parent bridges match.
55551 +        */
55552 +       for (; dev != NULL; dev = dev->bus->self) {
55553 +               if (pci_domain_nr(dev->bus) == pdev_id->domain
55554 +                   && dev->bus->number == pdev_id->bus
55555 +                   && dev->devfn == pdev_id->devfn)
55556 +                       return 1;
55557 +       }
55558 +
55559 +       return 0;
55560 +}
55561 +
55562 +static int __devinit pcistub_match(struct pci_dev *dev)
55563 +{
55564 +       struct pcistub_device_id *pdev_id;
55565 +       unsigned long flags;
55566 +       int found = 0;
55567 +
55568 +       spin_lock_irqsave(&device_ids_lock, flags);
55569 +       list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) {
55570 +               if (pcistub_match_one(dev, pdev_id)) {
55571 +                       found = 1;
55572 +                       break;
55573 +               }
55574 +       }
55575 +       spin_unlock_irqrestore(&device_ids_lock, flags);
55576 +
55577 +       return found;
55578 +}
55579 +
55580 +static int __devinit pcistub_init_device(struct pci_dev *dev)
55581 +{
55582 +       struct pciback_dev_data *dev_data;
55583 +       int err = 0;
55584 +
55585 +       dev_dbg(&dev->dev, "initializing...\n");
55586 +
55587 +       /* The PCI backend is not intended to be a module (or to work with
55588 +        * removable PCI devices (yet). If it were, pciback_config_free()
55589 +        * would need to be called somewhere to free the memory allocated
55590 +        * here and then to call kfree(pci_get_drvdata(psdev->dev)).
55591 +        */
55592 +       dev_data = kmalloc(sizeof(*dev_data), GFP_ATOMIC);
55593 +       if (!dev_data) {
55594 +               err = -ENOMEM;
55595 +               goto out;
55596 +       }
55597 +       pci_set_drvdata(dev, dev_data);
55598 +
55599 +       dev_dbg(&dev->dev, "initializing config\n");
55600 +       err = pciback_config_init(dev);
55601 +       if (err)
55602 +               goto out;
55603 +
55604 +       /* HACK: Force device (& ACPI) to determine what IRQ it's on - we
55605 +        * must do this here because pcibios_enable_device may specify
55606 +        * the pci device's true irq (and possibly its other resources)
55607 +        * if they differ from what's in the configuration space.
55608 +        * This makes the assumption that the device's resources won't
55609 +        * change after this point (otherwise this code may break!)
55610 +        */
55611 +       dev_dbg(&dev->dev, "enabling device\n");
55612 +       err = pci_enable_device(dev);
55613 +       if (err)
55614 +               goto config_release;
55615 +
55616 +       /* Now disable the device (this also ensures some private device
55617 +        * data is setup before we export)
55618 +        */
55619 +       dev_dbg(&dev->dev, "reset device\n");
55620 +       pciback_reset_device(dev);
55621 +
55622 +       return 0;
55623 +
55624 +      config_release:
55625 +       pciback_config_free(dev);
55626 +
55627 +      out:
55628 +       pci_set_drvdata(dev, NULL);
55629 +       kfree(dev_data);
55630 +       return err;
55631 +}
55632 +
55633 +/*
55634 + * Because some initialization still happens on
55635 + * devices during fs_initcall, we need to defer
55636 + * full initialization of our devices until
55637 + * device_initcall.
55638 + */
55639 +static int __init pcistub_init_devices_late(void)
55640 +{
55641 +       struct pcistub_device *psdev;
55642 +       unsigned long flags;
55643 +       int err = 0;
55644 +
55645 +       pr_debug("pciback: pcistub_init_devices_late\n");
55646 +
55647 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
55648 +
55649 +       while (!list_empty(&seized_devices)) {
55650 +               psdev = container_of(seized_devices.next,
55651 +                                    struct pcistub_device, dev_list);
55652 +               list_del(&psdev->dev_list);
55653 +
55654 +               spin_unlock_irqrestore(&pcistub_devices_lock, flags);
55655 +
55656 +               err = pcistub_init_device(psdev->dev);
55657 +               if (err) {
55658 +                       dev_err(&psdev->dev->dev,
55659 +                               "error %d initializing device\n", err);
55660 +                       kfree(psdev);
55661 +                       psdev = NULL;
55662 +               }
55663 +
55664 +               spin_lock_irqsave(&pcistub_devices_lock, flags);
55665 +
55666 +               if (psdev)
55667 +                       list_add_tail(&psdev->dev_list, &pcistub_devices);
55668 +       }
55669 +
55670 +       initialize_devices = 1;
55671 +
55672 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
55673 +
55674 +       return 0;
55675 +}
55676 +
55677 +static int __devinit pcistub_seize(struct pci_dev *dev)
55678 +{
55679 +       struct pcistub_device *psdev;
55680 +       unsigned long flags;
55681 +       int initialize_devices_copy;
55682 +       int err = 0;
55683 +
55684 +       psdev = pcistub_device_alloc(dev);
55685 +       if (!psdev)
55686 +               return -ENOMEM;
55687 +
55688 +       /* initialize_devices has to be accessed under a spin lock. But since
55689 +        * it can only change from 0 -> 1, if it's already 1, we don't have to
55690 +        * worry about it changing. That's why we can take a *copy* of
55691 +        * initialize_devices and wait till we're outside of the lock to
55692 +        * check if it's 1 (don't ever check if it's 0 outside of the lock)
55693 +        */
55694 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
55695 +
55696 +       initialize_devices_copy = initialize_devices;
55697 +
55698 +       if (!initialize_devices_copy) {
55699 +               dev_dbg(&dev->dev, "deferring initialization\n");
55700 +               list_add(&psdev->dev_list, &seized_devices);
55701 +       }
55702 +
55703 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
55704 +
55705 +       if (initialize_devices_copy) {
55706 +               /* don't want irqs disabled when calling pcistub_init_device */
55707 +               err = pcistub_init_device(psdev->dev);
55708 +               if (err)
55709 +                       goto out;
55710 +
55711 +               list_add(&psdev->dev_list, &pcistub_devices);
55712 +       }
55713 +
55714 +      out:
55715 +       if (err)
55716 +               pcistub_device_put(psdev);
55717 +
55718 +       return err;
55719 +}
55720 +
55721 +static int __devinit pcistub_probe(struct pci_dev *dev,
55722 +                                  const struct pci_device_id *id)
55723 +{
55724 +       int err = 0;
55725 +
55726 +       dev_dbg(&dev->dev, "probing...\n");
55727 +
55728 +       if (pcistub_match(dev)) {
55729 +
55730 +               if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL
55731 +                   && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
55732 +                       dev_err(&dev->dev, "can't export pci devices that "
55733 +                               "don't have a normal (0) or bridge (1) "
55734 +                               "header type!\n");
55735 +                       err = -ENODEV;
55736 +                       goto out;
55737 +               }
55738 +
55739 +               dev_info(&dev->dev, "seizing device\n");
55740 +               err = pcistub_seize(dev);
55741 +       } else
55742 +               /* Didn't find the device */
55743 +               err = -ENODEV;
55744 +
55745 +      out:
55746 +       return err;
55747 +}
55748 +
55749 +static void pcistub_remove(struct pci_dev *dev)
55750 +{
55751 +       struct pcistub_device *psdev, *found_psdev = NULL;
55752 +       unsigned long flags;
55753 +
55754 +       dev_dbg(&dev->dev, "removing\n");
55755 +
55756 +       spin_lock_irqsave(&pcistub_devices_lock, flags);
55757 +
55758 +       list_for_each_entry(psdev, &pcistub_devices, dev_list) {
55759 +               if (psdev->dev == dev) {
55760 +                       found_psdev = psdev;
55761 +                       break;
55762 +               }
55763 +       }
55764 +
55765 +       spin_unlock_irqrestore(&pcistub_devices_lock, flags);
55766 +
55767 +       if (found_psdev) {
55768 +               dev_dbg(&dev->dev, "found device to remove - in use? %p\n",
55769 +                       found_psdev->pdev);
55770 +
55771 +               if (found_psdev->pdev) {
55772 +                       printk(KERN_WARNING "pciback: ****** removing device "
55773 +                              "%s while still in-use! ******\n",
55774 +                              pci_name(found_psdev->dev));
55775 +                       printk(KERN_WARNING "pciback: ****** driver domain may "
55776 +                              "still access this device's i/o resources!\n");
55777 +                       printk(KERN_WARNING "pciback: ****** shutdown driver "
55778 +                              "domain before binding device\n");
55779 +                       printk(KERN_WARNING "pciback: ****** to other drivers "
55780 +                              "or domains\n");
55781 +
55782 +                       pciback_release_pci_dev(found_psdev->pdev,
55783 +                                               found_psdev->dev);
55784 +               }
55785 +
55786 +               spin_lock_irqsave(&pcistub_devices_lock, flags);
55787 +               list_del(&found_psdev->dev_list);
55788 +               spin_unlock_irqrestore(&pcistub_devices_lock, flags);
55789 +
55790 +               /* the final put for releasing from the list */
55791 +               pcistub_device_put(found_psdev);
55792 +       }
55793 +}
55794 +
55795 +static struct pci_device_id pcistub_ids[] = {
55796 +       {
55797 +        .vendor = PCI_ANY_ID,
55798 +        .device = PCI_ANY_ID,
55799 +        .subvendor = PCI_ANY_ID,
55800 +        .subdevice = PCI_ANY_ID,
55801 +        },
55802 +       {0,},
55803 +};
55804 +
55805 +/*
55806 + * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't
55807 + * for a normal device. I don't want it to be loaded automatically.
55808 + */
55809 +
55810 +static struct pci_driver pciback_pci_driver = {
55811 +       .name = "pciback",
55812 +       .id_table = pcistub_ids,
55813 +       .probe = pcistub_probe,
55814 +       .remove = pcistub_remove,
55815 +};
55816 +
55817 +static inline int str_to_slot(const char *buf, int *domain, int *bus,
55818 +                             int *slot, int *func)
55819 +{
55820 +       int err;
55821 +
55822 +       err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func);
55823 +       if (err == 4)
55824 +               return 0;
55825 +       else if (err < 0)
55826 +               return -EINVAL;
55827 +
55828 +       /* try again without domain */
55829 +       *domain = 0;
55830 +       err = sscanf(buf, " %x:%x.%x", bus, slot, func);
55831 +       if (err == 3)
55832 +               return 0;
55833 +
55834 +       return -EINVAL;
55835 +}
55836 +
55837 +static int pcistub_device_id_add(int domain, int bus, int slot, int func)
55838 +{
55839 +       struct pcistub_device_id *pci_dev_id;
55840 +       unsigned long flags;
55841 +
55842 +       pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL);
55843 +       if (!pci_dev_id)
55844 +               return -ENOMEM;
55845 +
55846 +       pci_dev_id->domain = domain;
55847 +       pci_dev_id->bus = bus;
55848 +       pci_dev_id->devfn = PCI_DEVFN(slot, func);
55849 +
55850 +       pr_debug("pciback: wants to seize %04x:%02x:%02x.%01x\n",
55851 +                domain, bus, slot, func);
55852 +
55853 +       spin_lock_irqsave(&device_ids_lock, flags);
55854 +       list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids);
55855 +       spin_unlock_irqrestore(&device_ids_lock, flags);
55856 +
55857 +       return 0;
55858 +}
55859 +
55860 +static int pcistub_device_id_remove(int domain, int bus, int slot, int func)
55861 +{
55862 +       struct pcistub_device_id *pci_dev_id, *t;
55863 +       int devfn = PCI_DEVFN(slot, func);
55864 +       int err = -ENOENT;
55865 +       unsigned long flags;
55866 +
55867 +       spin_lock_irqsave(&device_ids_lock, flags);
55868 +       list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids, slot_list) {
55869 +
55870 +               if (pci_dev_id->domain == domain
55871 +                   && pci_dev_id->bus == bus && pci_dev_id->devfn == devfn) {
55872 +                       /* Don't break; here because it's possible the same
55873 +                        * slot could be in the list more than once
55874 +                        */
55875 +                       list_del(&pci_dev_id->slot_list);
55876 +                       kfree(pci_dev_id);
55877 +
55878 +                       err = 0;
55879 +
55880 +                       pr_debug("pciback: removed %04x:%02x:%02x.%01x from "
55881 +                                "seize list\n", domain, bus, slot, func);
55882 +               }
55883 +       }
55884 +       spin_unlock_irqrestore(&device_ids_lock, flags);
55885 +
55886 +       return err;
55887 +}
55888 +
55889 +static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf,
55890 +                               size_t count)
55891 +{
55892 +       int domain, bus, slot, func;
55893 +       int err;
55894 +
55895 +       err = str_to_slot(buf, &domain, &bus, &slot, &func);
55896 +       if (err)
55897 +               goto out;
55898 +
55899 +       err = pcistub_device_id_add(domain, bus, slot, func);
55900 +
55901 +      out:
55902 +       if (!err)
55903 +               err = count;
55904 +       return err;
55905 +}
55906 +
55907 +DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add);
55908 +
55909 +static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf,
55910 +                                  size_t count)
55911 +{
55912 +       int domain, bus, slot, func;
55913 +       int err;
55914 +
55915 +       err = str_to_slot(buf, &domain, &bus, &slot, &func);
55916 +       if (err)
55917 +               goto out;
55918 +
55919 +       err = pcistub_device_id_remove(domain, bus, slot, func);
55920 +
55921 +      out:
55922 +       if (!err)
55923 +               err = count;
55924 +       return err;
55925 +}
55926 +
55927 +DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove);
55928 +
55929 +static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf)
55930 +{
55931 +       struct pcistub_device_id *pci_dev_id;
55932 +       size_t count = 0;
55933 +       unsigned long flags;
55934 +
55935 +       spin_lock_irqsave(&device_ids_lock, flags);
55936 +       list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) {
55937 +               if (count >= PAGE_SIZE)
55938 +                       break;
55939 +
55940 +               count += scnprintf(buf + count, PAGE_SIZE - count,
55941 +                                  "%04x:%02x:%02x.%01x\n",
55942 +                                  pci_dev_id->domain, pci_dev_id->bus,
55943 +                                  PCI_SLOT(pci_dev_id->devfn),
55944 +                                  PCI_FUNC(pci_dev_id->devfn));
55945 +       }
55946 +       spin_unlock_irqrestore(&device_ids_lock, flags);
55947 +
55948 +       return count;
55949 +}
55950 +
55951 +DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL);
55952 +
55953 +static int __init pcistub_init(void)
55954 +{
55955 +       int pos = 0;
55956 +       int err = 0;
55957 +       int domain, bus, slot, func;
55958 +       int parsed;
55959 +
55960 +       if (pci_devs_to_hide && *pci_devs_to_hide) {
55961 +               do {
55962 +                       parsed = 0;
55963 +
55964 +                       err = sscanf(pci_devs_to_hide + pos,
55965 +                                    " (%x:%x:%x.%x) %n",
55966 +                                    &domain, &bus, &slot, &func, &parsed);
55967 +                       if (err != 4) {
55968 +                               domain = 0;
55969 +                               err = sscanf(pci_devs_to_hide + pos,
55970 +                                            " (%x:%x.%x) %n",
55971 +                                            &bus, &slot, &func, &parsed);
55972 +                               if (err != 3)
55973 +                                       goto parse_error;
55974 +                       }
55975 +
55976 +                       err = pcistub_device_id_add(domain, bus, slot, func);
55977 +                       if (err)
55978 +                               goto out;
55979 +
55980 +                       /* if parsed<=0, we've reached the end of the string */
55981 +                       pos += parsed;
55982 +               } while (parsed > 0 && pci_devs_to_hide[pos]);
55983 +       }
55984 +
55985 +       /* If we're the first PCI Device Driver to register, we're the
55986 +        * first one to get offered PCI devices as they become
55987 +        * available (and thus we can be the first to grab them)
55988 +        */
55989 +       err = pci_register_driver(&pciback_pci_driver);
55990 +       if (err < 0)
55991 +               goto out;
55992 +
55993 +       driver_create_file(&pciback_pci_driver.driver, &driver_attr_new_slot);
55994 +       driver_create_file(&pciback_pci_driver.driver,
55995 +                          &driver_attr_remove_slot);
55996 +       driver_create_file(&pciback_pci_driver.driver, &driver_attr_slots);
55997 +
55998 +      out:
55999 +       return err;
56000 +
56001 +      parse_error:
56002 +       printk(KERN_ERR "pciback: Error parsing pci_devs_to_hide at \"%s\"\n",
56003 +              pci_devs_to_hide + pos);
56004 +       return -EINVAL;
56005 +}
56006 +
56007 +#ifndef MODULE
56008 +/*
56009 + * fs_initcall happens before device_initcall
56010 + * so pciback *should* get called first (b/c we 
56011 + * want to suck up any device before other drivers
56012 + * get a chance by being the first pci device
56013 + * driver to register)
56014 + */
56015 +fs_initcall(pcistub_init);
56016 +#endif
56017 +
56018 +static int __init pciback_init(void)
56019 +{
56020 +#ifdef MODULE
56021 +       int err;
56022 +
56023 +       err = pcistub_init();
56024 +       if (err < 0)
56025 +               return err;
56026 +#endif
56027 +
56028 +       pcistub_init_devices_late();
56029 +       pciback_xenbus_register();
56030 +
56031 +       return 0;
56032 +}
56033 +
56034 +static void __exit pciback_cleanup(void)
56035 +{
56036 +       pciback_xenbus_unregister();
56037 +
56038 +       driver_remove_file(&pciback_pci_driver.driver, &driver_attr_new_slot);
56039 +       driver_remove_file(&pciback_pci_driver.driver,
56040 +                          &driver_attr_remove_slot);
56041 +       driver_remove_file(&pciback_pci_driver.driver, &driver_attr_slots);
56042 +
56043 +       pci_unregister_driver(&pciback_pci_driver);
56044 +}
56045 +
56046 +module_init(pciback_init);
56047 +module_exit(pciback_cleanup);
56048 +
56049 +MODULE_LICENSE("Dual BSD/GPL");
56050 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/pciback/pciback.h linux-2.6.16/drivers/xen/pciback/pciback.h
56051 --- linux-2.6.16.orig/drivers/xen/pciback/pciback.h     1970-01-01 01:00:00.000000000 +0100
56052 +++ linux-2.6.16/drivers/xen/pciback/pciback.h  2006-06-26 09:51:32.000000000 +0200
56053 @@ -0,0 +1,78 @@
56054 +/*
56055 + * PCI Backend Common Data Structures & Function Declarations
56056 + *
56057 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
56058 + */
56059 +#ifndef __XEN_PCIBACK_H__
56060 +#define __XEN_PCIBACK_H__
56061 +
56062 +#include <linux/pci.h>
56063 +#include <linux/interrupt.h>
56064 +#include <xen/xenbus.h>
56065 +#include <linux/list.h>
56066 +#include <linux/spinlock.h>
56067 +#include <xen/interface/io/pciif.h>
56068 +
56069 +struct pci_dev_entry {
56070 +       struct list_head list;
56071 +       struct pci_dev *dev;
56072 +};
56073 +
56074 +struct pciback_device {
56075 +       void *pci_dev_data;
56076 +       spinlock_t dev_lock;
56077 +
56078 +       struct xenbus_device *xdev;
56079 +
56080 +       struct xenbus_watch be_watch;
56081 +       u8 be_watching;
56082 +
56083 +       int evtchn_irq;
56084 +
56085 +       struct xen_pci_sharedinfo *sh_info;
56086 +};
56087 +
56088 +struct pciback_dev_data {
56089 +       struct list_head config_fields;
56090 +};
56091 +
56092 +/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */
56093 +struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
56094 +                                           int domain, int bus,
56095 +                                           int slot, int func);
56096 +struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
56097 +                                   struct pci_dev *dev);
56098 +void pcistub_put_pci_dev(struct pci_dev *dev);
56099 +
56100 +/* Ensure a device is turned off or reset */
56101 +void pciback_reset_device(struct pci_dev *pdev);
56102 +
56103 +/* Access a virtual configuration space for a PCI device */
56104 +int pciback_config_init(struct pci_dev *dev);
56105 +void pciback_config_reset(struct pci_dev *dev);
56106 +void pciback_config_free(struct pci_dev *dev);
56107 +int pciback_config_read(struct pci_dev *dev, int offset, int size,
56108 +                       u32 * ret_val);
56109 +int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value);
56110 +
56111 +/* Handle requests for specific devices from the frontend */
56112 +typedef int (*publish_pci_root_cb) (struct pciback_device * pdev,
56113 +                                   unsigned int domain, unsigned int bus);
56114 +int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev);
56115 +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev);
56116 +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
56117 +                                   unsigned int domain, unsigned int bus,
56118 +                                   unsigned int devfn);
56119 +int pciback_init_devices(struct pciback_device *pdev);
56120 +int pciback_publish_pci_roots(struct pciback_device *pdev,
56121 +                             publish_pci_root_cb cb);
56122 +void pciback_release_devices(struct pciback_device *pdev);
56123 +
56124 +/* Handles events from front-end */
56125 +irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs);
56126 +
56127 +int pciback_xenbus_register(void);
56128 +void pciback_xenbus_unregister(void);
56129 +
56130 +extern int verbose_request;
56131 +#endif
56132 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/pciback/pciback_ops.c linux-2.6.16/drivers/xen/pciback/pciback_ops.c
56133 --- linux-2.6.16.orig/drivers/xen/pciback/pciback_ops.c 1970-01-01 01:00:00.000000000 +0100
56134 +++ linux-2.6.16/drivers/xen/pciback/pciback_ops.c      2006-06-26 09:51:32.000000000 +0200
56135 @@ -0,0 +1,74 @@
56136 +/*
56137 + * PCI Backend Operations - respond to PCI requests from Frontend
56138 + *
56139 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
56140 + */
56141 +#include <linux/module.h>
56142 +#include <asm/bitops.h>
56143 +#include <xen/evtchn.h>
56144 +#include "pciback.h"
56145 +
56146 +int verbose_request = 0;
56147 +module_param(verbose_request, int, 0644);
56148 +
56149 +/* Ensure a device is "turned off" and ready to be exported.
56150 + * (Also see pciback_config_reset to ensure virtual configuration space is
56151 + * ready to be re-exported)
56152 + */
56153 +void pciback_reset_device(struct pci_dev *dev)
56154 +{
56155 +       u16 cmd;
56156 +
56157 +       /* Disable devices (but not bridges) */
56158 +       if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
56159 +               pci_disable_device(dev);
56160 +
56161 +               pci_write_config_word(dev, PCI_COMMAND, 0);
56162 +
56163 +               dev->is_enabled = 0;
56164 +               dev->is_busmaster = 0;
56165 +       } else {
56166 +               pci_read_config_word(dev, PCI_COMMAND, &cmd);
56167 +               if (cmd & (PCI_COMMAND_INVALIDATE)) {
56168 +                       cmd &= ~(PCI_COMMAND_INVALIDATE);
56169 +                       pci_write_config_word(dev, PCI_COMMAND, cmd);
56170 +
56171 +                       dev->is_busmaster = 0;
56172 +               }
56173 +       }
56174 +
56175 +       pciback_config_reset(dev);
56176 +}
56177 +
56178 +irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs)
56179 +{
56180 +       struct pciback_device *pdev = dev_id;
56181 +       struct pci_dev *dev;
56182 +       struct xen_pci_op *op = &pdev->sh_info->op;
56183 +
56184 +       if (unlikely(!test_bit(_XEN_PCIF_active,
56185 +                              (unsigned long *)&pdev->sh_info->flags))) {
56186 +               pr_debug("pciback: interrupt, but no active operation\n");
56187 +               goto out;
56188 +       }
56189 +
56190 +       dev = pciback_get_pci_dev(pdev, op->domain, op->bus, op->devfn);
56191 +
56192 +       if (dev == NULL)
56193 +               op->err = XEN_PCI_ERR_dev_not_found;
56194 +       else if (op->cmd == XEN_PCI_OP_conf_read)
56195 +               op->err = pciback_config_read(dev, op->offset, op->size,
56196 +                                             &op->value);
56197 +       else if (op->cmd == XEN_PCI_OP_conf_write)
56198 +               op->err = pciback_config_write(dev, op->offset, op->size,
56199 +                                              op->value);
56200 +       else
56201 +               op->err = XEN_PCI_ERR_not_implemented;
56202 +
56203 +       wmb();
56204 +       clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
56205 +       notify_remote_via_irq(pdev->evtchn_irq);
56206 +
56207 +      out:
56208 +       return IRQ_HANDLED;
56209 +}
56210 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/pciback/vpci.c linux-2.6.16/drivers/xen/pciback/vpci.c
56211 --- linux-2.6.16.orig/drivers/xen/pciback/vpci.c        1970-01-01 01:00:00.000000000 +0100
56212 +++ linux-2.6.16/drivers/xen/pciback/vpci.c     2006-06-26 09:51:32.000000000 +0200
56213 @@ -0,0 +1,204 @@
56214 +/*
56215 + * PCI Backend - Provides a Virtual PCI bus (with real devices)
56216 + *               to the frontend
56217 + *
56218 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
56219 + */
56220 +
56221 +#include <linux/list.h>
56222 +#include <linux/slab.h>
56223 +#include <linux/pci.h>
56224 +#include <linux/spinlock.h>
56225 +#include "pciback.h"
56226 +
56227 +#define PCI_SLOT_MAX 32
56228 +
56229 +struct vpci_dev_data {
56230 +       /* Access to dev_list must be protected by lock */
56231 +       struct list_head dev_list[PCI_SLOT_MAX];
56232 +       spinlock_t lock;
56233 +};
56234 +
56235 +static inline struct list_head *list_first(struct list_head *head)
56236 +{
56237 +       return head->next;
56238 +}
56239 +
56240 +struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
56241 +                                   unsigned int domain, unsigned int bus,
56242 +                                   unsigned int devfn)
56243 +{
56244 +       struct pci_dev_entry *entry;
56245 +       struct pci_dev *dev = NULL;
56246 +       struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
56247 +       unsigned long flags;
56248 +
56249 +       if (domain != 0 || bus != 0)
56250 +               return NULL;
56251 +
56252 +       if (PCI_SLOT(devfn) < PCI_SLOT_MAX) {
56253 +               spin_lock_irqsave(&vpci_dev->lock, flags);
56254 +
56255 +               list_for_each_entry(entry,
56256 +                                   &vpci_dev->dev_list[PCI_SLOT(devfn)],
56257 +                                   list) {
56258 +                       if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) {
56259 +                               dev = entry->dev;
56260 +                               break;
56261 +                       }
56262 +               }
56263 +
56264 +               spin_unlock_irqrestore(&vpci_dev->lock, flags);
56265 +       }
56266 +       return dev;
56267 +}
56268 +
56269 +static inline int match_slot(struct pci_dev *l, struct pci_dev *r)
56270 +{
56271 +       if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus)
56272 +           && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn))
56273 +               return 1;
56274 +
56275 +       return 0;
56276 +}
56277 +
56278 +int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
56279 +{
56280 +       int err = 0, slot;
56281 +       struct pci_dev_entry *t, *dev_entry;
56282 +       struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
56283 +       unsigned long flags;
56284 +
56285 +       if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
56286 +               err = -EFAULT;
56287 +               xenbus_dev_fatal(pdev->xdev, err,
56288 +                                "Can't export bridges on the virtual PCI bus");
56289 +               goto out;
56290 +       }
56291 +
56292 +       dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
56293 +       if (!dev_entry) {
56294 +               err = -ENOMEM;
56295 +               xenbus_dev_fatal(pdev->xdev, err,
56296 +                                "Error adding entry to virtual PCI bus");
56297 +               goto out;
56298 +       }
56299 +
56300 +       dev_entry->dev = dev;
56301 +
56302 +       spin_lock_irqsave(&vpci_dev->lock, flags);
56303 +
56304 +       /* Keep multi-function devices together on the virtual PCI bus */
56305 +       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
56306 +               if (!list_empty(&vpci_dev->dev_list[slot])) {
56307 +                       t = list_entry(list_first(&vpci_dev->dev_list[slot]),
56308 +                                      struct pci_dev_entry, list);
56309 +
56310 +                       if (match_slot(dev, t->dev)) {
56311 +                               pr_info("pciback: vpci: %s: "
56312 +                                       "assign to virtual slot %d func %d\n",
56313 +                                       pci_name(dev), slot,
56314 +                                       PCI_FUNC(dev->devfn));
56315 +                               list_add_tail(&dev_entry->list,
56316 +                                             &vpci_dev->dev_list[slot]);
56317 +                               goto unlock;
56318 +                       }
56319 +               }
56320 +       }
56321 +
56322 +       /* Assign to a new slot on the virtual PCI bus */
56323 +       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
56324 +               if (list_empty(&vpci_dev->dev_list[slot])) {
56325 +                       printk(KERN_INFO
56326 +                              "pciback: vpci: %s: assign to virtual slot %d\n",
56327 +                              pci_name(dev), slot);
56328 +                       list_add_tail(&dev_entry->list,
56329 +                                     &vpci_dev->dev_list[slot]);
56330 +                       goto unlock;
56331 +               }
56332 +       }
56333 +
56334 +       err = -ENOMEM;
56335 +       xenbus_dev_fatal(pdev->xdev, err,
56336 +                        "No more space on root virtual PCI bus");
56337 +
56338 +      unlock:
56339 +       spin_unlock_irqrestore(&vpci_dev->lock, flags);
56340 +      out:
56341 +       return err;
56342 +}
56343 +
56344 +void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
56345 +{
56346 +       int slot;
56347 +       struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
56348 +       struct pci_dev *found_dev = NULL;
56349 +       unsigned long flags;
56350 +
56351 +       spin_lock_irqsave(&vpci_dev->lock, flags);
56352 +
56353 +       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
56354 +               struct pci_dev_entry *e, *tmp;
56355 +               list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
56356 +                                        list) {
56357 +                       if (e->dev == dev) {
56358 +                               list_del(&e->list);
56359 +                               found_dev = e->dev;
56360 +                               kfree(e);
56361 +                               goto out;
56362 +                       }
56363 +               }
56364 +       }
56365 +
56366 +      out:
56367 +       spin_unlock_irqrestore(&vpci_dev->lock, flags);
56368 +
56369 +       if (found_dev)
56370 +               pcistub_put_pci_dev(found_dev);
56371 +}
56372 +
56373 +int pciback_init_devices(struct pciback_device *pdev)
56374 +{
56375 +       int slot;
56376 +       struct vpci_dev_data *vpci_dev;
56377 +
56378 +       vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL);
56379 +       if (!vpci_dev)
56380 +               return -ENOMEM;
56381 +
56382 +       spin_lock_init(&vpci_dev->lock);
56383 +
56384 +       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
56385 +               INIT_LIST_HEAD(&vpci_dev->dev_list[slot]);
56386 +       }
56387 +
56388 +       pdev->pci_dev_data = vpci_dev;
56389 +
56390 +       return 0;
56391 +}
56392 +
56393 +int pciback_publish_pci_roots(struct pciback_device *pdev,
56394 +                             publish_pci_root_cb publish_cb)
56395 +{
56396 +       /* The Virtual PCI bus has only one root */
56397 +       return publish_cb(pdev, 0, 0);
56398 +}
56399 +
56400 +void pciback_release_devices(struct pciback_device *pdev)
56401 +{
56402 +       int slot;
56403 +       struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
56404 +
56405 +       for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
56406 +               struct pci_dev_entry *e, *tmp;
56407 +               list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
56408 +                                        list) {
56409 +                       list_del(&e->list);
56410 +                       pcistub_put_pci_dev(e->dev);
56411 +                       kfree(e);
56412 +               }
56413 +       }
56414 +
56415 +       kfree(vpci_dev);
56416 +       pdev->pci_dev_data = NULL;
56417 +}
56418 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/pciback/xenbus.c linux-2.6.16/drivers/xen/pciback/xenbus.c
56419 --- linux-2.6.16.orig/drivers/xen/pciback/xenbus.c      1970-01-01 01:00:00.000000000 +0100
56420 +++ linux-2.6.16/drivers/xen/pciback/xenbus.c   2006-06-26 09:51:32.000000000 +0200
56421 @@ -0,0 +1,441 @@
56422 +/*
56423 + * PCI Backend Xenbus Setup - handles setup with frontend and xend
56424 + *
56425 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
56426 + */
56427 +#include <linux/module.h>
56428 +#include <linux/init.h>
56429 +#include <linux/list.h>
56430 +#include <xen/xenbus.h>
56431 +#include <xen/evtchn.h>
56432 +#include "pciback.h"
56433 +
56434 +#define INVALID_EVTCHN_IRQ  (-1)
56435 +
56436 +static struct pciback_device *alloc_pdev(struct xenbus_device *xdev)
56437 +{
56438 +       struct pciback_device *pdev;
56439 +
56440 +       pdev = kzalloc(sizeof(struct pciback_device), GFP_KERNEL);
56441 +       if (pdev == NULL)
56442 +               goto out;
56443 +       dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev);
56444 +
56445 +       pdev->xdev = xdev;
56446 +       xdev->data = pdev;
56447 +
56448 +       spin_lock_init(&pdev->dev_lock);
56449 +
56450 +       pdev->sh_info = NULL;
56451 +       pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
56452 +       pdev->be_watching = 0;
56453 +
56454 +       if (pciback_init_devices(pdev)) {
56455 +               kfree(pdev);
56456 +               pdev = NULL;
56457 +       }
56458 +      out:
56459 +       return pdev;
56460 +}
56461 +
56462 +static void free_pdev(struct pciback_device *pdev)
56463 +{
56464 +       if (pdev->be_watching)
56465 +               unregister_xenbus_watch(&pdev->be_watch);
56466 +
56467 +       /* Ensure the guest can't trigger our handler before removing devices */
56468 +       if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ)
56469 +               unbind_from_irqhandler(pdev->evtchn_irq, pdev);
56470 +
56471 +       if (pdev->sh_info)
56472 +               xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_info);
56473 +
56474 +       pciback_release_devices(pdev);
56475 +
56476 +       pdev->xdev->data = NULL;
56477 +       pdev->xdev = NULL;
56478 +
56479 +       kfree(pdev);
56480 +}
56481 +
56482 +static int pciback_do_attach(struct pciback_device *pdev, int gnt_ref,
56483 +                            int remote_evtchn)
56484 +{
56485 +       int err = 0;
56486 +       int evtchn;
56487 +       dev_dbg(&pdev->xdev->dev,
56488 +               "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
56489 +               gnt_ref, remote_evtchn);
56490 +
56491 +       err =
56492 +           xenbus_map_ring_valloc(pdev->xdev, gnt_ref,
56493 +                                  (void **)&pdev->sh_info);
56494 +       if (err)
56495 +               goto out;
56496 +
56497 +       err = xenbus_bind_evtchn(pdev->xdev, remote_evtchn, &evtchn);
56498 +       if (err)
56499 +               goto out;
56500 +
56501 +       err = bind_evtchn_to_irqhandler(evtchn, pciback_handle_event,
56502 +                                       SA_SAMPLE_RANDOM, "pciback", pdev);
56503 +       if (err < 0) {
56504 +               xenbus_dev_fatal(pdev->xdev, err,
56505 +                                "Error binding event channel to IRQ");
56506 +               goto out;
56507 +       }
56508 +       pdev->evtchn_irq = err;
56509 +       err = 0;
56510 +
56511 +       dev_dbg(&pdev->xdev->dev, "Attached!\n");
56512 +      out:
56513 +       return err;
56514 +}
56515 +
56516 +static int pciback_attach(struct pciback_device *pdev)
56517 +{
56518 +       int err = 0;
56519 +       int gnt_ref, remote_evtchn;
56520 +       char *magic = NULL;
56521 +
56522 +       spin_lock(&pdev->dev_lock);
56523 +
56524 +       /* Make sure we only do this setup once */
56525 +       if (xenbus_read_driver_state(pdev->xdev->nodename) !=
56526 +           XenbusStateInitialised)
56527 +               goto out;
56528 +
56529 +       /* Wait for frontend to state that it has published the configuration */
56530 +       if (xenbus_read_driver_state(pdev->xdev->otherend) !=
56531 +           XenbusStateInitialised)
56532 +               goto out;
56533 +
56534 +       dev_dbg(&pdev->xdev->dev, "Reading frontend config\n");
56535 +
56536 +       err = xenbus_gather(XBT_NULL, pdev->xdev->otherend,
56537 +                           "pci-op-ref", "%u", &gnt_ref,
56538 +                           "event-channel", "%u", &remote_evtchn,
56539 +                           "magic", NULL, &magic, NULL);
56540 +       if (err) {
56541 +               /* If configuration didn't get read correctly, wait longer */
56542 +               xenbus_dev_fatal(pdev->xdev, err,
56543 +                                "Error reading configuration from frontend");
56544 +               goto out;
56545 +       }
56546 +
56547 +       if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) {
56548 +               xenbus_dev_fatal(pdev->xdev, -EFAULT,
56549 +                                "version mismatch (%s/%s) with pcifront - "
56550 +                                "halting pciback",
56551 +                                magic, XEN_PCI_MAGIC);
56552 +               goto out;
56553 +       }
56554 +
56555 +       err = pciback_do_attach(pdev, gnt_ref, remote_evtchn);
56556 +       if (err)
56557 +               goto out;
56558 +
56559 +       dev_dbg(&pdev->xdev->dev, "Connecting...\n");
56560 +
56561 +       err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
56562 +       if (err)
56563 +               xenbus_dev_fatal(pdev->xdev, err,
56564 +                                "Error switching to connected state!");
56565 +
56566 +       dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err);
56567 +      out:
56568 +       spin_unlock(&pdev->dev_lock);
56569 +
56570 +       if (magic)
56571 +               kfree(magic);
56572 +
56573 +       return err;
56574 +}
56575 +
56576 +static void pciback_frontend_changed(struct xenbus_device *xdev,
56577 +                                    XenbusState fe_state)
56578 +{
56579 +       struct pciback_device *pdev = xdev->data;
56580 +
56581 +       dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state);
56582 +
56583 +       switch (fe_state) {
56584 +       case XenbusStateInitialised:
56585 +               pciback_attach(pdev);
56586 +               break;
56587 +
56588 +       case XenbusStateClosing:
56589 +               xenbus_switch_state(xdev, XenbusStateClosing);
56590 +               break;
56591 +
56592 +       case XenbusStateClosed:
56593 +               dev_dbg(&xdev->dev, "frontend is gone! unregister device\n");
56594 +               device_unregister(&xdev->dev);
56595 +               break;
56596 +
56597 +       default:
56598 +               break;
56599 +       }
56600 +}
56601 +
56602 +static int pciback_publish_pci_root(struct pciback_device *pdev,
56603 +                                   unsigned int domain, unsigned int bus)
56604 +{
56605 +       unsigned int d, b;
56606 +       int i, root_num, len, err;
56607 +       char str[64];
56608 +
56609 +       dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n");
56610 +
56611 +       err = xenbus_scanf(XBT_NULL, pdev->xdev->nodename,
56612 +                          "root_num", "%d", &root_num);
56613 +       if (err == 0 || err == -ENOENT)
56614 +               root_num = 0;
56615 +       else if (err < 0)
56616 +               goto out;
56617 +
56618 +       /* Verify that we haven't already published this pci root */
56619 +       for (i = 0; i < root_num; i++) {
56620 +               len = snprintf(str, sizeof(str), "root-%d", i);
56621 +               if (unlikely(len >= (sizeof(str) - 1))) {
56622 +                       err = -ENOMEM;
56623 +                       goto out;
56624 +               }
56625 +
56626 +               err = xenbus_scanf(XBT_NULL, pdev->xdev->nodename,
56627 +                                  str, "%x:%x", &d, &b);
56628 +               if (err < 0)
56629 +                       goto out;
56630 +               if (err != 2) {
56631 +                       err = -EINVAL;
56632 +                       goto out;
56633 +               }
56634 +
56635 +               if (d == domain && b == bus) {
56636 +                       err = 0;
56637 +                       goto out;
56638 +               }
56639 +       }
56640 +
56641 +       len = snprintf(str, sizeof(str), "root-%d", root_num);
56642 +       if (unlikely(len >= (sizeof(str) - 1))) {
56643 +               err = -ENOMEM;
56644 +               goto out;
56645 +       }
56646 +
56647 +       dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n",
56648 +               root_num, domain, bus);
56649 +
56650 +       err = xenbus_printf(XBT_NULL, pdev->xdev->nodename, str,
56651 +                           "%04x:%02x", domain, bus);
56652 +       if (err)
56653 +               goto out;
56654 +
56655 +       err = xenbus_printf(XBT_NULL, pdev->xdev->nodename,
56656 +                           "root_num", "%d", (root_num + 1));
56657 +
56658 +      out:
56659 +       return err;
56660 +}
56661 +
56662 +static int pciback_export_device(struct pciback_device *pdev,
56663 +                                int domain, int bus, int slot, int func)
56664 +{
56665 +       struct pci_dev *dev;
56666 +       int err = 0;
56667 +
56668 +       dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n",
56669 +               domain, bus, slot, func);
56670 +
56671 +       dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func);
56672 +       if (!dev) {
56673 +               err = -EINVAL;
56674 +               xenbus_dev_fatal(pdev->xdev, err,
56675 +                                "Couldn't locate PCI device "
56676 +                                "(%04x:%02x:%02x.%01x)! "
56677 +                                "perhaps already in-use?",
56678 +                                domain, bus, slot, func);
56679 +               goto out;
56680 +       }
56681 +
56682 +       err = pciback_add_pci_dev(pdev, dev);
56683 +       if (err)
56684 +               goto out;
56685 +
56686 +       /* TODO: It'd be nice to export a bridge and have all of its children
56687 +        * get exported with it. This may be best done in xend (which will
56688 +        * have to calculate resource usage anyway) but we probably want to
56689 +        * put something in here to ensure that if a bridge gets given to a
56690 +        * driver domain, that all devices under that bridge are not given
56691 +        * to other driver domains (as he who controls the bridge can disable
56692 +        * it and stop the other devices from working).
56693 +        */
56694 +      out:
56695 +       return err;
56696 +}
56697 +
56698 +static int pciback_setup_backend(struct pciback_device *pdev)
56699 +{
56700 +       /* Get configuration from xend (if available now) */
56701 +       int domain, bus, slot, func;
56702 +       int err = 0;
56703 +       int i, num_devs;
56704 +       char dev_str[64];
56705 +
56706 +       spin_lock(&pdev->dev_lock);
56707 +
56708 +       /* It's possible we could get the call to setup twice, so make sure
56709 +        * we're not already connected.
56710 +        */
56711 +       if (xenbus_read_driver_state(pdev->xdev->nodename) !=
56712 +           XenbusStateInitWait)
56713 +               goto out;
56714 +
56715 +       dev_dbg(&pdev->xdev->dev, "getting be setup\n");
56716 +
56717 +       err = xenbus_scanf(XBT_NULL, pdev->xdev->nodename, "num_devs", "%d",
56718 +                          &num_devs);
56719 +       if (err != 1) {
56720 +               if (err >= 0)
56721 +                       err = -EINVAL;
56722 +               xenbus_dev_fatal(pdev->xdev, err,
56723 +                                "Error reading number of devices");
56724 +               goto out;
56725 +       }
56726 +
56727 +       for (i = 0; i < num_devs; i++) {
56728 +               int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
56729 +               if (unlikely(l >= (sizeof(dev_str) - 1))) {
56730 +                       err = -ENOMEM;
56731 +                       xenbus_dev_fatal(pdev->xdev, err,
56732 +                                        "String overflow while reading "
56733 +                                        "configuration");
56734 +                       goto out;
56735 +               }
56736 +
56737 +               err = xenbus_scanf(XBT_NULL, pdev->xdev->nodename, dev_str,
56738 +                                  "%x:%x:%x.%x", &domain, &bus, &slot, &func);
56739 +               if (err < 0) {
56740 +                       xenbus_dev_fatal(pdev->xdev, err,
56741 +                                        "Error reading device configuration");
56742 +                       goto out;
56743 +               }
56744 +               if (err != 4) {
56745 +                       err = -EINVAL;
56746 +                       xenbus_dev_fatal(pdev->xdev, err,
56747 +                                        "Error parsing pci device "
56748 +                                        "configuration");
56749 +                       goto out;
56750 +               }
56751 +
56752 +               err = pciback_export_device(pdev, domain, bus, slot, func);
56753 +               if (err)
56754 +                       goto out;
56755 +       }
56756 +
56757 +       err = pciback_publish_pci_roots(pdev, pciback_publish_pci_root);
56758 +       if (err) {
56759 +               xenbus_dev_fatal(pdev->xdev, err,
56760 +                                "Error while publish PCI root buses "
56761 +                                "for frontend");
56762 +               goto out;
56763 +       }
56764 +
56765 +       err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
56766 +       if (err)
56767 +               xenbus_dev_fatal(pdev->xdev, err,
56768 +                                "Error switching to initialised state!");
56769 +
56770 +      out:
56771 +       spin_unlock(&pdev->dev_lock);
56772 +
56773 +       if (!err)
56774 +               /* see if pcifront is already configured (if not, we'll wait) */
56775 +               pciback_attach(pdev);
56776 +
56777 +       return err;
56778 +}
56779 +
56780 +static void pciback_be_watch(struct xenbus_watch *watch,
56781 +                            const char **vec, unsigned int len)
56782 +{
56783 +       struct pciback_device *pdev =
56784 +           container_of(watch, struct pciback_device, be_watch);
56785 +
56786 +       switch (xenbus_read_driver_state(pdev->xdev->nodename)) {
56787 +       case XenbusStateInitWait:
56788 +               pciback_setup_backend(pdev);
56789 +               break;
56790 +
56791 +       default:
56792 +               break;
56793 +       }
56794 +}
56795 +
56796 +static int pciback_xenbus_probe(struct xenbus_device *dev,
56797 +                               const struct xenbus_device_id *id)
56798 +{
56799 +       int err = 0;
56800 +       struct pciback_device *pdev = alloc_pdev(dev);
56801 +
56802 +       if (pdev == NULL) {
56803 +               err = -ENOMEM;
56804 +               xenbus_dev_fatal(dev, err,
56805 +                                "Error allocating pciback_device struct");
56806 +               goto out;
56807 +       }
56808 +
56809 +       /* wait for xend to configure us */
56810 +       err = xenbus_switch_state(dev, XenbusStateInitWait);
56811 +       if (err)
56812 +               goto out;
56813 +
56814 +       /* watch the backend node for backend configuration information */
56815 +       err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch,
56816 +                               pciback_be_watch);
56817 +       if (err)
56818 +               goto out;
56819 +       pdev->be_watching = 1;
56820 +
56821 +       /* We need to force a call to our callback here in case
56822 +        * xend already configured us!
56823 +        */
56824 +       pciback_be_watch(&pdev->be_watch, NULL, 0);
56825 +
56826 +      out:
56827 +       return err;
56828 +}
56829 +
56830 +static int pciback_xenbus_remove(struct xenbus_device *dev)
56831 +{
56832 +       struct pciback_device *pdev = dev->data;
56833 +
56834 +       if (pdev != NULL)
56835 +               free_pdev(pdev);
56836 +
56837 +       return 0;
56838 +}
56839 +
56840 +static struct xenbus_device_id xenpci_ids[] = {
56841 +       {"pci"},
56842 +       {{0}},
56843 +};
56844 +
56845 +static struct xenbus_driver xenbus_pciback_driver = {
56846 +       .name                   = "pciback",
56847 +       .owner                  = THIS_MODULE,
56848 +       .ids                    = xenpci_ids,
56849 +       .probe                  = pciback_xenbus_probe,
56850 +       .remove                 = pciback_xenbus_remove,
56851 +       .otherend_changed       = pciback_frontend_changed,
56852 +};
56853 +
56854 +int __init pciback_xenbus_register(void)
56855 +{
56856 +       return xenbus_register_backend(&xenbus_pciback_driver);
56857 +}
56858 +
56859 +void __exit pciback_xenbus_unregister(void)
56860 +{
56861 +       xenbus_unregister_driver(&xenbus_pciback_driver);
56862 +}
56863 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/pcifront/Makefile linux-2.6.16/drivers/xen/pcifront/Makefile
56864 --- linux-2.6.16.orig/drivers/xen/pcifront/Makefile     1970-01-01 01:00:00.000000000 +0100
56865 +++ linux-2.6.16/drivers/xen/pcifront/Makefile  2006-06-26 09:51:32.000000000 +0200
56866 @@ -0,0 +1,7 @@
56867 +obj-y += pcifront.o
56868 +
56869 +pcifront-y := pci_op.o xenbus.o pci.o
56870 +
56871 +ifeq ($(CONFIG_XEN_PCIDEV_FE_DEBUG),y)
56872 +EXTRA_CFLAGS += -DDEBUG
56873 +endif
56874 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/pcifront/pci.c linux-2.6.16/drivers/xen/pcifront/pci.c
56875 --- linux-2.6.16.orig/drivers/xen/pcifront/pci.c        1970-01-01 01:00:00.000000000 +0100
56876 +++ linux-2.6.16/drivers/xen/pcifront/pci.c     2006-06-26 09:51:32.000000000 +0200
56877 @@ -0,0 +1,46 @@
56878 +/*
56879 + * PCI Frontend Operations - ensure only one PCI frontend runs at a time
56880 + *
56881 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
56882 + */
56883 +#include <linux/module.h>
56884 +#include <linux/init.h>
56885 +#include <linux/pci.h>
56886 +#include <linux/spinlock.h>
56887 +#include "pcifront.h"
56888 +
56889 +DEFINE_SPINLOCK(pcifront_dev_lock);
56890 +static struct pcifront_device *pcifront_dev = NULL;
56891 +
56892 +int pcifront_connect(struct pcifront_device *pdev)
56893 +{
56894 +       int err = 0;
56895 +
56896 +       spin_lock(&pcifront_dev_lock);
56897 +
56898 +       if (!pcifront_dev) {
56899 +               dev_info(&pdev->xdev->dev, "Installing PCI frontend\n");
56900 +               pcifront_dev = pdev;
56901 +       }
56902 +       else {
56903 +               dev_err(&pdev->xdev->dev, "PCI frontend already installed!\n");
56904 +               err = -EEXIST;
56905 +       }
56906 +
56907 +       spin_unlock(&pcifront_dev_lock);
56908 +
56909 +       return err;
56910 +}
56911 +
56912 +void pcifront_disconnect(struct pcifront_device *pdev)
56913 +{
56914 +       spin_lock(&pcifront_dev_lock);
56915 +
56916 +       if (pdev == pcifront_dev) {
56917 +               dev_info(&pdev->xdev->dev,
56918 +                        "Disconnecting PCI Frontend Buses\n");
56919 +               pcifront_dev = NULL;
56920 +       }
56921 +
56922 +       spin_unlock(&pcifront_dev_lock);
56923 +}
56924 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/pcifront/pci_op.c linux-2.6.16/drivers/xen/pcifront/pci_op.c
56925 --- linux-2.6.16.orig/drivers/xen/pcifront/pci_op.c     1970-01-01 01:00:00.000000000 +0100
56926 +++ linux-2.6.16/drivers/xen/pcifront/pci_op.c  2006-06-26 09:51:32.000000000 +0200
56927 @@ -0,0 +1,272 @@
56928 +/*
56929 + * PCI Frontend Operations - Communicates with frontend
56930 + *
56931 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
56932 + */
56933 +#include <linux/module.h>
56934 +#include <linux/version.h>
56935 +#include <linux/init.h>
56936 +#include <linux/pci.h>
56937 +#include <linux/spinlock.h>
56938 +#include <linux/time.h>
56939 +#include <xen/evtchn.h>
56940 +#include "pcifront.h"
56941 +
56942 +static int verbose_request = 0;
56943 +module_param(verbose_request, int, 0644);
56944 +
56945 +static int errno_to_pcibios_err(int errno)
56946 +{
56947 +       switch (errno) {
56948 +       case XEN_PCI_ERR_success:
56949 +               return PCIBIOS_SUCCESSFUL;
56950 +
56951 +       case XEN_PCI_ERR_dev_not_found:
56952 +               return PCIBIOS_DEVICE_NOT_FOUND;
56953 +
56954 +       case XEN_PCI_ERR_invalid_offset:
56955 +       case XEN_PCI_ERR_op_failed:
56956 +               return PCIBIOS_BAD_REGISTER_NUMBER;
56957 +
56958 +       case XEN_PCI_ERR_not_implemented:
56959 +               return PCIBIOS_FUNC_NOT_SUPPORTED;
56960 +
56961 +       case XEN_PCI_ERR_access_denied:
56962 +               return PCIBIOS_SET_FAILED;
56963 +       }
56964 +       return errno;
56965 +}
56966 +
56967 +static int do_pci_op(struct pcifront_device *pdev, struct xen_pci_op *op)
56968 +{
56969 +       int err = 0;
56970 +       struct xen_pci_op *active_op = &pdev->sh_info->op;
56971 +       unsigned long irq_flags;
56972 +       evtchn_port_t port = pdev->evtchn;
56973 +       nsec_t ns, ns_timeout;
56974 +       struct timeval tv;
56975 +
56976 +       spin_lock_irqsave(&pdev->sh_info_lock, irq_flags);
56977 +
56978 +       memcpy(active_op, op, sizeof(struct xen_pci_op));
56979 +
56980 +       /* Go */
56981 +       wmb();
56982 +       set_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
56983 +       notify_remote_via_evtchn(port);
56984 +
56985 +       /*
56986 +        * We set a poll timeout of 3 seconds but give up on return after
56987 +        * 2 seconds. It is better to time out too late rather than too early
56988 +        * (in the latter case we end up continually re-executing poll() with a
56989 +        * timeout in the past). 1s difference gives plenty of slack for error.
56990 +        */
56991 +       do_gettimeofday(&tv);
56992 +       ns_timeout = timeval_to_ns(&tv) + 2 * (nsec_t)NSEC_PER_SEC;
56993 +
56994 +       clear_evtchn(port);
56995 +
56996 +       while (test_bit(_XEN_PCIF_active,
56997 +                       (unsigned long *)&pdev->sh_info->flags)) {
56998 +               if (HYPERVISOR_poll(&port, 1, jiffies + 3*HZ))
56999 +                       BUG();
57000 +               clear_evtchn(port);
57001 +               do_gettimeofday(&tv);
57002 +               ns = timeval_to_ns(&tv);
57003 +               if (ns > ns_timeout) {
57004 +                       dev_err(&pdev->xdev->dev,
57005 +                               "pciback not responding!!!\n");
57006 +                       clear_bit(_XEN_PCIF_active,
57007 +                                 (unsigned long *)&pdev->sh_info->flags);
57008 +                       err = XEN_PCI_ERR_dev_not_found;
57009 +                       goto out;
57010 +               }
57011 +       }
57012 +
57013 +       memcpy(op, active_op, sizeof(struct xen_pci_op));
57014 +
57015 +       err = op->err;
57016 +      out:
57017 +       spin_unlock_irqrestore(&pdev->sh_info_lock, irq_flags);
57018 +       return err;
57019 +}
57020 +
57021 +/* Access to this function is spinlocked in drivers/pci/access.c */
57022 +static int pcifront_bus_read(struct pci_bus *bus, unsigned int devfn,
57023 +                            int where, int size, u32 * val)
57024 +{
57025 +       int err = 0;
57026 +       struct xen_pci_op op = {
57027 +               .cmd    = XEN_PCI_OP_conf_read,
57028 +               .domain = pci_domain_nr(bus),
57029 +               .bus    = bus->number,
57030 +               .devfn  = devfn,
57031 +               .offset = where,
57032 +               .size   = size,
57033 +       };
57034 +       struct pcifront_sd *sd = bus->sysdata;
57035 +       struct pcifront_device *pdev = sd->pdev;
57036 +
57037 +       if (verbose_request)
57038 +               dev_info(&pdev->xdev->dev,
57039 +                        "read dev=%04x:%02x:%02x.%01x - offset %x size %d\n",
57040 +                        pci_domain_nr(bus), bus->number, PCI_SLOT(devfn),
57041 +                        PCI_FUNC(devfn), where, size);
57042 +
57043 +       err = do_pci_op(pdev, &op);
57044 +
57045 +       if (likely(!err)) {
57046 +               if (verbose_request)
57047 +                       dev_info(&pdev->xdev->dev, "read got back value %x\n",
57048 +                                op.value);
57049 +
57050 +               *val = op.value;
57051 +       } else if (err == -ENODEV) {
57052 +               /* No device here, pretend that it just returned 0 */
57053 +               err = 0;
57054 +               *val = 0;
57055 +       }
57056 +
57057 +       return errno_to_pcibios_err(err);
57058 +}
57059 +
57060 +/* Access to this function is spinlocked in drivers/pci/access.c */
57061 +static int pcifront_bus_write(struct pci_bus *bus, unsigned int devfn,
57062 +                             int where, int size, u32 val)
57063 +{
57064 +       struct xen_pci_op op = {
57065 +               .cmd    = XEN_PCI_OP_conf_write,
57066 +               .domain = pci_domain_nr(bus),
57067 +               .bus    = bus->number,
57068 +               .devfn  = devfn,
57069 +               .offset = where,
57070 +               .size   = size,
57071 +               .value  = val,
57072 +       };
57073 +       struct pcifront_sd *sd = bus->sysdata;
57074 +       struct pcifront_device *pdev = sd->pdev;
57075 +
57076 +       if (verbose_request)
57077 +               dev_info(&pdev->xdev->dev,
57078 +                        "write dev=%04x:%02x:%02x.%01x - "
57079 +                        "offset %x size %d val %x\n",
57080 +                        pci_domain_nr(bus), bus->number,
57081 +                        PCI_SLOT(devfn), PCI_FUNC(devfn), where, size, val);
57082 +
57083 +       return errno_to_pcibios_err(do_pci_op(pdev, &op));
57084 +}
57085 +
57086 +struct pci_ops pcifront_bus_ops = {
57087 +       .read = pcifront_bus_read,
57088 +       .write = pcifront_bus_write,
57089 +};
57090 +
57091 +/* Claim resources for the PCI frontend as-is, backend won't allow changes */
57092 +static void pcifront_claim_resource(struct pci_dev *dev, void *data)
57093 +{
57094 +       struct pcifront_device *pdev = data;
57095 +       int i;
57096 +       struct resource *r;
57097 +
57098 +       for (i = 0; i < PCI_NUM_RESOURCES; i++) {
57099 +               r = &dev->resource[i];
57100 +
57101 +               if (!r->parent && r->start && r->flags) {
57102 +                       dev_dbg(&pdev->xdev->dev, "claiming resource %s/%d\n",
57103 +                               pci_name(dev), i);
57104 +                       pci_claim_resource(dev, i);
57105 +               }
57106 +       }
57107 +}
57108 +
57109 +int pcifront_scan_root(struct pcifront_device *pdev,
57110 +                      unsigned int domain, unsigned int bus)
57111 +{
57112 +       struct pci_bus *b;
57113 +       struct pcifront_sd *sd = NULL;
57114 +       struct pci_bus_entry *bus_entry = NULL;
57115 +       int err = 0;
57116 +
57117 +#ifndef CONFIG_PCI_DOMAINS
57118 +       if (domain != 0) {
57119 +               dev_err(&pdev->xdev->dev,
57120 +                       "PCI Root in non-zero PCI Domain! domain=%d\n", domain);
57121 +               dev_err(&pdev->xdev->dev,
57122 +                       "Please compile with CONFIG_PCI_DOMAINS\n");
57123 +               err = -EINVAL;
57124 +               goto err_out;
57125 +       }
57126 +#endif
57127 +
57128 +       dev_info(&pdev->xdev->dev, "Creating PCI Frontend Bus %04x:%02x\n",
57129 +                domain, bus);
57130 +
57131 +       bus_entry = kmalloc(sizeof(*bus_entry), GFP_KERNEL);
57132 +       sd = kmalloc(sizeof(*sd), GFP_KERNEL);
57133 +       if (!bus_entry || !sd) {
57134 +               err = -ENOMEM;
57135 +               goto err_out;
57136 +       }
57137 +       sd->domain = domain;
57138 +       sd->pdev = pdev;
57139 +
57140 +       b = pci_scan_bus_parented(&pdev->xdev->dev, bus, &pcifront_bus_ops, sd);
57141 +       if (!b) {
57142 +               dev_err(&pdev->xdev->dev, "Error creating PCI Frontend Bus!\n");
57143 +               err = -ENOMEM;
57144 +               goto err_out;
57145 +       }
57146 +       bus_entry->bus = b;
57147 +
57148 +       list_add(&bus_entry->list, &pdev->root_buses);
57149 +
57150 +       /* Claim resources before going "live" with our devices */
57151 +       pci_walk_bus(b, pcifront_claim_resource, pdev);
57152 +
57153 +       pci_bus_add_devices(b);
57154 +
57155 +       return 0;
57156 +
57157 +      err_out:
57158 +       kfree(bus_entry);
57159 +       kfree(sd);
57160 +
57161 +       return err;
57162 +}
57163 +
57164 +static void free_root_bus_devs(struct pci_bus *bus)
57165 +{
57166 +       struct pci_dev *dev;
57167 +
57168 +       spin_lock(&pci_bus_lock);
57169 +       while (!list_empty(&bus->devices)) {
57170 +               dev = container_of(bus->devices.next, struct pci_dev, bus_list);
57171 +               spin_unlock(&pci_bus_lock);
57172 +
57173 +               dev_dbg(&dev->dev, "removing device\n");
57174 +               pci_remove_bus_device(dev);
57175 +
57176 +               spin_lock(&pci_bus_lock);
57177 +       }
57178 +       spin_unlock(&pci_bus_lock);
57179 +}
57180 +
57181 +void pcifront_free_roots(struct pcifront_device *pdev)
57182 +{
57183 +       struct pci_bus_entry *bus_entry, *t;
57184 +
57185 +       dev_dbg(&pdev->xdev->dev, "cleaning up root buses\n");
57186 +
57187 +       list_for_each_entry_safe(bus_entry, t, &pdev->root_buses, list) {
57188 +               list_del(&bus_entry->list);
57189 +
57190 +               free_root_bus_devs(bus_entry->bus);
57191 +
57192 +               kfree(bus_entry->bus->sysdata);
57193 +
57194 +               device_unregister(bus_entry->bus->bridge);
57195 +               pci_remove_bus(bus_entry->bus);
57196 +
57197 +               kfree(bus_entry);
57198 +       }
57199 +}
57200 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/pcifront/pcifront.h linux-2.6.16/drivers/xen/pcifront/pcifront.h
57201 --- linux-2.6.16.orig/drivers/xen/pcifront/pcifront.h   1970-01-01 01:00:00.000000000 +0100
57202 +++ linux-2.6.16/drivers/xen/pcifront/pcifront.h        2006-06-26 09:51:32.000000000 +0200
57203 @@ -0,0 +1,40 @@
57204 +/*
57205 + * PCI Frontend - Common data structures & function declarations
57206 + *
57207 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
57208 + */
57209 +#ifndef __XEN_PCIFRONT_H__
57210 +#define __XEN_PCIFRONT_H__
57211 +
57212 +#include <linux/spinlock.h>
57213 +#include <linux/pci.h>
57214 +#include <xen/xenbus.h>
57215 +#include <xen/interface/io/pciif.h>
57216 +#include <xen/pcifront.h>
57217 +
57218 +struct pci_bus_entry {
57219 +       struct list_head list;
57220 +       struct pci_bus *bus;
57221 +};
57222 +
57223 +struct pcifront_device {
57224 +       struct xenbus_device *xdev;
57225 +       struct list_head root_buses;
57226 +       spinlock_t dev_lock;
57227 +
57228 +       int evtchn;
57229 +       int gnt_ref;
57230 +
57231 +       /* Lock this when doing any operations in sh_info */
57232 +       spinlock_t sh_info_lock;
57233 +       struct xen_pci_sharedinfo *sh_info;
57234 +};
57235 +
57236 +int pcifront_connect(struct pcifront_device *pdev);
57237 +void pcifront_disconnect(struct pcifront_device *pdev);
57238 +
57239 +int pcifront_scan_root(struct pcifront_device *pdev,
57240 +                      unsigned int domain, unsigned int bus);
57241 +void pcifront_free_roots(struct pcifront_device *pdev);
57242 +
57243 +#endif /* __XEN_PCIFRONT_H__ */
57244 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/pcifront/xenbus.c linux-2.6.16/drivers/xen/pcifront/xenbus.c
57245 --- linux-2.6.16.orig/drivers/xen/pcifront/xenbus.c     1970-01-01 01:00:00.000000000 +0100
57246 +++ linux-2.6.16/drivers/xen/pcifront/xenbus.c  2006-06-26 09:51:32.000000000 +0200
57247 @@ -0,0 +1,294 @@
57248 +/*
57249 + * PCI Frontend Xenbus Setup - handles setup with backend (imports page/evtchn)
57250 + *
57251 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
57252 + */
57253 +#include <linux/module.h>
57254 +#include <linux/init.h>
57255 +#include <linux/mm.h>
57256 +#include <xen/xenbus.h>
57257 +#include "pcifront.h"
57258 +
57259 +#define INVALID_GRANT_REF (0)
57260 +#define INVALID_EVTCHN    (-1)
57261 +
57262 +static struct pcifront_device *alloc_pdev(struct xenbus_device *xdev)
57263 +{
57264 +       struct pcifront_device *pdev;
57265 +
57266 +       pdev = kmalloc(sizeof(struct pcifront_device), GFP_KERNEL);
57267 +       if (pdev == NULL)
57268 +               goto out;
57269 +
57270 +       pdev->sh_info =
57271 +           (struct xen_pci_sharedinfo *)__get_free_page(GFP_KERNEL);
57272 +       if (pdev->sh_info == NULL) {
57273 +               kfree(pdev);
57274 +               pdev = NULL;
57275 +               goto out;
57276 +       }
57277 +       pdev->sh_info->flags = 0;
57278 +
57279 +       xdev->data = pdev;
57280 +       pdev->xdev = xdev;
57281 +
57282 +       INIT_LIST_HEAD(&pdev->root_buses);
57283 +
57284 +       spin_lock_init(&pdev->dev_lock);
57285 +       spin_lock_init(&pdev->sh_info_lock);
57286 +
57287 +       pdev->evtchn = INVALID_EVTCHN;
57288 +       pdev->gnt_ref = INVALID_GRANT_REF;
57289 +
57290 +       dev_dbg(&xdev->dev, "Allocated pdev @ 0x%p pdev->sh_info @ 0x%p\n",
57291 +               pdev, pdev->sh_info);
57292 +      out:
57293 +       return pdev;
57294 +}
57295 +
57296 +static void free_pdev(struct pcifront_device *pdev)
57297 +{
57298 +       dev_dbg(&pdev->xdev->dev, "freeing pdev @ 0x%p\n", pdev);
57299 +
57300 +       pcifront_free_roots(pdev);
57301 +
57302 +       if (pdev->evtchn != INVALID_EVTCHN)
57303 +               xenbus_free_evtchn(pdev->xdev, pdev->evtchn);
57304 +
57305 +       if (pdev->gnt_ref != INVALID_GRANT_REF)
57306 +               gnttab_end_foreign_access(pdev->gnt_ref, 0,
57307 +                                         (unsigned long)pdev->sh_info);
57308 +
57309 +       pdev->xdev->data = NULL;
57310 +
57311 +       kfree(pdev);
57312 +}
57313 +
57314 +static int pcifront_publish_info(struct pcifront_device *pdev)
57315 +{
57316 +       int err = 0;
57317 +       xenbus_transaction_t trans;
57318 +
57319 +       err = xenbus_grant_ring(pdev->xdev, virt_to_mfn(pdev->sh_info));
57320 +       if (err < 0)
57321 +               goto out;
57322 +
57323 +       pdev->gnt_ref = err;
57324 +
57325 +       err = xenbus_alloc_evtchn(pdev->xdev, &pdev->evtchn);
57326 +       if (err)
57327 +               goto out;
57328 +
57329 +      do_publish:
57330 +       err = xenbus_transaction_start(&trans);
57331 +       if (err) {
57332 +               xenbus_dev_fatal(pdev->xdev, err,
57333 +                                "Error writing configuration for backend "
57334 +                                "(start transaction)");
57335 +               goto out;
57336 +       }
57337 +
57338 +       err = xenbus_printf(trans, pdev->xdev->nodename,
57339 +                           "pci-op-ref", "%u", pdev->gnt_ref);
57340 +       if (!err)
57341 +               err = xenbus_printf(trans, pdev->xdev->nodename,
57342 +                                   "event-channel", "%u", pdev->evtchn);
57343 +       if (!err)
57344 +               err = xenbus_printf(trans, pdev->xdev->nodename,
57345 +                                   "magic", XEN_PCI_MAGIC);
57346 +
57347 +       if (err) {
57348 +               xenbus_transaction_end(trans, 1);
57349 +               xenbus_dev_fatal(pdev->xdev, err,
57350 +                                "Error writing configuration for backend");
57351 +               goto out;
57352 +       } else {
57353 +               err = xenbus_transaction_end(trans, 0);
57354 +               if (err == -EAGAIN)
57355 +                       goto do_publish;
57356 +               else if (err) {
57357 +                       xenbus_dev_fatal(pdev->xdev, err,
57358 +                                        "Error completing transaction "
57359 +                                        "for backend");
57360 +                       goto out;
57361 +               }
57362 +       }
57363 +
57364 +       xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
57365 +
57366 +       dev_dbg(&pdev->xdev->dev, "publishing successful!\n");
57367 +
57368 +      out:
57369 +       return err;
57370 +}
57371 +
57372 +static int pcifront_try_connect(struct pcifront_device *pdev)
57373 +{
57374 +       int err = -EFAULT;
57375 +       int i, num_roots, len;
57376 +       char str[64];
57377 +       unsigned int domain, bus;
57378 +
57379 +       spin_lock(&pdev->dev_lock);
57380 +
57381 +       /* Only connect once */
57382 +       if (xenbus_read_driver_state(pdev->xdev->nodename) !=
57383 +           XenbusStateInitialised)
57384 +               goto out;
57385 +
57386 +       err = pcifront_connect(pdev);
57387 +       if (err) {
57388 +               xenbus_dev_fatal(pdev->xdev, err,
57389 +                                "Error connecting PCI Frontend");
57390 +               goto out;
57391 +       }
57392 +
57393 +       err = xenbus_scanf(XBT_NULL, pdev->xdev->otherend,
57394 +                          "root_num", "%d", &num_roots);
57395 +       if (err == -ENOENT) {
57396 +               xenbus_dev_error(pdev->xdev, err,
57397 +                                "No PCI Roots found, trying 0000:00");
57398 +               err = pcifront_scan_root(pdev, 0, 0);
57399 +               num_roots = 0;
57400 +       } else if (err != 1) {
57401 +               if (err == 0)
57402 +                       err = -EINVAL;
57403 +               xenbus_dev_fatal(pdev->xdev, err,
57404 +                                "Error reading number of PCI roots");
57405 +               goto out;
57406 +       }
57407 +
57408 +       for (i = 0; i < num_roots; i++) {
57409 +               len = snprintf(str, sizeof(str), "root-%d", i);
57410 +               if (unlikely(len >= (sizeof(str) - 1))) {
57411 +                       err = -ENOMEM;
57412 +                       goto out;
57413 +               }
57414 +
57415 +               err = xenbus_scanf(XBT_NULL, pdev->xdev->otherend, str,
57416 +                                  "%x:%x", &domain, &bus);
57417 +               if (err != 2) {
57418 +                       if (err >= 0)
57419 +                               err = -EINVAL;
57420 +                       xenbus_dev_fatal(pdev->xdev, err,
57421 +                                        "Error reading PCI root %d", i);
57422 +                       goto out;
57423 +               }
57424 +
57425 +               err = pcifront_scan_root(pdev, domain, bus);
57426 +               if (err) {
57427 +                       xenbus_dev_fatal(pdev->xdev, err,
57428 +                                        "Error scanning PCI root %04x:%02x",
57429 +                                        domain, bus);
57430 +                       goto out;
57431 +               }
57432 +       }
57433 +
57434 +       err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
57435 +       if (err)
57436 +               goto out;
57437 +
57438 +      out:
57439 +       spin_unlock(&pdev->dev_lock);
57440 +       return err;
57441 +}
57442 +
57443 +static int pcifront_try_disconnect(struct pcifront_device *pdev)
57444 +{
57445 +       int err = 0;
57446 +       XenbusState prev_state;
57447 +
57448 +       spin_lock(&pdev->dev_lock);
57449 +
57450 +       prev_state = xenbus_read_driver_state(pdev->xdev->nodename);
57451 +
57452 +       if (prev_state < XenbusStateClosing)
57453 +               err = xenbus_switch_state(pdev->xdev, XenbusStateClosing);
57454 +
57455 +       if (!err && prev_state == XenbusStateConnected)
57456 +               pcifront_disconnect(pdev);
57457 +
57458 +       spin_unlock(&pdev->dev_lock);
57459 +
57460 +       return err;
57461 +}
57462 +
57463 +static void pcifront_backend_changed(struct xenbus_device *xdev,
57464 +                                    XenbusState be_state)
57465 +{
57466 +       struct pcifront_device *pdev = xdev->data;
57467 +
57468 +       switch (be_state) {
57469 +       case XenbusStateClosing:
57470 +               dev_warn(&xdev->dev, "backend going away!\n");
57471 +               pcifront_try_disconnect(pdev);
57472 +               break;
57473 +
57474 +       case XenbusStateClosed:
57475 +               dev_warn(&xdev->dev, "backend went away!\n");
57476 +               pcifront_try_disconnect(pdev);
57477 +
57478 +               device_unregister(&pdev->xdev->dev);
57479 +               break;
57480 +
57481 +       case XenbusStateConnected:
57482 +               pcifront_try_connect(pdev);
57483 +               break;
57484 +
57485 +       default:
57486 +               break;
57487 +       }
57488 +}
57489 +
57490 +static int pcifront_xenbus_probe(struct xenbus_device *xdev,
57491 +                                const struct xenbus_device_id *id)
57492 +{
57493 +       int err = 0;
57494 +       struct pcifront_device *pdev = alloc_pdev(xdev);
57495 +
57496 +       if (pdev == NULL) {
57497 +               err = -ENOMEM;
57498 +               xenbus_dev_fatal(xdev, err,
57499 +                                "Error allocating pcifront_device struct");
57500 +               goto out;
57501 +       }
57502 +
57503 +       err = pcifront_publish_info(pdev);
57504 +
57505 +      out:
57506 +       return err;
57507 +}
57508 +
57509 +static int pcifront_xenbus_remove(struct xenbus_device *xdev)
57510 +{
57511 +       if (xdev->data)
57512 +               free_pdev(xdev->data);
57513 +
57514 +       return 0;
57515 +}
57516 +
57517 +static struct xenbus_device_id xenpci_ids[] = {
57518 +       {"pci"},
57519 +       {{0}},
57520 +};
57521 +
57522 +static struct xenbus_driver xenbus_pcifront_driver = {
57523 +       .name                   = "pcifront",
57524 +       .owner                  = THIS_MODULE,
57525 +       .ids                    = xenpci_ids,
57526 +       .probe                  = pcifront_xenbus_probe,
57527 +       .remove                 = pcifront_xenbus_remove,
57528 +       .otherend_changed       = pcifront_backend_changed,
57529 +};
57530 +
57531 +static int __init pcifront_init(void)
57532 +{
57533 +       int err = 0;
57534 +
57535 +       err = xenbus_register_frontend(&xenbus_pcifront_driver);
57536 +
57537 +       return err;
57538 +}
57539 +
57540 +/* Initialize after the Xen PCI Frontend Stub is initialized */
57541 +subsys_initcall(pcifront_init);
57542 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/privcmd/Makefile linux-2.6.16/drivers/xen/privcmd/Makefile
57543 --- linux-2.6.16.orig/drivers/xen/privcmd/Makefile      1970-01-01 01:00:00.000000000 +0100
57544 +++ linux-2.6.16/drivers/xen/privcmd/Makefile   2006-06-26 09:51:32.000000000 +0200
57545 @@ -0,0 +1,2 @@
57546 +
57547 +obj-y  := privcmd.o
57548 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/privcmd/privcmd.c linux-2.6.16/drivers/xen/privcmd/privcmd.c
57549 --- linux-2.6.16.orig/drivers/xen/privcmd/privcmd.c     1970-01-01 01:00:00.000000000 +0100
57550 +++ linux-2.6.16/drivers/xen/privcmd/privcmd.c  2006-06-26 09:51:32.000000000 +0200
57551 @@ -0,0 +1,302 @@
57552 +/******************************************************************************
57553 + * privcmd.c
57554 + * 
57555 + * Interface to privileged domain-0 commands.
57556 + * 
57557 + * Copyright (c) 2002-2004, K A Fraser, B Dragovic
57558 + */
57559 +
57560 +#include <linux/config.h>
57561 +#include <linux/kernel.h>
57562 +#include <linux/sched.h>
57563 +#include <linux/slab.h>
57564 +#include <linux/string.h>
57565 +#include <linux/errno.h>
57566 +#include <linux/mm.h>
57567 +#include <linux/mman.h>
57568 +#include <linux/swap.h>
57569 +#include <linux/smp_lock.h>
57570 +#include <linux/highmem.h>
57571 +#include <linux/pagemap.h>
57572 +#include <linux/seq_file.h>
57573 +#include <linux/kthread.h>
57574 +#include <asm/hypervisor.h>
57575 +
57576 +#include <asm/pgalloc.h>
57577 +#include <asm/pgtable.h>
57578 +#include <asm/uaccess.h>
57579 +#include <asm/tlb.h>
57580 +#include <asm/hypervisor.h>
57581 +#include <xen/public/privcmd.h>
57582 +#include <xen/interface/xen.h>
57583 +#include <xen/interface/dom0_ops.h>
57584 +#include <xen/xen_proc.h>
57585 +
57586 +static struct proc_dir_entry *privcmd_intf;
57587 +static struct proc_dir_entry *capabilities_intf;
57588 +
57589 +#define NR_HYPERCALLS 32
57590 +static DECLARE_BITMAP(hypercall_permission_map, NR_HYPERCALLS);
57591 +
57592 +static int privcmd_ioctl(struct inode *inode, struct file *file,
57593 +                         unsigned int cmd, unsigned long data)
57594 +{
57595 +       int ret = -ENOSYS;
57596 +       void __user *udata = (void __user *) data;
57597 +
57598 +       switch (cmd) {
57599 +       case IOCTL_PRIVCMD_HYPERCALL: {
57600 +               privcmd_hypercall_t hypercall;
57601 +  
57602 +               if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
57603 +                       return -EFAULT;
57604 +
57605 +               /* Check hypercall number for validity. */
57606 +               if (hypercall.op >= NR_HYPERCALLS)
57607 +                       return -EINVAL;
57608 +               if (!test_bit(hypercall.op, hypercall_permission_map))
57609 +                       return -EINVAL;
57610 +
57611 +#if defined(__i386__)
57612 +               __asm__ __volatile__ (
57613 +                       "pushl %%ebx; pushl %%ecx; pushl %%edx; "
57614 +                       "pushl %%esi; pushl %%edi; "
57615 +                       "movl  4(%%eax),%%ebx ;"
57616 +                       "movl  8(%%eax),%%ecx ;"
57617 +                       "movl 12(%%eax),%%edx ;"
57618 +                       "movl 16(%%eax),%%esi ;"
57619 +                       "movl 20(%%eax),%%edi ;"
57620 +                       "movl   (%%eax),%%eax ;"
57621 +                       "shll $5,%%eax ;"
57622 +                       "addl $hypercall_page,%%eax ;"
57623 +                       "call *%%eax ;"
57624 +                       "popl %%edi; popl %%esi; popl %%edx; "
57625 +                       "popl %%ecx; popl %%ebx"
57626 +                       : "=a" (ret) : "0" (&hypercall) : "memory" );
57627 +#elif defined (__x86_64__)
57628 +               {
57629 +                       long ign1, ign2, ign3;
57630 +                       __asm__ __volatile__ (
57631 +                               "movq %8,%%r10; movq %9,%%r8;"
57632 +                               "shlq $5,%%rax ;"
57633 +                               "addq $hypercall_page,%%rax ;"
57634 +                               "call *%%rax"
57635 +                               : "=a" (ret), "=D" (ign1),
57636 +                                 "=S" (ign2), "=d" (ign3)
57637 +                               : "0" ((unsigned long)hypercall.op), 
57638 +                               "1" ((unsigned long)hypercall.arg[0]), 
57639 +                               "2" ((unsigned long)hypercall.arg[1]),
57640 +                               "3" ((unsigned long)hypercall.arg[2]), 
57641 +                               "g" ((unsigned long)hypercall.arg[3]),
57642 +                               "g" ((unsigned long)hypercall.arg[4])
57643 +                               : "r8", "r10", "memory" );
57644 +               }
57645 +#elif defined (__ia64__)
57646 +               __asm__ __volatile__ (
57647 +                       ";; mov r14=%2; mov r15=%3; "
57648 +                       "mov r16=%4; mov r17=%5; mov r18=%6;"
57649 +                       "mov r2=%1; break 0x1000;; mov %0=r8 ;;"
57650 +                       : "=r" (ret)
57651 +                       : "r" (hypercall.op),
57652 +                       "r" (hypercall.arg[0]),
57653 +                       "r" (hypercall.arg[1]),
57654 +                       "r" (hypercall.arg[2]),
57655 +                       "r" (hypercall.arg[3]),
57656 +                       "r" (hypercall.arg[4])
57657 +                       : "r14","r15","r16","r17","r18","r2","r8","memory");
57658 +#endif
57659 +       }
57660 +       break;
57661 +
57662 +#if defined(CONFIG_XEN_PRIVILEGED_GUEST)
57663 +       case IOCTL_PRIVCMD_MMAP: {
57664 +#define PRIVCMD_MMAP_SZ 32
57665 +               privcmd_mmap_t mmapcmd;
57666 +               privcmd_mmap_entry_t msg[PRIVCMD_MMAP_SZ];
57667 +               privcmd_mmap_entry_t __user *p;
57668 +               int i, rc;
57669 +
57670 +               if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
57671 +                       return -EFAULT;
57672 +
57673 +               p = mmapcmd.entry;
57674 +
57675 +               for (i = 0; i < mmapcmd.num;
57676 +                    i += PRIVCMD_MMAP_SZ, p += PRIVCMD_MMAP_SZ) {
57677 +                       int j, n = ((mmapcmd.num-i)>PRIVCMD_MMAP_SZ)?
57678 +                               PRIVCMD_MMAP_SZ:(mmapcmd.num-i);
57679 +
57680 +                       if (copy_from_user(&msg, p,
57681 +                                          n*sizeof(privcmd_mmap_entry_t)))
57682 +                               return -EFAULT;
57683 +     
57684 +                       for (j = 0; j < n; j++) {
57685 +                               struct vm_area_struct *vma = 
57686 +                                       find_vma( current->mm, msg[j].va );
57687 +
57688 +                               if (!vma)
57689 +                                       return -EINVAL;
57690 +
57691 +                               if (msg[j].va > PAGE_OFFSET)
57692 +                                       return -EINVAL;
57693 +
57694 +                               if ((msg[j].va + (msg[j].npages << PAGE_SHIFT))
57695 +                                   > vma->vm_end )
57696 +                                       return -EINVAL;
57697 +
57698 +                               if ((rc = direct_remap_pfn_range(
57699 +                                       vma,
57700 +                                       msg[j].va&PAGE_MASK, 
57701 +                                       msg[j].mfn, 
57702 +                                       msg[j].npages<<PAGE_SHIFT, 
57703 +                                       vma->vm_page_prot,
57704 +                                       mmapcmd.dom)) < 0)
57705 +                                       return rc;
57706 +                       }
57707 +               }
57708 +               ret = 0;
57709 +       }
57710 +       break;
57711 +
57712 +       case IOCTL_PRIVCMD_MMAPBATCH: {
57713 +               mmu_update_t u;
57714 +               privcmd_mmapbatch_t m;
57715 +               struct vm_area_struct *vma = NULL;
57716 +               unsigned long __user *p;
57717 +               unsigned long addr, mfn; 
57718 +               uint64_t ptep;
57719 +               int i;
57720 +
57721 +               if (copy_from_user(&m, udata, sizeof(m))) {
57722 +                       ret = -EFAULT;
57723 +                       goto batch_err;
57724 +               }
57725 +
57726 +               if (m.dom == DOMID_SELF) {
57727 +                       ret = -EINVAL;
57728 +                       goto batch_err;
57729 +               }
57730 +
57731 +               vma = find_vma(current->mm, m.addr);
57732 +               if (!vma) {
57733 +                       ret = -EINVAL;
57734 +                       goto batch_err;
57735 +               }
57736 +
57737 +               if (m.addr > PAGE_OFFSET) {
57738 +                       ret = -EFAULT;
57739 +                       goto batch_err;
57740 +               }
57741 +
57742 +               if ((m.addr + (m.num<<PAGE_SHIFT)) > vma->vm_end) {
57743 +                       ret = -EFAULT;
57744 +                       goto batch_err;
57745 +               }
57746 +
57747 +               p = m.arr;
57748 +               addr = m.addr;
57749 +               for (i = 0; i < m.num; i++, addr += PAGE_SIZE, p++) {
57750 +                       if (get_user(mfn, p))
57751 +                               return -EFAULT;
57752 +#ifdef __ia64__
57753 +                       ret = remap_pfn_range(vma,
57754 +                                             addr&PAGE_MASK,
57755 +                                             mfn,
57756 +                                             1<<PAGE_SHIFT,
57757 +                                             vma->vm_page_prot);
57758 +                       if (ret < 0)
57759 +                           goto batch_err;
57760 +#else
57761 +
57762 +                       ret = create_lookup_pte_addr(vma->vm_mm, addr, &ptep);
57763 +                       if (ret)
57764 +                               goto batch_err;
57765 +
57766 +                       u.val = pte_val_ma(pfn_pte_ma(mfn, vma->vm_page_prot));
57767 +                       u.ptr = ptep;
57768 +
57769 +                       if (HYPERVISOR_mmu_update(&u, 1, NULL, m.dom) < 0)
57770 +                               put_user(0xF0000000 | mfn, p);
57771 +#endif
57772 +               }
57773 +
57774 +               ret = 0;
57775 +               break;
57776 +
57777 +       batch_err:
57778 +               printk("batch_err ret=%d vma=%p addr=%lx "
57779 +                      "num=%d arr=%p %lx-%lx\n", 
57780 +                      ret, vma, m.addr, m.num, m.arr,
57781 +                      vma ? vma->vm_start : 0, vma ? vma->vm_end : 0);
57782 +               break;
57783 +       }
57784 +       break;
57785 +#endif
57786 +
57787 +       default:
57788 +               ret = -EINVAL;
57789 +               break;
57790 +       }
57791 +
57792 +       return ret;
57793 +}
57794 +
57795 +static int privcmd_mmap(struct file * file, struct vm_area_struct * vma)
57796 +{
57797 +       /* DONTCOPY is essential for Xen as copy_page_range is broken. */
57798 +       vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY;
57799 +
57800 +       return 0;
57801 +}
57802 +
57803 +static struct file_operations privcmd_file_ops = {
57804 +       .ioctl = privcmd_ioctl,
57805 +       .mmap  = privcmd_mmap,
57806 +};
57807 +
57808 +static int capabilities_read(char *page, char **start, off_t off,
57809 +                        int count, int *eof, void *data)
57810 +{
57811 +       int len = 0;
57812 +       *page = 0;
57813 +
57814 +       if (xen_start_info->flags & SIF_INITDOMAIN)
57815 +               len = sprintf( page, "control_d\n" );
57816 +
57817 +       *eof = 1;
57818 +       return len;
57819 +}
57820 +
57821 +static int __init privcmd_init(void)
57822 +{
57823 +       /* Set of hypercalls that privileged applications may execute. */
57824 +       set_bit(__HYPERVISOR_acm_op,           hypercall_permission_map);
57825 +       set_bit(__HYPERVISOR_dom0_op,          hypercall_permission_map);
57826 +       set_bit(__HYPERVISOR_event_channel_op, hypercall_permission_map);
57827 +       set_bit(__HYPERVISOR_memory_op,        hypercall_permission_map);
57828 +       set_bit(__HYPERVISOR_mmu_update,       hypercall_permission_map);
57829 +       set_bit(__HYPERVISOR_mmuext_op,        hypercall_permission_map);
57830 +       set_bit(__HYPERVISOR_xen_version,      hypercall_permission_map);
57831 +
57832 +       privcmd_intf = create_xen_proc_entry("privcmd", 0400);
57833 +       if (privcmd_intf != NULL)
57834 +               privcmd_intf->proc_fops = &privcmd_file_ops;
57835 +
57836 +       capabilities_intf = create_xen_proc_entry("capabilities", 0400 );
57837 +       if (capabilities_intf != NULL)
57838 +               capabilities_intf->read_proc = capabilities_read;
57839 +
57840 +       return 0;
57841 +}
57842 +
57843 +__initcall(privcmd_init);
57844 +
57845 +/*
57846 + * Local variables:
57847 + *  c-file-style: "linux"
57848 + *  indent-tabs-mode: t
57849 + *  c-indent-level: 8
57850 + *  c-basic-offset: 8
57851 + *  tab-width: 8
57852 + * End:
57853 + */
57854 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/tpmback/Makefile linux-2.6.16/drivers/xen/tpmback/Makefile
57855 --- linux-2.6.16.orig/drivers/xen/tpmback/Makefile      1970-01-01 01:00:00.000000000 +0100
57856 +++ linux-2.6.16/drivers/xen/tpmback/Makefile   2006-06-26 09:51:32.000000000 +0200
57857 @@ -0,0 +1,4 @@
57858 +
57859 +obj-$(CONFIG_XEN_TPMDEV_BACKEND)       += tpmbk.o
57860 +
57861 +tpmbk-y += tpmback.o interface.o xenbus.o
57862 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/tpmback/common.h linux-2.6.16/drivers/xen/tpmback/common.h
57863 --- linux-2.6.16.orig/drivers/xen/tpmback/common.h      1970-01-01 01:00:00.000000000 +0100
57864 +++ linux-2.6.16/drivers/xen/tpmback/common.h   2006-06-26 09:51:32.000000000 +0200
57865 @@ -0,0 +1,91 @@
57866 +/******************************************************************************
57867 + * drivers/xen/tpmback/common.h
57868 + */
57869 +
57870 +#ifndef __NETIF__BACKEND__COMMON_H__
57871 +#define __NETIF__BACKEND__COMMON_H__
57872 +
57873 +#include <linux/config.h>
57874 +#include <linux/version.h>
57875 +#include <linux/module.h>
57876 +#include <linux/interrupt.h>
57877 +#include <linux/slab.h>
57878 +#include <xen/evtchn.h>
57879 +#include <xen/driver_util.h>
57880 +#include <xen/interface/grant_table.h>
57881 +#include <xen/interface/io/tpmif.h>
57882 +#include <asm/io.h>
57883 +#include <asm/pgalloc.h>
57884 +
57885 +#define DPRINTK(_f, _a...) pr_debug("(file=%s, line=%d) " _f, \
57886 +                                    __FILE__ , __LINE__ , ## _a )
57887 +
57888 +typedef struct tpmif_st {
57889 +       struct list_head tpmif_list;
57890 +       /* Unique identifier for this interface. */
57891 +       domid_t domid;
57892 +       unsigned int handle;
57893 +
57894 +       /* Physical parameters of the comms window. */
57895 +       unsigned int evtchn;
57896 +       unsigned int irq;
57897 +
57898 +       /* The shared rings and indexes. */
57899 +       tpmif_tx_interface_t *tx;
57900 +       struct vm_struct *tx_area;
57901 +
57902 +       /* Miscellaneous private stuff. */
57903 +       enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
57904 +       int active;
57905 +
57906 +       struct tpmif_st *hash_next;
57907 +       struct list_head list;  /* scheduling list */
57908 +       atomic_t refcnt;
57909 +
57910 +       long int tpm_instance;
57911 +       unsigned long mmap_vstart;
57912 +
57913 +       struct work_struct work;
57914 +
57915 +       grant_handle_t shmem_handle;
57916 +       grant_ref_t shmem_ref;
57917 +} tpmif_t;
57918 +
57919 +void tpmif_disconnect_complete(tpmif_t * tpmif);
57920 +tpmif_t *tpmif_find(domid_t domid, long int instance);
57921 +void tpmif_interface_init(void);
57922 +void tpmif_interface_exit(void);
57923 +void tpmif_schedule_work(tpmif_t * tpmif);
57924 +void tpmif_deschedule_work(tpmif_t * tpmif);
57925 +void tpmif_xenbus_init(void);
57926 +void tpmif_xenbus_exit(void);
57927 +int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn);
57928 +irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs);
57929 +int tpmif_vtpm_open(tpmif_t *tpmif, domid_t domain, u32 instance);
57930 +int tpmif_vtpm_close(u32 instance);
57931 +
57932 +int vtpm_release_packets(tpmif_t * tpmif, int send_msgs);
57933 +
57934 +#define tpmif_get(_b) (atomic_inc(&(_b)->refcnt))
57935 +#define tpmif_put(_b)                             \
57936 +    do {                                          \
57937 +        if ( atomic_dec_and_test(&(_b)->refcnt) ) \
57938 +            tpmif_disconnect_complete(_b);        \
57939 +    } while (0)
57940 +
57941 +
57942 +extern int num_frontends;
57943 +
57944 +#define MMAP_VADDR(t,_req) ((t)->mmap_vstart + ((_req) * PAGE_SIZE))
57945 +
57946 +#endif /* __TPMIF__BACKEND__COMMON_H__ */
57947 +
57948 +/*
57949 + * Local variables:
57950 + *  c-file-style: "linux"
57951 + *  indent-tabs-mode: t
57952 + *  c-indent-level: 8
57953 + *  c-basic-offset: 8
57954 + *  tab-width: 8
57955 + * End:
57956 + */
57957 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/tpmback/interface.c linux-2.6.16/drivers/xen/tpmback/interface.c
57958 --- linux-2.6.16.orig/drivers/xen/tpmback/interface.c   1970-01-01 01:00:00.000000000 +0100
57959 +++ linux-2.6.16/drivers/xen/tpmback/interface.c        2006-06-26 09:51:32.000000000 +0200
57960 @@ -0,0 +1,194 @@
57961 + /*****************************************************************************
57962 + * drivers/xen/tpmback/interface.c
57963 + *
57964 + * Vritual TPM interface management.
57965 + *
57966 + * Copyright (c) 2005, IBM Corporation
57967 + *
57968 + * Author: Stefan Berger, stefanb@us.ibm.com
57969 + *
57970 + * This code has been derived from drivers/xen/netback/interface.c
57971 + * Copyright (c) 2004, Keir Fraser
57972 + */
57973 +
57974 +#include "common.h"
57975 +#include <xen/balloon.h>
57976 +
57977 +static kmem_cache_t *tpmif_cachep;
57978 +int num_frontends = 0;
57979 +
57980 +LIST_HEAD(tpmif_list);
57981 +
57982 +static tpmif_t *alloc_tpmif(domid_t domid, long int instance)
57983 +{
57984 +       struct page *page;
57985 +       tpmif_t *tpmif;
57986 +
57987 +       tpmif = kmem_cache_alloc(tpmif_cachep, GFP_KERNEL);
57988 +       if (!tpmif)
57989 +               return ERR_PTR(-ENOMEM);
57990 +
57991 +       memset(tpmif, 0, sizeof (*tpmif));
57992 +       tpmif->domid = domid;
57993 +       tpmif->status = DISCONNECTED;
57994 +       tpmif->tpm_instance = instance;
57995 +       atomic_set(&tpmif->refcnt, 1);
57996 +
57997 +       page = balloon_alloc_empty_page_range(TPMIF_TX_RING_SIZE);
57998 +       BUG_ON(page == NULL);
57999 +       tpmif->mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
58000 +
58001 +       list_add(&tpmif->tpmif_list, &tpmif_list);
58002 +       num_frontends++;
58003 +
58004 +       return tpmif;
58005 +}
58006 +
58007 +static void free_tpmif(tpmif_t * tpmif)
58008 +{
58009 +       num_frontends--;
58010 +       list_del(&tpmif->tpmif_list);
58011 +       kmem_cache_free(tpmif_cachep, tpmif);
58012 +}
58013 +
58014 +tpmif_t *tpmif_find(domid_t domid, long int instance)
58015 +{
58016 +       tpmif_t *tpmif;
58017 +
58018 +       list_for_each_entry(tpmif, &tpmif_list, tpmif_list) {
58019 +               if (tpmif->tpm_instance == instance) {
58020 +                       if (tpmif->domid == domid) {
58021 +                               tpmif_get(tpmif);
58022 +                               return tpmif;
58023 +                       } else {
58024 +                               return ERR_PTR(-EEXIST);
58025 +                       }
58026 +               }
58027 +       }
58028 +
58029 +       return alloc_tpmif(domid, instance);
58030 +}
58031 +
58032 +static int map_frontend_page(tpmif_t *tpmif, unsigned long shared_page)
58033 +{
58034 +       int ret;
58035 +       struct gnttab_map_grant_ref op = {
58036 +               .host_addr = (unsigned long)tpmif->tx_area->addr,
58037 +               .flags = GNTMAP_host_map,
58038 +               .ref = shared_page,
58039 +               .dom = tpmif->domid,
58040 +       };
58041 +
58042 +       lock_vm_area(tpmif->tx_area);
58043 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
58044 +       unlock_vm_area(tpmif->tx_area);
58045 +       BUG_ON(ret);
58046 +
58047 +       if (op.status) {
58048 +               DPRINTK(" Grant table operation failure !\n");
58049 +               return op.status;
58050 +       }
58051 +
58052 +       tpmif->shmem_ref = shared_page;
58053 +       tpmif->shmem_handle = op.handle;
58054 +
58055 +       return 0;
58056 +}
58057 +
58058 +static void unmap_frontend_page(tpmif_t *tpmif)
58059 +{
58060 +       struct gnttab_unmap_grant_ref op;
58061 +       int ret;
58062 +
58063 +       op.host_addr    = (unsigned long)tpmif->tx_area->addr;
58064 +       op.handle       = tpmif->shmem_handle;
58065 +       op.dev_bus_addr = 0;
58066 +
58067 +       lock_vm_area(tpmif->tx_area);
58068 +       ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
58069 +       unlock_vm_area(tpmif->tx_area);
58070 +       BUG_ON(ret);
58071 +}
58072 +
58073 +int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn)
58074 +{
58075 +       int err;
58076 +       evtchn_op_t op = {
58077 +               .cmd = EVTCHNOP_bind_interdomain,
58078 +               .u.bind_interdomain.remote_dom = tpmif->domid,
58079 +               .u.bind_interdomain.remote_port = evtchn,
58080 +        };
58081 +
58082 +        if (tpmif->irq) {
58083 +                return 0;
58084 +        }
58085 +
58086 +       if ((tpmif->tx_area = alloc_vm_area(PAGE_SIZE)) == NULL)
58087 +               return -ENOMEM;
58088 +
58089 +       err = map_frontend_page(tpmif, shared_page);
58090 +       if (err) {
58091 +               free_vm_area(tpmif->tx_area);
58092 +               return err;
58093 +       }
58094 +
58095 +       err = HYPERVISOR_event_channel_op(&op);
58096 +       if (err) {
58097 +               unmap_frontend_page(tpmif);
58098 +               free_vm_area(tpmif->tx_area);
58099 +               return err;
58100 +       }
58101 +
58102 +       tpmif->evtchn = op.u.bind_interdomain.local_port;
58103 +
58104 +       tpmif->tx = (tpmif_tx_interface_t *)tpmif->tx_area->addr;
58105 +
58106 +       tpmif->irq = bind_evtchn_to_irqhandler(
58107 +               tpmif->evtchn, tpmif_be_int, 0, "tpmif-backend", tpmif);
58108 +       tpmif->shmem_ref = shared_page;
58109 +       tpmif->active = 1;
58110 +
58111 +       return 0;
58112 +}
58113 +
58114 +static void __tpmif_disconnect_complete(void *arg)
58115 +{
58116 +       tpmif_t *tpmif = (tpmif_t *) arg;
58117 +
58118 +       if (tpmif->irq)
58119 +               unbind_from_irqhandler(tpmif->irq, tpmif);
58120 +
58121 +       if (tpmif->tx) {
58122 +               unmap_frontend_page(tpmif);
58123 +               free_vm_area(tpmif->tx_area);
58124 +       }
58125 +
58126 +       free_tpmif(tpmif);
58127 +}
58128 +
58129 +void tpmif_disconnect_complete(tpmif_t * tpmif)
58130 +{
58131 +       INIT_WORK(&tpmif->work, __tpmif_disconnect_complete, (void *)tpmif);
58132 +       schedule_work(&tpmif->work);
58133 +}
58134 +
58135 +void __init tpmif_interface_init(void)
58136 +{
58137 +       tpmif_cachep = kmem_cache_create("tpmif_cache", sizeof (tpmif_t),
58138 +                                        0, 0, NULL, NULL);
58139 +}
58140 +
58141 +void __init tpmif_interface_exit(void)
58142 +{
58143 +       kmem_cache_destroy(tpmif_cachep);
58144 +}
58145 +
58146 +/*
58147 + * Local variables:
58148 + *  c-file-style: "linux"
58149 + *  indent-tabs-mode: t
58150 + *  c-indent-level: 8
58151 + *  c-basic-offset: 8
58152 + *  tab-width: 8
58153 + * End:
58154 + */
58155 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/tpmback/tpmback.c linux-2.6.16/drivers/xen/tpmback/tpmback.c
58156 --- linux-2.6.16.orig/drivers/xen/tpmback/tpmback.c     1970-01-01 01:00:00.000000000 +0100
58157 +++ linux-2.6.16/drivers/xen/tpmback/tpmback.c  2006-06-26 09:51:32.000000000 +0200
58158 @@ -0,0 +1,1060 @@
58159 +/******************************************************************************
58160 + * drivers/xen/tpmback/tpmback.c
58161 + *
58162 + * Copyright (c) 2005, IBM Corporation
58163 + *
58164 + * Author: Stefan Berger, stefanb@us.ibm.com
58165 + * Grant table support: Mahadevan Gomathisankaran
58166 + *
58167 + * This code has been derived from drivers/xen/netback/netback.c
58168 + * Copyright (c) 2002-2004, K A Fraser
58169 + *
58170 + */
58171 +
58172 +#include "common.h"
58173 +#include <xen/evtchn.h>
58174 +
58175 +#include <linux/types.h>
58176 +#include <linux/list.h>
58177 +#include <linux/miscdevice.h>
58178 +#include <linux/poll.h>
58179 +#include <asm/uaccess.h>
58180 +#include <xen/xenbus.h>
58181 +#include <xen/interface/grant_table.h>
58182 +
58183 +/* local data structures */
58184 +struct data_exchange {
58185 +       struct list_head pending_pak;
58186 +       struct list_head current_pak;
58187 +       unsigned int copied_so_far;
58188 +       u8 has_opener;
58189 +       rwlock_t pak_lock;      // protects all of the previous fields
58190 +       wait_queue_head_t wait_queue;
58191 +};
58192 +
58193 +struct vtpm_resp_hdr {
58194 +       uint32_t instance_no;
58195 +       uint16_t tag_no;
58196 +       uint32_t len_no;
58197 +       uint32_t ordinal_no;
58198 +} __attribute__ ((packed));
58199 +
58200 +struct packet {
58201 +       struct list_head next;
58202 +       unsigned int data_len;
58203 +       u8 *data_buffer;
58204 +       tpmif_t *tpmif;
58205 +       u32 tpm_instance;
58206 +       u8 req_tag;
58207 +       u32 last_read;
58208 +       u8 flags;
58209 +       struct timer_list processing_timer;
58210 +};
58211 +
58212 +enum {
58213 +       PACKET_FLAG_DISCARD_RESPONSE = 1,
58214 +       PACKET_FLAG_CHECK_RESPONSESTATUS = 2,
58215 +};
58216 +
58217 +/* local variables */
58218 +static struct data_exchange dataex;
58219 +
58220 +/* local function prototypes */
58221 +static int _packet_write(struct packet *pak,
58222 +                        const char *data, size_t size, int userbuffer);
58223 +static void processing_timeout(unsigned long ptr);
58224 +static int packet_read_shmem(struct packet *pak,
58225 +                            tpmif_t * tpmif,
58226 +                            u32 offset,
58227 +                            char *buffer, int isuserbuffer, u32 left);
58228 +static int vtpm_queue_packet(struct packet *pak);
58229 +
58230 +#define MIN(x,y)  (x) < (y) ? (x) : (y)
58231 +
58232 +/***************************************************************
58233 + Buffer copying fo user and kernel space buffes.
58234 +***************************************************************/
58235 +static inline int copy_from_buffer(void *to,
58236 +                                  const void *from, unsigned long size,
58237 +                                  int isuserbuffer)
58238 +{
58239 +       if (isuserbuffer) {
58240 +               if (copy_from_user(to, (void __user *)from, size))
58241 +                       return -EFAULT;
58242 +       } else {
58243 +               memcpy(to, from, size);
58244 +       }
58245 +       return 0;
58246 +}
58247 +
58248 +static inline int copy_to_buffer(void *to,
58249 +                                const void *from, unsigned long size,
58250 +                                int isuserbuffer)
58251 +{
58252 +       if (isuserbuffer) {
58253 +               if (copy_to_user((void __user *)to, from, size))
58254 +                       return -EFAULT;
58255 +       } else {
58256 +               memcpy(to, from, size);
58257 +       }
58258 +       return 0;
58259 +}
58260 +
58261 +/***************************************************************
58262 + Packet-related functions
58263 +***************************************************************/
58264 +
58265 +static struct packet *packet_find_instance(struct list_head *head,
58266 +                                          u32 tpm_instance)
58267 +{
58268 +       struct packet *pak;
58269 +       struct list_head *p;
58270 +
58271 +       /*
58272 +        * traverse the list of packets and return the first
58273 +        * one with the given instance number
58274 +        */
58275 +       list_for_each(p, head) {
58276 +               pak = list_entry(p, struct packet, next);
58277 +
58278 +               if (pak->tpm_instance == tpm_instance) {
58279 +                       return pak;
58280 +               }
58281 +       }
58282 +       return NULL;
58283 +}
58284 +
58285 +static struct packet *packet_find_packet(struct list_head *head, void *packet)
58286 +{
58287 +       struct packet *pak;
58288 +       struct list_head *p;
58289 +
58290 +       /*
58291 +        * traverse the list of packets and return the first
58292 +        * one with the given instance number
58293 +        */
58294 +       list_for_each(p, head) {
58295 +               pak = list_entry(p, struct packet, next);
58296 +
58297 +               if (pak == packet) {
58298 +                       return pak;
58299 +               }
58300 +       }
58301 +       return NULL;
58302 +}
58303 +
58304 +static struct packet *packet_alloc(tpmif_t * tpmif,
58305 +                                  u32 size, u8 req_tag, u8 flags)
58306 +{
58307 +       struct packet *pak = NULL;
58308 +       pak = kzalloc(sizeof (struct packet), GFP_KERNEL);
58309 +       if (NULL != pak) {
58310 +               if (tpmif) {
58311 +                       pak->tpmif = tpmif;
58312 +                       pak->tpm_instance = tpmif->tpm_instance;
58313 +               }
58314 +               pak->data_len = size;
58315 +               pak->req_tag = req_tag;
58316 +               pak->last_read = 0;
58317 +               pak->flags = flags;
58318 +
58319 +               /*
58320 +                * cannot do tpmif_get(tpmif); bad things happen
58321 +                * on the last tpmif_put()
58322 +                */
58323 +               init_timer(&pak->processing_timer);
58324 +               pak->processing_timer.function = processing_timeout;
58325 +               pak->processing_timer.data = (unsigned long)pak;
58326 +       }
58327 +       return pak;
58328 +}
58329 +
58330 +static void inline packet_reset(struct packet *pak)
58331 +{
58332 +       pak->last_read = 0;
58333 +}
58334 +
58335 +static void packet_free(struct packet *pak)
58336 +{
58337 +       if (timer_pending(&pak->processing_timer)) {
58338 +               BUG();
58339 +       }
58340 +       kfree(pak->data_buffer);
58341 +       /*
58342 +        * cannot do tpmif_put(pak->tpmif); bad things happen
58343 +        * on the last tpmif_put()
58344 +        */
58345 +       kfree(pak);
58346 +}
58347 +
58348 +static int packet_set(struct packet *pak,
58349 +                     const unsigned char *buffer, u32 size)
58350 +{
58351 +       int rc = 0;
58352 +       unsigned char *buf = kmalloc(size, GFP_KERNEL);
58353 +
58354 +       if (buf) {
58355 +               pak->data_buffer = buf;
58356 +               memcpy(buf, buffer, size);
58357 +               pak->data_len = size;
58358 +       } else {
58359 +               rc = -ENOMEM;
58360 +       }
58361 +       return rc;
58362 +}
58363 +
58364 +/*
58365 + * Write data to the shared memory and send it to the FE.
58366 + */
58367 +static int packet_write(struct packet *pak,
58368 +                       const char *data, size_t size, int isuserbuffer)
58369 +{
58370 +       int rc = 0;
58371 +
58372 +       if ((pak->flags & PACKET_FLAG_CHECK_RESPONSESTATUS)) {
58373 +#ifdef CONFIG_XEN_TPMDEV_CLOSE_IF_VTPM_FAILS
58374 +               u32 res;
58375 +
58376 +               if (copy_from_buffer(&res,
58377 +                                    &data[2 + 4], sizeof (res),
58378 +                                    isuserbuffer)) {
58379 +                       return -EFAULT;
58380 +               }
58381 +
58382 +               if (res != 0) {
58383 +                       /*
58384 +                        * Close down this device. Should have the
58385 +                        * FE notified about closure.
58386 +                        */
58387 +                       if (!pak->tpmif) {
58388 +                               return -EFAULT;
58389 +                       }
58390 +                       pak->tpmif->status = DISCONNECTING;
58391 +               }
58392 +#endif
58393 +       }
58394 +
58395 +       if (0 != (pak->flags & PACKET_FLAG_DISCARD_RESPONSE)) {
58396 +               /* Don't send a respone to this packet. Just acknowledge it. */
58397 +               rc = size;
58398 +       } else {
58399 +               rc = _packet_write(pak, data, size, isuserbuffer);
58400 +       }
58401 +
58402 +       return rc;
58403 +}
58404 +
58405 +int _packet_write(struct packet *pak,
58406 +                 const char *data, size_t size, int isuserbuffer)
58407 +{
58408 +       /*
58409 +        * Write into the shared memory pages directly
58410 +        * and send it to the front end.
58411 +        */
58412 +       tpmif_t *tpmif = pak->tpmif;
58413 +       grant_handle_t handle;
58414 +       int rc = 0;
58415 +       unsigned int i = 0;
58416 +       unsigned int offset = 0;
58417 +
58418 +       if (tpmif == NULL) {
58419 +               return -EFAULT;
58420 +       }
58421 +
58422 +       if (tpmif->status == DISCONNECTED) {
58423 +               return size;
58424 +       }
58425 +
58426 +       while (offset < size && i < TPMIF_TX_RING_SIZE) {
58427 +               unsigned int tocopy;
58428 +               struct gnttab_map_grant_ref map_op;
58429 +               struct gnttab_unmap_grant_ref unmap_op;
58430 +               tpmif_tx_request_t *tx;
58431 +
58432 +               tx = &tpmif->tx->ring[i].req;
58433 +
58434 +               if (0 == tx->addr) {
58435 +                       DPRINTK("ERROR: Buffer for outgoing packet NULL?! i=%d\n", i);
58436 +                       return 0;
58437 +               }
58438 +
58439 +               map_op.host_addr = MMAP_VADDR(tpmif, i);
58440 +               map_op.flags = GNTMAP_host_map;
58441 +               map_op.ref = tx->ref;
58442 +               map_op.dom = tpmif->domid;
58443 +
58444 +               if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
58445 +                                                      &map_op, 1))) {
58446 +                       BUG();
58447 +               }
58448 +
58449 +               handle = map_op.handle;
58450 +
58451 +               if (map_op.status) {
58452 +                       DPRINTK(" Grant table operation failure !\n");
58453 +                       return 0;
58454 +               }
58455 +               set_phys_to_machine(__pa(MMAP_VADDR(tpmif, i)) >> PAGE_SHIFT,
58456 +                                   FOREIGN_FRAME(map_op.
58457 +                                                 dev_bus_addr >> PAGE_SHIFT));
58458 +
58459 +               tocopy = MIN(size - offset, PAGE_SIZE);
58460 +
58461 +               if (copy_from_buffer((void *)(MMAP_VADDR(tpmif, i) |
58462 +                                             (tx->addr & ~PAGE_MASK)),
58463 +                                    &data[offset], tocopy, isuserbuffer)) {
58464 +                       tpmif_put(tpmif);
58465 +                       return -EFAULT;
58466 +               }
58467 +               tx->size = tocopy;
58468 +
58469 +               unmap_op.host_addr = MMAP_VADDR(tpmif, i);
58470 +               unmap_op.handle = handle;
58471 +               unmap_op.dev_bus_addr = 0;
58472 +
58473 +               if (unlikely
58474 +                   (HYPERVISOR_grant_table_op
58475 +                    (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) {
58476 +                       BUG();
58477 +               }
58478 +
58479 +               offset += tocopy;
58480 +               i++;
58481 +       }
58482 +
58483 +       rc = offset;
58484 +       DPRINTK("Notifying frontend via irq %d\n", tpmif->irq);
58485 +       notify_remote_via_irq(tpmif->irq);
58486 +
58487 +       return rc;
58488 +}
58489 +
58490 +/*
58491 + * Read data from the shared memory and copy it directly into the
58492 + * provided buffer. Advance the read_last indicator which tells
58493 + * how many bytes have already been read.
58494 + */
58495 +static int packet_read(struct packet *pak, size_t numbytes,
58496 +                      char *buffer, size_t buffersize, int isuserbuffer)
58497 +{
58498 +       tpmif_t *tpmif = pak->tpmif;
58499 +
58500 +       /*
58501 +        * Read 'numbytes' of data from the buffer. The first 4
58502 +        * bytes are the instance number in network byte order,
58503 +        * after that come the data from the shared memory buffer.
58504 +        */
58505 +       u32 to_copy;
58506 +       u32 offset = 0;
58507 +       u32 room_left = buffersize;
58508 +
58509 +       if (pak->last_read < 4) {
58510 +               /*
58511 +                * copy the instance number into the buffer
58512 +                */
58513 +               u32 instance_no = htonl(pak->tpm_instance);
58514 +               u32 last_read = pak->last_read;
58515 +
58516 +               to_copy = MIN(4 - last_read, numbytes);
58517 +
58518 +               if (copy_to_buffer(&buffer[0],
58519 +                                  &(((u8 *) & instance_no)[last_read]),
58520 +                                  to_copy, isuserbuffer)) {
58521 +                       return -EFAULT;
58522 +               }
58523 +
58524 +               pak->last_read += to_copy;
58525 +               offset += to_copy;
58526 +               room_left -= to_copy;
58527 +       }
58528 +
58529 +       /*
58530 +        * If the packet has a data buffer appended, read from it...
58531 +        */
58532 +
58533 +       if (room_left > 0) {
58534 +               if (pak->data_buffer) {
58535 +                       u32 to_copy = MIN(pak->data_len - offset, room_left);
58536 +                       u32 last_read = pak->last_read - 4;
58537 +
58538 +                       if (copy_to_buffer(&buffer[offset],
58539 +                                          &pak->data_buffer[last_read],
58540 +                                          to_copy, isuserbuffer)) {
58541 +                               return -EFAULT;
58542 +                       }
58543 +                       pak->last_read += to_copy;
58544 +                       offset += to_copy;
58545 +               } else {
58546 +                       offset = packet_read_shmem(pak,
58547 +                                                  tpmif,
58548 +                                                  offset,
58549 +                                                  buffer,
58550 +                                                  isuserbuffer, room_left);
58551 +               }
58552 +       }
58553 +       return offset;
58554 +}
58555 +
58556 +static int packet_read_shmem(struct packet *pak,
58557 +                            tpmif_t * tpmif,
58558 +                            u32 offset, char *buffer, int isuserbuffer,
58559 +                            u32 room_left)
58560 +{
58561 +       u32 last_read = pak->last_read - 4;
58562 +       u32 i = (last_read / PAGE_SIZE);
58563 +       u32 pg_offset = last_read & (PAGE_SIZE - 1);
58564 +       u32 to_copy;
58565 +       grant_handle_t handle;
58566 +
58567 +       tpmif_tx_request_t *tx;
58568 +
58569 +       tx = &tpmif->tx->ring[0].req;
58570 +       /*
58571 +        * Start copying data at the page with index 'index'
58572 +        * and within that page at offset 'offset'.
58573 +        * Copy a maximum of 'room_left' bytes.
58574 +        */
58575 +       to_copy = MIN(PAGE_SIZE - pg_offset, room_left);
58576 +       while (to_copy > 0) {
58577 +               void *src;
58578 +               struct gnttab_map_grant_ref map_op;
58579 +               struct gnttab_unmap_grant_ref unmap_op;
58580 +
58581 +               tx = &tpmif->tx->ring[i].req;
58582 +
58583 +               map_op.host_addr = MMAP_VADDR(tpmif, i);
58584 +               map_op.flags = GNTMAP_host_map;
58585 +               map_op.ref = tx->ref;
58586 +               map_op.dom = tpmif->domid;
58587 +
58588 +               if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
58589 +                                                      &map_op, 1))) {
58590 +                       BUG();
58591 +               }
58592 +
58593 +               if (map_op.status) {
58594 +                       DPRINTK(" Grant table operation failure !\n");
58595 +                       return -EFAULT;
58596 +               }
58597 +
58598 +               handle = map_op.handle;
58599 +
58600 +               if (to_copy > tx->size) {
58601 +                       /*
58602 +                        * User requests more than what's available
58603 +                        */
58604 +                       to_copy = MIN(tx->size, to_copy);
58605 +               }
58606 +
58607 +               DPRINTK("Copying from mapped memory at %08lx\n",
58608 +                       (unsigned long)(MMAP_VADDR(tpmif, i) |
58609 +                                       (tx->addr & ~PAGE_MASK)));
58610 +
58611 +               src = (void *)(MMAP_VADDR(tpmif, i) |
58612 +                              ((tx->addr & ~PAGE_MASK) + pg_offset));
58613 +               if (copy_to_buffer(&buffer[offset],
58614 +                                  src, to_copy, isuserbuffer)) {
58615 +                       return -EFAULT;
58616 +               }
58617 +
58618 +               DPRINTK("Data from TPM-FE of domain %d are %d %d %d %d\n",
58619 +                       tpmif->domid, buffer[offset], buffer[offset + 1],
58620 +                       buffer[offset + 2], buffer[offset + 3]);
58621 +
58622 +               unmap_op.host_addr = MMAP_VADDR(tpmif, i);
58623 +               unmap_op.handle = handle;
58624 +               unmap_op.dev_bus_addr = 0;
58625 +
58626 +               if (unlikely
58627 +                   (HYPERVISOR_grant_table_op
58628 +                    (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) {
58629 +                       BUG();
58630 +               }
58631 +
58632 +               offset += to_copy;
58633 +               pg_offset = 0;
58634 +               last_read += to_copy;
58635 +               room_left -= to_copy;
58636 +
58637 +               to_copy = MIN(PAGE_SIZE, room_left);
58638 +               i++;
58639 +       }                       /* while (to_copy > 0) */
58640 +       /*
58641 +        * Adjust the last_read pointer
58642 +        */
58643 +       pak->last_read = last_read + 4;
58644 +       return offset;
58645 +}
58646 +
58647 +/* ============================================================
58648 + * The file layer for reading data from this device
58649 + * ============================================================
58650 + */
58651 +static int vtpm_op_open(struct inode *inode, struct file *f)
58652 +{
58653 +       int rc = 0;
58654 +       unsigned long flags;
58655 +
58656 +       write_lock_irqsave(&dataex.pak_lock, flags);
58657 +       if (dataex.has_opener == 0) {
58658 +               dataex.has_opener = 1;
58659 +       } else {
58660 +               rc = -EPERM;
58661 +       }
58662 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
58663 +       return rc;
58664 +}
58665 +
58666 +static ssize_t vtpm_op_read(struct file *file,
58667 +                           char __user * data, size_t size, loff_t * offset)
58668 +{
58669 +       int ret_size = -ENODATA;
58670 +       struct packet *pak = NULL;
58671 +       unsigned long flags;
58672 +
58673 +       write_lock_irqsave(&dataex.pak_lock, flags);
58674 +
58675 +       if (list_empty(&dataex.pending_pak)) {
58676 +               write_unlock_irqrestore(&dataex.pak_lock, flags);
58677 +               wait_event_interruptible(dataex.wait_queue,
58678 +                                        !list_empty(&dataex.pending_pak));
58679 +               write_lock_irqsave(&dataex.pak_lock, flags);
58680 +       }
58681 +
58682 +       if (!list_empty(&dataex.pending_pak)) {
58683 +               unsigned int left;
58684 +               pak = list_entry(dataex.pending_pak.next, struct packet, next);
58685 +
58686 +               left = pak->data_len - dataex.copied_so_far;
58687 +
58688 +               DPRINTK("size given by app: %d, available: %d\n", size, left);
58689 +
58690 +               ret_size = MIN(size, left);
58691 +
58692 +               ret_size = packet_read(pak, ret_size, data, size, 1);
58693 +               if (ret_size < 0) {
58694 +                       ret_size = -EFAULT;
58695 +               } else {
58696 +                       DPRINTK("Copied %d bytes to user buffer\n", ret_size);
58697 +
58698 +                       dataex.copied_so_far += ret_size;
58699 +                       if (dataex.copied_so_far >= pak->data_len + 4) {
58700 +                               DPRINTK("All data from this packet given to app.\n");
58701 +                               /* All data given to app */
58702 +
58703 +                               del_singleshot_timer_sync(&pak->
58704 +                                                         processing_timer);
58705 +                               list_del(&pak->next);
58706 +                               list_add_tail(&pak->next, &dataex.current_pak);
58707 +                               /*
58708 +                                * The more fontends that are handled at the same time,
58709 +                                * the more time we give the TPM to process the request.
58710 +                                */
58711 +                               mod_timer(&pak->processing_timer,
58712 +                                         jiffies + (num_frontends * 60 * HZ));
58713 +                               dataex.copied_so_far = 0;
58714 +                       }
58715 +               }
58716 +       }
58717 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
58718 +
58719 +       DPRINTK("Returning result from read to app: %d\n", ret_size);
58720 +
58721 +       return ret_size;
58722 +}
58723 +
58724 +/*
58725 + * Write operation - only works after a previous read operation!
58726 + */
58727 +static ssize_t vtpm_op_write(struct file *file,
58728 +                            const char __user * data, size_t size,
58729 +                            loff_t * offset)
58730 +{
58731 +       struct packet *pak;
58732 +       int rc = 0;
58733 +       unsigned int off = 4;
58734 +       unsigned long flags;
58735 +       struct vtpm_resp_hdr vrh;
58736 +
58737 +       /*
58738 +        * Minimum required packet size is:
58739 +        * 4 bytes for instance number
58740 +        * 2 bytes for tag
58741 +        * 4 bytes for paramSize
58742 +        * 4 bytes for the ordinal
58743 +        * sum: 14 bytes
58744 +        */
58745 +       if (size < sizeof (vrh))
58746 +               return -EFAULT;
58747 +
58748 +       if (copy_from_user(&vrh, data, sizeof (vrh)))
58749 +               return -EFAULT;
58750 +
58751 +       /* malformed packet? */
58752 +       if ((off + ntohl(vrh.len_no)) != size)
58753 +               return -EFAULT;
58754 +
58755 +       write_lock_irqsave(&dataex.pak_lock, flags);
58756 +       pak = packet_find_instance(&dataex.current_pak,
58757 +                                  ntohl(vrh.instance_no));
58758 +
58759 +       if (pak == NULL) {
58760 +               write_unlock_irqrestore(&dataex.pak_lock, flags);
58761 +               printk(KERN_ALERT "No associated packet! (inst=%d)\n",
58762 +                      ntohl(vrh.instance_no));
58763 +               return -EFAULT;
58764 +       }
58765 +
58766 +       del_singleshot_timer_sync(&pak->processing_timer);
58767 +       list_del(&pak->next);
58768 +
58769 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
58770 +
58771 +       /*
58772 +        * The first 'offset' bytes must be the instance number - skip them.
58773 +        */
58774 +       size -= off;
58775 +
58776 +       rc = packet_write(pak, &data[off], size, 1);
58777 +
58778 +       if (rc > 0) {
58779 +               /* I neglected the first 4 bytes */
58780 +               rc += off;
58781 +       }
58782 +       packet_free(pak);
58783 +       return rc;
58784 +}
58785 +
58786 +static int vtpm_op_release(struct inode *inode, struct file *file)
58787 +{
58788 +       unsigned long flags;
58789 +
58790 +       vtpm_release_packets(NULL, 1);
58791 +       write_lock_irqsave(&dataex.pak_lock, flags);
58792 +       dataex.has_opener = 0;
58793 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
58794 +       return 0;
58795 +}
58796 +
58797 +static unsigned int vtpm_op_poll(struct file *file,
58798 +                                struct poll_table_struct *pts)
58799 +{
58800 +       unsigned int flags = POLLOUT | POLLWRNORM;
58801 +
58802 +       poll_wait(file, &dataex.wait_queue, pts);
58803 +       if (!list_empty(&dataex.pending_pak)) {
58804 +               flags |= POLLIN | POLLRDNORM;
58805 +       }
58806 +       return flags;
58807 +}
58808 +
58809 +static struct file_operations vtpm_ops = {
58810 +       .owner = THIS_MODULE,
58811 +       .llseek = no_llseek,
58812 +       .open = vtpm_op_open,
58813 +       .read = vtpm_op_read,
58814 +       .write = vtpm_op_write,
58815 +       .release = vtpm_op_release,
58816 +       .poll = vtpm_op_poll,
58817 +};
58818 +
58819 +static struct miscdevice vtpms_miscdevice = {
58820 +       .minor = 225,
58821 +       .name = "vtpm",
58822 +       .fops = &vtpm_ops,
58823 +};
58824 +
58825 +/***************************************************************
58826 + Virtual TPM functions and data stuctures
58827 +***************************************************************/
58828 +
58829 +static u8 create_cmd[] = {
58830 +       1, 193,                 /* 0: TPM_TAG_RQU_COMMAMD */
58831 +       0, 0, 0, 19,            /* 2: length */
58832 +       0, 0, 0, 0x1,           /* 6: VTPM_ORD_OPEN */
58833 +       0,                      /* 10: VTPM type */
58834 +       0, 0, 0, 0,             /* 11: domain id */
58835 +       0, 0, 0, 0              /* 15: instance id */
58836 +};
58837 +
58838 +int tpmif_vtpm_open(tpmif_t * tpmif, domid_t domid, u32 instance)
58839 +{
58840 +       int rc = 0;
58841 +       struct packet *pak;
58842 +
58843 +       pak = packet_alloc(tpmif,
58844 +                          sizeof (create_cmd),
58845 +                          create_cmd[1],
58846 +                          PACKET_FLAG_DISCARD_RESPONSE |
58847 +                          PACKET_FLAG_CHECK_RESPONSESTATUS);
58848 +       if (pak) {
58849 +               u8 buf[sizeof (create_cmd)];
58850 +               u32 domid_no = htonl((u32) domid);
58851 +               u32 instance_no = htonl(instance);
58852 +
58853 +               memcpy(buf, create_cmd, sizeof (create_cmd));
58854 +
58855 +               memcpy(&buf[11], &domid_no, sizeof (u32));
58856 +               memcpy(&buf[15], &instance_no, sizeof (u32));
58857 +
58858 +               /* copy the buffer into the packet */
58859 +               rc = packet_set(pak, buf, sizeof (buf));
58860 +
58861 +               if (rc == 0) {
58862 +                       pak->tpm_instance = 0;
58863 +                       rc = vtpm_queue_packet(pak);
58864 +               }
58865 +               if (rc < 0) {
58866 +                       /* could not be queued or built */
58867 +                       packet_free(pak);
58868 +               }
58869 +       } else {
58870 +               rc = -ENOMEM;
58871 +       }
58872 +       return rc;
58873 +}
58874 +
58875 +static u8 destroy_cmd[] = {
58876 +       1, 193,                 /* 0: TPM_TAG_RQU_COMMAMD */
58877 +       0, 0, 0, 14,            /* 2: length */
58878 +       0, 0, 0, 0x2,           /* 6: VTPM_ORD_CLOSE */
58879 +       0, 0, 0, 0              /* 10: instance id */
58880 +};
58881 +
58882 +int tpmif_vtpm_close(u32 instid)
58883 +{
58884 +       int rc = 0;
58885 +       struct packet *pak;
58886 +
58887 +       pak = packet_alloc(NULL,
58888 +                          sizeof (destroy_cmd),
58889 +                          destroy_cmd[1], PACKET_FLAG_DISCARD_RESPONSE);
58890 +       if (pak) {
58891 +               u8 buf[sizeof (destroy_cmd)];
58892 +               u32 instid_no = htonl(instid);
58893 +
58894 +               memcpy(buf, destroy_cmd, sizeof (destroy_cmd));
58895 +               memcpy(&buf[10], &instid_no, sizeof (u32));
58896 +
58897 +               /* copy the buffer into the packet */
58898 +               rc = packet_set(pak, buf, sizeof (buf));
58899 +
58900 +               if (rc == 0) {
58901 +                       pak->tpm_instance = 0;
58902 +                       rc = vtpm_queue_packet(pak);
58903 +               }
58904 +               if (rc < 0) {
58905 +                       /* could not be queued or built */
58906 +                       packet_free(pak);
58907 +               }
58908 +       } else {
58909 +               rc = -ENOMEM;
58910 +       }
58911 +       return rc;
58912 +}
58913 +
58914 +/***************************************************************
58915 + Utility functions
58916 +***************************************************************/
58917 +
58918 +static int tpm_send_fail_message(struct packet *pak, u8 req_tag)
58919 +{
58920 +       int rc;
58921 +       static const unsigned char tpm_error_message_fail[] = {
58922 +               0x00, 0x00,
58923 +               0x00, 0x00, 0x00, 0x0a,
58924 +               0x00, 0x00, 0x00, 0x09  /* TPM_FAIL */
58925 +       };
58926 +       unsigned char buffer[sizeof (tpm_error_message_fail)];
58927 +
58928 +       memcpy(buffer, tpm_error_message_fail,
58929 +              sizeof (tpm_error_message_fail));
58930 +       /*
58931 +        * Insert the right response tag depending on the given tag
58932 +        * All response tags are '+3' to the request tag.
58933 +        */
58934 +       buffer[1] = req_tag + 3;
58935 +
58936 +       /*
58937 +        * Write the data to shared memory and notify the front-end
58938 +        */
58939 +       rc = packet_write(pak, buffer, sizeof (buffer), 0);
58940 +
58941 +       return rc;
58942 +}
58943 +
58944 +static void _vtpm_release_packets(struct list_head *head,
58945 +                                 tpmif_t * tpmif, int send_msgs)
58946 +{
58947 +       struct packet *pak;
58948 +       struct list_head *pos,
58949 +                *tmp;
58950 +
58951 +       list_for_each_safe(pos, tmp, head) {
58952 +               pak = list_entry(pos, struct packet, next);
58953 +
58954 +               if (tpmif == NULL || pak->tpmif == tpmif) {
58955 +                       int can_send = 0;
58956 +
58957 +                       del_singleshot_timer_sync(&pak->processing_timer);
58958 +                       list_del(&pak->next);
58959 +
58960 +                       if (pak->tpmif && pak->tpmif->status == CONNECTED) {
58961 +                               can_send = 1;
58962 +                       }
58963 +
58964 +                       if (send_msgs && can_send) {
58965 +                               tpm_send_fail_message(pak, pak->req_tag);
58966 +                       }
58967 +                       packet_free(pak);
58968 +               }
58969 +       }
58970 +}
58971 +
58972 +int vtpm_release_packets(tpmif_t * tpmif, int send_msgs)
58973 +{
58974 +       unsigned long flags;
58975 +
58976 +       write_lock_irqsave(&dataex.pak_lock, flags);
58977 +
58978 +       _vtpm_release_packets(&dataex.pending_pak, tpmif, send_msgs);
58979 +       _vtpm_release_packets(&dataex.current_pak, tpmif, send_msgs);
58980 +
58981 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
58982 +       return 0;
58983 +}
58984 +
58985 +static int vtpm_queue_packet(struct packet *pak)
58986 +{
58987 +       int rc = 0;
58988 +
58989 +       if (dataex.has_opener) {
58990 +               unsigned long flags;
58991 +
58992 +               write_lock_irqsave(&dataex.pak_lock, flags);
58993 +               list_add_tail(&pak->next, &dataex.pending_pak);
58994 +               /* give the TPM some time to pick up the request */
58995 +               mod_timer(&pak->processing_timer, jiffies + (30 * HZ));
58996 +               write_unlock_irqrestore(&dataex.pak_lock, flags);
58997 +
58998 +               wake_up_interruptible(&dataex.wait_queue);
58999 +       } else {
59000 +               rc = -EFAULT;
59001 +       }
59002 +       return rc;
59003 +}
59004 +
59005 +static int vtpm_receive(tpmif_t * tpmif, u32 size)
59006 +{
59007 +       int rc = 0;
59008 +       unsigned char buffer[10];
59009 +       __be32 *native_size;
59010 +       struct packet *pak = packet_alloc(tpmif, size, 0, 0);
59011 +
59012 +       if (!pak)
59013 +               return -ENOMEM;
59014 +       /*
59015 +        * Read 10 bytes from the received buffer to test its
59016 +        * content for validity.
59017 +        */
59018 +       if (sizeof (buffer) != packet_read(pak,
59019 +                                          sizeof (buffer), buffer,
59020 +                                          sizeof (buffer), 0)) {
59021 +               goto failexit;
59022 +       }
59023 +       /*
59024 +        * Reset the packet read pointer so we can read all its
59025 +        * contents again.
59026 +        */
59027 +       packet_reset(pak);
59028 +
59029 +       native_size = (__force __be32 *) (&buffer[4 + 2]);
59030 +       /*
59031 +        * Verify that the size of the packet is correct
59032 +        * as indicated and that there's actually someone reading packets.
59033 +        * The minimum size of the packet is '10' for tag, size indicator
59034 +        * and ordinal.
59035 +        */
59036 +       if (size < 10 ||
59037 +           be32_to_cpu(*native_size) != size ||
59038 +           0 == dataex.has_opener || tpmif->status != CONNECTED) {
59039 +               rc = -EINVAL;
59040 +               goto failexit;
59041 +       } else {
59042 +               rc = vtpm_queue_packet(pak);
59043 +               if (rc < 0)
59044 +                       goto failexit;
59045 +       }
59046 +       return 0;
59047 +
59048 +      failexit:
59049 +       if (pak) {
59050 +               tpm_send_fail_message(pak, buffer[4 + 1]);
59051 +               packet_free(pak);
59052 +       }
59053 +       return rc;
59054 +}
59055 +
59056 +/*
59057 + * Timeout function that gets invoked when a packet has not been processed
59058 + * during the timeout period.
59059 + * The packet must be on a list when this function is invoked. This
59060 + * also means that once its taken off a list, the timer must be
59061 + * destroyed as well.
59062 + */
59063 +static void processing_timeout(unsigned long ptr)
59064 +{
59065 +       struct packet *pak = (struct packet *)ptr;
59066 +       unsigned long flags;
59067 +
59068 +       write_lock_irqsave(&dataex.pak_lock, flags);
59069 +       /*
59070 +        * The packet needs to be searched whether it
59071 +        * is still on the list.
59072 +        */
59073 +       if (pak == packet_find_packet(&dataex.pending_pak, pak) ||
59074 +           pak == packet_find_packet(&dataex.current_pak, pak)) {
59075 +               list_del(&pak->next);
59076 +               if ((pak->flags & PACKET_FLAG_DISCARD_RESPONSE) == 0) {
59077 +                       tpm_send_fail_message(pak, pak->req_tag);
59078 +               }
59079 +               packet_free(pak);
59080 +       }
59081 +
59082 +       write_unlock_irqrestore(&dataex.pak_lock, flags);
59083 +}
59084 +
59085 +static void tpm_tx_action(unsigned long unused);
59086 +static DECLARE_TASKLET(tpm_tx_tasklet, tpm_tx_action, 0);
59087 +
59088 +static struct list_head tpm_schedule_list;
59089 +static spinlock_t tpm_schedule_list_lock;
59090 +
59091 +static inline void maybe_schedule_tx_action(void)
59092 +{
59093 +       smp_mb();
59094 +       tasklet_schedule(&tpm_tx_tasklet);
59095 +}
59096 +
59097 +static inline int __on_tpm_schedule_list(tpmif_t * tpmif)
59098 +{
59099 +       return tpmif->list.next != NULL;
59100 +}
59101 +
59102 +static void remove_from_tpm_schedule_list(tpmif_t * tpmif)
59103 +{
59104 +       spin_lock_irq(&tpm_schedule_list_lock);
59105 +       if (likely(__on_tpm_schedule_list(tpmif))) {
59106 +               list_del(&tpmif->list);
59107 +               tpmif->list.next = NULL;
59108 +               tpmif_put(tpmif);
59109 +       }
59110 +       spin_unlock_irq(&tpm_schedule_list_lock);
59111 +}
59112 +
59113 +static void add_to_tpm_schedule_list_tail(tpmif_t * tpmif)
59114 +{
59115 +       if (__on_tpm_schedule_list(tpmif))
59116 +               return;
59117 +
59118 +       spin_lock_irq(&tpm_schedule_list_lock);
59119 +       if (!__on_tpm_schedule_list(tpmif) && tpmif->active) {
59120 +               list_add_tail(&tpmif->list, &tpm_schedule_list);
59121 +               tpmif_get(tpmif);
59122 +       }
59123 +       spin_unlock_irq(&tpm_schedule_list_lock);
59124 +}
59125 +
59126 +void tpmif_schedule_work(tpmif_t * tpmif)
59127 +{
59128 +       add_to_tpm_schedule_list_tail(tpmif);
59129 +       maybe_schedule_tx_action();
59130 +}
59131 +
59132 +void tpmif_deschedule_work(tpmif_t * tpmif)
59133 +{
59134 +       remove_from_tpm_schedule_list(tpmif);
59135 +}
59136 +
59137 +static void tpm_tx_action(unsigned long unused)
59138 +{
59139 +       struct list_head *ent;
59140 +       tpmif_t *tpmif;
59141 +       tpmif_tx_request_t *tx;
59142 +
59143 +       DPRINTK("%s: Getting data from front-end(s)!\n", __FUNCTION__);
59144 +
59145 +       while (!list_empty(&tpm_schedule_list)) {
59146 +               /* Get a tpmif from the list with work to do. */
59147 +               ent = tpm_schedule_list.next;
59148 +               tpmif = list_entry(ent, tpmif_t, list);
59149 +               tpmif_get(tpmif);
59150 +               remove_from_tpm_schedule_list(tpmif);
59151 +
59152 +               tx = &tpmif->tx->ring[0].req;
59153 +
59154 +               /* pass it up */
59155 +               vtpm_receive(tpmif, tx->size);
59156 +
59157 +               tpmif_put(tpmif);
59158 +       }
59159 +}
59160 +
59161 +irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs)
59162 +{
59163 +       tpmif_t *tpmif = (tpmif_t *) dev_id;
59164 +
59165 +       add_to_tpm_schedule_list_tail(tpmif);
59166 +       maybe_schedule_tx_action();
59167 +       return IRQ_HANDLED;
59168 +}
59169 +
59170 +static int __init tpmback_init(void)
59171 +{
59172 +       int rc;
59173 +
59174 +       if ((rc = misc_register(&vtpms_miscdevice)) != 0) {
59175 +               printk(KERN_ALERT
59176 +                      "Could not register misc device for TPM BE.\n");
59177 +               return rc;
59178 +       }
59179 +
59180 +       INIT_LIST_HEAD(&dataex.pending_pak);
59181 +       INIT_LIST_HEAD(&dataex.current_pak);
59182 +       dataex.has_opener = 0;
59183 +       rwlock_init(&dataex.pak_lock);
59184 +       init_waitqueue_head(&dataex.wait_queue);
59185 +
59186 +       spin_lock_init(&tpm_schedule_list_lock);
59187 +       INIT_LIST_HEAD(&tpm_schedule_list);
59188 +
59189 +       tpmif_interface_init();
59190 +       tpmif_xenbus_init();
59191 +
59192 +       printk(KERN_ALERT "Successfully initialized TPM backend driver.\n");
59193 +
59194 +       return 0;
59195 +}
59196 +
59197 +module_init(tpmback_init);
59198 +
59199 +static void __exit tpmback_exit(void)
59200 +{
59201 +       tpmif_xenbus_exit();
59202 +       tpmif_interface_exit();
59203 +       misc_deregister(&vtpms_miscdevice);
59204 +}
59205 +
59206 +module_exit(tpmback_exit);
59207 +
59208 +MODULE_LICENSE("Dual BSD/GPL");
59209 +
59210 +/*
59211 + * Local variables:
59212 + *  c-file-style: "linux"
59213 + *  indent-tabs-mode: t
59214 + *  c-indent-level: 8
59215 + *  c-basic-offset: 8
59216 + *  tab-width: 8
59217 + * End:
59218 + */
59219 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/tpmback/xenbus.c linux-2.6.16/drivers/xen/tpmback/xenbus.c
59220 --- linux-2.6.16.orig/drivers/xen/tpmback/xenbus.c      1970-01-01 01:00:00.000000000 +0100
59221 +++ linux-2.6.16/drivers/xen/tpmback/xenbus.c   2006-06-26 09:51:32.000000000 +0200
59222 @@ -0,0 +1,328 @@
59223 +/*  Xenbus code for tpmif backend
59224 +    Copyright (C) 2005 IBM Corporation
59225 +    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
59226 +
59227 +    This program is free software; you can redistribute it and/or modify
59228 +    it under the terms of the GNU General Public License as published by
59229 +    the Free Software Foundation; either version 2 of the License, or
59230 +    (at your option) any later version.
59231 +
59232 +    This program is distributed in the hope that it will be useful,
59233 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
59234 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
59235 +    GNU General Public License for more details.
59236 +
59237 +    You should have received a copy of the GNU General Public License
59238 +    along with this program; if not, write to the Free Software
59239 +    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
59240 +*/
59241 +#include <stdarg.h>
59242 +#include <linux/module.h>
59243 +#include <xen/xenbus.h>
59244 +#include "common.h"
59245 +
59246 +struct backend_info
59247 +{
59248 +       struct xenbus_device *dev;
59249 +
59250 +       /* our communications channel */
59251 +       tpmif_t *tpmif;
59252 +
59253 +       long int frontend_id;
59254 +       long int instance; // instance of TPM
59255 +       u8 is_instance_set;// whether instance number has been set
59256 +
59257 +       /* watch front end for changes */
59258 +       struct xenbus_watch backend_watch;
59259 +       XenbusState frontend_state;
59260 +};
59261 +
59262 +static void maybe_connect(struct backend_info *be);
59263 +static void connect(struct backend_info *be);
59264 +static int connect_ring(struct backend_info *be);
59265 +static void backend_changed(struct xenbus_watch *watch,
59266 +                            const char **vec, unsigned int len);
59267 +static void frontend_changed(struct xenbus_device *dev,
59268 +                             XenbusState frontend_state);
59269 +
59270 +static int tpmback_remove(struct xenbus_device *dev)
59271 +{
59272 +       struct backend_info *be = dev->data;
59273 +
59274 +       if (be->backend_watch.node) {
59275 +               unregister_xenbus_watch(&be->backend_watch);
59276 +               kfree(be->backend_watch.node);
59277 +               be->backend_watch.node = NULL;
59278 +       }
59279 +       if (be->tpmif) {
59280 +               tpmif_put(be->tpmif);
59281 +               be->tpmif = NULL;
59282 +       }
59283 +       kfree(be);
59284 +       dev->data = NULL;
59285 +       return 0;
59286 +}
59287 +
59288 +static int tpmback_probe(struct xenbus_device *dev,
59289 +                         const struct xenbus_device_id *id)
59290 +{
59291 +       int err;
59292 +       struct backend_info *be = kzalloc(sizeof(struct backend_info),
59293 +                                         GFP_KERNEL);
59294 +
59295 +       if (!be) {
59296 +               xenbus_dev_fatal(dev, -ENOMEM,
59297 +                                "allocating backend structure");
59298 +               return -ENOMEM;
59299 +       }
59300 +
59301 +       be->is_instance_set = 0;
59302 +       be->dev = dev;
59303 +       dev->data = be;
59304 +
59305 +       err = xenbus_watch_path2(dev, dev->nodename,
59306 +                               "instance", &be->backend_watch,
59307 +                               backend_changed);
59308 +       if (err) {
59309 +               goto fail;
59310 +       }
59311 +
59312 +       err = xenbus_switch_state(dev, XenbusStateInitWait);
59313 +       if (err) {
59314 +               goto fail;
59315 +       }
59316 +       return 0;
59317 +fail:
59318 +       tpmback_remove(dev);
59319 +       return err;
59320 +}
59321 +
59322 +
59323 +static void backend_changed(struct xenbus_watch *watch,
59324 +                            const char **vec, unsigned int len)
59325 +{
59326 +       int err;
59327 +       long instance;
59328 +       struct backend_info *be
59329 +               = container_of(watch, struct backend_info, backend_watch);
59330 +       struct xenbus_device *dev = be->dev;
59331 +
59332 +       err = xenbus_scanf(XBT_NULL, dev->nodename,
59333 +                          "instance","%li", &instance);
59334 +       if (XENBUS_EXIST_ERR(err)) {
59335 +               return;
59336 +       }
59337 +
59338 +       if (err != 1) {
59339 +               xenbus_dev_fatal(dev, err, "reading instance");
59340 +               return;
59341 +       }
59342 +
59343 +       if (be->is_instance_set != 0 && be->instance != instance) {
59344 +               printk(KERN_WARNING
59345 +                      "tpmback: changing instance (from %ld to %ld) "
59346 +                      "not allowed.\n",
59347 +                      be->instance, instance);
59348 +               return;
59349 +       }
59350 +
59351 +       if (be->is_instance_set == 0) {
59352 +               be->tpmif = tpmif_find(dev->otherend_id,
59353 +                                      instance);
59354 +               if (IS_ERR(be->tpmif)) {
59355 +                       err = PTR_ERR(be->tpmif);
59356 +                       be->tpmif = NULL;
59357 +                       xenbus_dev_fatal(dev,err,"creating block interface");
59358 +                       return;
59359 +               }
59360 +               be->instance = instance;
59361 +               be->is_instance_set = 1;
59362 +
59363 +               /*
59364 +                * There's an unfortunate problem:
59365 +                * Sometimes after a suspend/resume the
59366 +                * state switch to XenbusStateInitialised happens
59367 +                * *before* I get to this point here. Since then
59368 +                * the connect_ring() must have failed (be->tpmif is
59369 +                * still NULL), I just call it here again indirectly.
59370 +                */
59371 +               if (be->frontend_state == XenbusStateInitialised) {
59372 +                       frontend_changed(dev, be->frontend_state);
59373 +               }
59374 +       }
59375 +}
59376 +
59377 +
59378 +static void frontend_changed(struct xenbus_device *dev,
59379 +                             XenbusState frontend_state)
59380 +{
59381 +       struct backend_info *be = dev->data;
59382 +       int err;
59383 +
59384 +       be->frontend_state = frontend_state;
59385 +
59386 +       switch (frontend_state) {
59387 +       case XenbusStateInitialising:
59388 +       case XenbusStateConnected:
59389 +               break;
59390 +
59391 +       case XenbusStateInitialised:
59392 +               err = connect_ring(be);
59393 +               if (err) {
59394 +                       return;
59395 +               }
59396 +               maybe_connect(be);
59397 +               break;
59398 +
59399 +       case XenbusStateClosing:
59400 +               xenbus_switch_state(dev, XenbusStateClosing);
59401 +               break;
59402 +
59403 +       case XenbusStateClosed:
59404 +               /*
59405 +                * Notify the vTPM manager about the front-end
59406 +                * having left.
59407 +                */
59408 +               tpmif_vtpm_close(be->instance);
59409 +               device_unregister(&be->dev->dev);
59410 +               break;
59411 +
59412 +       case XenbusStateUnknown:
59413 +       case XenbusStateInitWait:
59414 +       default:
59415 +               xenbus_dev_fatal(dev, -EINVAL,
59416 +                                "saw state %d at frontend",
59417 +                                frontend_state);
59418 +               break;
59419 +       }
59420 +}
59421 +
59422 +
59423 +
59424 +static void maybe_connect(struct backend_info *be)
59425 +{
59426 +       int err;
59427 +
59428 +       if (be->tpmif == NULL || be->tpmif->status == CONNECTED)
59429 +               return;
59430 +
59431 +       connect(be);
59432 +
59433 +       /*
59434 +        * Notify the vTPM manager about a new front-end.
59435 +        */
59436 +       err = tpmif_vtpm_open(be->tpmif,
59437 +                             be->frontend_id,
59438 +                             be->instance);
59439 +       if (err) {
59440 +               xenbus_dev_error(be->dev, err,
59441 +                                "queueing vtpm open packet");
59442 +               /*
59443 +                * Should close down this device and notify FE
59444 +                * about closure.
59445 +                */
59446 +               return;
59447 +       }
59448 +}
59449 +
59450 +
59451 +static void connect(struct backend_info *be)
59452 +{
59453 +       xenbus_transaction_t xbt;
59454 +       int err;
59455 +       struct xenbus_device *dev = be->dev;
59456 +       unsigned long ready = 1;
59457 +
59458 +again:
59459 +       err = xenbus_transaction_start(&xbt);
59460 +       if (err) {
59461 +               xenbus_dev_fatal(be->dev, err, "starting transaction");
59462 +               return;
59463 +       }
59464 +
59465 +       err = xenbus_printf(xbt, be->dev->nodename,
59466 +                           "ready", "%lu", ready);
59467 +       if (err) {
59468 +               xenbus_dev_fatal(be->dev, err, "writing 'ready'");
59469 +               goto abort;
59470 +       }
59471 +
59472 +       err = xenbus_transaction_end(xbt, 0);
59473 +       if (err == -EAGAIN)
59474 +               goto again;
59475 +       if (err)
59476 +               xenbus_dev_fatal(be->dev, err, "end of transaction");
59477 +
59478 +       err = xenbus_switch_state(dev, XenbusStateConnected);
59479 +       if (!err)
59480 +               be->tpmif->status = CONNECTED;
59481 +       return;
59482 +abort:
59483 +       xenbus_transaction_end(xbt, 1);
59484 +}
59485 +
59486 +
59487 +static int connect_ring(struct backend_info *be)
59488 +{
59489 +       struct xenbus_device *dev = be->dev;
59490 +       unsigned long ring_ref;
59491 +       unsigned int evtchn;
59492 +       int err;
59493 +
59494 +       err = xenbus_gather(XBT_NULL, dev->otherend,
59495 +                           "ring-ref", "%lu", &ring_ref,
59496 +                           "event-channel", "%u", &evtchn, NULL);
59497 +       if (err) {
59498 +               xenbus_dev_error(dev, err,
59499 +                                "reading %s/ring-ref and event-channel",
59500 +                                dev->otherend);
59501 +               return err;
59502 +       }
59503 +       if (be->tpmif != NULL) {
59504 +               err = tpmif_map(be->tpmif, ring_ref, evtchn);
59505 +               if (err) {
59506 +                       xenbus_dev_error(dev, err,
59507 +                                        "mapping shared-frame %lu port %u",
59508 +                                        ring_ref, evtchn);
59509 +                       return err;
59510 +               }
59511 +       }
59512 +       return 0;
59513 +}
59514 +
59515 +
59516 +static struct xenbus_device_id tpmback_ids[] = {
59517 +       { "vtpm" },
59518 +       { "" }
59519 +};
59520 +
59521 +
59522 +static struct xenbus_driver tpmback = {
59523 +       .name = "vtpm",
59524 +       .owner = THIS_MODULE,
59525 +       .ids = tpmback_ids,
59526 +       .probe = tpmback_probe,
59527 +       .remove = tpmback_remove,
59528 +       .otherend_changed = frontend_changed,
59529 +};
59530 +
59531 +
59532 +void tpmif_xenbus_init(void)
59533 +{
59534 +       xenbus_register_backend(&tpmback);
59535 +}
59536 +
59537 +void tpmif_xenbus_exit(void)
59538 +{
59539 +       xenbus_unregister_driver(&tpmback);
59540 +}
59541 +
59542 +/*
59543 + * Local variables:
59544 + *  c-file-style: "linux"
59545 + *  indent-tabs-mode: t
59546 + *  c-indent-level: 8
59547 + *  c-basic-offset: 8
59548 + *  tab-width: 8
59549 + * End:
59550 + */
59551 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/tpmfront/Makefile linux-2.6.16/drivers/xen/tpmfront/Makefile
59552 --- linux-2.6.16.orig/drivers/xen/tpmfront/Makefile     1970-01-01 01:00:00.000000000 +0100
59553 +++ linux-2.6.16/drivers/xen/tpmfront/Makefile  2006-06-26 09:51:32.000000000 +0200
59554 @@ -0,0 +1,2 @@
59555 +
59556 +obj-$(CONFIG_XEN_TPMDEV_FRONTEND)      += tpmfront.o
59557 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/tpmfront/tpmfront.c linux-2.6.16/drivers/xen/tpmfront/tpmfront.c
59558 --- linux-2.6.16.orig/drivers/xen/tpmfront/tpmfront.c   1970-01-01 01:00:00.000000000 +0100
59559 +++ linux-2.6.16/drivers/xen/tpmfront/tpmfront.c        2006-06-26 09:51:32.000000000 +0200
59560 @@ -0,0 +1,731 @@
59561 +/*
59562 + * Copyright (c) 2005, IBM Corporation
59563 + *
59564 + * Author: Stefan Berger, stefanb@us.ibm.com
59565 + * Grant table support: Mahadevan Gomathisankaran
59566 + *
59567 + * This code has been derived from drivers/xen/netfront/netfront.c
59568 + *
59569 + * Copyright (c) 2002-2004, K A Fraser
59570 + *
59571 + * This program is free software; you can redistribute it and/or
59572 + * modify it under the terms of the GNU General Public License version 2
59573 + * as published by the Free Software Foundation; or, when distributed
59574 + * separately from the Linux kernel or incorporated into other
59575 + * software packages, subject to the following license:
59576 + * 
59577 + * Permission is hereby granted, free of charge, to any person obtaining a copy
59578 + * of this source file (the "Software"), to deal in the Software without
59579 + * restriction, including without limitation the rights to use, copy, modify,
59580 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
59581 + * and to permit persons to whom the Software is furnished to do so, subject to
59582 + * the following conditions:
59583 + *
59584 + * The above copyright notice and this permission notice shall be included in
59585 + * all copies or substantial portions of the Software.
59586 + *
59587 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
59588 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
59589 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
59590 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
59591 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
59592 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
59593 + * IN THE SOFTWARE.
59594 + */
59595 +
59596 +#include <linux/config.h>
59597 +#include <linux/module.h>
59598 +#include <linux/version.h>
59599 +#include <linux/kernel.h>
59600 +#include <linux/slab.h>
59601 +#include <linux/errno.h>
59602 +#include <linux/interrupt.h>
59603 +#include <linux/init.h>
59604 +#include <xen/tpmfe.h>
59605 +#include <linux/err.h>
59606 +#include <linux/mutex.h>
59607 +#include <asm/io.h>
59608 +#include <xen/evtchn.h>
59609 +#include <xen/interface/grant_table.h>
59610 +#include <xen/interface/io/tpmif.h>
59611 +#include <asm/uaccess.h>
59612 +#include <xen/xenbus.h>
59613 +#include <xen/interface/grant_table.h>
59614 +
59615 +#include "tpmfront.h"
59616 +
59617 +#undef DEBUG
59618 +
59619 +/* locally visible variables */
59620 +static grant_ref_t gref_head;
59621 +static struct tpm_private *my_priv;
59622 +
59623 +/* local function prototypes */
59624 +static irqreturn_t tpmif_int(int irq,
59625 +                             void *tpm_priv,
59626 +                             struct pt_regs *ptregs);
59627 +static void tpmif_rx_action(unsigned long unused);
59628 +static void tpmif_connect(struct tpm_private *tp, domid_t domid);
59629 +static DECLARE_TASKLET(tpmif_rx_tasklet, tpmif_rx_action, 0);
59630 +static int tpm_allocate_buffers(struct tpm_private *tp);
59631 +static void tpmif_set_connected_state(struct tpm_private *tp,
59632 +                                      u8 newstate);
59633 +static int tpm_xmit(struct tpm_private *tp,
59634 +                    const u8 * buf, size_t count, int userbuffer,
59635 +                    void *remember);
59636 +
59637 +#define DPRINTK(fmt, args...) \
59638 +    pr_debug("xen_tpm_fr (%s:%d) " fmt, __FUNCTION__, __LINE__, ##args)
59639 +#define IPRINTK(fmt, args...) \
59640 +    printk(KERN_INFO "xen_tpm_fr: " fmt, ##args)
59641 +#define WPRINTK(fmt, args...) \
59642 +    printk(KERN_WARNING "xen_tpm_fr: " fmt, ##args)
59643 +
59644 +
59645 +static inline int
59646 +tx_buffer_copy(struct tx_buffer *txb, const u8 * src, int len,
59647 +               int isuserbuffer)
59648 +{
59649 +       int copied = len;
59650 +
59651 +       if (len > txb->size) {
59652 +               copied = txb->size;
59653 +       }
59654 +       if (isuserbuffer) {
59655 +               if (copy_from_user(txb->data, src, copied))
59656 +                       return -EFAULT;
59657 +       } else {
59658 +               memcpy(txb->data, src, copied);
59659 +       }
59660 +       txb->len = len;
59661 +       return copied;
59662 +}
59663 +
59664 +static inline struct tx_buffer *tx_buffer_alloc(void)
59665 +{
59666 +       struct tx_buffer *txb = kzalloc(sizeof (struct tx_buffer),
59667 +                                       GFP_KERNEL);
59668 +
59669 +       if (txb) {
59670 +               txb->len = 0;
59671 +               txb->size = PAGE_SIZE;
59672 +               txb->data = (unsigned char *)__get_free_page(GFP_KERNEL);
59673 +               if (txb->data == NULL) {
59674 +                       kfree(txb);
59675 +                       txb = NULL;
59676 +               }
59677 +       }
59678 +       return txb;
59679 +}
59680 +
59681 +
59682 +/**************************************************************
59683 + Utility function for the tpm_private structure
59684 +**************************************************************/
59685 +static inline void tpm_private_init(struct tpm_private *tp)
59686 +{
59687 +       spin_lock_init(&tp->tx_lock);
59688 +       init_waitqueue_head(&tp->wait_q);
59689 +}
59690 +
59691 +static struct tpm_private *tpm_private_get(void)
59692 +{
59693 +       if (!my_priv) {
59694 +               my_priv = kzalloc(sizeof(struct tpm_private), GFP_KERNEL);
59695 +               if (my_priv) {
59696 +                       tpm_private_init(my_priv);
59697 +               }
59698 +       }
59699 +       return my_priv;
59700 +}
59701 +
59702 +static inline void tpm_private_free(void)
59703 +{
59704 +       kfree(my_priv);
59705 +       my_priv = NULL;
59706 +}
59707 +
59708 +/**************************************************************
59709 +
59710 + The interface to let the tpm plugin register its callback
59711 + function and send data to another partition using this module
59712 +
59713 +**************************************************************/
59714 +
59715 +static DEFINE_MUTEX(upperlayer_lock);
59716 +static DEFINE_MUTEX(suspend_lock);
59717 +static struct tpmfe_device *upperlayer_tpmfe;
59718 +
59719 +/*
59720 + * Send data via this module by calling this function
59721 + */
59722 +int tpm_fe_send(struct tpm_private *tp, const u8 * buf, size_t count, void *ptr)
59723 +{
59724 +       int sent;
59725 +
59726 +       mutex_lock(&suspend_lock);
59727 +       sent = tpm_xmit(tp, buf, count, 0, ptr);
59728 +       mutex_unlock(&suspend_lock);
59729 +
59730 +       return sent;
59731 +}
59732 +EXPORT_SYMBOL(tpm_fe_send);
59733 +
59734 +/*
59735 + * Register a callback for receiving data from this module
59736 + */
59737 +int tpm_fe_register_receiver(struct tpmfe_device *tpmfe_dev)
59738 +{
59739 +       int rc = 0;
59740 +
59741 +       mutex_lock(&upperlayer_lock);
59742 +       if (NULL == upperlayer_tpmfe) {
59743 +               upperlayer_tpmfe = tpmfe_dev;
59744 +               tpmfe_dev->max_tx_size = TPMIF_TX_RING_SIZE * PAGE_SIZE;
59745 +               tpmfe_dev->tpm_private = tpm_private_get();
59746 +               if (!tpmfe_dev->tpm_private) {
59747 +                       rc = -ENOMEM;
59748 +               }
59749 +       } else {
59750 +               rc = -EBUSY;
59751 +       }
59752 +       mutex_unlock(&upperlayer_lock);
59753 +       return rc;
59754 +}
59755 +EXPORT_SYMBOL(tpm_fe_register_receiver);
59756 +
59757 +/*
59758 + * Unregister the callback for receiving data from this module
59759 + */
59760 +void tpm_fe_unregister_receiver(void)
59761 +{
59762 +       mutex_lock(&upperlayer_lock);
59763 +       upperlayer_tpmfe = NULL;
59764 +       mutex_unlock(&upperlayer_lock);
59765 +}
59766 +EXPORT_SYMBOL(tpm_fe_unregister_receiver);
59767 +
59768 +/*
59769 + * Call this function to send data to the upper layer's
59770 + * registered receiver function.
59771 + */
59772 +static int tpm_fe_send_upperlayer(const u8 * buf, size_t count,
59773 +                                  const void *ptr)
59774 +{
59775 +       int rc = 0;
59776 +
59777 +       mutex_lock(&upperlayer_lock);
59778 +
59779 +       if (upperlayer_tpmfe && upperlayer_tpmfe->receive)
59780 +               rc = upperlayer_tpmfe->receive(buf, count, ptr);
59781 +
59782 +       mutex_unlock(&upperlayer_lock);
59783 +       return rc;
59784 +}
59785 +
59786 +/**************************************************************
59787 + XENBUS support code
59788 +**************************************************************/
59789 +
59790 +static int setup_tpmring(struct xenbus_device *dev,
59791 +                         struct tpm_private *tp)
59792 +{
59793 +       tpmif_tx_interface_t *sring;
59794 +       int err;
59795 +
59796 +       sring = (void *)__get_free_page(GFP_KERNEL);
59797 +       if (!sring) {
59798 +               xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
59799 +               return -ENOMEM;
59800 +       }
59801 +       tp->tx = sring;
59802 +
59803 +       tpm_allocate_buffers(tp);
59804 +
59805 +       err = xenbus_grant_ring(dev, virt_to_mfn(tp->tx));
59806 +       if (err < 0) {
59807 +               free_page((unsigned long)sring);
59808 +               tp->tx = NULL;
59809 +               xenbus_dev_fatal(dev, err, "allocating grant reference");
59810 +               goto fail;
59811 +       }
59812 +       tp->ring_ref = err;
59813 +
59814 +       err = xenbus_alloc_evtchn(dev, &tp->evtchn);
59815 +       if (err)
59816 +               goto fail;
59817 +
59818 +       tpmif_connect(tp, dev->otherend_id);
59819 +
59820 +       return 0;
59821 +fail:
59822 +       return err;
59823 +}
59824 +
59825 +
59826 +static void destroy_tpmring(struct tpm_private *tp)
59827 +{
59828 +       tpmif_set_connected_state(tp, 0);
59829 +       if (tp->tx != NULL) {
59830 +               gnttab_end_foreign_access(tp->ring_ref, 0,
59831 +                                         (unsigned long)tp->tx);
59832 +               tp->tx = NULL;
59833 +       }
59834 +
59835 +       if (tp->irq)
59836 +               unbind_from_irqhandler(tp->irq, NULL);
59837 +       tp->evtchn = tp->irq = 0;
59838 +}
59839 +
59840 +
59841 +static int talk_to_backend(struct xenbus_device *dev,
59842 +                           struct tpm_private *tp)
59843 +{
59844 +       const char *message = NULL;
59845 +       int err;
59846 +       xenbus_transaction_t xbt;
59847 +
59848 +       err = setup_tpmring(dev, tp);
59849 +       if (err) {
59850 +               xenbus_dev_fatal(dev, err, "setting up ring");
59851 +               goto out;
59852 +       }
59853 +
59854 +again:
59855 +       err = xenbus_transaction_start(&xbt);
59856 +       if (err) {
59857 +               xenbus_dev_fatal(dev, err, "starting transaction");
59858 +               goto destroy_tpmring;
59859 +       }
59860 +
59861 +       err = xenbus_printf(xbt, dev->nodename,
59862 +                           "ring-ref","%u", tp->ring_ref);
59863 +       if (err) {
59864 +               message = "writing ring-ref";
59865 +               goto abort_transaction;
59866 +       }
59867 +
59868 +       err = xenbus_printf(xbt, dev->nodename,
59869 +                           "event-channel", "%u", tp->evtchn);
59870 +       if (err) {
59871 +               message = "writing event-channel";
59872 +               goto abort_transaction;
59873 +       }
59874 +
59875 +       err = xenbus_printf(xbt, dev->nodename,
59876 +                           "state", "%d", XenbusStateInitialised);
59877 +       if (err) {
59878 +               goto abort_transaction;
59879 +       }
59880 +
59881 +       err = xenbus_transaction_end(xbt, 0);
59882 +       if (err == -EAGAIN)
59883 +               goto again;
59884 +       if (err) {
59885 +               xenbus_dev_fatal(dev, err, "completing transaction");
59886 +               goto destroy_tpmring;
59887 +       }
59888 +       return 0;
59889 +
59890 +abort_transaction:
59891 +       xenbus_transaction_end(xbt, 1);
59892 +       if (message)
59893 +               xenbus_dev_error(dev, err, "%s", message);
59894 +destroy_tpmring:
59895 +       destroy_tpmring(tp);
59896 +out:
59897 +       return err;
59898 +}
59899 +
59900 +/**
59901 + * Callback received when the backend's state changes.
59902 + */
59903 +static void backend_changed(struct xenbus_device *dev,
59904 +                           XenbusState backend_state)
59905 +{
59906 +       struct tpm_private *tp = dev->data;
59907 +       DPRINTK("\n");
59908 +
59909 +       switch (backend_state) {
59910 +       case XenbusStateInitialising:
59911 +       case XenbusStateInitWait:
59912 +       case XenbusStateInitialised:
59913 +       case XenbusStateUnknown:
59914 +               break;
59915 +
59916 +       case XenbusStateConnected:
59917 +               tpmif_set_connected_state(tp, 1);
59918 +               break;
59919 +
59920 +       case XenbusStateClosing:
59921 +               tpmif_set_connected_state(tp, 0);
59922 +               break;
59923 +
59924 +       case XenbusStateClosed:
59925 +               if (tp->is_suspended == 0) {
59926 +                       device_unregister(&dev->dev);
59927 +               }
59928 +               break;
59929 +       }
59930 +}
59931 +
59932 +
59933 +static int tpmfront_probe(struct xenbus_device *dev,
59934 +                          const struct xenbus_device_id *id)
59935 +{
59936 +       int err;
59937 +       int handle;
59938 +       struct tpm_private *tp = tpm_private_get();
59939 +
59940 +       err = xenbus_scanf(XBT_NULL, dev->nodename,
59941 +                          "handle", "%i", &handle);
59942 +       if (XENBUS_EXIST_ERR(err))
59943 +               return err;
59944 +
59945 +       if (err < 0) {
59946 +               xenbus_dev_fatal(dev,err,"reading virtual-device");
59947 +               return err;
59948 +       }
59949 +
59950 +       tp->dev = dev;
59951 +       dev->data = tp;
59952 +
59953 +       err = talk_to_backend(dev, tp);
59954 +       if (err) {
59955 +               tpm_private_free();
59956 +               dev->data = NULL;
59957 +               return err;
59958 +       }
59959 +       return 0;
59960 +}
59961 +
59962 +
59963 +static int tpmfront_remove(struct xenbus_device *dev)
59964 +{
59965 +       struct tpm_private *tp = dev->data;
59966 +       destroy_tpmring(tp);
59967 +       return 0;
59968 +}
59969 +
59970 +static int
59971 +tpmfront_suspend(struct xenbus_device *dev)
59972 +{
59973 +       struct tpm_private *tp = dev->data;
59974 +       u32 ctr;
59975 +
59976 +       /* lock, so no app can send */
59977 +       mutex_lock(&suspend_lock);
59978 +       tp->is_suspended = 1;
59979 +
59980 +       for (ctr = 0; atomic_read(&tp->tx_busy) && ctr <= 25; ctr++) {
59981 +               if ((ctr % 10) == 0)
59982 +                       printk("TPM-FE [INFO]: Waiting for outstanding request.\n");
59983 +               /*
59984 +                * Wait for a request to be responded to.
59985 +                */
59986 +               interruptible_sleep_on_timeout(&tp->wait_q, 100);
59987 +       }
59988 +
59989 +       if (atomic_read(&tp->tx_busy)) {
59990 +               /*
59991 +                * A temporary work-around.
59992 +                */
59993 +               printk("TPM-FE [WARNING]: Resetting busy flag.");
59994 +               atomic_set(&tp->tx_busy, 0);
59995 +       }
59996 +
59997 +       return 0;
59998 +}
59999 +
60000 +static int
60001 +tpmfront_resume(struct xenbus_device *dev)
60002 +{
60003 +       struct tpm_private *tp = dev->data;
60004 +       return talk_to_backend(dev, tp);
60005 +}
60006 +
60007 +static void
60008 +tpmif_connect(struct tpm_private *tp, domid_t domid)
60009 +{
60010 +       int err;
60011 +
60012 +       tp->backend_id = domid;
60013 +
60014 +       err = bind_evtchn_to_irqhandler(tp->evtchn,
60015 +                                       tpmif_int, SA_SAMPLE_RANDOM, "tpmif",
60016 +                                       tp);
60017 +       if (err <= 0) {
60018 +               WPRINTK("bind_evtchn_to_irqhandler failed (err=%d)\n", err);
60019 +               return;
60020 +       }
60021 +
60022 +       tp->irq = err;
60023 +}
60024 +
60025 +static struct xenbus_device_id tpmfront_ids[] = {
60026 +       { "vtpm" },
60027 +       { "" }
60028 +};
60029 +
60030 +static struct xenbus_driver tpmfront = {
60031 +       .name = "vtpm",
60032 +       .owner = THIS_MODULE,
60033 +       .ids = tpmfront_ids,
60034 +       .probe = tpmfront_probe,
60035 +       .remove =  tpmfront_remove,
60036 +       .resume = tpmfront_resume,
60037 +       .otherend_changed = backend_changed,
60038 +       .suspend = tpmfront_suspend,
60039 +};
60040 +
60041 +static void __init init_tpm_xenbus(void)
60042 +{
60043 +       xenbus_register_frontend(&tpmfront);
60044 +}
60045 +
60046 +static void __exit exit_tpm_xenbus(void)
60047 +{
60048 +       xenbus_unregister_driver(&tpmfront);
60049 +}
60050 +
60051 +
60052 +static int
60053 +tpm_allocate_buffers(struct tpm_private *tp)
60054 +{
60055 +       unsigned int i;
60056 +
60057 +       for (i = 0; i < TPMIF_TX_RING_SIZE; i++)
60058 +               tp->tx_buffers[i] = tx_buffer_alloc();
60059 +       return 1;
60060 +}
60061 +
60062 +static void
60063 +tpmif_rx_action(unsigned long priv)
60064 +{
60065 +       struct tpm_private *tp = (struct tpm_private *)priv;
60066 +
60067 +       int i = 0;
60068 +       unsigned int received;
60069 +       unsigned int offset = 0;
60070 +       u8 *buffer;
60071 +       tpmif_tx_request_t *tx;
60072 +       tx = &tp->tx->ring[i].req;
60073 +
60074 +       received = tx->size;
60075 +
60076 +       buffer = kmalloc(received, GFP_KERNEL);
60077 +       if (NULL == buffer) {
60078 +               goto exit;
60079 +       }
60080 +
60081 +       for (i = 0; i < TPMIF_TX_RING_SIZE && offset < received; i++) {
60082 +               struct tx_buffer *txb = tp->tx_buffers[i];
60083 +               tpmif_tx_request_t *tx;
60084 +               unsigned int tocopy;
60085 +
60086 +               tx = &tp->tx->ring[i].req;
60087 +               tocopy = tx->size;
60088 +               if (tocopy > PAGE_SIZE) {
60089 +                       tocopy = PAGE_SIZE;
60090 +               }
60091 +
60092 +               memcpy(&buffer[offset], txb->data, tocopy);
60093 +
60094 +               gnttab_release_grant_reference(&gref_head, tx->ref);
60095 +
60096 +               offset += tocopy;
60097 +       }
60098 +
60099 +       tpm_fe_send_upperlayer(buffer, received, tp->tx_remember);
60100 +       kfree(buffer);
60101 +
60102 +exit:
60103 +       atomic_set(&tp->tx_busy, 0);
60104 +       wake_up_interruptible(&tp->wait_q);
60105 +}
60106 +
60107 +
60108 +static irqreturn_t
60109 +tpmif_int(int irq, void *tpm_priv, struct pt_regs *ptregs)
60110 +{
60111 +       struct tpm_private *tp = tpm_priv;
60112 +       unsigned long flags;
60113 +
60114 +       spin_lock_irqsave(&tp->tx_lock, flags);
60115 +       tpmif_rx_tasklet.data = (unsigned long)tp;
60116 +       tasklet_schedule(&tpmif_rx_tasklet);
60117 +       spin_unlock_irqrestore(&tp->tx_lock, flags);
60118 +
60119 +       return IRQ_HANDLED;
60120 +}
60121 +
60122 +
60123 +static int
60124 +tpm_xmit(struct tpm_private *tp,
60125 +         const u8 * buf, size_t count, int isuserbuffer,
60126 +         void *remember)
60127 +{
60128 +       tpmif_tx_request_t *tx;
60129 +       TPMIF_RING_IDX i;
60130 +       unsigned int offset = 0;
60131 +
60132 +       spin_lock_irq(&tp->tx_lock);
60133 +
60134 +       if (unlikely(atomic_read(&tp->tx_busy))) {
60135 +               printk("tpm_xmit: There's an outstanding request/response "
60136 +                      "on the way!\n");
60137 +               spin_unlock_irq(&tp->tx_lock);
60138 +               return -EBUSY;
60139 +       }
60140 +
60141 +       if (tp->is_connected != 1) {
60142 +               spin_unlock_irq(&tp->tx_lock);
60143 +               return -EIO;
60144 +       }
60145 +
60146 +       for (i = 0; count > 0 && i < TPMIF_TX_RING_SIZE; i++) {
60147 +               struct tx_buffer *txb = tp->tx_buffers[i];
60148 +               int copied;
60149 +
60150 +               if (NULL == txb) {
60151 +                       DPRINTK("txb (i=%d) is NULL. buffers initilized?\n"
60152 +                               "Not transmitting anything!\n", i);
60153 +                       spin_unlock_irq(&tp->tx_lock);
60154 +                       return -EFAULT;
60155 +               }
60156 +               copied = tx_buffer_copy(txb, &buf[offset], count,
60157 +                                       isuserbuffer);
60158 +               if (copied < 0) {
60159 +                       /* An error occurred */
60160 +                       spin_unlock_irq(&tp->tx_lock);
60161 +                       return copied;
60162 +               }
60163 +               count -= copied;
60164 +               offset += copied;
60165 +
60166 +               tx = &tp->tx->ring[i].req;
60167 +
60168 +               tx->addr = virt_to_machine(txb->data);
60169 +               tx->size = txb->len;
60170 +
60171 +               DPRINTK("First 4 characters sent by TPM-FE are 0x%02x 0x%02x 0x%02x 0x%02x\n",
60172 +                       txb->data[0],txb->data[1],txb->data[2],txb->data[3]);
60173 +
60174 +               /* get the granttable reference for this page */
60175 +               tx->ref = gnttab_claim_grant_reference(&gref_head);
60176 +
60177 +               if (-ENOSPC == tx->ref) {
60178 +                       spin_unlock_irq(&tp->tx_lock);
60179 +                       DPRINTK(" Grant table claim reference failed in func:%s line:%d file:%s\n", __FUNCTION__, __LINE__, __FILE__);
60180 +                       return -ENOSPC;
60181 +               }
60182 +               gnttab_grant_foreign_access_ref( tx->ref,
60183 +                                                tp->backend_id,
60184 +                                                (tx->addr >> PAGE_SHIFT),
60185 +                                                0 /*RW*/);
60186 +               wmb();
60187 +       }
60188 +
60189 +       atomic_set(&tp->tx_busy, 1);
60190 +       tp->tx_remember = remember;
60191 +       mb();
60192 +
60193 +       DPRINTK("Notifying backend via event channel %d\n",
60194 +               tp->evtchn);
60195 +
60196 +       notify_remote_via_irq(tp->irq);
60197 +
60198 +       spin_unlock_irq(&tp->tx_lock);
60199 +       return offset;
60200 +}
60201 +
60202 +
60203 +static void tpmif_notify_upperlayer(struct tpm_private *tp)
60204 +{
60205 +       /*
60206 +        * Notify upper layer about the state of the connection
60207 +        * to the BE.
60208 +        */
60209 +       mutex_lock(&upperlayer_lock);
60210 +
60211 +       if (upperlayer_tpmfe != NULL) {
60212 +               if (tp->is_connected) {
60213 +                       upperlayer_tpmfe->status(TPMFE_STATUS_CONNECTED);
60214 +               } else {
60215 +                       upperlayer_tpmfe->status(0);
60216 +               }
60217 +       }
60218 +       mutex_unlock(&upperlayer_lock);
60219 +}
60220 +
60221 +
60222 +static void tpmif_set_connected_state(struct tpm_private *tp, u8 is_connected)
60223 +{
60224 +       /*
60225 +        * Don't notify upper layer if we are in suspend mode and
60226 +        * should disconnect - assumption is that we will resume
60227 +        * The mutex keeps apps from sending.
60228 +        */
60229 +       if (is_connected == 0 && tp->is_suspended == 1) {
60230 +               return;
60231 +       }
60232 +
60233 +       /*
60234 +        * Unlock the mutex if we are connected again
60235 +        * after being suspended - now resuming.
60236 +        * This also removes the suspend state.
60237 +        */
60238 +       if (is_connected == 1 && tp->is_suspended == 1) {
60239 +               tp->is_suspended = 0;
60240 +               /* unlock, so apps can resume sending */
60241 +               mutex_unlock(&suspend_lock);
60242 +       }
60243 +
60244 +       if (is_connected != tp->is_connected) {
60245 +               tp->is_connected = is_connected;
60246 +               tpmif_notify_upperlayer(tp);
60247 +       }
60248 +}
60249 +
60250 +
60251 +/* =================================================================
60252 + * Initialization function.
60253 + * =================================================================
60254 + */
60255 +
60256 +static int __init
60257 +tpmif_init(void)
60258 +{
60259 +       IPRINTK("Initialising the vTPM driver.\n");
60260 +       if ( gnttab_alloc_grant_references ( TPMIF_TX_RING_SIZE,
60261 +                                            &gref_head ) < 0) {
60262 +               return -EFAULT;
60263 +       }
60264 +
60265 +       init_tpm_xenbus();
60266 +
60267 +       return 0;
60268 +}
60269 +
60270 +module_init(tpmif_init);
60271 +
60272 +static void __exit
60273 +tpmif_exit(void)
60274 +{
60275 +       exit_tpm_xenbus();
60276 +       gnttab_free_grant_references(gref_head);
60277 +}
60278 +
60279 +module_exit(tpmif_exit);
60280 +
60281 +MODULE_LICENSE("Dual BSD/GPL");
60282 +
60283 +/*
60284 + * Local variables:
60285 + *  c-file-style: "linux"
60286 + *  indent-tabs-mode: t
60287 + *  c-indent-level: 8
60288 + *  c-basic-offset: 8
60289 + *  tab-width: 8
60290 + * End:
60291 + */
60292 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/tpmfront/tpmfront.h linux-2.6.16/drivers/xen/tpmfront/tpmfront.h
60293 --- linux-2.6.16.orig/drivers/xen/tpmfront/tpmfront.h   1970-01-01 01:00:00.000000000 +0100
60294 +++ linux-2.6.16/drivers/xen/tpmfront/tpmfront.h        2006-06-26 09:51:32.000000000 +0200
60295 @@ -0,0 +1,40 @@
60296 +#ifndef TPM_FRONT_H
60297 +#define TPM_FRONT_H
60298 +
60299 +struct tpm_private {
60300 +       tpmif_tx_interface_t *tx;
60301 +       unsigned int evtchn;
60302 +       unsigned int irq;
60303 +       u8 is_connected;
60304 +       u8 is_suspended;
60305 +
60306 +       spinlock_t tx_lock;
60307 +
60308 +       struct tx_buffer *tx_buffers[TPMIF_TX_RING_SIZE];
60309 +
60310 +       atomic_t tx_busy;
60311 +       void *tx_remember;
60312 +       domid_t backend_id;
60313 +       wait_queue_head_t wait_q;
60314 +
60315 +       struct xenbus_device *dev;
60316 +       int ring_ref;
60317 +};
60318 +
60319 +struct tx_buffer {
60320 +       unsigned int size;      // available space in data
60321 +       unsigned int len;       // used space in data
60322 +       unsigned char *data;    // pointer to a page
60323 +};
60324 +
60325 +#endif
60326 +
60327 +/*
60328 + * Local variables:
60329 + *  c-file-style: "linux"
60330 + *  indent-tabs-mode: t
60331 + *  c-indent-level: 8
60332 + *  c-basic-offset: 8
60333 + *  tab-width: 8
60334 + * End:
60335 + */
60336 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/util.c linux-2.6.16/drivers/xen/util.c
60337 --- linux-2.6.16.orig/drivers/xen/util.c        1970-01-01 01:00:00.000000000 +0100
60338 +++ linux-2.6.16/drivers/xen/util.c     2006-06-26 09:51:32.000000000 +0200
60339 @@ -0,0 +1,80 @@
60340 +#include <linux/config.h>
60341 +#include <linux/mm.h>
60342 +#include <linux/module.h>
60343 +#include <linux/slab.h>
60344 +#include <linux/vmalloc.h>
60345 +#include <asm/uaccess.h>
60346 +#include <xen/driver_util.h>
60347 +
60348 +static int f(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
60349 +{
60350 +       /* apply_to_page_range() does all the hard work. */
60351 +       return 0;
60352 +}
60353 +
60354 +struct vm_struct *alloc_vm_area(unsigned long size)
60355 +{
60356 +       struct vm_struct *area;
60357 +
60358 +       area = get_vm_area(size, VM_IOREMAP);
60359 +       if (area == NULL)
60360 +               return NULL;
60361 +
60362 +       /*
60363 +        * This ensures that page tables are constructed for this region
60364 +        * of kernel virtual address space and mapped into init_mm.
60365 +        */
60366 +       if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
60367 +                               area->size, f, NULL)) {
60368 +               free_vm_area(area);
60369 +               return NULL;
60370 +       }
60371 +
60372 +       return area;
60373 +}
60374 +EXPORT_SYMBOL_GPL(alloc_vm_area);
60375 +
60376 +void free_vm_area(struct vm_struct *area)
60377 +{
60378 +       struct vm_struct *ret;
60379 +       ret = remove_vm_area(area->addr);
60380 +       BUG_ON(ret != area);
60381 +       kfree(area);
60382 +}
60383 +EXPORT_SYMBOL_GPL(free_vm_area);
60384 +
60385 +void lock_vm_area(struct vm_struct *area)
60386 +{
60387 +       unsigned long i;
60388 +       char c;
60389 +
60390 +       /*
60391 +        * Prevent context switch to a lazy mm that doesn't have this area
60392 +        * mapped into its page tables.
60393 +        */
60394 +       preempt_disable();
60395 +
60396 +       /*
60397 +        * Ensure that the page tables are mapped into the current mm. The
60398 +        * page-fault path will copy the page directory pointers from init_mm.
60399 +        */
60400 +       for (i = 0; i < area->size; i += PAGE_SIZE)
60401 +               (void)__get_user(c, (char __user *)area->addr + i);
60402 +}
60403 +EXPORT_SYMBOL_GPL(lock_vm_area);
60404 +
60405 +void unlock_vm_area(struct vm_struct *area)
60406 +{
60407 +       preempt_enable();
60408 +}
60409 +EXPORT_SYMBOL_GPL(unlock_vm_area);
60410 +
60411 +/*
60412 + * Local variables:
60413 + *  c-file-style: "linux"
60414 + *  indent-tabs-mode: t
60415 + *  c-indent-level: 8
60416 + *  c-basic-offset: 8
60417 + *  tab-width: 8
60418 + * End:
60419 + */
60420 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/xenbus/Makefile linux-2.6.16/drivers/xen/xenbus/Makefile
60421 --- linux-2.6.16.orig/drivers/xen/xenbus/Makefile       1970-01-01 01:00:00.000000000 +0100
60422 +++ linux-2.6.16/drivers/xen/xenbus/Makefile    2006-06-26 09:51:32.000000000 +0200
60423 @@ -0,0 +1,8 @@
60424 +obj-y  += xenbus.o
60425 +
60426 +xenbus-objs =
60427 +xenbus-objs += xenbus_client.o
60428 +xenbus-objs += xenbus_comms.o
60429 +xenbus-objs += xenbus_xs.o
60430 +xenbus-objs += xenbus_probe.o
60431 +xenbus-objs += xenbus_dev.o
60432 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/xenbus/xenbus_client.c linux-2.6.16/drivers/xen/xenbus/xenbus_client.c
60433 --- linux-2.6.16.orig/drivers/xen/xenbus/xenbus_client.c        1970-01-01 01:00:00.000000000 +0100
60434 +++ linux-2.6.16/drivers/xen/xenbus/xenbus_client.c     2006-06-26 09:51:32.000000000 +0200
60435 @@ -0,0 +1,412 @@
60436 +/******************************************************************************
60437 + * Client-facing interface for the Xenbus driver.  In other words, the
60438 + * interface between the Xenbus and the device-specific code, be it the
60439 + * frontend or the backend of that driver.
60440 + *
60441 + * Copyright (C) 2005 XenSource Ltd
60442 + * 
60443 + * This program is free software; you can redistribute it and/or
60444 + * modify it under the terms of the GNU General Public License version 2
60445 + * as published by the Free Software Foundation; or, when distributed
60446 + * separately from the Linux kernel or incorporated into other
60447 + * software packages, subject to the following license:
60448 + * 
60449 + * Permission is hereby granted, free of charge, to any person obtaining a copy
60450 + * of this source file (the "Software"), to deal in the Software without
60451 + * restriction, including without limitation the rights to use, copy, modify,
60452 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
60453 + * and to permit persons to whom the Software is furnished to do so, subject to
60454 + * the following conditions:
60455 + * 
60456 + * The above copyright notice and this permission notice shall be included in
60457 + * all copies or substantial portions of the Software.
60458 + * 
60459 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
60460 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
60461 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
60462 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
60463 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
60464 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
60465 + * IN THE SOFTWARE.
60466 + */
60467 +
60468 +#include <xen/evtchn.h>
60469 +#include <xen/gnttab.h>
60470 +#include <xen/xenbus.h>
60471 +#include <xen/driver_util.h>
60472 +
60473 +/* xenbus_probe.c */
60474 +extern char *kasprintf(const char *fmt, ...);
60475 +
60476 +#define DPRINTK(fmt, args...) \
60477 +    pr_debug("xenbus_client (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
60478 +
60479 +int xenbus_watch_path(struct xenbus_device *dev, const char *path,
60480 +                     struct xenbus_watch *watch,
60481 +                     void (*callback)(struct xenbus_watch *,
60482 +                                      const char **, unsigned int))
60483 +{
60484 +       int err;
60485 +
60486 +       watch->node = path;
60487 +       watch->callback = callback;
60488 +
60489 +       err = register_xenbus_watch(watch);
60490 +
60491 +       if (err) {
60492 +               watch->node = NULL;
60493 +               watch->callback = NULL;
60494 +               xenbus_dev_fatal(dev, err, "adding watch on %s", path);
60495 +       }
60496 +
60497 +       return err;
60498 +}
60499 +EXPORT_SYMBOL_GPL(xenbus_watch_path);
60500 +
60501 +
60502 +int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
60503 +                      const char *path2, struct xenbus_watch *watch,
60504 +                      void (*callback)(struct xenbus_watch *,
60505 +                                       const char **, unsigned int))
60506 +{
60507 +       int err;
60508 +       char *state = kasprintf("%s/%s", path, path2);
60509 +       if (!state) {
60510 +               xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch");
60511 +               return -ENOMEM;
60512 +       }
60513 +       err = xenbus_watch_path(dev, state, watch, callback);
60514 +
60515 +       if (err)
60516 +               kfree(state);
60517 +       return err;
60518 +}
60519 +EXPORT_SYMBOL_GPL(xenbus_watch_path2);
60520 +
60521 +
60522 +int xenbus_switch_state(struct xenbus_device *dev, XenbusState state)
60523 +{
60524 +       /* We check whether the state is currently set to the given value, and
60525 +          if not, then the state is set.  We don't want to unconditionally
60526 +          write the given state, because we don't want to fire watches
60527 +          unnecessarily.  Furthermore, if the node has gone, we don't write
60528 +          to it, as the device will be tearing down, and we don't want to
60529 +          resurrect that directory.
60530 +
60531 +          Note that, because of this cached value of our state, this function
60532 +          will not work inside a Xenstore transaction (something it was
60533 +          trying to in the past) because dev->state would not get reset if
60534 +          the transaction was aborted.
60535 +
60536 +        */
60537 +
60538 +       int current_state;
60539 +       int err;
60540 +
60541 +       if (state == dev->state)
60542 +               return 0;
60543 +
60544 +       err = xenbus_scanf(XBT_NULL, dev->nodename, "state", "%d",
60545 +                          &current_state);
60546 +       if (err != 1)
60547 +               return 0;
60548 +
60549 +       err = xenbus_printf(XBT_NULL, dev->nodename, "state", "%d", state);
60550 +       if (err) {
60551 +               if (state != XenbusStateClosing) /* Avoid looping */
60552 +                       xenbus_dev_fatal(dev, err, "writing new state");
60553 +               return err;
60554 +       }
60555 +
60556 +       dev->state = state;
60557 +
60558 +       return 0;
60559 +}
60560 +EXPORT_SYMBOL_GPL(xenbus_switch_state);
60561 +
60562 +
60563 +/**
60564 + * Return the path to the error node for the given device, or NULL on failure.
60565 + * If the value returned is non-NULL, then it is the caller's to kfree.
60566 + */
60567 +static char *error_path(struct xenbus_device *dev)
60568 +{
60569 +       return kasprintf("error/%s", dev->nodename);
60570 +}
60571 +
60572 +
60573 +void _dev_error(struct xenbus_device *dev, int err, const char *fmt,
60574 +               va_list ap)
60575 +{
60576 +       int ret;
60577 +       unsigned int len;
60578 +       char *printf_buffer = NULL, *path_buffer = NULL;
60579 +
60580 +#define PRINTF_BUFFER_SIZE 4096
60581 +       printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
60582 +       if (printf_buffer == NULL)
60583 +               goto fail;
60584 +
60585 +       len = sprintf(printf_buffer, "%i ", -err);
60586 +       ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
60587 +
60588 +       BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1);
60589 +
60590 +       dev_err(&dev->dev, "%s\n", printf_buffer);
60591 +
60592 +       path_buffer = error_path(dev);
60593 +
60594 +       if (path_buffer == NULL) {
60595 +               printk("xenbus: failed to write error node for %s (%s)\n",
60596 +                      dev->nodename, printf_buffer);
60597 +               goto fail;
60598 +       }
60599 +
60600 +       if (xenbus_write(XBT_NULL, path_buffer, "error", printf_buffer) != 0) {
60601 +               printk("xenbus: failed to write error node for %s (%s)\n",
60602 +                      dev->nodename, printf_buffer);
60603 +               goto fail;
60604 +       }
60605 +
60606 +fail:
60607 +       if (printf_buffer)
60608 +               kfree(printf_buffer);
60609 +       if (path_buffer)
60610 +               kfree(path_buffer);
60611 +}
60612 +
60613 +
60614 +void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt,
60615 +                     ...)
60616 +{
60617 +       va_list ap;
60618 +
60619 +       va_start(ap, fmt);
60620 +       _dev_error(dev, err, fmt, ap);
60621 +       va_end(ap);
60622 +}
60623 +EXPORT_SYMBOL_GPL(xenbus_dev_error);
60624 +
60625 +
60626 +void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt,
60627 +                     ...)
60628 +{
60629 +       va_list ap;
60630 +
60631 +       va_start(ap, fmt);
60632 +       _dev_error(dev, err, fmt, ap);
60633 +       va_end(ap);
60634 +
60635 +       xenbus_switch_state(dev, XenbusStateClosing);
60636 +}
60637 +EXPORT_SYMBOL_GPL(xenbus_dev_fatal);
60638 +
60639 +
60640 +int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn)
60641 +{
60642 +       int err = gnttab_grant_foreign_access(dev->otherend_id, ring_mfn, 0);
60643 +       if (err < 0)
60644 +               xenbus_dev_fatal(dev, err, "granting access to ring page");
60645 +       return err;
60646 +}
60647 +EXPORT_SYMBOL_GPL(xenbus_grant_ring);
60648 +
60649 +
60650 +int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port)
60651 +{
60652 +       evtchn_op_t op = {
60653 +               .cmd = EVTCHNOP_alloc_unbound,
60654 +               .u.alloc_unbound.dom = DOMID_SELF,
60655 +               .u.alloc_unbound.remote_dom = dev->otherend_id
60656 +       };
60657 +       int err = HYPERVISOR_event_channel_op(&op);
60658 +       if (err)
60659 +               xenbus_dev_fatal(dev, err, "allocating event channel");
60660 +       else
60661 +               *port = op.u.alloc_unbound.port;
60662 +       return err;
60663 +}
60664 +EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);
60665 +
60666 +
60667 +int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port)
60668 +{
60669 +       evtchn_op_t op = {
60670 +               .cmd = EVTCHNOP_bind_interdomain,
60671 +               .u.bind_interdomain.remote_dom = dev->otherend_id,
60672 +               .u.bind_interdomain.remote_port = remote_port,
60673 +       };
60674 +       int err = HYPERVISOR_event_channel_op(&op);
60675 +       if (err)
60676 +               xenbus_dev_fatal(dev, err,
60677 +                                "binding to event channel %d from domain %d",
60678 +                                remote_port, dev->otherend_id);
60679 +       else
60680 +               *port = op.u.bind_interdomain.local_port;
60681 +       return err;
60682 +}
60683 +EXPORT_SYMBOL_GPL(xenbus_bind_evtchn);
60684 +
60685 +
60686 +int xenbus_free_evtchn(struct xenbus_device *dev, int port)
60687 +{
60688 +       evtchn_op_t op = {
60689 +               .cmd = EVTCHNOP_close,
60690 +               .u.close.port = port,
60691 +       };
60692 +       int err = HYPERVISOR_event_channel_op(&op);
60693 +       if (err)
60694 +               xenbus_dev_error(dev, err, "freeing event channel %d", port);
60695 +       return err;
60696 +}
60697 +
60698 +
60699 +/* Based on Rusty Russell's skeleton driver's map_page */
60700 +int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr)
60701 +{
60702 +       struct gnttab_map_grant_ref op = {
60703 +               .flags = GNTMAP_host_map,
60704 +               .ref   = gnt_ref,
60705 +               .dom   = dev->otherend_id,
60706 +       };
60707 +       struct vm_struct *area;
60708 +
60709 +       *vaddr = NULL;
60710 +
60711 +       area = alloc_vm_area(PAGE_SIZE);
60712 +       if (!area)
60713 +               return -ENOMEM;
60714 +
60715 +       op.host_addr = (unsigned long)area->addr;
60716 +
60717 +       lock_vm_area(area);
60718 +       BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1));
60719 +       unlock_vm_area(area);
60720 +
60721 +       if (op.status != GNTST_okay) {
60722 +               free_vm_area(area);
60723 +               xenbus_dev_fatal(dev, op.status,
60724 +                                "mapping in shared page %d from domain %d",
60725 +                                gnt_ref, dev->otherend_id);
60726 +               return op.status;
60727 +       }
60728 +
60729 +       /* Stuff the handle in an unused field */
60730 +       area->phys_addr = (unsigned long)op.handle;
60731 +
60732 +       *vaddr = area->addr;
60733 +       return 0;
60734 +}
60735 +EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
60736 +
60737 +
60738 +int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
60739 +                  grant_handle_t *handle, void *vaddr)
60740 +{
60741 +       struct gnttab_map_grant_ref op = {
60742 +               .host_addr = (unsigned long)vaddr,
60743 +               .flags     = GNTMAP_host_map,
60744 +               .ref       = gnt_ref,
60745 +               .dom       = dev->otherend_id,
60746 +       };
60747 +
60748 +       BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1));
60749 +
60750 +       if (op.status != GNTST_okay) {
60751 +               xenbus_dev_fatal(dev, op.status,
60752 +                                "mapping in shared page %d from domain %d",
60753 +                                gnt_ref, dev->otherend_id);
60754 +       } else
60755 +               *handle = op.handle;
60756 +
60757 +       return op.status;
60758 +}
60759 +EXPORT_SYMBOL_GPL(xenbus_map_ring);
60760 +
60761 +
60762 +/* Based on Rusty Russell's skeleton driver's unmap_page */
60763 +int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr)
60764 +{
60765 +       struct vm_struct *area;
60766 +       struct gnttab_unmap_grant_ref op = {
60767 +               .host_addr = (unsigned long)vaddr,
60768 +       };
60769 +
60770 +       /* It'd be nice if linux/vmalloc.h provided a find_vm_area(void *addr)
60771 +        * method so that we don't have to muck with vmalloc internals here.
60772 +        * We could force the user to hang on to their struct vm_struct from
60773 +        * xenbus_map_ring_valloc, but these 6 lines considerably simplify
60774 +        * this API.
60775 +        */
60776 +       read_lock(&vmlist_lock);
60777 +       for (area = vmlist; area != NULL; area = area->next) {
60778 +               if (area->addr == vaddr)
60779 +                       break;
60780 +       }
60781 +       read_unlock(&vmlist_lock);
60782 +
60783 +       if (!area) {
60784 +               xenbus_dev_error(dev, -ENOENT,
60785 +                                "can't find mapped virtual address %p", vaddr);
60786 +               return GNTST_bad_virt_addr;
60787 +       }
60788 +
60789 +       op.handle = (grant_handle_t)area->phys_addr;
60790 +
60791 +       lock_vm_area(area);
60792 +       BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
60793 +       unlock_vm_area(area);
60794 +
60795 +       if (op.status == GNTST_okay)
60796 +               free_vm_area(area);
60797 +       else
60798 +               xenbus_dev_error(dev, op.status,
60799 +                                "unmapping page at handle %d error %d",
60800 +                                (int16_t)area->phys_addr, op.status);
60801 +
60802 +       return op.status;
60803 +}
60804 +EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
60805 +
60806 +
60807 +int xenbus_unmap_ring(struct xenbus_device *dev,
60808 +                    grant_handle_t handle, void *vaddr)
60809 +{
60810 +       struct gnttab_unmap_grant_ref op = {
60811 +               .host_addr = (unsigned long)vaddr,
60812 +               .handle    = handle,
60813 +       };
60814 +
60815 +       BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
60816 +
60817 +       if (op.status != GNTST_okay)
60818 +               xenbus_dev_error(dev, op.status,
60819 +                                "unmapping page at handle %d error %d",
60820 +                                handle, op.status);
60821 +
60822 +       return op.status;
60823 +}
60824 +EXPORT_SYMBOL_GPL(xenbus_unmap_ring);
60825 +
60826 +
60827 +XenbusState xenbus_read_driver_state(const char *path)
60828 +{
60829 +       XenbusState result;
60830 +       int err = xenbus_gather(XBT_NULL, path, "state", "%d", &result, NULL);
60831 +       if (err)
60832 +               result = XenbusStateClosed;
60833 +
60834 +       return result;
60835 +}
60836 +EXPORT_SYMBOL_GPL(xenbus_read_driver_state);
60837 +
60838 +
60839 +/*
60840 + * Local variables:
60841 + *  c-file-style: "linux"
60842 + *  indent-tabs-mode: t
60843 + *  c-indent-level: 8
60844 + *  c-basic-offset: 8
60845 + *  tab-width: 8
60846 + * End:
60847 + */
60848 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/xenbus/xenbus_comms.c linux-2.6.16/drivers/xen/xenbus/xenbus_comms.c
60849 --- linux-2.6.16.orig/drivers/xen/xenbus/xenbus_comms.c 1970-01-01 01:00:00.000000000 +0100
60850 +++ linux-2.6.16/drivers/xen/xenbus/xenbus_comms.c      2006-06-26 09:51:32.000000000 +0200
60851 @@ -0,0 +1,218 @@
60852 +/******************************************************************************
60853 + * xenbus_comms.c
60854 + *
60855 + * Low level code to talks to Xen Store: ringbuffer and event channel.
60856 + *
60857 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
60858 + * 
60859 + * This program is free software; you can redistribute it and/or
60860 + * modify it under the terms of the GNU General Public License version 2
60861 + * as published by the Free Software Foundation; or, when distributed
60862 + * separately from the Linux kernel or incorporated into other
60863 + * software packages, subject to the following license:
60864 + * 
60865 + * Permission is hereby granted, free of charge, to any person obtaining a copy
60866 + * of this source file (the "Software"), to deal in the Software without
60867 + * restriction, including without limitation the rights to use, copy, modify,
60868 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
60869 + * and to permit persons to whom the Software is furnished to do so, subject to
60870 + * the following conditions:
60871 + * 
60872 + * The above copyright notice and this permission notice shall be included in
60873 + * all copies or substantial portions of the Software.
60874 + * 
60875 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
60876 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
60877 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
60878 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
60879 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
60880 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
60881 + * IN THE SOFTWARE.
60882 + */
60883 +
60884 +#include <asm/hypervisor.h>
60885 +#include <xen/evtchn.h>
60886 +#include <linux/wait.h>
60887 +#include <linux/interrupt.h>
60888 +#include <linux/sched.h>
60889 +#include <linux/err.h>
60890 +#include <xen/xenbus.h>
60891 +#include "xenbus_comms.h"
60892 +
60893 +static int xenbus_irq;
60894 +
60895 +extern void xenbus_probe(void *);
60896 +extern int xenstored_ready;
60897 +static DECLARE_WORK(probe_work, xenbus_probe, NULL);
60898 +
60899 +DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
60900 +
60901 +static inline struct xenstore_domain_interface *xenstore_domain_interface(void)
60902 +{
60903 +       return mfn_to_virt(xen_start_info->store_mfn);
60904 +}
60905 +
60906 +static irqreturn_t wake_waiting(int irq, void *unused, struct pt_regs *regs)
60907 +{
60908 +       if (unlikely(xenstored_ready == 0)) {
60909 +               xenstored_ready = 1;
60910 +               schedule_work(&probe_work);
60911 +       }
60912 +
60913 +       wake_up(&xb_waitq);
60914 +       return IRQ_HANDLED;
60915 +}
60916 +
60917 +static int check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
60918 +{
60919 +       return ((prod - cons) <= XENSTORE_RING_SIZE);
60920 +}
60921 +
60922 +static void *get_output_chunk(XENSTORE_RING_IDX cons,
60923 +                             XENSTORE_RING_IDX prod,
60924 +                             char *buf, uint32_t *len)
60925 +{
60926 +       *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
60927 +       if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
60928 +               *len = XENSTORE_RING_SIZE - (prod - cons);
60929 +       return buf + MASK_XENSTORE_IDX(prod);
60930 +}
60931 +
60932 +static const void *get_input_chunk(XENSTORE_RING_IDX cons,
60933 +                                  XENSTORE_RING_IDX prod,
60934 +                                  const char *buf, uint32_t *len)
60935 +{
60936 +       *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
60937 +       if ((prod - cons) < *len)
60938 +               *len = prod - cons;
60939 +       return buf + MASK_XENSTORE_IDX(cons);
60940 +}
60941 +
60942 +int xb_write(const void *data, unsigned len)
60943 +{
60944 +       struct xenstore_domain_interface *intf = xenstore_domain_interface();
60945 +       XENSTORE_RING_IDX cons, prod;
60946 +       int rc;
60947 +
60948 +       while (len != 0) {
60949 +               void *dst;
60950 +               unsigned int avail;
60951 +
60952 +               rc = wait_event_interruptible(
60953 +                       xb_waitq,
60954 +                       (intf->req_prod - intf->req_cons) !=
60955 +                       XENSTORE_RING_SIZE);
60956 +               if (rc < 0)
60957 +                       return rc;
60958 +
60959 +               /* Read indexes, then verify. */
60960 +               cons = intf->req_cons;
60961 +               prod = intf->req_prod;
60962 +               mb();
60963 +               if (!check_indexes(cons, prod)) {
60964 +                       intf->req_cons = intf->req_prod = 0;
60965 +                       return -EIO;
60966 +               }
60967 +
60968 +               dst = get_output_chunk(cons, prod, intf->req, &avail);
60969 +               if (avail == 0)
60970 +                       continue;
60971 +               if (avail > len)
60972 +                       avail = len;
60973 +
60974 +               memcpy(dst, data, avail);
60975 +               data += avail;
60976 +               len -= avail;
60977 +
60978 +               /* Other side must not see new header until data is there. */
60979 +               wmb();
60980 +               intf->req_prod += avail;
60981 +
60982 +               /* This implies mb() before other side sees interrupt. */
60983 +               notify_remote_via_evtchn(xen_start_info->store_evtchn);
60984 +       }
60985 +
60986 +       return 0;
60987 +}
60988 +
60989 +int xb_read(void *data, unsigned len)
60990 +{
60991 +       struct xenstore_domain_interface *intf = xenstore_domain_interface();
60992 +       XENSTORE_RING_IDX cons, prod;
60993 +       int rc;
60994 +
60995 +       while (len != 0) {
60996 +               unsigned int avail;
60997 +               const char *src;
60998 +
60999 +               rc = wait_event_interruptible(
61000 +                       xb_waitq,
61001 +                       intf->rsp_cons != intf->rsp_prod);
61002 +               if (rc < 0)
61003 +                       return rc;
61004 +
61005 +               /* Read indexes, then verify. */
61006 +               cons = intf->rsp_cons;
61007 +               prod = intf->rsp_prod;
61008 +               mb();
61009 +               if (!check_indexes(cons, prod)) {
61010 +                       intf->rsp_cons = intf->rsp_prod = 0;
61011 +                       return -EIO;
61012 +               }
61013 +
61014 +               src = get_input_chunk(cons, prod, intf->rsp, &avail);
61015 +               if (avail == 0)
61016 +                       continue;
61017 +               if (avail > len)
61018 +                       avail = len;
61019 +
61020 +               /* We must read header before we read data. */
61021 +               rmb();
61022 +
61023 +               memcpy(data, src, avail);
61024 +               data += avail;
61025 +               len -= avail;
61026 +
61027 +               /* Other side must not see free space until we've copied out */
61028 +               mb();
61029 +               intf->rsp_cons += avail;
61030 +
61031 +               pr_debug("Finished read of %i bytes (%i to go)\n", avail, len);
61032 +
61033 +               /* Implies mb(): they will see new header. */
61034 +               notify_remote_via_evtchn(xen_start_info->store_evtchn);
61035 +       }
61036 +
61037 +       return 0;
61038 +}
61039 +
61040 +/* Set up interrupt handler off store event channel. */
61041 +int xb_init_comms(void)
61042 +{
61043 +       int err;
61044 +
61045 +       if (xenbus_irq)
61046 +               unbind_from_irqhandler(xenbus_irq, &xb_waitq);
61047 +
61048 +       err = bind_evtchn_to_irqhandler(
61049 +               xen_start_info->store_evtchn, wake_waiting,
61050 +               0, "xenbus", &xb_waitq);
61051 +       if (err <= 0) {
61052 +               printk(KERN_ERR "XENBUS request irq failed %i\n", err);
61053 +               return err;
61054 +       }
61055 +
61056 +       xenbus_irq = err;
61057 +
61058 +       return 0;
61059 +}
61060 +
61061 +/*
61062 + * Local variables:
61063 + *  c-file-style: "linux"
61064 + *  indent-tabs-mode: t
61065 + *  c-indent-level: 8
61066 + *  c-basic-offset: 8
61067 + *  tab-width: 8
61068 + * End:
61069 + */
61070 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/xenbus/xenbus_comms.h linux-2.6.16/drivers/xen/xenbus/xenbus_comms.h
61071 --- linux-2.6.16.orig/drivers/xen/xenbus/xenbus_comms.h 1970-01-01 01:00:00.000000000 +0100
61072 +++ linux-2.6.16/drivers/xen/xenbus/xenbus_comms.h      2006-06-26 09:51:32.000000000 +0200
61073 @@ -0,0 +1,53 @@
61074 +/*
61075 + * Private include for xenbus communications.
61076 + * 
61077 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
61078 + *
61079 + * This program is free software; you can redistribute it and/or
61080 + * modify it under the terms of the GNU General Public License version 2
61081 + * as published by the Free Software Foundation; or, when distributed
61082 + * separately from the Linux kernel or incorporated into other
61083 + * software packages, subject to the following license:
61084 + * 
61085 + * Permission is hereby granted, free of charge, to any person obtaining a copy
61086 + * of this source file (the "Software"), to deal in the Software without
61087 + * restriction, including without limitation the rights to use, copy, modify,
61088 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
61089 + * and to permit persons to whom the Software is furnished to do so, subject to
61090 + * the following conditions:
61091 + * 
61092 + * The above copyright notice and this permission notice shall be included in
61093 + * all copies or substantial portions of the Software.
61094 + * 
61095 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
61096 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
61097 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
61098 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
61099 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
61100 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
61101 + * IN THE SOFTWARE.
61102 + */
61103 +
61104 +#ifndef _XENBUS_COMMS_H
61105 +#define _XENBUS_COMMS_H
61106 +
61107 +int xs_init(void);
61108 +int xb_init_comms(void);
61109 +
61110 +/* Low level routines. */
61111 +int xb_write(const void *data, unsigned len);
61112 +int xb_read(void *data, unsigned len);
61113 +int xs_input_avail(void);
61114 +extern wait_queue_head_t xb_waitq;
61115 +
61116 +#endif /* _XENBUS_COMMS_H */
61117 +
61118 +/*
61119 + * Local variables:
61120 + *  c-file-style: "linux"
61121 + *  indent-tabs-mode: t
61122 + *  c-indent-level: 8
61123 + *  c-basic-offset: 8
61124 + *  tab-width: 8
61125 + * End:
61126 + */
61127 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/xenbus/xenbus_dev.c linux-2.6.16/drivers/xen/xenbus/xenbus_dev.c
61128 --- linux-2.6.16.orig/drivers/xen/xenbus/xenbus_dev.c   1970-01-01 01:00:00.000000000 +0100
61129 +++ linux-2.6.16/drivers/xen/xenbus/xenbus_dev.c        2006-06-26 09:51:32.000000000 +0200
61130 @@ -0,0 +1,252 @@
61131 +/*
61132 + * xenbus_dev.c
61133 + * 
61134 + * Driver giving user-space access to the kernel's xenbus connection
61135 + * to xenstore.
61136 + * 
61137 + * Copyright (c) 2005, Christian Limpach
61138 + * Copyright (c) 2005, Rusty Russell, IBM Corporation
61139 + * 
61140 + * This program is free software; you can redistribute it and/or
61141 + * modify it under the terms of the GNU General Public License version 2
61142 + * as published by the Free Software Foundation; or, when distributed
61143 + * separately from the Linux kernel or incorporated into other
61144 + * software packages, subject to the following license:
61145 + * 
61146 + * Permission is hereby granted, free of charge, to any person obtaining a copy
61147 + * of this source file (the "Software"), to deal in the Software without
61148 + * restriction, including without limitation the rights to use, copy, modify,
61149 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
61150 + * and to permit persons to whom the Software is furnished to do so, subject to
61151 + * the following conditions:
61152 + * 
61153 + * The above copyright notice and this permission notice shall be included in
61154 + * all copies or substantial portions of the Software.
61155 + * 
61156 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
61157 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
61158 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
61159 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
61160 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
61161 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
61162 + * IN THE SOFTWARE.
61163 + */
61164 +
61165 +#include <linux/config.h>
61166 +#include <linux/kernel.h>
61167 +#include <linux/errno.h>
61168 +#include <linux/uio.h>
61169 +#include <linux/notifier.h>
61170 +#include <linux/wait.h>
61171 +#include <linux/fs.h>
61172 +#include <linux/poll.h>
61173 +
61174 +#include "xenbus_comms.h"
61175 +
61176 +#include <asm/uaccess.h>
61177 +#include <asm/hypervisor.h>
61178 +#include <xen/xenbus.h>
61179 +#include <xen/xen_proc.h>
61180 +#include <asm/hypervisor.h>
61181 +
61182 +struct xenbus_dev_transaction {
61183 +       struct list_head list;
61184 +       xenbus_transaction_t handle;
61185 +};
61186 +
61187 +struct xenbus_dev_data {
61188 +       /* In-progress transaction. */
61189 +       struct list_head transactions;
61190 +
61191 +       /* Partial request. */
61192 +       unsigned int len;
61193 +       union {
61194 +               struct xsd_sockmsg msg;
61195 +               char buffer[PAGE_SIZE];
61196 +       } u;
61197 +
61198 +       /* Response queue. */
61199 +#define MASK_READ_IDX(idx) ((idx)&(PAGE_SIZE-1))
61200 +       char read_buffer[PAGE_SIZE];
61201 +       unsigned int read_cons, read_prod;
61202 +       wait_queue_head_t read_waitq;
61203 +};
61204 +
61205 +static struct proc_dir_entry *xenbus_dev_intf;
61206 +
61207 +static ssize_t xenbus_dev_read(struct file *filp,
61208 +                              char __user *ubuf,
61209 +                              size_t len, loff_t *ppos)
61210 +{
61211 +       struct xenbus_dev_data *u = filp->private_data;
61212 +       int i;
61213 +
61214 +       if (wait_event_interruptible(u->read_waitq,
61215 +                                    u->read_prod != u->read_cons))
61216 +               return -EINTR;
61217 +
61218 +       for (i = 0; i < len; i++) {
61219 +               if (u->read_cons == u->read_prod)
61220 +                       break;
61221 +               put_user(u->read_buffer[MASK_READ_IDX(u->read_cons)], ubuf+i);
61222 +               u->read_cons++;
61223 +       }
61224 +
61225 +       return i;
61226 +}
61227 +
61228 +static void queue_reply(struct xenbus_dev_data *u,
61229 +                       char *data, unsigned int len)
61230 +{
61231 +       int i;
61232 +
61233 +       for (i = 0; i < len; i++, u->read_prod++)
61234 +               u->read_buffer[MASK_READ_IDX(u->read_prod)] = data[i];
61235 +
61236 +       BUG_ON((u->read_prod - u->read_cons) > sizeof(u->read_buffer));
61237 +
61238 +       wake_up(&u->read_waitq);
61239 +}
61240 +
61241 +static ssize_t xenbus_dev_write(struct file *filp,
61242 +                               const char __user *ubuf,
61243 +                               size_t len, loff_t *ppos)
61244 +{
61245 +       struct xenbus_dev_data *u = filp->private_data;
61246 +       struct xenbus_dev_transaction *trans = NULL;
61247 +       void *reply;
61248 +
61249 +       if ((len + u->len) > sizeof(u->u.buffer))
61250 +               return -EINVAL;
61251 +
61252 +       if (copy_from_user(u->u.buffer + u->len, ubuf, len) != 0)
61253 +               return -EFAULT;
61254 +
61255 +       u->len += len;
61256 +       if (u->len < (sizeof(u->u.msg) + u->u.msg.len))
61257 +               return len;
61258 +
61259 +       switch (u->u.msg.type) {
61260 +       case XS_TRANSACTION_START:
61261 +       case XS_TRANSACTION_END:
61262 +       case XS_DIRECTORY:
61263 +       case XS_READ:
61264 +       case XS_GET_PERMS:
61265 +       case XS_RELEASE:
61266 +       case XS_GET_DOMAIN_PATH:
61267 +       case XS_WRITE:
61268 +       case XS_MKDIR:
61269 +       case XS_RM:
61270 +       case XS_SET_PERMS:
61271 +               if (u->u.msg.type == XS_TRANSACTION_START) {
61272 +                       trans = kmalloc(sizeof(*trans), GFP_KERNEL);
61273 +                       if (!trans)
61274 +                               return -ENOMEM;
61275 +               }
61276 +
61277 +               reply = xenbus_dev_request_and_reply(&u->u.msg);
61278 +               if (IS_ERR(reply)) {
61279 +                       kfree(trans);
61280 +                       return PTR_ERR(reply);
61281 +               }
61282 +
61283 +               if (u->u.msg.type == XS_TRANSACTION_START) {
61284 +                       trans->handle = simple_strtoul(reply, NULL, 0);
61285 +                       list_add(&trans->list, &u->transactions);
61286 +               } else if (u->u.msg.type == XS_TRANSACTION_END) {
61287 +                       list_for_each_entry(trans, &u->transactions, list)
61288 +                               if (trans->handle == u->u.msg.tx_id)
61289 +                                       break;
61290 +                       BUG_ON(&trans->list == &u->transactions);
61291 +                       list_del(&trans->list);
61292 +                       kfree(trans);
61293 +               }
61294 +               queue_reply(u, (char *)&u->u.msg, sizeof(u->u.msg));
61295 +               queue_reply(u, (char *)reply, u->u.msg.len);
61296 +               kfree(reply);
61297 +               break;
61298 +
61299 +       default:
61300 +               return -EINVAL;
61301 +       }
61302 +
61303 +       u->len = 0;
61304 +       return len;
61305 +}
61306 +
61307 +static int xenbus_dev_open(struct inode *inode, struct file *filp)
61308 +{
61309 +       struct xenbus_dev_data *u;
61310 +
61311 +       if (xen_start_info->store_evtchn == 0)
61312 +               return -ENOENT;
61313 +
61314 +       nonseekable_open(inode, filp);
61315 +
61316 +       u = kzalloc(sizeof(*u), GFP_KERNEL);
61317 +       if (u == NULL)
61318 +               return -ENOMEM;
61319 +
61320 +       INIT_LIST_HEAD(&u->transactions);
61321 +       init_waitqueue_head(&u->read_waitq);
61322 +
61323 +       filp->private_data = u;
61324 +
61325 +       return 0;
61326 +}
61327 +
61328 +static int xenbus_dev_release(struct inode *inode, struct file *filp)
61329 +{
61330 +       struct xenbus_dev_data *u = filp->private_data;
61331 +       struct xenbus_dev_transaction *trans, *tmp;
61332 +
61333 +       list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
61334 +               xenbus_transaction_end(trans->handle, 1);
61335 +               list_del(&trans->list);
61336 +               kfree(trans);
61337 +       }
61338 +
61339 +       kfree(u);
61340 +
61341 +       return 0;
61342 +}
61343 +
61344 +static unsigned int xenbus_dev_poll(struct file *file, poll_table *wait)
61345 +{
61346 +       struct xenbus_dev_data *u = file->private_data;
61347 +
61348 +       poll_wait(file, &u->read_waitq, wait);
61349 +       if (u->read_cons != u->read_prod)
61350 +               return POLLIN | POLLRDNORM;
61351 +       return 0;
61352 +}
61353 +
61354 +static struct file_operations xenbus_dev_file_ops = {
61355 +       .read = xenbus_dev_read,
61356 +       .write = xenbus_dev_write,
61357 +       .open = xenbus_dev_open,
61358 +       .release = xenbus_dev_release,
61359 +       .poll = xenbus_dev_poll,
61360 +};
61361 +
61362 +static int __init
61363 +xenbus_dev_init(void)
61364 +{
61365 +       xenbus_dev_intf = create_xen_proc_entry("xenbus", 0400);
61366 +       if (xenbus_dev_intf)
61367 +               xenbus_dev_intf->proc_fops = &xenbus_dev_file_ops;
61368 +
61369 +       return 0;
61370 +}
61371 +
61372 +__initcall(xenbus_dev_init);
61373 +
61374 +/*
61375 + * Local variables:
61376 + *  c-file-style: "linux"
61377 + *  indent-tabs-mode: t
61378 + *  c-indent-level: 8
61379 + *  c-basic-offset: 8
61380 + *  tab-width: 8
61381 + * End:
61382 + */
61383 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/xenbus/xenbus_probe.c linux-2.6.16/drivers/xen/xenbus/xenbus_probe.c
61384 --- linux-2.6.16.orig/drivers/xen/xenbus/xenbus_probe.c 1970-01-01 01:00:00.000000000 +0100
61385 +++ linux-2.6.16/drivers/xen/xenbus/xenbus_probe.c      2006-06-26 09:51:32.000000000 +0200
61386 @@ -0,0 +1,1083 @@
61387 +/******************************************************************************
61388 + * Talks to Xen Store to figure out what devices we have.
61389 + *
61390 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
61391 + * Copyright (C) 2005 Mike Wray, Hewlett-Packard
61392 + * Copyright (C) 2005 XenSource Ltd
61393 + * 
61394 + * This program is free software; you can redistribute it and/or
61395 + * modify it under the terms of the GNU General Public License version 2
61396 + * as published by the Free Software Foundation; or, when distributed
61397 + * separately from the Linux kernel or incorporated into other
61398 + * software packages, subject to the following license:
61399 + * 
61400 + * Permission is hereby granted, free of charge, to any person obtaining a copy
61401 + * of this source file (the "Software"), to deal in the Software without
61402 + * restriction, including without limitation the rights to use, copy, modify,
61403 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
61404 + * and to permit persons to whom the Software is furnished to do so, subject to
61405 + * the following conditions:
61406 + * 
61407 + * The above copyright notice and this permission notice shall be included in
61408 + * all copies or substantial portions of the Software.
61409 + * 
61410 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
61411 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
61412 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
61413 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
61414 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
61415 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
61416 + * IN THE SOFTWARE.
61417 + */
61418 +
61419 +#define DPRINTK(fmt, args...) \
61420 +    pr_debug("xenbus_probe (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
61421 +
61422 +#include <linux/kernel.h>
61423 +#include <linux/err.h>
61424 +#include <linux/string.h>
61425 +#include <linux/ctype.h>
61426 +#include <linux/fcntl.h>
61427 +#include <linux/mm.h>
61428 +#include <linux/notifier.h>
61429 +#include <linux/kthread.h>
61430 +
61431 +#include <asm/io.h>
61432 +#include <asm/page.h>
61433 +#include <asm/pgtable.h>
61434 +#include <asm/hypervisor.h>
61435 +#include <xen/xenbus.h>
61436 +#include <xen/xen_proc.h>
61437 +#include <xen/evtchn.h>
61438 +#include <xen/features.h>
61439 +
61440 +#include "xenbus_comms.h"
61441 +
61442 +extern struct mutex xenwatch_mutex;
61443 +
61444 +static struct notifier_block *xenstore_chain;
61445 +
61446 +/* If something in array of ids matches this device, return it. */
61447 +static const struct xenbus_device_id *
61448 +match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
61449 +{
61450 +       for (; *arr->devicetype != '\0'; arr++) {
61451 +               if (!strcmp(arr->devicetype, dev->devicetype))
61452 +                       return arr;
61453 +       }
61454 +       return NULL;
61455 +}
61456 +
61457 +static int xenbus_match(struct device *_dev, struct device_driver *_drv)
61458 +{
61459 +       struct xenbus_driver *drv = to_xenbus_driver(_drv);
61460 +
61461 +       if (!drv->ids)
61462 +               return 0;
61463 +
61464 +       return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
61465 +}
61466 +
61467 +struct xen_bus_type
61468 +{
61469 +       char *root;
61470 +       unsigned int levels;
61471 +       int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename);
61472 +       int (*probe)(const char *type, const char *dir);
61473 +       struct bus_type bus;
61474 +       struct device dev;
61475 +};
61476 +
61477 +
61478 +/* device/<type>/<id> => <type>-<id> */
61479 +static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
61480 +{
61481 +       nodename = strchr(nodename, '/');
61482 +       if (!nodename || strlen(nodename + 1) >= BUS_ID_SIZE) {
61483 +               printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
61484 +               return -EINVAL;
61485 +       }
61486 +
61487 +       strlcpy(bus_id, nodename + 1, BUS_ID_SIZE);
61488 +       if (!strchr(bus_id, '/')) {
61489 +               printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
61490 +               return -EINVAL;
61491 +       }
61492 +       *strchr(bus_id, '/') = '-';
61493 +       return 0;
61494 +}
61495 +
61496 +
61497 +static void free_otherend_details(struct xenbus_device *dev)
61498 +{
61499 +       kfree(dev->otherend);
61500 +       dev->otherend = NULL;
61501 +}
61502 +
61503 +
61504 +static void free_otherend_watch(struct xenbus_device *dev)
61505 +{
61506 +       if (dev->otherend_watch.node) {
61507 +               unregister_xenbus_watch(&dev->otherend_watch);
61508 +               kfree(dev->otherend_watch.node);
61509 +               dev->otherend_watch.node = NULL;
61510 +       }
61511 +}
61512 +
61513 +
61514 +static int read_otherend_details(struct xenbus_device *xendev,
61515 +                                char *id_node, char *path_node)
61516 +{
61517 +       int err = xenbus_gather(XBT_NULL, xendev->nodename,
61518 +                               id_node, "%i", &xendev->otherend_id,
61519 +                               path_node, NULL, &xendev->otherend,
61520 +                               NULL);
61521 +       if (err) {
61522 +               xenbus_dev_fatal(xendev, err,
61523 +                                "reading other end details from %s",
61524 +                                xendev->nodename);
61525 +               return err;
61526 +       }
61527 +       if (strlen(xendev->otherend) == 0 ||
61528 +           !xenbus_exists(XBT_NULL, xendev->otherend, "")) {
61529 +               xenbus_dev_fatal(xendev, -ENOENT, "missing other end from %s",
61530 +                                xendev->nodename);
61531 +               free_otherend_details(xendev);
61532 +               return -ENOENT;
61533 +       }
61534 +
61535 +       return 0;
61536 +}
61537 +
61538 +
61539 +static int read_backend_details(struct xenbus_device *xendev)
61540 +{
61541 +       return read_otherend_details(xendev, "backend-id", "backend");
61542 +}
61543 +
61544 +
61545 +static int read_frontend_details(struct xenbus_device *xendev)
61546 +{
61547 +       return read_otherend_details(xendev, "frontend-id", "frontend");
61548 +}
61549 +
61550 +
61551 +/* Bus type for frontend drivers. */
61552 +static int xenbus_probe_frontend(const char *type, const char *name);
61553 +static struct xen_bus_type xenbus_frontend = {
61554 +       .root = "device",
61555 +       .levels = 2,            /* device/type/<id> */
61556 +       .get_bus_id = frontend_bus_id,
61557 +       .probe = xenbus_probe_frontend,
61558 +       .bus = {
61559 +               .name  = "xen",
61560 +               .match = xenbus_match,
61561 +       },
61562 +       .dev = {
61563 +               .bus_id = "xen",
61564 +       },
61565 +};
61566 +
61567 +/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
61568 +static int backend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
61569 +{
61570 +       int domid, err;
61571 +       const char *devid, *type, *frontend;
61572 +       unsigned int typelen;
61573 +
61574 +       type = strchr(nodename, '/');
61575 +       if (!type)
61576 +               return -EINVAL;
61577 +       type++;
61578 +       typelen = strcspn(type, "/");
61579 +       if (!typelen || type[typelen] != '/')
61580 +               return -EINVAL;
61581 +
61582 +       devid = strrchr(nodename, '/') + 1;
61583 +
61584 +       err = xenbus_gather(XBT_NULL, nodename, "frontend-id", "%i", &domid,
61585 +                           "frontend", NULL, &frontend,
61586 +                           NULL);
61587 +       if (err)
61588 +               return err;
61589 +       if (strlen(frontend) == 0)
61590 +               err = -ERANGE;
61591 +       if (!err && !xenbus_exists(XBT_NULL, frontend, ""))
61592 +               err = -ENOENT;
61593 +
61594 +       kfree(frontend);
61595 +
61596 +       if (err)
61597 +               return err;
61598 +
61599 +       if (snprintf(bus_id, BUS_ID_SIZE,
61600 +                    "%.*s-%i-%s", typelen, type, domid, devid) >= BUS_ID_SIZE)
61601 +               return -ENOSPC;
61602 +       return 0;
61603 +}
61604 +
61605 +static int xenbus_uevent_backend(struct device *dev, char **envp,
61606 +                                int num_envp, char *buffer, int buffer_size);
61607 +static int xenbus_probe_backend(const char *type, const char *domid);
61608 +static struct xen_bus_type xenbus_backend = {
61609 +       .root = "backend",
61610 +       .levels = 3,            /* backend/type/<frontend>/<id> */
61611 +       .get_bus_id = backend_bus_id,
61612 +       .probe = xenbus_probe_backend,
61613 +       .bus = {
61614 +               .name  = "xen-backend",
61615 +               .match = xenbus_match,
61616 +               .uevent = xenbus_uevent_backend,
61617 +       },
61618 +       .dev = {
61619 +               .bus_id = "xen-backend",
61620 +       },
61621 +};
61622 +
61623 +static int xenbus_uevent_backend(struct device *dev, char **envp,
61624 +                                int num_envp, char *buffer, int buffer_size)
61625 +{
61626 +       struct xenbus_device *xdev;
61627 +       struct xenbus_driver *drv;
61628 +       int i = 0;
61629 +       int length = 0;
61630 +
61631 +       DPRINTK("");
61632 +
61633 +       if (dev == NULL)
61634 +               return -ENODEV;
61635 +
61636 +       xdev = to_xenbus_device(dev);
61637 +       if (xdev == NULL)
61638 +               return -ENODEV;
61639 +
61640 +       /* stuff we want to pass to /sbin/hotplug */
61641 +       add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
61642 +                      "XENBUS_TYPE=%s", xdev->devicetype);
61643 +
61644 +       add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
61645 +                      "XENBUS_PATH=%s", xdev->nodename);
61646 +
61647 +       add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
61648 +                      "XENBUS_BASE_PATH=%s", xenbus_backend.root);
61649 +
61650 +       /* terminate, set to next free slot, shrink available space */
61651 +       envp[i] = NULL;
61652 +       envp = &envp[i];
61653 +       num_envp -= i;
61654 +       buffer = &buffer[length];
61655 +       buffer_size -= length;
61656 +
61657 +       if (dev->driver) {
61658 +               drv = to_xenbus_driver(dev->driver);
61659 +               if (drv && drv->uevent)
61660 +                       return drv->uevent(xdev, envp, num_envp, buffer,
61661 +                                          buffer_size);
61662 +       }
61663 +
61664 +       return 0;
61665 +}
61666 +
61667 +static void otherend_changed(struct xenbus_watch *watch,
61668 +                            const char **vec, unsigned int len)
61669 +{
61670 +       struct xenbus_device *dev =
61671 +               container_of(watch, struct xenbus_device, otherend_watch);
61672 +       struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
61673 +       XenbusState state;
61674 +
61675 +       /* Protect us against watches firing on old details when the otherend
61676 +          details change, say immediately after a resume. */
61677 +       if (!dev->otherend ||
61678 +           strncmp(dev->otherend, vec[XS_WATCH_PATH],
61679 +                   strlen(dev->otherend))) {
61680 +               DPRINTK("Ignoring watch at %s", vec[XS_WATCH_PATH]);
61681 +               return;
61682 +       }
61683 +
61684 +       state = xenbus_read_driver_state(dev->otherend);
61685 +
61686 +       DPRINTK("state is %d, %s, %s",
61687 +               state, dev->otherend_watch.node, vec[XS_WATCH_PATH]);
61688 +       if (drv->otherend_changed)
61689 +               drv->otherend_changed(dev, state);
61690 +}
61691 +
61692 +
61693 +static int talk_to_otherend(struct xenbus_device *dev)
61694 +{
61695 +       struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
61696 +
61697 +       free_otherend_watch(dev);
61698 +       free_otherend_details(dev);
61699 +
61700 +       return drv->read_otherend_details(dev);
61701 +}
61702 +
61703 +
61704 +static int watch_otherend(struct xenbus_device *dev)
61705 +{
61706 +       return xenbus_watch_path2(dev, dev->otherend, "state",
61707 +                                 &dev->otherend_watch, otherend_changed);
61708 +}
61709 +
61710 +
61711 +static int xenbus_dev_probe(struct device *_dev)
61712 +{
61713 +       struct xenbus_device *dev = to_xenbus_device(_dev);
61714 +       struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
61715 +       const struct xenbus_device_id *id;
61716 +       int err;
61717 +
61718 +       DPRINTK("");
61719 +
61720 +       if (!drv->probe) {
61721 +               err = -ENODEV;
61722 +               goto fail;
61723 +       }
61724 +
61725 +       id = match_device(drv->ids, dev);
61726 +       if (!id) {
61727 +               err = -ENODEV;
61728 +               goto fail;
61729 +       }
61730 +
61731 +       err = talk_to_otherend(dev);
61732 +       if (err) {
61733 +               printk(KERN_WARNING
61734 +                      "xenbus_probe: talk_to_otherend on %s failed.\n",
61735 +                      dev->nodename);
61736 +               return err;
61737 +       }
61738 +
61739 +       err = drv->probe(dev, id);
61740 +       if (err)
61741 +               goto fail;
61742 +
61743 +       err = watch_otherend(dev);
61744 +       if (err) {
61745 +               printk(KERN_WARNING
61746 +                      "xenbus_probe: watch_otherend on %s failed.\n",
61747 +                      dev->nodename);
61748 +               return err;
61749 +       }
61750 +
61751 +       return 0;
61752 +fail:
61753 +       xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);
61754 +       xenbus_switch_state(dev, XenbusStateClosed);
61755 +       return -ENODEV;
61756 +}
61757 +
61758 +static int xenbus_dev_remove(struct device *_dev)
61759 +{
61760 +       struct xenbus_device *dev = to_xenbus_device(_dev);
61761 +       struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
61762 +
61763 +       DPRINTK("");
61764 +
61765 +       free_otherend_watch(dev);
61766 +       free_otherend_details(dev);
61767 +
61768 +       if (drv->remove)
61769 +               drv->remove(dev);
61770 +
61771 +       xenbus_switch_state(dev, XenbusStateClosed);
61772 +       return 0;
61773 +}
61774 +
61775 +static int xenbus_register_driver_common(struct xenbus_driver *drv,
61776 +                                        struct xen_bus_type *bus)
61777 +{
61778 +       int ret;
61779 +
61780 +       drv->driver.name = drv->name;
61781 +       drv->driver.bus = &bus->bus;
61782 +       drv->driver.owner = drv->owner;
61783 +       drv->driver.probe = xenbus_dev_probe;
61784 +       drv->driver.remove = xenbus_dev_remove;
61785 +
61786 +       mutex_lock(&xenwatch_mutex);
61787 +       ret = driver_register(&drv->driver);
61788 +       mutex_unlock(&xenwatch_mutex);
61789 +       return ret;
61790 +}
61791 +
61792 +int xenbus_register_frontend(struct xenbus_driver *drv)
61793 +{
61794 +       drv->read_otherend_details = read_backend_details;
61795 +
61796 +       return xenbus_register_driver_common(drv, &xenbus_frontend);
61797 +}
61798 +EXPORT_SYMBOL_GPL(xenbus_register_frontend);
61799 +
61800 +int xenbus_register_backend(struct xenbus_driver *drv)
61801 +{
61802 +       drv->read_otherend_details = read_frontend_details;
61803 +
61804 +       return xenbus_register_driver_common(drv, &xenbus_backend);
61805 +}
61806 +EXPORT_SYMBOL_GPL(xenbus_register_backend);
61807 +
61808 +void xenbus_unregister_driver(struct xenbus_driver *drv)
61809 +{
61810 +       driver_unregister(&drv->driver);
61811 +}
61812 +EXPORT_SYMBOL_GPL(xenbus_unregister_driver);
61813 +
61814 +struct xb_find_info
61815 +{
61816 +       struct xenbus_device *dev;
61817 +       const char *nodename;
61818 +};
61819 +
61820 +static int cmp_dev(struct device *dev, void *data)
61821 +{
61822 +       struct xenbus_device *xendev = to_xenbus_device(dev);
61823 +       struct xb_find_info *info = data;
61824 +
61825 +       if (!strcmp(xendev->nodename, info->nodename)) {
61826 +               info->dev = xendev;
61827 +               get_device(dev);
61828 +               return 1;
61829 +       }
61830 +       return 0;
61831 +}
61832 +
61833 +struct xenbus_device *xenbus_device_find(const char *nodename,
61834 +                                        struct bus_type *bus)
61835 +{
61836 +       struct xb_find_info info = { .dev = NULL, .nodename = nodename };
61837 +
61838 +       bus_for_each_dev(bus, NULL, &info, cmp_dev);
61839 +       return info.dev;
61840 +}
61841 +
61842 +static int cleanup_dev(struct device *dev, void *data)
61843 +{
61844 +       struct xenbus_device *xendev = to_xenbus_device(dev);
61845 +       struct xb_find_info *info = data;
61846 +       int len = strlen(info->nodename);
61847 +
61848 +       DPRINTK("%s", info->nodename);
61849 +
61850 +       /* Match the info->nodename path, or any subdirectory of that path. */
61851 +       if (strncmp(xendev->nodename, info->nodename, len))
61852 +               return 0;
61853 +
61854 +       /* If the node name is longer, ensure it really is a subdirectory. */
61855 +       if ((strlen(xendev->nodename) > len) && (xendev->nodename[len] != '/'))
61856 +               return 0;
61857 +
61858 +       info->dev = xendev;
61859 +       get_device(dev);
61860 +       return 1;
61861 +}
61862 +
61863 +static void xenbus_cleanup_devices(const char *path, struct bus_type *bus)
61864 +{
61865 +       struct xb_find_info info = { .nodename = path };
61866 +
61867 +       do {
61868 +               info.dev = NULL;
61869 +               bus_for_each_dev(bus, NULL, &info, cleanup_dev);
61870 +               if (info.dev) {
61871 +                       device_unregister(&info.dev->dev);
61872 +                       put_device(&info.dev->dev);
61873 +               }
61874 +       } while (info.dev);
61875 +}
61876 +
61877 +static void xenbus_dev_release(struct device *dev)
61878 +{
61879 +       if (dev)
61880 +               kfree(to_xenbus_device(dev));
61881 +}
61882 +
61883 +/* Simplified asprintf. */
61884 +char *kasprintf(const char *fmt, ...)
61885 +{
61886 +       va_list ap;
61887 +       unsigned int len;
61888 +       char *p, dummy[1];
61889 +
61890 +       va_start(ap, fmt);
61891 +       /* FIXME: vsnprintf has a bug, NULL should work */
61892 +       len = vsnprintf(dummy, 0, fmt, ap);
61893 +       va_end(ap);
61894 +
61895 +       p = kmalloc(len + 1, GFP_KERNEL);
61896 +       if (!p)
61897 +               return NULL;
61898 +       va_start(ap, fmt);
61899 +       vsprintf(p, fmt, ap);
61900 +       va_end(ap);
61901 +       return p;
61902 +}
61903 +
61904 +static ssize_t xendev_show_nodename(struct device *dev,
61905 +                                   struct device_attribute *attr, char *buf)
61906 +{
61907 +       return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
61908 +}
61909 +DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL);
61910 +
61911 +static ssize_t xendev_show_devtype(struct device *dev,
61912 +                                  struct device_attribute *attr, char *buf)
61913 +{
61914 +       return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
61915 +}
61916 +DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
61917 +
61918 +
61919 +static int xenbus_probe_node(struct xen_bus_type *bus,
61920 +                            const char *type,
61921 +                            const char *nodename)
61922 +{
61923 +       int err;
61924 +       struct xenbus_device *xendev;
61925 +       size_t stringlen;
61926 +       char *tmpstring;
61927 +
61928 +       XenbusState state = xenbus_read_driver_state(nodename);
61929 +
61930 +       if (state != XenbusStateInitialising) {
61931 +               /* Device is not new, so ignore it.  This can happen if a
61932 +                  device is going away after switching to Closed.  */
61933 +               return 0;
61934 +       }
61935 +
61936 +       stringlen = strlen(nodename) + 1 + strlen(type) + 1;
61937 +       xendev = kzalloc(sizeof(*xendev) + stringlen, GFP_KERNEL);
61938 +       if (!xendev)
61939 +               return -ENOMEM;
61940 +
61941 +       /* Copy the strings into the extra space. */
61942 +
61943 +       tmpstring = (char *)(xendev + 1);
61944 +       strcpy(tmpstring, nodename);
61945 +       xendev->nodename = tmpstring;
61946 +
61947 +       tmpstring += strlen(tmpstring) + 1;
61948 +       strcpy(tmpstring, type);
61949 +       xendev->devicetype = tmpstring;
61950 +
61951 +       xendev->dev.parent = &bus->dev;
61952 +       xendev->dev.bus = &bus->bus;
61953 +       xendev->dev.release = xenbus_dev_release;
61954 +
61955 +       err = bus->get_bus_id(xendev->dev.bus_id, xendev->nodename);
61956 +       if (err)
61957 +               goto fail;
61958 +
61959 +       /* Register with generic device framework. */
61960 +       err = device_register(&xendev->dev);
61961 +       if (err)
61962 +               goto fail;
61963 +
61964 +       device_create_file(&xendev->dev, &dev_attr_nodename);
61965 +       device_create_file(&xendev->dev, &dev_attr_devtype);
61966 +
61967 +       return 0;
61968 +fail:
61969 +       kfree(xendev);
61970 +       return err;
61971 +}
61972 +
61973 +/* device/<typename>/<name> */
61974 +static int xenbus_probe_frontend(const char *type, const char *name)
61975 +{
61976 +       char *nodename;
61977 +       int err;
61978 +
61979 +       nodename = kasprintf("%s/%s/%s", xenbus_frontend.root, type, name);
61980 +       if (!nodename)
61981 +               return -ENOMEM;
61982 +
61983 +       DPRINTK("%s", nodename);
61984 +
61985 +       err = xenbus_probe_node(&xenbus_frontend, type, nodename);
61986 +       kfree(nodename);
61987 +       return err;
61988 +}
61989 +
61990 +/* backend/<typename>/<frontend-uuid>/<name> */
61991 +static int xenbus_probe_backend_unit(const char *dir,
61992 +                                    const char *type,
61993 +                                    const char *name)
61994 +{
61995 +       char *nodename;
61996 +       int err;
61997 +
61998 +       nodename = kasprintf("%s/%s", dir, name);
61999 +       if (!nodename)
62000 +               return -ENOMEM;
62001 +
62002 +       DPRINTK("%s\n", nodename);
62003 +
62004 +       err = xenbus_probe_node(&xenbus_backend, type, nodename);
62005 +       kfree(nodename);
62006 +       return err;
62007 +}
62008 +
62009 +/* backend/<typename>/<frontend-domid> */
62010 +static int xenbus_probe_backend(const char *type, const char *domid)
62011 +{
62012 +       char *nodename;
62013 +       int err = 0;
62014 +       char **dir;
62015 +       unsigned int i, dir_n = 0;
62016 +
62017 +       DPRINTK("");
62018 +
62019 +       nodename = kasprintf("%s/%s/%s", xenbus_backend.root, type, domid);
62020 +       if (!nodename)
62021 +               return -ENOMEM;
62022 +
62023 +       dir = xenbus_directory(XBT_NULL, nodename, "", &dir_n);
62024 +       if (IS_ERR(dir)) {
62025 +               kfree(nodename);
62026 +               return PTR_ERR(dir);
62027 +       }
62028 +
62029 +       for (i = 0; i < dir_n; i++) {
62030 +               err = xenbus_probe_backend_unit(nodename, type, dir[i]);
62031 +               if (err)
62032 +                       break;
62033 +       }
62034 +       kfree(dir);
62035 +       kfree(nodename);
62036 +       return err;
62037 +}
62038 +
62039 +static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
62040 +{
62041 +       int err = 0;
62042 +       char **dir;
62043 +       unsigned int dir_n = 0;
62044 +       int i;
62045 +
62046 +       dir = xenbus_directory(XBT_NULL, bus->root, type, &dir_n);
62047 +       if (IS_ERR(dir))
62048 +               return PTR_ERR(dir);
62049 +
62050 +       for (i = 0; i < dir_n; i++) {
62051 +               err = bus->probe(type, dir[i]);
62052 +               if (err)
62053 +                       break;
62054 +       }
62055 +       kfree(dir);
62056 +       return err;
62057 +}
62058 +
62059 +static int xenbus_probe_devices(struct xen_bus_type *bus)
62060 +{
62061 +       int err = 0;
62062 +       char **dir;
62063 +       unsigned int i, dir_n;
62064 +
62065 +       dir = xenbus_directory(XBT_NULL, bus->root, "", &dir_n);
62066 +       if (IS_ERR(dir))
62067 +               return PTR_ERR(dir);
62068 +
62069 +       for (i = 0; i < dir_n; i++) {
62070 +               err = xenbus_probe_device_type(bus, dir[i]);
62071 +               if (err)
62072 +                       break;
62073 +       }
62074 +       kfree(dir);
62075 +       return err;
62076 +}
62077 +
62078 +static unsigned int char_count(const char *str, char c)
62079 +{
62080 +       unsigned int i, ret = 0;
62081 +
62082 +       for (i = 0; str[i]; i++)
62083 +               if (str[i] == c)
62084 +                       ret++;
62085 +       return ret;
62086 +}
62087 +
62088 +static int strsep_len(const char *str, char c, unsigned int len)
62089 +{
62090 +       unsigned int i;
62091 +
62092 +       for (i = 0; str[i]; i++)
62093 +               if (str[i] == c) {
62094 +                       if (len == 0)
62095 +                               return i;
62096 +                       len--;
62097 +               }
62098 +       return (len == 0) ? i : -ERANGE;
62099 +}
62100 +
62101 +static void dev_changed(const char *node, struct xen_bus_type *bus)
62102 +{
62103 +       int exists, rootlen;
62104 +       struct xenbus_device *dev;
62105 +       char type[BUS_ID_SIZE];
62106 +       const char *p, *root;
62107 +
62108 +       if (char_count(node, '/') < 2)
62109 +               return;
62110 +
62111 +       exists = xenbus_exists(XBT_NULL, node, "");
62112 +       if (!exists) {
62113 +               xenbus_cleanup_devices(node, &bus->bus);
62114 +               return;
62115 +       }
62116 +
62117 +       /* backend/<type>/... or device/<type>/... */
62118 +       p = strchr(node, '/') + 1;
62119 +       snprintf(type, BUS_ID_SIZE, "%.*s", (int)strcspn(p, "/"), p);
62120 +       type[BUS_ID_SIZE-1] = '\0';
62121 +
62122 +       rootlen = strsep_len(node, '/', bus->levels);
62123 +       if (rootlen < 0)
62124 +               return;
62125 +       root = kasprintf("%.*s", rootlen, node);
62126 +       if (!root)
62127 +               return;
62128 +
62129 +       dev = xenbus_device_find(root, &bus->bus);
62130 +       if (!dev)
62131 +               xenbus_probe_node(bus, type, root);
62132 +       else
62133 +               put_device(&dev->dev);
62134 +
62135 +       kfree(root);
62136 +}
62137 +
62138 +static void frontend_changed(struct xenbus_watch *watch,
62139 +                            const char **vec, unsigned int len)
62140 +{
62141 +       DPRINTK("");
62142 +
62143 +       dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
62144 +}
62145 +
62146 +static void backend_changed(struct xenbus_watch *watch,
62147 +                           const char **vec, unsigned int len)
62148 +{
62149 +       DPRINTK("");
62150 +
62151 +       dev_changed(vec[XS_WATCH_PATH], &xenbus_backend);
62152 +}
62153 +
62154 +/* We watch for devices appearing and vanishing. */
62155 +static struct xenbus_watch fe_watch = {
62156 +       .node = "device",
62157 +       .callback = frontend_changed,
62158 +};
62159 +
62160 +static struct xenbus_watch be_watch = {
62161 +       .node = "backend",
62162 +       .callback = backend_changed,
62163 +};
62164 +
62165 +static int suspend_dev(struct device *dev, void *data)
62166 +{
62167 +       int err = 0;
62168 +       struct xenbus_driver *drv;
62169 +       struct xenbus_device *xdev;
62170 +
62171 +       DPRINTK("");
62172 +
62173 +       if (dev->driver == NULL)
62174 +               return 0;
62175 +       drv = to_xenbus_driver(dev->driver);
62176 +       xdev = container_of(dev, struct xenbus_device, dev);
62177 +       if (drv->suspend)
62178 +               err = drv->suspend(xdev);
62179 +       if (err)
62180 +               printk(KERN_WARNING
62181 +                      "xenbus: suspend %s failed: %i\n", dev->bus_id, err);
62182 +       return 0;
62183 +}
62184 +
62185 +static int resume_dev(struct device *dev, void *data)
62186 +{
62187 +       int err;
62188 +       struct xenbus_driver *drv;
62189 +       struct xenbus_device *xdev;
62190 +
62191 +       DPRINTK("");
62192 +
62193 +       if (dev->driver == NULL)
62194 +               return 0;
62195 +       drv = to_xenbus_driver(dev->driver);
62196 +       xdev = container_of(dev, struct xenbus_device, dev);
62197 +
62198 +       err = talk_to_otherend(xdev);
62199 +       if (err) {
62200 +               printk(KERN_WARNING
62201 +                      "xenbus: resume (talk_to_otherend) %s failed: %i\n",
62202 +                      dev->bus_id, err);
62203 +               return err;
62204 +       }
62205 +
62206 +       err = watch_otherend(xdev);
62207 +       if (err) {
62208 +               printk(KERN_WARNING
62209 +                      "xenbus_probe: resume (watch_otherend) %s failed: "
62210 +                      "%d.\n", dev->bus_id, err);
62211 +               return err;
62212 +       }
62213 +
62214 +       if (drv->resume)
62215 +               err = drv->resume(xdev);
62216 +       if (err)
62217 +               printk(KERN_WARNING
62218 +                      "xenbus: resume %s failed: %i\n", dev->bus_id, err);
62219 +       return err;
62220 +}
62221 +
62222 +void xenbus_suspend(void)
62223 +{
62224 +       DPRINTK("");
62225 +
62226 +       bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
62227 +       bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, suspend_dev);
62228 +       xs_suspend();
62229 +}
62230 +EXPORT_SYMBOL_GPL(xenbus_suspend);
62231 +
62232 +void xenbus_resume(void)
62233 +{
62234 +       xb_init_comms();
62235 +       xs_resume();
62236 +       bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
62237 +       bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, resume_dev);
62238 +}
62239 +EXPORT_SYMBOL_GPL(xenbus_resume);
62240 +
62241 +
62242 +/* A flag to determine if xenstored is 'ready' (i.e. has started) */
62243 +int xenstored_ready = 0;
62244 +
62245 +
62246 +int register_xenstore_notifier(struct notifier_block *nb)
62247 +{
62248 +       int ret = 0;
62249 +
62250 +       if (xenstored_ready > 0)
62251 +               ret = nb->notifier_call(nb, 0, NULL);
62252 +       else
62253 +               notifier_chain_register(&xenstore_chain, nb);
62254 +
62255 +       return ret;
62256 +}
62257 +EXPORT_SYMBOL_GPL(register_xenstore_notifier);
62258 +
62259 +void unregister_xenstore_notifier(struct notifier_block *nb)
62260 +{
62261 +       notifier_chain_unregister(&xenstore_chain, nb);
62262 +}
62263 +EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
62264 +
62265 +
62266 +static int all_devices_ready_(struct device *dev, void *data)
62267 +{
62268 +       struct xenbus_device *xendev = to_xenbus_device(dev);
62269 +       int *result = data;
62270 +
62271 +       if (xendev->state != XenbusStateConnected) {
62272 +               result = 0;
62273 +               return 1;
62274 +       }
62275 +
62276 +       return 0;
62277 +}
62278 +
62279 +
62280 +static int all_devices_ready(void)
62281 +{
62282 +       int ready = 1;
62283 +       bus_for_each_dev(&xenbus_frontend.bus, NULL, &ready,
62284 +                        all_devices_ready_);
62285 +       return ready;
62286 +}
62287 +
62288 +
62289 +void xenbus_probe(void *unused)
62290 +{
62291 +       int i;
62292 +
62293 +       BUG_ON((xenstored_ready <= 0));
62294 +
62295 +       /* Enumerate devices in xenstore. */
62296 +       xenbus_probe_devices(&xenbus_frontend);
62297 +       xenbus_probe_devices(&xenbus_backend);
62298 +
62299 +       /* Watch for changes. */
62300 +       register_xenbus_watch(&fe_watch);
62301 +       register_xenbus_watch(&be_watch);
62302 +
62303 +       /* Notify others that xenstore is up */
62304 +       notifier_call_chain(&xenstore_chain, 0, NULL);
62305 +
62306 +       /* On a 10 second timeout, waiting for all devices currently
62307 +          configured.  We need to do this to guarantee that the filesystems
62308 +          and / or network devices needed for boot are available, before we
62309 +          can allow the boot to proceed.
62310 +
62311 +          A possible improvement here would be to have the tools add a
62312 +          per-device flag to the store entry, indicating whether it is needed
62313 +          at boot time.  This would allow people who knew what they were
62314 +          doing to accelerate their boot slightly, but of course needs tools
62315 +          or manual intervention to set up those flags correctly.
62316 +        */
62317 +       for (i = 0; i < 10 * HZ; i++) {
62318 +               if (all_devices_ready())
62319 +                       return;
62320 +
62321 +               set_current_state(TASK_INTERRUPTIBLE);
62322 +               schedule_timeout(1);
62323 +       }
62324 +
62325 +       printk(KERN_WARNING
62326 +              "XENBUS: Timeout connecting to devices!\n");
62327 +}
62328 +
62329 +
62330 +static struct file_operations xsd_kva_fops;
62331 +static struct proc_dir_entry *xsd_kva_intf;
62332 +static struct proc_dir_entry *xsd_port_intf;
62333 +
62334 +static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma)
62335 +{
62336 +       size_t size = vma->vm_end - vma->vm_start;
62337 +
62338 +       if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
62339 +               return -EINVAL;
62340 +
62341 +       if (remap_pfn_range(vma, vma->vm_start,
62342 +                           mfn_to_pfn(xen_start_info->store_mfn),
62343 +                           size, vma->vm_page_prot))
62344 +               return -EAGAIN;
62345 +
62346 +       return 0;
62347 +}
62348 +
62349 +static int xsd_kva_read(char *page, char **start, off_t off,
62350 +                        int count, int *eof, void *data)
62351 +{
62352 +       int len;
62353 +
62354 +       len  = sprintf(page, "0x%p", mfn_to_virt(xen_start_info->store_mfn));
62355 +       *eof = 1;
62356 +       return len;
62357 +}
62358 +
62359 +static int xsd_port_read(char *page, char **start, off_t off,
62360 +                        int count, int *eof, void *data)
62361 +{
62362 +       int len;
62363 +
62364 +       len  = sprintf(page, "%d", xen_start_info->store_evtchn);
62365 +       *eof = 1;
62366 +       return len;
62367 +}
62368 +
62369 +
62370 +static int __init xenbus_probe_init(void)
62371 +{
62372 +       int err = 0, dom0;
62373 +       unsigned long page = 0;
62374 +
62375 +       DPRINTK("");
62376 +
62377 +       if (xen_init() < 0) {
62378 +               DPRINTK("failed");
62379 +               return -ENODEV;
62380 +       }
62381 +
62382 +       /* Register ourselves with the kernel bus subsystem */
62383 +       bus_register(&xenbus_frontend.bus);
62384 +       bus_register(&xenbus_backend.bus);
62385 +
62386 +       /*
62387 +        * Domain0 doesn't have a store_evtchn or store_mfn yet.
62388 +        */
62389 +       dom0 = (xen_start_info->store_evtchn == 0);
62390 +
62391 +       if (dom0) {
62392 +               evtchn_op_t op = { 0 };
62393 +
62394 +               /* Allocate page. */
62395 +               page = get_zeroed_page(GFP_KERNEL);
62396 +               if (!page)
62397 +                       return -ENOMEM;
62398 +
62399 +               xen_start_info->store_mfn =
62400 +                       pfn_to_mfn(virt_to_phys((void *)page) >>
62401 +                                  PAGE_SHIFT);
62402 +
62403 +               /* Next allocate a local port which xenstored can bind to */
62404 +               op.cmd = EVTCHNOP_alloc_unbound;
62405 +               op.u.alloc_unbound.dom        = DOMID_SELF;
62406 +               op.u.alloc_unbound.remote_dom = 0;
62407 +
62408 +               err = HYPERVISOR_event_channel_op(&op);
62409 +               if (err == -ENOSYS)
62410 +                       goto err;
62411 +               BUG_ON(err);
62412 +               xen_start_info->store_evtchn = op.u.alloc_unbound.port;
62413 +
62414 +               /* And finally publish the above info in /proc/xen */
62415 +               xsd_kva_intf = create_xen_proc_entry("xsd_kva", 0600);
62416 +               if (xsd_kva_intf) {
62417 +                       memcpy(&xsd_kva_fops, xsd_kva_intf->proc_fops,
62418 +                              sizeof(xsd_kva_fops));
62419 +                       xsd_kva_fops.mmap = xsd_kva_mmap;
62420 +                       xsd_kva_intf->proc_fops = &xsd_kva_fops;
62421 +                       xsd_kva_intf->read_proc = xsd_kva_read;
62422 +               }
62423 +               xsd_port_intf = create_xen_proc_entry("xsd_port", 0400);
62424 +               if (xsd_port_intf)
62425 +                       xsd_port_intf->read_proc = xsd_port_read;
62426 +       } else
62427 +               xenstored_ready = 1;
62428 +
62429 +       /* Initialize the interface to xenstore. */
62430 +       err = xs_init();
62431 +       if (err) {
62432 +               printk(KERN_WARNING
62433 +                      "XENBUS: Error initializing xenstore comms: %i\n", err);
62434 +               goto err;
62435 +       }
62436 +
62437 +       /* Register ourselves with the kernel device subsystem */
62438 +       device_register(&xenbus_frontend.dev);
62439 +       device_register(&xenbus_backend.dev);
62440 +
62441 +       if (!dom0)
62442 +               xenbus_probe(NULL);
62443 +
62444 +       return 0;
62445 +
62446 + err:
62447 +       if (page)
62448 +               free_page(page);
62449 +
62450 +       /*
62451 +         * Do not unregister the xenbus front/backend buses here. The
62452 +         * buses must exist because front/backend drivers will use
62453 +         * them when they are registered.
62454 +         */
62455 +
62456 +       return err;
62457 +}
62458 +
62459 +postcore_initcall(xenbus_probe_init);
62460 +
62461 +/*
62462 + * Local variables:
62463 + *  c-file-style: "linux"
62464 + *  indent-tabs-mode: t
62465 + *  c-indent-level: 8
62466 + *  c-basic-offset: 8
62467 + *  tab-width: 8
62468 + * End:
62469 + */
62470 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/drivers/xen/xenbus/xenbus_xs.c linux-2.6.16/drivers/xen/xenbus/xenbus_xs.c
62471 --- linux-2.6.16.orig/drivers/xen/xenbus/xenbus_xs.c    1970-01-01 01:00:00.000000000 +0100
62472 +++ linux-2.6.16/drivers/xen/xenbus/xenbus_xs.c 2006-06-26 09:51:32.000000000 +0200
62473 @@ -0,0 +1,856 @@
62474 +/******************************************************************************
62475 + * xenbus_xs.c
62476 + *
62477 + * This is the kernel equivalent of the "xs" library.  We don't need everything
62478 + * and we use xenbus_comms for communication.
62479 + *
62480 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
62481 + * 
62482 + * This program is free software; you can redistribute it and/or
62483 + * modify it under the terms of the GNU General Public License version 2
62484 + * as published by the Free Software Foundation; or, when distributed
62485 + * separately from the Linux kernel or incorporated into other
62486 + * software packages, subject to the following license:
62487 + * 
62488 + * Permission is hereby granted, free of charge, to any person obtaining a copy
62489 + * of this source file (the "Software"), to deal in the Software without
62490 + * restriction, including without limitation the rights to use, copy, modify,
62491 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
62492 + * and to permit persons to whom the Software is furnished to do so, subject to
62493 + * the following conditions:
62494 + * 
62495 + * The above copyright notice and this permission notice shall be included in
62496 + * all copies or substantial portions of the Software.
62497 + * 
62498 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
62499 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
62500 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
62501 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
62502 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
62503 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
62504 + * IN THE SOFTWARE.
62505 + */
62506 +
62507 +#include <linux/unistd.h>
62508 +#include <linux/errno.h>
62509 +#include <linux/types.h>
62510 +#include <linux/uio.h>
62511 +#include <linux/kernel.h>
62512 +#include <linux/string.h>
62513 +#include <linux/err.h>
62514 +#include <linux/slab.h>
62515 +#include <linux/fcntl.h>
62516 +#include <linux/kthread.h>
62517 +#include <linux/rwsem.h>
62518 +#include <xen/xenbus.h>
62519 +#include "xenbus_comms.h"
62520 +
62521 +/* xenbus_probe.c */
62522 +extern char *kasprintf(const char *fmt, ...);
62523 +
62524 +struct xs_stored_msg {
62525 +       struct list_head list;
62526 +
62527 +       struct xsd_sockmsg hdr;
62528 +
62529 +       union {
62530 +               /* Queued replies. */
62531 +               struct {
62532 +                       char *body;
62533 +               } reply;
62534 +
62535 +               /* Queued watch events. */
62536 +               struct {
62537 +                       struct xenbus_watch *handle;
62538 +                       char **vec;
62539 +                       unsigned int vec_size;
62540 +               } watch;
62541 +       } u;
62542 +};
62543 +
62544 +struct xs_handle {
62545 +       /* A list of replies. Currently only one will ever be outstanding. */
62546 +       struct list_head reply_list;
62547 +       spinlock_t reply_lock;
62548 +       wait_queue_head_t reply_waitq;
62549 +
62550 +       /* One request at a time. */
62551 +       struct mutex request_mutex;
62552 +
62553 +       /* Protect transactions against save/restore. */
62554 +       struct rw_semaphore suspend_mutex;
62555 +};
62556 +
62557 +static struct xs_handle xs_state;
62558 +
62559 +/* List of registered watches, and a lock to protect it. */
62560 +static LIST_HEAD(watches);
62561 +static DEFINE_SPINLOCK(watches_lock);
62562 +
62563 +/* List of pending watch callback events, and a lock to protect it. */
62564 +static LIST_HEAD(watch_events);
62565 +static DEFINE_SPINLOCK(watch_events_lock);
62566 +
62567 +/*
62568 + * Details of the xenwatch callback kernel thread. The thread waits on the
62569 + * watch_events_waitq for work to do (queued on watch_events list). When it
62570 + * wakes up it acquires the xenwatch_mutex before reading the list and
62571 + * carrying out work.
62572 + */
62573 +static pid_t xenwatch_pid;
62574 +/* static */ DEFINE_MUTEX(xenwatch_mutex);
62575 +static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq);
62576 +
62577 +static int get_error(const char *errorstring)
62578 +{
62579 +       unsigned int i;
62580 +
62581 +       for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) {
62582 +               if (i == ARRAY_SIZE(xsd_errors) - 1) {
62583 +                       printk(KERN_WARNING
62584 +                              "XENBUS xen store gave: unknown error %s",
62585 +                              errorstring);
62586 +                       return EINVAL;
62587 +               }
62588 +       }
62589 +       return xsd_errors[i].errnum;
62590 +}
62591 +
62592 +static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)
62593 +{
62594 +       struct xs_stored_msg *msg;
62595 +       char *body;
62596 +
62597 +       spin_lock(&xs_state.reply_lock);
62598 +
62599 +       while (list_empty(&xs_state.reply_list)) {
62600 +               spin_unlock(&xs_state.reply_lock);
62601 +               /* XXX FIXME: Avoid synchronous wait for response here. */
62602 +               wait_event(xs_state.reply_waitq,
62603 +                          !list_empty(&xs_state.reply_list));
62604 +               spin_lock(&xs_state.reply_lock);
62605 +       }
62606 +
62607 +       msg = list_entry(xs_state.reply_list.next,
62608 +                        struct xs_stored_msg, list);
62609 +       list_del(&msg->list);
62610 +
62611 +       spin_unlock(&xs_state.reply_lock);
62612 +
62613 +       *type = msg->hdr.type;
62614 +       if (len)
62615 +               *len = msg->hdr.len;
62616 +       body = msg->u.reply.body;
62617 +
62618 +       kfree(msg);
62619 +
62620 +       return body;
62621 +}
62622 +
62623 +/* Emergency write. */
62624 +void xenbus_debug_write(const char *str, unsigned int count)
62625 +{
62626 +       struct xsd_sockmsg msg = { 0 };
62627 +
62628 +       msg.type = XS_DEBUG;
62629 +       msg.len = sizeof("print") + count + 1;
62630 +
62631 +       mutex_lock(&xs_state.request_mutex);
62632 +       xb_write(&msg, sizeof(msg));
62633 +       xb_write("print", sizeof("print"));
62634 +       xb_write(str, count);
62635 +       xb_write("", 1);
62636 +       mutex_unlock(&xs_state.request_mutex);
62637 +}
62638 +
62639 +void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
62640 +{
62641 +       void *ret;
62642 +       struct xsd_sockmsg req_msg = *msg;
62643 +       int err;
62644 +
62645 +       if (req_msg.type == XS_TRANSACTION_START)
62646 +               down_read(&xs_state.suspend_mutex);
62647 +
62648 +       mutex_lock(&xs_state.request_mutex);
62649 +
62650 +       err = xb_write(msg, sizeof(*msg) + msg->len);
62651 +       if (err) {
62652 +               msg->type = XS_ERROR;
62653 +               ret = ERR_PTR(err);
62654 +       } else
62655 +               ret = read_reply(&msg->type, &msg->len);
62656 +
62657 +       mutex_unlock(&xs_state.request_mutex);
62658 +
62659 +       if ((msg->type == XS_TRANSACTION_END) ||
62660 +           ((req_msg.type == XS_TRANSACTION_START) &&
62661 +            (msg->type == XS_ERROR)))
62662 +               up_read(&xs_state.suspend_mutex);
62663 +
62664 +       return ret;
62665 +}
62666 +
62667 +/* Send message to xs, get kmalloc'ed reply.  ERR_PTR() on error. */
62668 +static void *xs_talkv(xenbus_transaction_t t,
62669 +                     enum xsd_sockmsg_type type,
62670 +                     const struct kvec *iovec,
62671 +                     unsigned int num_vecs,
62672 +                     unsigned int *len)
62673 +{
62674 +       struct xsd_sockmsg msg;
62675 +       void *ret = NULL;
62676 +       unsigned int i;
62677 +       int err;
62678 +
62679 +       msg.tx_id = t;
62680 +       msg.req_id = 0;
62681 +       msg.type = type;
62682 +       msg.len = 0;
62683 +       for (i = 0; i < num_vecs; i++)
62684 +               msg.len += iovec[i].iov_len;
62685 +
62686 +       mutex_lock(&xs_state.request_mutex);
62687 +
62688 +       err = xb_write(&msg, sizeof(msg));
62689 +       if (err) {
62690 +               mutex_unlock(&xs_state.request_mutex);
62691 +               return ERR_PTR(err);
62692 +       }
62693 +
62694 +       for (i = 0; i < num_vecs; i++) {
62695 +               err = xb_write(iovec[i].iov_base, iovec[i].iov_len);;
62696 +               if (err) {
62697 +                       mutex_unlock(&xs_state.request_mutex);
62698 +                       return ERR_PTR(err);
62699 +               }
62700 +       }
62701 +
62702 +       ret = read_reply(&msg.type, len);
62703 +
62704 +       mutex_unlock(&xs_state.request_mutex);
62705 +
62706 +       if (IS_ERR(ret))
62707 +               return ret;
62708 +
62709 +       if (msg.type == XS_ERROR) {
62710 +               err = get_error(ret);
62711 +               kfree(ret);
62712 +               return ERR_PTR(-err);
62713 +       }
62714 +
62715 +       if (msg.type != type) {
62716 +               if (printk_ratelimit())
62717 +                       printk(KERN_WARNING
62718 +                              "XENBUS unexpected type [%d], expected [%d]\n",
62719 +                              msg.type, type);
62720 +               kfree(ret);
62721 +               return ERR_PTR(-EINVAL);
62722 +       }
62723 +       return ret;
62724 +}
62725 +
62726 +/* Simplified version of xs_talkv: single message. */
62727 +static void *xs_single(xenbus_transaction_t t,
62728 +                      enum xsd_sockmsg_type type,
62729 +                      const char *string,
62730 +                      unsigned int *len)
62731 +{
62732 +       struct kvec iovec;
62733 +
62734 +       iovec.iov_base = (void *)string;
62735 +       iovec.iov_len = strlen(string) + 1;
62736 +       return xs_talkv(t, type, &iovec, 1, len);
62737 +}
62738 +
62739 +/* Many commands only need an ack, don't care what it says. */
62740 +static int xs_error(char *reply)
62741 +{
62742 +       if (IS_ERR(reply))
62743 +               return PTR_ERR(reply);
62744 +       kfree(reply);
62745 +       return 0;
62746 +}
62747 +
62748 +static unsigned int count_strings(const char *strings, unsigned int len)
62749 +{
62750 +       unsigned int num;
62751 +       const char *p;
62752 +
62753 +       for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1)
62754 +               num++;
62755 +
62756 +       return num;
62757 +}
62758 +
62759 +/* Return the path to dir with /name appended. Buffer must be kfree()'ed. */
62760 +static char *join(const char *dir, const char *name)
62761 +{
62762 +       char *buffer;
62763 +
62764 +       if (strlen(name) == 0)
62765 +               buffer = kasprintf("%s", dir);
62766 +       else
62767 +               buffer = kasprintf("%s/%s", dir, name);
62768 +       return (!buffer) ? ERR_PTR(-ENOMEM) : buffer;
62769 +}
62770 +
62771 +static char **split(char *strings, unsigned int len, unsigned int *num)
62772 +{
62773 +       char *p, **ret;
62774 +
62775 +       /* Count the strings. */
62776 +       *num = count_strings(strings, len);
62777 +
62778 +       /* Transfer to one big alloc for easy freeing. */
62779 +       ret = kmalloc(*num * sizeof(char *) + len, GFP_KERNEL);
62780 +       if (!ret) {
62781 +               kfree(strings);
62782 +               return ERR_PTR(-ENOMEM);
62783 +       }
62784 +       memcpy(&ret[*num], strings, len);
62785 +       kfree(strings);
62786 +
62787 +       strings = (char *)&ret[*num];
62788 +       for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1)
62789 +               ret[(*num)++] = p;
62790 +
62791 +       return ret;
62792 +}
62793 +
62794 +char **xenbus_directory(xenbus_transaction_t t,
62795 +                       const char *dir, const char *node, unsigned int *num)
62796 +{
62797 +       char *strings, *path;
62798 +       unsigned int len;
62799 +
62800 +       path = join(dir, node);
62801 +       if (IS_ERR(path))
62802 +               return (char **)path;
62803 +
62804 +       strings = xs_single(t, XS_DIRECTORY, path, &len);
62805 +       kfree(path);
62806 +       if (IS_ERR(strings))
62807 +               return (char **)strings;
62808 +
62809 +       return split(strings, len, num);
62810 +}
62811 +EXPORT_SYMBOL_GPL(xenbus_directory);
62812 +
62813 +/* Check if a path exists. Return 1 if it does. */
62814 +int xenbus_exists(xenbus_transaction_t t,
62815 +                 const char *dir, const char *node)
62816 +{
62817 +       char **d;
62818 +       int dir_n;
62819 +
62820 +       d = xenbus_directory(t, dir, node, &dir_n);
62821 +       if (IS_ERR(d))
62822 +               return 0;
62823 +       kfree(d);
62824 +       return 1;
62825 +}
62826 +EXPORT_SYMBOL_GPL(xenbus_exists);
62827 +
62828 +/* Get the value of a single file.
62829 + * Returns a kmalloced value: call free() on it after use.
62830 + * len indicates length in bytes.
62831 + */
62832 +void *xenbus_read(xenbus_transaction_t t,
62833 +                 const char *dir, const char *node, unsigned int *len)
62834 +{
62835 +       char *path;
62836 +       void *ret;
62837 +
62838 +       path = join(dir, node);
62839 +       if (IS_ERR(path))
62840 +               return (void *)path;
62841 +
62842 +       ret = xs_single(t, XS_READ, path, len);
62843 +       kfree(path);
62844 +       return ret;
62845 +}
62846 +EXPORT_SYMBOL_GPL(xenbus_read);
62847 +
62848 +/* Write the value of a single file.
62849 + * Returns -err on failure.
62850 + */
62851 +int xenbus_write(xenbus_transaction_t t,
62852 +                const char *dir, const char *node, const char *string)
62853 +{
62854 +       const char *path;
62855 +       struct kvec iovec[2];
62856 +       int ret;
62857 +
62858 +       path = join(dir, node);
62859 +       if (IS_ERR(path))
62860 +               return PTR_ERR(path);
62861 +
62862 +       iovec[0].iov_base = (void *)path;
62863 +       iovec[0].iov_len = strlen(path) + 1;
62864 +       iovec[1].iov_base = (void *)string;
62865 +       iovec[1].iov_len = strlen(string);
62866 +
62867 +       ret = xs_error(xs_talkv(t, XS_WRITE, iovec, ARRAY_SIZE(iovec), NULL));
62868 +       kfree(path);
62869 +       return ret;
62870 +}
62871 +EXPORT_SYMBOL_GPL(xenbus_write);
62872 +
62873 +/* Create a new directory. */
62874 +int xenbus_mkdir(xenbus_transaction_t t,
62875 +                const char *dir, const char *node)
62876 +{
62877 +       char *path;
62878 +       int ret;
62879 +
62880 +       path = join(dir, node);
62881 +       if (IS_ERR(path))
62882 +               return PTR_ERR(path);
62883 +
62884 +       ret = xs_error(xs_single(t, XS_MKDIR, path, NULL));
62885 +       kfree(path);
62886 +       return ret;
62887 +}
62888 +EXPORT_SYMBOL_GPL(xenbus_mkdir);
62889 +
62890 +/* Destroy a file or directory (directories must be empty). */
62891 +int xenbus_rm(xenbus_transaction_t t, const char *dir, const char *node)
62892 +{
62893 +       char *path;
62894 +       int ret;
62895 +
62896 +       path = join(dir, node);
62897 +       if (IS_ERR(path))
62898 +               return PTR_ERR(path);
62899 +
62900 +       ret = xs_error(xs_single(t, XS_RM, path, NULL));
62901 +       kfree(path);
62902 +       return ret;
62903 +}
62904 +EXPORT_SYMBOL_GPL(xenbus_rm);
62905 +
62906 +/* Start a transaction: changes by others will not be seen during this
62907 + * transaction, and changes will not be visible to others until end.
62908 + */
62909 +int xenbus_transaction_start(xenbus_transaction_t *t)
62910 +{
62911 +       char *id_str;
62912 +
62913 +       down_read(&xs_state.suspend_mutex);
62914 +
62915 +       id_str = xs_single(XBT_NULL, XS_TRANSACTION_START, "", NULL);
62916 +       if (IS_ERR(id_str)) {
62917 +               up_read(&xs_state.suspend_mutex);
62918 +               return PTR_ERR(id_str);
62919 +       }
62920 +
62921 +       *t = simple_strtoul(id_str, NULL, 0);
62922 +       kfree(id_str);
62923 +       return 0;
62924 +}
62925 +EXPORT_SYMBOL_GPL(xenbus_transaction_start);
62926 +
62927 +/* End a transaction.
62928 + * If abandon is true, transaction is discarded instead of committed.
62929 + */
62930 +int xenbus_transaction_end(xenbus_transaction_t t, int abort)
62931 +{
62932 +       char abortstr[2];
62933 +       int err;
62934 +
62935 +       if (abort)
62936 +               strcpy(abortstr, "F");
62937 +       else
62938 +               strcpy(abortstr, "T");
62939 +
62940 +       err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL));
62941 +
62942 +       up_read(&xs_state.suspend_mutex);
62943 +
62944 +       return err;
62945 +}
62946 +EXPORT_SYMBOL_GPL(xenbus_transaction_end);
62947 +
62948 +/* Single read and scanf: returns -errno or num scanned. */
62949 +int xenbus_scanf(xenbus_transaction_t t,
62950 +                const char *dir, const char *node, const char *fmt, ...)
62951 +{
62952 +       va_list ap;
62953 +       int ret;
62954 +       char *val;
62955 +
62956 +       val = xenbus_read(t, dir, node, NULL);
62957 +       if (IS_ERR(val))
62958 +               return PTR_ERR(val);
62959 +
62960 +       va_start(ap, fmt);
62961 +       ret = vsscanf(val, fmt, ap);
62962 +       va_end(ap);
62963 +       kfree(val);
62964 +       /* Distinctive errno. */
62965 +       if (ret == 0)
62966 +               return -ERANGE;
62967 +       return ret;
62968 +}
62969 +EXPORT_SYMBOL_GPL(xenbus_scanf);
62970 +
62971 +/* Single printf and write: returns -errno or 0. */
62972 +int xenbus_printf(xenbus_transaction_t t,
62973 +                 const char *dir, const char *node, const char *fmt, ...)
62974 +{
62975 +       va_list ap;
62976 +       int ret;
62977 +#define PRINTF_BUFFER_SIZE 4096
62978 +       char *printf_buffer;
62979 +
62980 +       printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
62981 +       if (printf_buffer == NULL)
62982 +               return -ENOMEM;
62983 +
62984 +       va_start(ap, fmt);
62985 +       ret = vsnprintf(printf_buffer, PRINTF_BUFFER_SIZE, fmt, ap);
62986 +       va_end(ap);
62987 +
62988 +       BUG_ON(ret > PRINTF_BUFFER_SIZE-1);
62989 +       ret = xenbus_write(t, dir, node, printf_buffer);
62990 +
62991 +       kfree(printf_buffer);
62992 +
62993 +       return ret;
62994 +}
62995 +EXPORT_SYMBOL_GPL(xenbus_printf);
62996 +
62997 +/* Takes tuples of names, scanf-style args, and void **, NULL terminated. */
62998 +int xenbus_gather(xenbus_transaction_t t, const char *dir, ...)
62999 +{
63000 +       va_list ap;
63001 +       const char *name;
63002 +       int ret = 0;
63003 +
63004 +       va_start(ap, dir);
63005 +       while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
63006 +               const char *fmt = va_arg(ap, char *);
63007 +               void *result = va_arg(ap, void *);
63008 +               char *p;
63009 +
63010 +               p = xenbus_read(t, dir, name, NULL);
63011 +               if (IS_ERR(p)) {
63012 +                       ret = PTR_ERR(p);
63013 +                       break;
63014 +               }
63015 +               if (fmt) {
63016 +                       if (sscanf(p, fmt, result) == 0)
63017 +                               ret = -EINVAL;
63018 +                       kfree(p);
63019 +               } else
63020 +                       *(char **)result = p;
63021 +       }
63022 +       va_end(ap);
63023 +       return ret;
63024 +}
63025 +EXPORT_SYMBOL_GPL(xenbus_gather);
63026 +
63027 +static int xs_watch(const char *path, const char *token)
63028 +{
63029 +       struct kvec iov[2];
63030 +
63031 +       iov[0].iov_base = (void *)path;
63032 +       iov[0].iov_len = strlen(path) + 1;
63033 +       iov[1].iov_base = (void *)token;
63034 +       iov[1].iov_len = strlen(token) + 1;
63035 +
63036 +       return xs_error(xs_talkv(XBT_NULL, XS_WATCH, iov,
63037 +                                ARRAY_SIZE(iov), NULL));
63038 +}
63039 +
63040 +static int xs_unwatch(const char *path, const char *token)
63041 +{
63042 +       struct kvec iov[2];
63043 +
63044 +       iov[0].iov_base = (char *)path;
63045 +       iov[0].iov_len = strlen(path) + 1;
63046 +       iov[1].iov_base = (char *)token;
63047 +       iov[1].iov_len = strlen(token) + 1;
63048 +
63049 +       return xs_error(xs_talkv(XBT_NULL, XS_UNWATCH, iov,
63050 +                                ARRAY_SIZE(iov), NULL));
63051 +}
63052 +
63053 +static struct xenbus_watch *find_watch(const char *token)
63054 +{
63055 +       struct xenbus_watch *i, *cmp;
63056 +
63057 +       cmp = (void *)simple_strtoul(token, NULL, 16);
63058 +
63059 +       list_for_each_entry(i, &watches, list)
63060 +               if (i == cmp)
63061 +                       return i;
63062 +
63063 +       return NULL;
63064 +}
63065 +
63066 +/* Register callback to watch this node. */
63067 +int register_xenbus_watch(struct xenbus_watch *watch)
63068 +{
63069 +       /* Pointer in ascii is the token. */
63070 +       char token[sizeof(watch) * 2 + 1];
63071 +       int err;
63072 +
63073 +       sprintf(token, "%lX", (long)watch);
63074 +
63075 +       down_read(&xs_state.suspend_mutex);
63076 +
63077 +       spin_lock(&watches_lock);
63078 +       BUG_ON(find_watch(token));
63079 +       list_add(&watch->list, &watches);
63080 +       spin_unlock(&watches_lock);
63081 +
63082 +       err = xs_watch(watch->node, token);
63083 +
63084 +       /* Ignore errors due to multiple registration. */
63085 +       if ((err != 0) && (err != -EEXIST)) {
63086 +               spin_lock(&watches_lock);
63087 +               list_del(&watch->list);
63088 +               spin_unlock(&watches_lock);
63089 +       }
63090 +
63091 +       up_read(&xs_state.suspend_mutex);
63092 +
63093 +       return err;
63094 +}
63095 +EXPORT_SYMBOL_GPL(register_xenbus_watch);
63096 +
63097 +void unregister_xenbus_watch(struct xenbus_watch *watch)
63098 +{
63099 +       struct xs_stored_msg *msg, *tmp;
63100 +       char token[sizeof(watch) * 2 + 1];
63101 +       int err;
63102 +
63103 +       sprintf(token, "%lX", (long)watch);
63104 +
63105 +       down_read(&xs_state.suspend_mutex);
63106 +
63107 +       spin_lock(&watches_lock);
63108 +       BUG_ON(!find_watch(token));
63109 +       list_del(&watch->list);
63110 +       spin_unlock(&watches_lock);
63111 +
63112 +       err = xs_unwatch(watch->node, token);
63113 +       if (err)
63114 +               printk(KERN_WARNING
63115 +                      "XENBUS Failed to release watch %s: %i\n",
63116 +                      watch->node, err);
63117 +
63118 +       up_read(&xs_state.suspend_mutex);
63119 +
63120 +       /* Cancel pending watch events. */
63121 +       spin_lock(&watch_events_lock);
63122 +       list_for_each_entry_safe(msg, tmp, &watch_events, list) {
63123 +               if (msg->u.watch.handle != watch)
63124 +                       continue;
63125 +               list_del(&msg->list);
63126 +               kfree(msg->u.watch.vec);
63127 +               kfree(msg);
63128 +       }
63129 +       spin_unlock(&watch_events_lock);
63130 +
63131 +       /* Flush any currently-executing callback, unless we are it. :-) */
63132 +       if (current->pid != xenwatch_pid) {
63133 +               mutex_lock(&xenwatch_mutex);
63134 +               mutex_unlock(&xenwatch_mutex);
63135 +       }
63136 +}
63137 +EXPORT_SYMBOL_GPL(unregister_xenbus_watch);
63138 +
63139 +void xs_suspend(void)
63140 +{
63141 +       down_write(&xs_state.suspend_mutex);
63142 +       mutex_lock(&xs_state.request_mutex);
63143 +}
63144 +
63145 +void xs_resume(void)
63146 +{
63147 +       struct xenbus_watch *watch;
63148 +       char token[sizeof(watch) * 2 + 1];
63149 +
63150 +       mutex_unlock(&xs_state.request_mutex);
63151 +
63152 +       /* No need for watches_lock: the suspend_mutex is sufficient. */
63153 +       list_for_each_entry(watch, &watches, list) {
63154 +               sprintf(token, "%lX", (long)watch);
63155 +               xs_watch(watch->node, token);
63156 +       }
63157 +
63158 +       up_write(&xs_state.suspend_mutex);
63159 +}
63160 +
63161 +static int xenwatch_handle_callback(void *data)
63162 +{
63163 +       struct xs_stored_msg *msg = data;
63164 +
63165 +       msg->u.watch.handle->callback(msg->u.watch.handle,
63166 +                                     (const char **)msg->u.watch.vec,
63167 +                                     msg->u.watch.vec_size);
63168 +
63169 +       kfree(msg->u.watch.vec);
63170 +       kfree(msg);
63171 +
63172 +       /* Kill this kthread if we were spawned just for this callback. */
63173 +       if (current->pid != xenwatch_pid)
63174 +               do_exit(0);
63175 +
63176 +       return 0;
63177 +}
63178 +
63179 +static int xenwatch_thread(void *unused)
63180 +{
63181 +       struct list_head *ent;
63182 +       struct xs_stored_msg *msg;
63183 +
63184 +       for (;;) {
63185 +               wait_event_interruptible(watch_events_waitq,
63186 +                                        !list_empty(&watch_events));
63187 +
63188 +               if (kthread_should_stop())
63189 +                       break;
63190 +
63191 +               mutex_lock(&xenwatch_mutex);
63192 +
63193 +               spin_lock(&watch_events_lock);
63194 +               ent = watch_events.next;
63195 +               if (ent != &watch_events)
63196 +                       list_del(ent);
63197 +               spin_unlock(&watch_events_lock);
63198 +
63199 +               if (ent != &watch_events) {
63200 +                       msg = list_entry(ent, struct xs_stored_msg, list);
63201 +                       if (msg->u.watch.handle->flags & XBWF_new_thread)
63202 +                               kthread_run(xenwatch_handle_callback,
63203 +                                           msg, "xenwatch_cb");
63204 +                       else
63205 +                               xenwatch_handle_callback(msg);
63206 +               }
63207 +
63208 +               mutex_unlock(&xenwatch_mutex);
63209 +       }
63210 +
63211 +       return 0;
63212 +}
63213 +
63214 +static int process_msg(void)
63215 +{
63216 +       struct xs_stored_msg *msg;
63217 +       char *body;
63218 +       int err;
63219 +
63220 +       msg = kmalloc(sizeof(*msg), GFP_KERNEL);
63221 +       if (msg == NULL)
63222 +               return -ENOMEM;
63223 +
63224 +       err = xb_read(&msg->hdr, sizeof(msg->hdr));
63225 +       if (err) {
63226 +               kfree(msg);
63227 +               return err;
63228 +       }
63229 +
63230 +       body = kmalloc(msg->hdr.len + 1, GFP_KERNEL);
63231 +       if (body == NULL) {
63232 +               kfree(msg);
63233 +               return -ENOMEM;
63234 +       }
63235 +
63236 +       err = xb_read(body, msg->hdr.len);
63237 +       if (err) {
63238 +               kfree(body);
63239 +               kfree(msg);
63240 +               return err;
63241 +       }
63242 +       body[msg->hdr.len] = '\0';
63243 +
63244 +       if (msg->hdr.type == XS_WATCH_EVENT) {
63245 +               msg->u.watch.vec = split(body, msg->hdr.len,
63246 +                                        &msg->u.watch.vec_size);
63247 +               if (IS_ERR(msg->u.watch.vec)) {
63248 +                       kfree(msg);
63249 +                       return PTR_ERR(msg->u.watch.vec);
63250 +               }
63251 +
63252 +               spin_lock(&watches_lock);
63253 +               msg->u.watch.handle = find_watch(
63254 +                       msg->u.watch.vec[XS_WATCH_TOKEN]);
63255 +               if (msg->u.watch.handle != NULL) {
63256 +                       spin_lock(&watch_events_lock);
63257 +                       list_add_tail(&msg->list, &watch_events);
63258 +                       wake_up(&watch_events_waitq);
63259 +                       spin_unlock(&watch_events_lock);
63260 +               } else {
63261 +                       kfree(msg->u.watch.vec);
63262 +                       kfree(msg);
63263 +               }
63264 +               spin_unlock(&watches_lock);
63265 +       } else {
63266 +               msg->u.reply.body = body;
63267 +               spin_lock(&xs_state.reply_lock);
63268 +               list_add_tail(&msg->list, &xs_state.reply_list);
63269 +               spin_unlock(&xs_state.reply_lock);
63270 +               wake_up(&xs_state.reply_waitq);
63271 +       }
63272 +
63273 +       return 0;
63274 +}
63275 +
63276 +static int xenbus_thread(void *unused)
63277 +{
63278 +       int err;
63279 +
63280 +       for (;;) {
63281 +               err = process_msg();
63282 +               if (err)
63283 +                       printk(KERN_WARNING "XENBUS error %d while reading "
63284 +                              "message\n", err);
63285 +               if (kthread_should_stop())
63286 +                       break;
63287 +       }
63288 +
63289 +       return 0;
63290 +}
63291 +
63292 +int xs_init(void)
63293 +{
63294 +       int err;
63295 +       struct task_struct *task;
63296 +
63297 +       INIT_LIST_HEAD(&xs_state.reply_list);
63298 +       spin_lock_init(&xs_state.reply_lock);
63299 +       init_waitqueue_head(&xs_state.reply_waitq);
63300 +
63301 +       mutex_init(&xs_state.request_mutex);
63302 +       init_rwsem(&xs_state.suspend_mutex);
63303 +
63304 +       /* Initialize the shared memory rings to talk to xenstored */
63305 +       err = xb_init_comms();
63306 +       if (err)
63307 +               return err;
63308 +
63309 +       task = kthread_run(xenwatch_thread, NULL, "xenwatch");
63310 +       if (IS_ERR(task))
63311 +               return PTR_ERR(task);
63312 +       xenwatch_pid = task->pid;
63313 +
63314 +       task = kthread_run(xenbus_thread, NULL, "xenbus");
63315 +       if (IS_ERR(task))
63316 +               return PTR_ERR(task);
63317 +
63318 +       return 0;
63319 +}
63320 +
63321 +/*
63322 + * Local variables:
63323 + *  c-file-style: "linux"
63324 + *  indent-tabs-mode: t
63325 + *  c-indent-level: 8
63326 + *  c-basic-offset: 8
63327 + *  tab-width: 8
63328 + * End:
63329 + */
63330 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/fs/Kconfig linux-2.6.16/fs/Kconfig
63331 --- linux-2.6.16.orig/fs/Kconfig        2006-06-26 09:49:45.000000000 +0200
63332 +++ linux-2.6.16/fs/Kconfig     2006-06-26 09:51:32.000000000 +0200
63333 @@ -843,6 +843,7 @@
63334  config HUGETLBFS
63335         bool "HugeTLB file system support"
63336         depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN
63337 +       depends !XEN
63338         help
63339           hugetlbfs is a filesystem backing for HugeTLB pages, based on
63340           ramfs. For architectures that support it, say Y here and read
63341 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/a.out.h linux-2.6.16/include/asm-i386/a.out.h
63342 --- linux-2.6.16.orig/include/asm-i386/a.out.h  2006-03-20 06:53:29.000000000 +0100
63343 +++ linux-2.6.16/include/asm-i386/a.out.h       2006-06-26 09:51:32.000000000 +0200
63344 @@ -19,7 +19,7 @@
63345  
63346  #ifdef __KERNEL__
63347  
63348 -#define STACK_TOP      TASK_SIZE
63349 +#define STACK_TOP      (TASK_SIZE - 3*PAGE_SIZE)
63350  
63351  #endif
63352  
63353 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/apic.h linux-2.6.16/include/asm-i386/apic.h
63354 --- linux-2.6.16.orig/include/asm-i386/apic.h   2006-03-20 06:53:29.000000000 +0100
63355 +++ linux-2.6.16/include/asm-i386/apic.h        2006-06-26 09:51:32.000000000 +0200
63356 @@ -132,10 +132,12 @@
63357  
63358  extern int disable_timer_pin_1;
63359  
63360 +#ifndef CONFIG_XEN
63361  void smp_send_timer_broadcast_ipi(struct pt_regs *regs);
63362  void switch_APIC_timer_to_ipi(void *cpumask);
63363  void switch_ipi_to_APIC_timer(void *cpumask);
63364  #define ARCH_APICTIMER_STOPS_ON_C3     1
63365 +#endif
63366  
63367  extern int timer_over_8254;
63368  
63369 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/atomic.h linux-2.6.16/include/asm-i386/atomic.h
63370 --- linux-2.6.16.orig/include/asm-i386/atomic.h 2006-03-20 06:53:29.000000000 +0100
63371 +++ linux-2.6.16/include/asm-i386/atomic.h      2006-06-26 09:51:32.000000000 +0200
63372 @@ -4,18 +4,13 @@
63373  #include <linux/config.h>
63374  #include <linux/compiler.h>
63375  #include <asm/processor.h>
63376 +#include <asm/smp_alt.h>
63377  
63378  /*
63379   * Atomic operations that C can't guarantee us.  Useful for
63380   * resource counting etc..
63381   */
63382  
63383 -#ifdef CONFIG_SMP
63384 -#define LOCK "lock ; "
63385 -#else
63386 -#define LOCK ""
63387 -#endif
63388 -
63389  /*
63390   * Make sure gcc doesn't try to be clever and move things around
63391   * on us. We need to use _exactly_ the address the user gave us,
63392 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/bitops.h linux-2.6.16/include/asm-i386/bitops.h
63393 --- linux-2.6.16.orig/include/asm-i386/bitops.h 2006-03-20 06:53:29.000000000 +0100
63394 +++ linux-2.6.16/include/asm-i386/bitops.h      2006-06-26 09:51:32.000000000 +0200
63395 @@ -7,6 +7,7 @@
63396  
63397  #include <linux/config.h>
63398  #include <linux/compiler.h>
63399 +#include <asm/smp_alt.h>
63400  
63401  /*
63402   * These have to be done with inline assembly: that way the bit-setting
63403 @@ -16,12 +17,6 @@
63404   * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
63405   */
63406  
63407 -#ifdef CONFIG_SMP
63408 -#define LOCK_PREFIX "lock ; "
63409 -#else
63410 -#define LOCK_PREFIX ""
63411 -#endif
63412 -
63413  #define ADDR (*(volatile long *) addr)
63414  
63415  /**
63416 @@ -41,7 +36,7 @@
63417   */
63418  static inline void set_bit(int nr, volatile unsigned long * addr)
63419  {
63420 -       __asm__ __volatile__( LOCK_PREFIX
63421 +       __asm__ __volatile__( LOCK
63422                 "btsl %1,%0"
63423                 :"+m" (ADDR)
63424                 :"Ir" (nr));
63425 @@ -76,7 +71,7 @@
63426   */
63427  static inline void clear_bit(int nr, volatile unsigned long * addr)
63428  {
63429 -       __asm__ __volatile__( LOCK_PREFIX
63430 +       __asm__ __volatile__( LOCK
63431                 "btrl %1,%0"
63432                 :"+m" (ADDR)
63433                 :"Ir" (nr));
63434 @@ -121,7 +116,7 @@
63435   */
63436  static inline void change_bit(int nr, volatile unsigned long * addr)
63437  {
63438 -       __asm__ __volatile__( LOCK_PREFIX
63439 +       __asm__ __volatile__( LOCK
63440                 "btcl %1,%0"
63441                 :"+m" (ADDR)
63442                 :"Ir" (nr));
63443 @@ -140,7 +135,7 @@
63444  {
63445         int oldbit;
63446  
63447 -       __asm__ __volatile__( LOCK_PREFIX
63448 +       __asm__ __volatile__( LOCK
63449                 "btsl %2,%1\n\tsbbl %0,%0"
63450                 :"=r" (oldbit),"+m" (ADDR)
63451                 :"Ir" (nr) : "memory");
63452 @@ -180,7 +175,7 @@
63453  {
63454         int oldbit;
63455  
63456 -       __asm__ __volatile__( LOCK_PREFIX
63457 +       __asm__ __volatile__( LOCK
63458                 "btrl %2,%1\n\tsbbl %0,%0"
63459                 :"=r" (oldbit),"+m" (ADDR)
63460                 :"Ir" (nr) : "memory");
63461 @@ -231,7 +226,7 @@
63462  {
63463         int oldbit;
63464  
63465 -       __asm__ __volatile__( LOCK_PREFIX
63466 +       __asm__ __volatile__( LOCK
63467                 "btcl %2,%1\n\tsbbl %0,%0"
63468                 :"=r" (oldbit),"+m" (ADDR)
63469                 :"Ir" (nr) : "memory");
63470 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/elf.h linux-2.6.16/include/asm-i386/elf.h
63471 --- linux-2.6.16.orig/include/asm-i386/elf.h    2006-06-26 09:49:46.000000000 +0200
63472 +++ linux-2.6.16/include/asm-i386/elf.h 2006-06-26 09:51:32.000000000 +0200
63473 @@ -129,11 +129,16 @@
63474  #define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs)
63475  #define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) dump_task_extended_fpu(tsk, elf_xfpregs)
63476  
63477 -#define VSYSCALL_BASE  (__fix_to_virt(FIX_VSYSCALL))
63478 +#define VSYSCALL_BASE  (PAGE_OFFSET - 2*PAGE_SIZE)
63479  #define VSYSCALL_EHDR  ((const struct elfhdr *) VSYSCALL_BASE)
63480  #define VSYSCALL_ENTRY ((unsigned long) &__kernel_vsyscall)
63481  extern void __kernel_vsyscall;
63482  
63483 +#define ARCH_HAS_SETUP_ADDITIONAL_PAGES
63484 +struct linux_binprm;
63485 +extern int arch_setup_additional_pages(struct linux_binprm *bprm,
63486 +                                       int executable_stack);
63487 +
63488  #define ARCH_DLINFO                                            \
63489  do {                                                           \
63490                 NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY);        \
63491 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/fixmap.h linux-2.6.16/include/asm-i386/fixmap.h
63492 --- linux-2.6.16.orig/include/asm-i386/fixmap.h 2006-03-20 06:53:29.000000000 +0100
63493 +++ linux-2.6.16/include/asm-i386/fixmap.h      2006-06-26 09:51:32.000000000 +0200
63494 @@ -20,7 +20,7 @@
63495   * Leave one empty page between vmalloc'ed areas and
63496   * the start of the fixmap.
63497   */
63498 -#define __FIXADDR_TOP  0xfffff000
63499 +extern unsigned long __FIXADDR_TOP;
63500  
63501  #ifndef __ASSEMBLY__
63502  #include <linux/kernel.h>
63503 @@ -52,7 +52,6 @@
63504   */
63505  enum fixed_addresses {
63506         FIX_HOLE,
63507 -       FIX_VSYSCALL,
63508  #ifdef CONFIG_X86_LOCAL_APIC
63509         FIX_APIC_BASE,  /* local (CPU) APIC) -- required for SMP or not */
63510  #endif
63511 @@ -95,6 +94,8 @@
63512  extern void __set_fixmap (enum fixed_addresses idx,
63513                                         unsigned long phys, pgprot_t flags);
63514  
63515 +extern void set_fixaddr_top(unsigned long top);
63516 +
63517  #define set_fixmap(idx, phys) \
63518                 __set_fixmap(idx, phys, PAGE_KERNEL)
63519  /*
63520 @@ -116,14 +117,6 @@
63521  #define __fix_to_virt(x)       (FIXADDR_TOP - ((x) << PAGE_SHIFT))
63522  #define __virt_to_fix(x)       ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
63523  
63524 -/*
63525 - * This is the range that is readable by user mode, and things
63526 - * acting like user mode such as get_user_pages.
63527 - */
63528 -#define FIXADDR_USER_START     (__fix_to_virt(FIX_VSYSCALL))
63529 -#define FIXADDR_USER_END       (FIXADDR_USER_START + PAGE_SIZE)
63530 -
63531 -
63532  extern void __this_fixmap_does_not_exist(void);
63533  
63534  /*
63535 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/futex.h linux-2.6.16/include/asm-i386/futex.h
63536 --- linux-2.6.16.orig/include/asm-i386/futex.h  2006-03-20 06:53:29.000000000 +0100
63537 +++ linux-2.6.16/include/asm-i386/futex.h       2006-06-26 09:51:32.000000000 +0200
63538 @@ -28,7 +28,7 @@
63539  "1:    movl    %2, %0\n\
63540         movl    %0, %3\n"                                       \
63541         insn "\n"                                               \
63542 -"2:    " LOCK_PREFIX "cmpxchgl %3, %2\n\
63543 +"2:    " LOCK "cmpxchgl %3, %2\n\
63544         jnz     1b\n\
63545  3:     .section .fixup,\"ax\"\n\
63546  4:     mov     %5, %1\n\
63547 @@ -68,7 +68,7 @@
63548  #endif
63549                 switch (op) {
63550                 case FUTEX_OP_ADD:
63551 -                       __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret,
63552 +                       __futex_atomic_op1(LOCK "xaddl %0, %2", ret,
63553                                            oldval, uaddr, oparg);
63554                         break;
63555                 case FUTEX_OP_OR:
63556 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-default/mach_traps.h linux-2.6.16/include/asm-i386/mach-default/mach_traps.h
63557 --- linux-2.6.16.orig/include/asm-i386/mach-default/mach_traps.h        2006-03-20 06:53:29.000000000 +0100
63558 +++ linux-2.6.16/include/asm-i386/mach-default/mach_traps.h     2006-06-26 09:51:32.000000000 +0200
63559 @@ -15,6 +15,18 @@
63560         outb(reason, 0x61);
63561  }
63562  
63563 +static inline void clear_io_check_error(unsigned char reason)
63564 +{
63565 +       unsigned long i;
63566 +
63567 +       reason = (reason & 0xf) | 8;
63568 +       outb(reason, 0x61);
63569 +       i = 2000;
63570 +       while (--i) udelay(1000);
63571 +       reason &= ~8;
63572 +       outb(reason, 0x61);
63573 +}
63574 +
63575  static inline unsigned char get_nmi_reason(void)
63576  {
63577         return inb(0x61);
63578 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/agp.h linux-2.6.16/include/asm-i386/mach-xen/asm/agp.h
63579 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/agp.h       1970-01-01 01:00:00.000000000 +0100
63580 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/agp.h    2006-06-26 09:51:32.000000000 +0200
63581 @@ -0,0 +1,37 @@
63582 +#ifndef AGP_H
63583 +#define AGP_H 1
63584 +
63585 +#include <asm/pgtable.h>
63586 +#include <asm/cacheflush.h>
63587 +#include <asm/system.h>
63588 +
63589 +/* 
63590 + * Functions to keep the agpgart mappings coherent with the MMU.
63591 + * The GART gives the CPU a physical alias of pages in memory. The alias region is
63592 + * mapped uncacheable. Make sure there are no conflicting mappings
63593 + * with different cachability attributes for the same page. This avoids
63594 + * data corruption on some CPUs.
63595 + */
63596 +
63597 +int map_page_into_agp(struct page *page);
63598 +int unmap_page_from_agp(struct page *page);
63599 +#define flush_agp_mappings() global_flush_tlb()
63600 +
63601 +/* Could use CLFLUSH here if the cpu supports it. But then it would
63602 +   need to be called for each cacheline of the whole page so it may not be 
63603 +   worth it. Would need a page for it. */
63604 +#define flush_agp_cache() wbinvd()
63605 +
63606 +/* Convert a physical address to an address suitable for the GART. */
63607 +#define phys_to_gart(x) phys_to_machine(x)
63608 +#define gart_to_phys(x) machine_to_phys(x)
63609 +
63610 +/* GATT allocation. Returns/accepts GATT kernel virtual address. */
63611 +#define alloc_gatt_pages(order)        ({                                          \
63612 +       char *_t; dma_addr_t _d;                                            \
63613 +       _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL);    \
63614 +       _t; })
63615 +#define free_gatt_pages(table, order)  \
63616 +       dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table))
63617 +
63618 +#endif
63619 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/desc.h linux-2.6.16/include/asm-i386/mach-xen/asm/desc.h
63620 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/desc.h      1970-01-01 01:00:00.000000000 +0100
63621 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/desc.h   2006-06-26 09:51:32.000000000 +0200
63622 @@ -0,0 +1,164 @@
63623 +#ifndef __ARCH_DESC_H
63624 +#define __ARCH_DESC_H
63625 +
63626 +#include <asm/ldt.h>
63627 +#include <asm/segment.h>
63628 +
63629 +#define CPU_16BIT_STACK_SIZE 1024
63630 +
63631 +#ifndef __ASSEMBLY__
63632 +
63633 +#include <linux/preempt.h>
63634 +#include <linux/smp.h>
63635 +
63636 +#include <asm/mmu.h>
63637 +
63638 +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
63639 +
63640 +DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
63641 +
63642 +struct Xgt_desc_struct {
63643 +       unsigned short size;
63644 +       unsigned long address __attribute__((packed));
63645 +       unsigned short pad;
63646 +} __attribute__ ((packed));
63647 +
63648 +extern struct Xgt_desc_struct idt_descr;
63649 +DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
63650 +
63651 +
63652 +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
63653 +{
63654 +       return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
63655 +}
63656 +
63657 +#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
63658 +#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
63659 +
63660 +#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
63661 +#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
63662 +#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr))
63663 +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt))
63664 +
63665 +#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
63666 +#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
63667 +#define store_tr(tr) __asm__ ("str %0":"=mr" (tr))
63668 +#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt))
63669 +
63670 +/*
63671 + * This is the ldt that every process will get unless we need
63672 + * something other than this.
63673 + */
63674 +extern struct desc_struct default_ldt[];
63675 +extern void set_intr_gate(unsigned int irq, void * addr);
63676 +
63677 +#define _set_tssldt_desc(n,addr,limit,type) \
63678 +__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \
63679 +       "movw %w1,2(%2)\n\t" \
63680 +       "rorl $16,%1\n\t" \
63681 +       "movb %b1,4(%2)\n\t" \
63682 +       "movb %4,5(%2)\n\t" \
63683 +       "movb $0,6(%2)\n\t" \
63684 +       "movb %h1,7(%2)\n\t" \
63685 +       "rorl $16,%1" \
63686 +       : "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type))
63687 +
63688 +#ifndef CONFIG_X86_NO_TSS
63689 +static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr)
63690 +{
63691 +       _set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr,
63692 +               offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89);
63693 +}
63694 +
63695 +#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
63696 +#endif
63697 +
63698 +static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size)
63699 +{
63700 +       _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82);
63701 +}
63702 +
63703 +#define LDT_entry_a(info) \
63704 +       ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
63705 +
63706 +#define LDT_entry_b(info) \
63707 +       (((info)->base_addr & 0xff000000) | \
63708 +       (((info)->base_addr & 0x00ff0000) >> 16) | \
63709 +       ((info)->limit & 0xf0000) | \
63710 +       (((info)->read_exec_only ^ 1) << 9) | \
63711 +       ((info)->contents << 10) | \
63712 +       (((info)->seg_not_present ^ 1) << 15) | \
63713 +       ((info)->seg_32bit << 22) | \
63714 +       ((info)->limit_in_pages << 23) | \
63715 +       ((info)->useable << 20) | \
63716 +       0x7000)
63717 +
63718 +#define LDT_empty(info) (\
63719 +       (info)->base_addr       == 0    && \
63720 +       (info)->limit           == 0    && \
63721 +       (info)->contents        == 0    && \
63722 +       (info)->read_exec_only  == 1    && \
63723 +       (info)->seg_32bit       == 0    && \
63724 +       (info)->limit_in_pages  == 0    && \
63725 +       (info)->seg_not_present == 1    && \
63726 +       (info)->useable         == 0    )
63727 +
63728 +extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
63729 +
63730 +#if TLS_SIZE != 24
63731 +# error update this code.
63732 +#endif
63733 +
63734 +static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
63735 +{
63736 +#define C(i) HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), *(u64 *)&t->tls_array[i])
63737 +       C(0); C(1); C(2);
63738 +#undef C
63739 +}
63740 +
63741 +static inline void clear_LDT(void)
63742 +{
63743 +       int cpu = get_cpu();
63744 +
63745 +       /*
63746 +        * NB. We load the default_ldt for lcall7/27 handling on demand, as
63747 +        * it slows down context switching. Noone uses it anyway.
63748 +        */
63749 +       cpu = cpu;              /* XXX avoid compiler warning */
63750 +       xen_set_ldt(0UL, 0);
63751 +       put_cpu();
63752 +}
63753 +
63754 +/*
63755 + * load one particular LDT into the current CPU
63756 + */
63757 +static inline void load_LDT_nolock(mm_context_t *pc, int cpu)
63758 +{
63759 +       void *segments = pc->ldt;
63760 +       int count = pc->size;
63761 +
63762 +       if (likely(!count))
63763 +               segments = NULL;
63764 +
63765 +       xen_set_ldt((unsigned long)segments, count);
63766 +}
63767 +
63768 +static inline void load_LDT(mm_context_t *pc)
63769 +{
63770 +       int cpu = get_cpu();
63771 +       load_LDT_nolock(pc, cpu);
63772 +       put_cpu();
63773 +}
63774 +
63775 +static inline unsigned long get_desc_base(unsigned long *desc)
63776 +{
63777 +       unsigned long base;
63778 +       base = ((desc[0] >> 16)  & 0x0000ffff) |
63779 +               ((desc[1] << 16) & 0x00ff0000) |
63780 +               (desc[1] & 0xff000000);
63781 +       return base;
63782 +}
63783 +
63784 +#endif /* !__ASSEMBLY__ */
63785 +
63786 +#endif
63787 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/dma-mapping.h linux-2.6.16/include/asm-i386/mach-xen/asm/dma-mapping.h
63788 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/dma-mapping.h       1970-01-01 01:00:00.000000000 +0100
63789 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/dma-mapping.h    2006-06-26 09:51:32.000000000 +0200
63790 @@ -0,0 +1,152 @@
63791 +#ifndef _ASM_I386_DMA_MAPPING_H
63792 +#define _ASM_I386_DMA_MAPPING_H
63793 +
63794 +/*
63795 + * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
63796 + * documentation.
63797 + */
63798 +
63799 +#include <linux/config.h>
63800 +#include <linux/mm.h>
63801 +#include <asm/cache.h>
63802 +#include <asm/io.h>
63803 +#include <asm/scatterlist.h>
63804 +#include <asm/swiotlb.h>
63805 +
63806 +static inline int
63807 +address_needs_mapping(struct device *hwdev, dma_addr_t addr)
63808 +{
63809 +       dma_addr_t mask = 0xffffffff;
63810 +       /* If the device has a mask, use it, otherwise default to 32 bits */
63811 +       if (hwdev && hwdev->dma_mask)
63812 +               mask = *hwdev->dma_mask;
63813 +       return (addr & ~mask) != 0;
63814 +}
63815 +
63816 +static inline int
63817 +range_straddles_page_boundary(void *p, size_t size)
63818 +{
63819 +       extern unsigned long *contiguous_bitmap;
63820 +       return (((((unsigned long)p & ~PAGE_MASK) + size) > PAGE_SIZE) &&
63821 +               !test_bit(__pa(p) >> PAGE_SHIFT, contiguous_bitmap));
63822 +}
63823 +
63824 +#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
63825 +#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
63826 +
63827 +void *dma_alloc_coherent(struct device *dev, size_t size,
63828 +                          dma_addr_t *dma_handle, gfp_t flag);
63829 +
63830 +void dma_free_coherent(struct device *dev, size_t size,
63831 +                        void *vaddr, dma_addr_t dma_handle);
63832 +
63833 +extern dma_addr_t
63834 +dma_map_single(struct device *dev, void *ptr, size_t size,
63835 +              enum dma_data_direction direction);
63836 +
63837 +extern void
63838 +dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
63839 +                enum dma_data_direction direction);
63840 +
63841 +extern int dma_map_sg(struct device *hwdev, struct scatterlist *sg,
63842 +                     int nents, enum dma_data_direction direction);
63843 +extern void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg,
63844 +                        int nents, enum dma_data_direction direction);
63845 +
63846 +extern dma_addr_t
63847 +dma_map_page(struct device *dev, struct page *page, unsigned long offset,
63848 +            size_t size, enum dma_data_direction direction);
63849 +
63850 +extern void
63851 +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
63852 +              enum dma_data_direction direction);
63853 +
63854 +extern void
63855 +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
63856 +                       enum dma_data_direction direction);
63857 +
63858 +extern void
63859 +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
63860 +                           enum dma_data_direction direction);
63861 +
63862 +static inline void
63863 +dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle,
63864 +                             unsigned long offset, size_t size,
63865 +                             enum dma_data_direction direction)
63866 +{
63867 +       dma_sync_single_for_cpu(dev, dma_handle+offset, size, direction);
63868 +}
63869 +
63870 +static inline void
63871 +dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
63872 +                                unsigned long offset, size_t size,
63873 +                                enum dma_data_direction direction)
63874 +{
63875 +       dma_sync_single_for_device(dev, dma_handle+offset, size, direction);
63876 +}
63877 +
63878 +static inline void
63879 +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
63880 +                   enum dma_data_direction direction)
63881 +{
63882 +       if (swiotlb)
63883 +               swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction);
63884 +       flush_write_buffers();
63885 +}
63886 +
63887 +static inline void
63888 +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
63889 +                   enum dma_data_direction direction)
63890 +{
63891 +       if (swiotlb)
63892 +               swiotlb_sync_sg_for_device(dev,sg,nelems,direction);
63893 +       flush_write_buffers();
63894 +}
63895 +
63896 +extern int
63897 +dma_mapping_error(dma_addr_t dma_addr);
63898 +
63899 +extern int
63900 +dma_supported(struct device *dev, u64 mask);
63901 +
63902 +static inline int
63903 +dma_set_mask(struct device *dev, u64 mask)
63904 +{
63905 +       if(!dev->dma_mask || !dma_supported(dev, mask))
63906 +               return -EIO;
63907 +
63908 +       *dev->dma_mask = mask;
63909 +
63910 +       return 0;
63911 +}
63912 +
63913 +static inline int
63914 +dma_get_cache_alignment(void)
63915 +{
63916 +       /* no easy way to get cache size on all x86, so return the
63917 +        * maximum possible, to be safe */
63918 +       return (1 << INTERNODE_CACHE_SHIFT);
63919 +}
63920 +
63921 +#define dma_is_consistent(d)   (1)
63922 +
63923 +static inline void
63924 +dma_cache_sync(void *vaddr, size_t size,
63925 +              enum dma_data_direction direction)
63926 +{
63927 +       flush_write_buffers();
63928 +}
63929 +
63930 +#define ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY
63931 +extern int
63932 +dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
63933 +                           dma_addr_t device_addr, size_t size, int flags);
63934 +
63935 +extern void
63936 +dma_release_declared_memory(struct device *dev);
63937 +
63938 +extern void *
63939 +dma_mark_declared_memory_occupied(struct device *dev,
63940 +                                 dma_addr_t device_addr, size_t size);
63941 +
63942 +#endif
63943 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/fixmap.h linux-2.6.16/include/asm-i386/mach-xen/asm/fixmap.h
63944 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/fixmap.h    1970-01-01 01:00:00.000000000 +0100
63945 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/fixmap.h 2006-06-26 09:51:32.000000000 +0200
63946 @@ -0,0 +1,156 @@
63947 +/*
63948 + * fixmap.h: compile-time virtual memory allocation
63949 + *
63950 + * This file is subject to the terms and conditions of the GNU General Public
63951 + * License.  See the file "COPYING" in the main directory of this archive
63952 + * for more details.
63953 + *
63954 + * Copyright (C) 1998 Ingo Molnar
63955 + *
63956 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
63957 + */
63958 +
63959 +#ifndef _ASM_FIXMAP_H
63960 +#define _ASM_FIXMAP_H
63961 +
63962 +#include <linux/config.h>
63963 +
63964 +/* used by vmalloc.c, vsyscall.lds.S.
63965 + *
63966 + * Leave one empty page between vmalloc'ed areas and
63967 + * the start of the fixmap.
63968 + */
63969 +extern unsigned long __FIXADDR_TOP;
63970 +
63971 +#ifndef __ASSEMBLY__
63972 +#include <linux/kernel.h>
63973 +#include <asm/acpi.h>
63974 +#include <asm/apicdef.h>
63975 +#include <asm/page.h>
63976 +#include <xen/gnttab.h>
63977 +#ifdef CONFIG_HIGHMEM
63978 +#include <linux/threads.h>
63979 +#include <asm/kmap_types.h>
63980 +#endif
63981 +
63982 +/*
63983 + * Here we define all the compile-time 'special' virtual
63984 + * addresses. The point is to have a constant address at
63985 + * compile time, but to set the physical address only
63986 + * in the boot process. We allocate these special addresses
63987 + * from the end of virtual memory (0xfffff000) backwards.
63988 + * Also this lets us do fail-safe vmalloc(), we
63989 + * can guarantee that these special addresses and
63990 + * vmalloc()-ed addresses never overlap.
63991 + *
63992 + * these 'compile-time allocated' memory buffers are
63993 + * fixed-size 4k pages. (or larger if used with an increment
63994 + * highger than 1) use fixmap_set(idx,phys) to associate
63995 + * physical memory with fixmap indices.
63996 + *
63997 + * TLB entries of such buffers will not be flushed across
63998 + * task switches.
63999 + */
64000 +enum fixed_addresses {
64001 +       FIX_HOLE,
64002 +#ifdef CONFIG_X86_LOCAL_APIC
64003 +       FIX_APIC_BASE,  /* local (CPU) APIC) -- required for SMP or not */
64004 +#endif
64005 +#ifdef CONFIG_X86_IO_APIC
64006 +       FIX_IO_APIC_BASE_0,
64007 +       FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
64008 +#endif
64009 +#ifdef CONFIG_X86_VISWS_APIC
64010 +       FIX_CO_CPU,     /* Cobalt timer */
64011 +       FIX_CO_APIC,    /* Cobalt APIC Redirection Table */ 
64012 +       FIX_LI_PCIA,    /* Lithium PCI Bridge A */
64013 +       FIX_LI_PCIB,    /* Lithium PCI Bridge B */
64014 +#endif
64015 +#ifdef CONFIG_X86_F00F_BUG
64016 +       FIX_F00F_IDT,   /* Virtual mapping for IDT */
64017 +#endif
64018 +#ifdef CONFIG_X86_CYCLONE_TIMER
64019 +       FIX_CYCLONE_TIMER, /*cyclone timer register*/
64020 +#endif 
64021 +#ifdef CONFIG_HIGHMEM
64022 +       FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
64023 +       FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
64024 +#endif
64025 +#ifdef CONFIG_ACPI
64026 +       FIX_ACPI_BEGIN,
64027 +       FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
64028 +#endif
64029 +#ifdef CONFIG_PCI_MMCONFIG
64030 +       FIX_PCIE_MCFG,
64031 +#endif
64032 +       FIX_SHARED_INFO,
64033 +#define NR_FIX_ISAMAPS 256
64034 +       FIX_ISAMAP_END,
64035 +       FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
64036 +       __end_of_permanent_fixed_addresses,
64037 +       /* temporary boot-time mappings, used before ioremap() is functional */
64038 +#define NR_FIX_BTMAPS  16
64039 +       FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
64040 +       FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
64041 +       FIX_WP_TEST,
64042 +       __end_of_fixed_addresses
64043 +};
64044 +
64045 +extern void __set_fixmap(enum fixed_addresses idx,
64046 +                                       maddr_t phys, pgprot_t flags);
64047 +
64048 +extern void set_fixaddr_top(unsigned long top);
64049 +
64050 +#define set_fixmap(idx, phys) \
64051 +               __set_fixmap(idx, phys, PAGE_KERNEL)
64052 +/*
64053 + * Some hardware wants to get fixmapped without caching.
64054 + */
64055 +#define set_fixmap_nocache(idx, phys) \
64056 +               __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
64057 +
64058 +#define clear_fixmap(idx) \
64059 +               __set_fixmap(idx, 0, __pgprot(0))
64060 +
64061 +#define FIXADDR_TOP    ((unsigned long)__FIXADDR_TOP)
64062 +
64063 +#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
64064 +#define __FIXADDR_BOOT_SIZE    (__end_of_fixed_addresses << PAGE_SHIFT)
64065 +#define FIXADDR_START          (FIXADDR_TOP - __FIXADDR_SIZE)
64066 +#define FIXADDR_BOOT_START     (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
64067 +
64068 +#define __fix_to_virt(x)       (FIXADDR_TOP - ((x) << PAGE_SHIFT))
64069 +#define __virt_to_fix(x)       ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
64070 +
64071 +extern void __this_fixmap_does_not_exist(void);
64072 +
64073 +/*
64074 + * 'index to address' translation. If anyone tries to use the idx
64075 + * directly without tranlation, we catch the bug with a NULL-deference
64076 + * kernel oops. Illegal ranges of incoming indices are caught too.
64077 + */
64078 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
64079 +{
64080 +       /*
64081 +        * this branch gets completely eliminated after inlining,
64082 +        * except when someone tries to use fixaddr indices in an
64083 +        * illegal way. (such as mixing up address types or using
64084 +        * out-of-range indices).
64085 +        *
64086 +        * If it doesn't get removed, the linker will complain
64087 +        * loudly with a reasonably clear error message..
64088 +        */
64089 +       if (idx >= __end_of_fixed_addresses)
64090 +               __this_fixmap_does_not_exist();
64091 +
64092 +        return __fix_to_virt(idx);
64093 +}
64094 +
64095 +static inline unsigned long virt_to_fix(const unsigned long vaddr)
64096 +{
64097 +       BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
64098 +       return __virt_to_fix(vaddr);
64099 +}
64100 +
64101 +#endif /* !__ASSEMBLY__ */
64102 +#endif
64103 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/floppy.h linux-2.6.16/include/asm-i386/mach-xen/asm/floppy.h
64104 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/floppy.h    1970-01-01 01:00:00.000000000 +0100
64105 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/floppy.h 2006-06-26 09:51:32.000000000 +0200
64106 @@ -0,0 +1,147 @@
64107 +/*
64108 + * Architecture specific parts of the Floppy driver
64109 + *
64110 + * This file is subject to the terms and conditions of the GNU General Public
64111 + * License.  See the file "COPYING" in the main directory of this archive
64112 + * for more details.
64113 + *
64114 + * Copyright (C) 1995
64115 + *
64116 + * Modifications for Xen are Copyright (c) 2004, Keir Fraser.
64117 + */
64118 +#ifndef __ASM_XEN_I386_FLOPPY_H
64119 +#define __ASM_XEN_I386_FLOPPY_H
64120 +
64121 +#include <linux/vmalloc.h>
64122 +
64123 +/* XEN: Hit DMA paths on the head. This trick from asm-m68k/floppy.h. */
64124 +#include <asm/dma.h>
64125 +#undef MAX_DMA_ADDRESS
64126 +#define MAX_DMA_ADDRESS 0
64127 +#define CROSS_64KB(a,s) (0)
64128 +
64129 +#define fd_inb(port)                   inb_p(port)
64130 +#define fd_outb(value,port)            outb_p(value,port)
64131 +
64132 +#define fd_request_dma()        (0)
64133 +#define fd_free_dma()           ((void)0)
64134 +#define fd_enable_irq()         enable_irq(FLOPPY_IRQ)
64135 +#define fd_disable_irq()        disable_irq(FLOPPY_IRQ)
64136 +#define fd_free_irq()          free_irq(FLOPPY_IRQ, NULL)
64137 +#define fd_get_dma_residue()    (virtual_dma_count + virtual_dma_residue)
64138 +#define fd_dma_setup(addr, size, mode, io) vdma_dma_setup(addr, size, mode, io)
64139 +/*
64140 + * Do not use vmalloc/vfree: floppy_release_irq_and_dma() gets called from
64141 + * softirq context via motor_off_callback. A generic bug we happen to trigger.
64142 + */
64143 +#define fd_dma_mem_alloc(size) __get_free_pages(GFP_KERNEL, get_order(size))
64144 +#define fd_dma_mem_free(addr, size) free_pages(addr, get_order(size))
64145 +
64146 +static int virtual_dma_count;
64147 +static int virtual_dma_residue;
64148 +static char *virtual_dma_addr;
64149 +static int virtual_dma_mode;
64150 +static int doing_pdma;
64151 +
64152 +static irqreturn_t floppy_hardint(int irq, void *dev_id, struct pt_regs * regs)
64153 +{
64154 +       register unsigned char st;
64155 +       register int lcount;
64156 +       register char *lptr;
64157 +
64158 +       if (!doing_pdma)
64159 +               return floppy_interrupt(irq, dev_id, regs);
64160 +
64161 +       st = 1;
64162 +       for(lcount=virtual_dma_count, lptr=virtual_dma_addr; 
64163 +           lcount; lcount--, lptr++) {
64164 +               st=inb(virtual_dma_port+4) & 0xa0 ;
64165 +               if(st != 0xa0) 
64166 +                       break;
64167 +               if(virtual_dma_mode)
64168 +                       outb_p(*lptr, virtual_dma_port+5);
64169 +               else
64170 +                       *lptr = inb_p(virtual_dma_port+5);
64171 +       }
64172 +       virtual_dma_count = lcount;
64173 +       virtual_dma_addr = lptr;
64174 +       st = inb(virtual_dma_port+4);
64175 +
64176 +       if(st == 0x20)
64177 +               return IRQ_HANDLED;
64178 +       if(!(st & 0x20)) {
64179 +               virtual_dma_residue += virtual_dma_count;
64180 +               virtual_dma_count=0;
64181 +               doing_pdma = 0;
64182 +               floppy_interrupt(irq, dev_id, regs);
64183 +               return IRQ_HANDLED;
64184 +       }
64185 +       return IRQ_HANDLED;
64186 +}
64187 +
64188 +static void fd_disable_dma(void)
64189 +{
64190 +       doing_pdma = 0;
64191 +       virtual_dma_residue += virtual_dma_count;
64192 +       virtual_dma_count=0;
64193 +}
64194 +
64195 +static int fd_request_irq(void)
64196 +{
64197 +       return request_irq(FLOPPY_IRQ, floppy_hardint,SA_INTERRUPT,
64198 +                                          "floppy", NULL);
64199 +}
64200 +
64201 +static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io)
64202 +{
64203 +       doing_pdma = 1;
64204 +       virtual_dma_port = io;
64205 +       virtual_dma_mode = (mode  == DMA_MODE_WRITE);
64206 +       virtual_dma_addr = addr;
64207 +       virtual_dma_count = size;
64208 +       virtual_dma_residue = 0;
64209 +       return 0;
64210 +}
64211 +
64212 +/* XEN: This trick to force 'virtual DMA' is from include/asm-m68k/floppy.h. */
64213 +#define FDC1 xen_floppy_init()
64214 +static int FDC2 = -1;
64215 +
64216 +static int xen_floppy_init(void)
64217 +{
64218 +       use_virtual_dma = 1;
64219 +       can_use_virtual_dma = 1;
64220 +       return 0x3f0;
64221 +}
64222 +
64223 +/*
64224 + * Floppy types are stored in the rtc's CMOS RAM and so rtc_lock
64225 + * is needed to prevent corrupted CMOS RAM in case "insmod floppy"
64226 + * coincides with another rtc CMOS user.               Paul G.
64227 + */
64228 +#define FLOPPY0_TYPE   ({                              \
64229 +       unsigned long flags;                            \
64230 +       unsigned char val;                              \
64231 +       spin_lock_irqsave(&rtc_lock, flags);            \
64232 +       val = (CMOS_READ(0x10) >> 4) & 15;              \
64233 +       spin_unlock_irqrestore(&rtc_lock, flags);       \
64234 +       val;                                            \
64235 +})
64236 +
64237 +#define FLOPPY1_TYPE   ({                              \
64238 +       unsigned long flags;                            \
64239 +       unsigned char val;                              \
64240 +       spin_lock_irqsave(&rtc_lock, flags);            \
64241 +       val = CMOS_READ(0x10) & 15;                     \
64242 +       spin_unlock_irqrestore(&rtc_lock, flags);       \
64243 +       val;                                            \
64244 +})
64245 +
64246 +#define N_FDC 2
64247 +#define N_DRIVE 8
64248 +
64249 +#define FLOPPY_MOTOR_MASK 0xf0
64250 +
64251 +#define EXTRA_FLOPPY_PARAMS
64252 +
64253 +#endif /* __ASM_XEN_I386_FLOPPY_H */
64254 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/highmem.h linux-2.6.16/include/asm-i386/mach-xen/asm/highmem.h
64255 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/highmem.h   1970-01-01 01:00:00.000000000 +0100
64256 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/highmem.h        2006-06-26 09:51:32.000000000 +0200
64257 @@ -0,0 +1,81 @@
64258 +/*
64259 + * highmem.h: virtual kernel memory mappings for high memory
64260 + *
64261 + * Used in CONFIG_HIGHMEM systems for memory pages which
64262 + * are not addressable by direct kernel virtual addresses.
64263 + *
64264 + * Copyright (C) 1999 Gerhard Wichert, Siemens AG
64265 + *                   Gerhard.Wichert@pdb.siemens.de
64266 + *
64267 + *
64268 + * Redesigned the x86 32-bit VM architecture to deal with 
64269 + * up to 16 Terabyte physical memory. With current x86 CPUs
64270 + * we now support up to 64 Gigabytes physical RAM.
64271 + *
64272 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
64273 + */
64274 +
64275 +#ifndef _ASM_HIGHMEM_H
64276 +#define _ASM_HIGHMEM_H
64277 +
64278 +#ifdef __KERNEL__
64279 +
64280 +#include <linux/config.h>
64281 +#include <linux/interrupt.h>
64282 +#include <linux/threads.h>
64283 +#include <asm/kmap_types.h>
64284 +#include <asm/tlbflush.h>
64285 +
64286 +/* declarations for highmem.c */
64287 +extern unsigned long highstart_pfn, highend_pfn;
64288 +
64289 +extern pte_t *kmap_pte;
64290 +extern pgprot_t kmap_prot;
64291 +extern pte_t *pkmap_page_table;
64292 +
64293 +/*
64294 + * Right now we initialize only a single pte table. It can be extended
64295 + * easily, subsequent pte tables have to be allocated in one physical
64296 + * chunk of RAM.
64297 + */
64298 +#ifdef CONFIG_X86_PAE
64299 +#define LAST_PKMAP 512
64300 +#else
64301 +#define LAST_PKMAP 1024
64302 +#endif
64303 +/*
64304 + * Ordering is:
64305 + *
64306 + * FIXADDR_TOP
64307 + *                     fixed_addresses
64308 + * FIXADDR_START
64309 + *                     temp fixed addresses
64310 + * FIXADDR_BOOT_START
64311 + *                     Persistent kmap area
64312 + * PKMAP_BASE
64313 + * VMALLOC_END
64314 + *                     Vmalloc area
64315 + * VMALLOC_START
64316 + * high_memory
64317 + */
64318 +#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK )
64319 +#define LAST_PKMAP_MASK (LAST_PKMAP-1)
64320 +#define PKMAP_NR(virt)  ((virt-PKMAP_BASE) >> PAGE_SHIFT)
64321 +#define PKMAP_ADDR(nr)  (PKMAP_BASE + ((nr) << PAGE_SHIFT))
64322 +
64323 +extern void * FASTCALL(kmap_high(struct page *page));
64324 +extern void FASTCALL(kunmap_high(struct page *page));
64325 +
64326 +void *kmap(struct page *page);
64327 +void kunmap(struct page *page);
64328 +void *kmap_atomic(struct page *page, enum km_type type);
64329 +void *kmap_atomic_pte(struct page *page, enum km_type type);
64330 +void kunmap_atomic(void *kvaddr, enum km_type type);
64331 +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
64332 +struct page *kmap_atomic_to_page(void *ptr);
64333 +
64334 +#define flush_cache_kmaps()    do { } while (0)
64335 +
64336 +#endif /* __KERNEL__ */
64337 +
64338 +#endif /* _ASM_HIGHMEM_H */
64339 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/hw_irq.h linux-2.6.16/include/asm-i386/mach-xen/asm/hw_irq.h
64340 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/hw_irq.h    1970-01-01 01:00:00.000000000 +0100
64341 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/hw_irq.h 2006-06-26 09:51:32.000000000 +0200
64342 @@ -0,0 +1,73 @@
64343 +#ifndef _ASM_HW_IRQ_H
64344 +#define _ASM_HW_IRQ_H
64345 +
64346 +/*
64347 + *     linux/include/asm/hw_irq.h
64348 + *
64349 + *     (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
64350 + *
64351 + *     moved some of the old arch/i386/kernel/irq.h to here. VY
64352 + *
64353 + *     IRQ/IPI changes taken from work by Thomas Radke
64354 + *     <tomsoft@informatik.tu-chemnitz.de>
64355 + */
64356 +
64357 +#include <linux/config.h>
64358 +#include <linux/profile.h>
64359 +#include <asm/atomic.h>
64360 +#include <asm/irq.h>
64361 +#include <asm/sections.h>
64362 +
64363 +struct hw_interrupt_type;
64364 +
64365 +/*
64366 + * Various low-level irq details needed by irq.c, process.c,
64367 + * time.c, io_apic.c and smp.c
64368 + *
64369 + * Interrupt entry/exit code at both C and assembly level
64370 + */
64371 +
64372 +extern u8 irq_vector[NR_IRQ_VECTORS];
64373 +#define IO_APIC_VECTOR(irq)    (irq_vector[irq])
64374 +#define AUTO_ASSIGN            -1
64375 +
64376 +extern void (*interrupt[NR_IRQS])(void);
64377 +
64378 +#ifdef CONFIG_SMP
64379 +fastcall void reschedule_interrupt(void);
64380 +fastcall void invalidate_interrupt(void);
64381 +fastcall void call_function_interrupt(void);
64382 +#endif
64383 +
64384 +#ifdef CONFIG_X86_LOCAL_APIC
64385 +fastcall void apic_timer_interrupt(void);
64386 +fastcall void error_interrupt(void);
64387 +fastcall void spurious_interrupt(void);
64388 +fastcall void thermal_interrupt(struct pt_regs *);
64389 +#define platform_legacy_irq(irq)       ((irq) < 16)
64390 +#endif
64391 +
64392 +void disable_8259A_irq(unsigned int irq);
64393 +void enable_8259A_irq(unsigned int irq);
64394 +int i8259A_irq_pending(unsigned int irq);
64395 +void make_8259A_irq(unsigned int irq);
64396 +void init_8259A(int aeoi);
64397 +void FASTCALL(send_IPI_self(int vector));
64398 +void init_VISWS_APIC_irqs(void);
64399 +void setup_IO_APIC(void);
64400 +void disable_IO_APIC(void);
64401 +void print_IO_APIC(void);
64402 +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
64403 +void send_IPI(int dest, int vector);
64404 +void setup_ioapic_dest(void);
64405 +
64406 +extern unsigned long io_apic_irqs;
64407 +
64408 +extern atomic_t irq_err_count;
64409 +extern atomic_t irq_mis_count;
64410 +
64411 +#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs))
64412 +
64413 +extern void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i);
64414 +
64415 +#endif /* _ASM_HW_IRQ_H */
64416 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/hypercall.h linux-2.6.16/include/asm-i386/mach-xen/asm/hypercall.h
64417 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/hypercall.h 1970-01-01 01:00:00.000000000 +0100
64418 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/hypercall.h      2006-06-26 09:51:32.000000000 +0200
64419 @@ -0,0 +1,342 @@
64420 +/******************************************************************************
64421 + * hypercall.h
64422 + * 
64423 + * Linux-specific hypervisor handling.
64424 + * 
64425 + * Copyright (c) 2002-2004, K A Fraser
64426 + * 
64427 + * This program is free software; you can redistribute it and/or
64428 + * modify it under the terms of the GNU General Public License version 2
64429 + * as published by the Free Software Foundation; or, when distributed
64430 + * separately from the Linux kernel or incorporated into other
64431 + * software packages, subject to the following license:
64432 + * 
64433 + * Permission is hereby granted, free of charge, to any person obtaining a copy
64434 + * of this source file (the "Software"), to deal in the Software without
64435 + * restriction, including without limitation the rights to use, copy, modify,
64436 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
64437 + * and to permit persons to whom the Software is furnished to do so, subject to
64438 + * the following conditions:
64439 + * 
64440 + * The above copyright notice and this permission notice shall be included in
64441 + * all copies or substantial portions of the Software.
64442 + * 
64443 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
64444 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
64445 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
64446 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
64447 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
64448 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
64449 + * IN THE SOFTWARE.
64450 + */
64451 +
64452 +#ifndef __HYPERCALL_H__
64453 +#define __HYPERCALL_H__
64454 +
64455 +#ifndef __HYPERVISOR_H__
64456 +# error "please don't include this file directly"
64457 +#endif
64458 +
64459 +#define __STR(x) #x
64460 +#define STR(x) __STR(x)
64461 +
64462 +#define _hypercall0(type, name)                        \
64463 +({                                             \
64464 +       long __res;                             \
64465 +       asm volatile (                          \
64466 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
64467 +               : "=a" (__res)                  \
64468 +               :                               \
64469 +               : "memory" );                   \
64470 +       (type)__res;                            \
64471 +})
64472 +
64473 +#define _hypercall1(type, name, a1)                            \
64474 +({                                                             \
64475 +       long __res, __ign1;                                     \
64476 +       asm volatile (                                          \
64477 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
64478 +               : "=a" (__res), "=b" (__ign1)                   \
64479 +               : "1" ((long)(a1))                              \
64480 +               : "memory" );                                   \
64481 +       (type)__res;                                            \
64482 +})
64483 +
64484 +#define _hypercall2(type, name, a1, a2)                                \
64485 +({                                                             \
64486 +       long __res, __ign1, __ign2;                             \
64487 +       asm volatile (                                          \
64488 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
64489 +               : "=a" (__res), "=b" (__ign1), "=c" (__ign2)    \
64490 +               : "1" ((long)(a1)), "2" ((long)(a2))            \
64491 +               : "memory" );                                   \
64492 +       (type)__res;                                            \
64493 +})
64494 +
64495 +#define _hypercall3(type, name, a1, a2, a3)                    \
64496 +({                                                             \
64497 +       long __res, __ign1, __ign2, __ign3;                     \
64498 +       asm volatile (                                          \
64499 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
64500 +               : "=a" (__res), "=b" (__ign1), "=c" (__ign2),   \
64501 +               "=d" (__ign3)                                   \
64502 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
64503 +               "3" ((long)(a3))                                \
64504 +               : "memory" );                                   \
64505 +       (type)__res;                                            \
64506 +})
64507 +
64508 +#define _hypercall4(type, name, a1, a2, a3, a4)                        \
64509 +({                                                             \
64510 +       long __res, __ign1, __ign2, __ign3, __ign4;             \
64511 +       asm volatile (                                          \
64512 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
64513 +               : "=a" (__res), "=b" (__ign1), "=c" (__ign2),   \
64514 +               "=d" (__ign3), "=S" (__ign4)                    \
64515 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
64516 +               "3" ((long)(a3)), "4" ((long)(a4))              \
64517 +               : "memory" );                                   \
64518 +       (type)__res;                                            \
64519 +})
64520 +
64521 +#define _hypercall5(type, name, a1, a2, a3, a4, a5)            \
64522 +({                                                             \
64523 +       long __res, __ign1, __ign2, __ign3, __ign4, __ign5;     \
64524 +       asm volatile (                                          \
64525 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
64526 +               : "=a" (__res), "=b" (__ign1), "=c" (__ign2),   \
64527 +               "=d" (__ign3), "=S" (__ign4), "=D" (__ign5)     \
64528 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
64529 +               "3" ((long)(a3)), "4" ((long)(a4)),             \
64530 +               "5" ((long)(a5))                                \
64531 +               : "memory" );                                   \
64532 +       (type)__res;                                            \
64533 +})
64534 +
64535 +static inline int
64536 +HYPERVISOR_set_trap_table(
64537 +       trap_info_t *table)
64538 +{
64539 +       return _hypercall1(int, set_trap_table, table);
64540 +}
64541 +
64542 +static inline int
64543 +HYPERVISOR_mmu_update(
64544 +       mmu_update_t *req, int count, int *success_count, domid_t domid)
64545 +{
64546 +       return _hypercall4(int, mmu_update, req, count, success_count, domid);
64547 +}
64548 +
64549 +static inline int
64550 +HYPERVISOR_mmuext_op(
64551 +       struct mmuext_op *op, int count, int *success_count, domid_t domid)
64552 +{
64553 +       return _hypercall4(int, mmuext_op, op, count, success_count, domid);
64554 +}
64555 +
64556 +static inline int
64557 +HYPERVISOR_set_gdt(
64558 +       unsigned long *frame_list, int entries)
64559 +{
64560 +       return _hypercall2(int, set_gdt, frame_list, entries);
64561 +}
64562 +
64563 +static inline int
64564 +HYPERVISOR_stack_switch(
64565 +       unsigned long ss, unsigned long esp)
64566 +{
64567 +       return _hypercall2(int, stack_switch, ss, esp);
64568 +}
64569 +
64570 +static inline int
64571 +HYPERVISOR_set_callbacks(
64572 +       unsigned long event_selector, unsigned long event_address,
64573 +       unsigned long failsafe_selector, unsigned long failsafe_address)
64574 +{
64575 +       return _hypercall4(int, set_callbacks,
64576 +                          event_selector, event_address,
64577 +                          failsafe_selector, failsafe_address);
64578 +}
64579 +
64580 +static inline int
64581 +HYPERVISOR_fpu_taskswitch(
64582 +       int set)
64583 +{
64584 +       return _hypercall1(int, fpu_taskswitch, set);
64585 +}
64586 +
64587 +static inline int
64588 +HYPERVISOR_sched_op_compat(
64589 +       int cmd, unsigned long arg)
64590 +{
64591 +       return _hypercall2(int, sched_op_compat, cmd, arg);
64592 +}
64593 +
64594 +static inline int
64595 +HYPERVISOR_sched_op(
64596 +       int cmd, void *arg)
64597 +{
64598 +       return _hypercall2(int, sched_op, cmd, arg);
64599 +}
64600 +
64601 +static inline long
64602 +HYPERVISOR_set_timer_op(
64603 +       u64 timeout)
64604 +{
64605 +       unsigned long timeout_hi = (unsigned long)(timeout>>32);
64606 +       unsigned long timeout_lo = (unsigned long)timeout;
64607 +       return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
64608 +}
64609 +
64610 +static inline int
64611 +HYPERVISOR_dom0_op(
64612 +       dom0_op_t *dom0_op)
64613 +{
64614 +       dom0_op->interface_version = DOM0_INTERFACE_VERSION;
64615 +       return _hypercall1(int, dom0_op, dom0_op);
64616 +}
64617 +
64618 +static inline int
64619 +HYPERVISOR_set_debugreg(
64620 +       int reg, unsigned long value)
64621 +{
64622 +       return _hypercall2(int, set_debugreg, reg, value);
64623 +}
64624 +
64625 +static inline unsigned long
64626 +HYPERVISOR_get_debugreg(
64627 +       int reg)
64628 +{
64629 +       return _hypercall1(unsigned long, get_debugreg, reg);
64630 +}
64631 +
64632 +static inline int
64633 +HYPERVISOR_update_descriptor(
64634 +       u64 ma, u64 desc)
64635 +{
64636 +       return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
64637 +}
64638 +
64639 +static inline int
64640 +HYPERVISOR_memory_op(
64641 +       unsigned int cmd, void *arg)
64642 +{
64643 +       return _hypercall2(int, memory_op, cmd, arg);
64644 +}
64645 +
64646 +static inline int
64647 +HYPERVISOR_multicall(
64648 +       void *call_list, int nr_calls)
64649 +{
64650 +       return _hypercall2(int, multicall, call_list, nr_calls);
64651 +}
64652 +
64653 +static inline int
64654 +HYPERVISOR_update_va_mapping(
64655 +       unsigned long va, pte_t new_val, unsigned long flags)
64656 +{
64657 +       unsigned long pte_hi = 0;
64658 +#ifdef CONFIG_X86_PAE
64659 +       pte_hi = new_val.pte_high;
64660 +#endif
64661 +       return _hypercall4(int, update_va_mapping, va,
64662 +                          new_val.pte_low, pte_hi, flags);
64663 +}
64664 +
64665 +static inline int
64666 +HYPERVISOR_event_channel_op(
64667 +       void *op)
64668 +{
64669 +       return _hypercall1(int, event_channel_op, op);
64670 +}
64671 +
64672 +static inline int
64673 +HYPERVISOR_xen_version(
64674 +       int cmd, void *arg)
64675 +{
64676 +       return _hypercall2(int, xen_version, cmd, arg);
64677 +}
64678 +
64679 +static inline int
64680 +HYPERVISOR_console_io(
64681 +       int cmd, int count, char *str)
64682 +{
64683 +       return _hypercall3(int, console_io, cmd, count, str);
64684 +}
64685 +
64686 +static inline int
64687 +HYPERVISOR_physdev_op(
64688 +       void *physdev_op)
64689 +{
64690 +       return _hypercall1(int, physdev_op, physdev_op);
64691 +}
64692 +
64693 +static inline int
64694 +HYPERVISOR_grant_table_op(
64695 +       unsigned int cmd, void *uop, unsigned int count)
64696 +{
64697 +       return _hypercall3(int, grant_table_op, cmd, uop, count);
64698 +}
64699 +
64700 +static inline int
64701 +HYPERVISOR_update_va_mapping_otherdomain(
64702 +       unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
64703 +{
64704 +       unsigned long pte_hi = 0;
64705 +#ifdef CONFIG_X86_PAE
64706 +       pte_hi = new_val.pte_high;
64707 +#endif
64708 +       return _hypercall5(int, update_va_mapping_otherdomain, va,
64709 +                          new_val.pte_low, pte_hi, flags, domid);
64710 +}
64711 +
64712 +static inline int
64713 +HYPERVISOR_vm_assist(
64714 +       unsigned int cmd, unsigned int type)
64715 +{
64716 +       return _hypercall2(int, vm_assist, cmd, type);
64717 +}
64718 +
64719 +static inline int
64720 +HYPERVISOR_vcpu_op(
64721 +       int cmd, int vcpuid, void *extra_args)
64722 +{
64723 +       return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
64724 +}
64725 +
64726 +static inline int
64727 +HYPERVISOR_suspend(
64728 +       unsigned long srec)
64729 +{
64730 +       struct sched_shutdown sched_shutdown = {
64731 +               .reason = SHUTDOWN_suspend
64732 +       };
64733 +
64734 +       int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
64735 +                            &sched_shutdown, srec);
64736 +
64737 +       if (rc == -ENOSYS)
64738 +               rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
64739 +                                SHUTDOWN_suspend, srec);
64740 +
64741 +       return rc;
64742 +}
64743 +
64744 +static inline int
64745 +HYPERVISOR_nmi_op(
64746 +       unsigned long op, void *arg)
64747 +{
64748 +       return _hypercall2(int, nmi_op, op, arg);
64749 +}
64750 +
64751 +#endif /* __HYPERCALL_H__ */
64752 +
64753 +/*
64754 + * Local variables:
64755 + *  c-file-style: "linux"
64756 + *  indent-tabs-mode: t
64757 + *  c-indent-level: 8
64758 + *  c-basic-offset: 8
64759 + *  tab-width: 8
64760 + * End:
64761 + */
64762 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/hypervisor.h linux-2.6.16/include/asm-i386/mach-xen/asm/hypervisor.h
64763 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/hypervisor.h        1970-01-01 01:00:00.000000000 +0100
64764 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/hypervisor.h     2006-06-26 09:51:32.000000000 +0200
64765 @@ -0,0 +1,224 @@
64766 +/******************************************************************************
64767 + * hypervisor.h
64768 + * 
64769 + * Linux-specific hypervisor handling.
64770 + * 
64771 + * Copyright (c) 2002-2004, K A Fraser
64772 + * 
64773 + * This program is free software; you can redistribute it and/or
64774 + * modify it under the terms of the GNU General Public License version 2
64775 + * as published by the Free Software Foundation; or, when distributed
64776 + * separately from the Linux kernel or incorporated into other
64777 + * software packages, subject to the following license:
64778 + * 
64779 + * Permission is hereby granted, free of charge, to any person obtaining a copy
64780 + * of this source file (the "Software"), to deal in the Software without
64781 + * restriction, including without limitation the rights to use, copy, modify,
64782 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
64783 + * and to permit persons to whom the Software is furnished to do so, subject to
64784 + * the following conditions:
64785 + * 
64786 + * The above copyright notice and this permission notice shall be included in
64787 + * all copies or substantial portions of the Software.
64788 + * 
64789 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
64790 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
64791 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
64792 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
64793 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
64794 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
64795 + * IN THE SOFTWARE.
64796 + */
64797 +
64798 +#ifndef __HYPERVISOR_H__
64799 +#define __HYPERVISOR_H__
64800 +
64801 +#include <linux/config.h>
64802 +#include <linux/types.h>
64803 +#include <linux/kernel.h>
64804 +#include <linux/version.h>
64805 +#include <linux/errno.h>
64806 +#include <xen/interface/xen.h>
64807 +#include <xen/interface/dom0_ops.h>
64808 +#include <xen/interface/sched.h>
64809 +#include <xen/interface/nmi.h>
64810 +#include <asm/ptrace.h>
64811 +#include <asm/page.h>
64812 +#if defined(__i386__)
64813 +#  ifdef CONFIG_X86_PAE
64814 +#   include <asm-generic/pgtable-nopud.h>
64815 +#  else
64816 +#   include <asm-generic/pgtable-nopmd.h>
64817 +#  endif
64818 +#endif
64819 +
64820 +extern shared_info_t *HYPERVISOR_shared_info;
64821 +
64822 +/* arch/xen/i386/kernel/setup.c */
64823 +extern start_info_t *xen_start_info;
64824 +
64825 +/* arch/xen/kernel/evtchn.c */
64826 +/* Force a proper event-channel callback from Xen. */
64827 +void force_evtchn_callback(void);
64828 +
64829 +/* arch/xen/kernel/process.c */
64830 +void xen_cpu_idle (void);
64831 +
64832 +/* arch/xen/i386/kernel/hypervisor.c */
64833 +void do_hypervisor_callback(struct pt_regs *regs);
64834 +
64835 +/* arch/xen/i386/mm/hypervisor.c */
64836 +/*
64837 + * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already
64838 + * be MACHINE addresses.
64839 + */
64840 +
64841 +void xen_pt_switch(unsigned long ptr);
64842 +void xen_new_user_pt(unsigned long ptr); /* x86_64 only */
64843 +void xen_load_gs(unsigned int selector); /* x86_64 only */
64844 +void xen_tlb_flush(void);
64845 +void xen_invlpg(unsigned long ptr);
64846 +
64847 +void xen_l1_entry_update(pte_t *ptr, pte_t val);
64848 +void xen_l2_entry_update(pmd_t *ptr, pmd_t val);
64849 +void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */
64850 +void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */
64851 +void xen_pgd_pin(unsigned long ptr);
64852 +void xen_pgd_unpin(unsigned long ptr);
64853 +
64854 +void xen_set_ldt(unsigned long ptr, unsigned long bytes);
64855 +void xen_machphys_update(unsigned long mfn, unsigned long pfn);
64856 +
64857 +#ifdef CONFIG_SMP
64858 +#include <linux/cpumask.h>
64859 +void xen_tlb_flush_all(void);
64860 +void xen_invlpg_all(unsigned long ptr);
64861 +void xen_tlb_flush_mask(cpumask_t *mask);
64862 +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr);
64863 +#endif
64864 +
64865 +/* Returns zero on success else negative errno. */
64866 +int xen_create_contiguous_region(
64867 +    unsigned long vstart, unsigned int order, unsigned int address_bits);
64868 +void xen_destroy_contiguous_region(
64869 +    unsigned long vstart, unsigned int order);
64870 +
64871 +/* Turn jiffies into Xen system time. */
64872 +u64 jiffies_to_st(unsigned long jiffies);
64873 +
64874 +#include <asm/hypercall.h>
64875 +
64876 +#if defined(CONFIG_X86_64)
64877 +#define MULTI_UVMFLAGS_INDEX 2
64878 +#define MULTI_UVMDOMID_INDEX 3
64879 +#else
64880 +#define MULTI_UVMFLAGS_INDEX 3
64881 +#define MULTI_UVMDOMID_INDEX 4
64882 +#endif
64883 +
64884 +#define xen_init()     (0)
64885 +
64886 +static inline int
64887 +HYPERVISOR_yield(
64888 +       void)
64889 +{
64890 +       int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
64891 +
64892 +       if (rc == -ENOSYS)
64893 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
64894 +
64895 +       return rc;
64896 +}
64897 +
64898 +static inline int
64899 +HYPERVISOR_block(
64900 +       void)
64901 +{
64902 +       int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL);
64903 +
64904 +       if (rc == -ENOSYS)
64905 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0);
64906 +
64907 +       return rc;
64908 +}
64909 +
64910 +static inline int
64911 +HYPERVISOR_shutdown(
64912 +       unsigned int reason)
64913 +{
64914 +       struct sched_shutdown sched_shutdown = {
64915 +               .reason = reason
64916 +       };
64917 +
64918 +       int rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown);
64919 +
64920 +       if (rc == -ENOSYS)
64921 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason);
64922 +
64923 +       return rc;
64924 +}
64925 +
64926 +static inline int
64927 +HYPERVISOR_poll(
64928 +       evtchn_port_t *ports, unsigned int nr_ports, u64 timeout)
64929 +{
64930 +       struct sched_poll sched_poll = {
64931 +               .ports = ports,
64932 +               .nr_ports = nr_ports,
64933 +               .timeout = jiffies_to_st(timeout)
64934 +       };
64935 +
64936 +       int rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
64937 +
64938 +       if (rc == -ENOSYS)
64939 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
64940 +
64941 +       return rc;
64942 +}
64943 +
64944 +static inline void
64945 +MULTI_update_va_mapping(
64946 +    multicall_entry_t *mcl, unsigned long va,
64947 +    pte_t new_val, unsigned long flags)
64948 +{
64949 +    mcl->op = __HYPERVISOR_update_va_mapping;
64950 +    mcl->args[0] = va;
64951 +#if defined(CONFIG_X86_64)
64952 +    mcl->args[1] = new_val.pte;
64953 +    mcl->args[2] = flags;
64954 +#elif defined(CONFIG_X86_PAE)
64955 +    mcl->args[1] = new_val.pte_low;
64956 +    mcl->args[2] = new_val.pte_high;
64957 +    mcl->args[3] = flags;
64958 +#else
64959 +    mcl->args[1] = new_val.pte_low;
64960 +    mcl->args[2] = 0;
64961 +    mcl->args[3] = flags;
64962 +#endif
64963 +}
64964 +
64965 +static inline void
64966 +MULTI_update_va_mapping_otherdomain(
64967 +    multicall_entry_t *mcl, unsigned long va,
64968 +    pte_t new_val, unsigned long flags, domid_t domid)
64969 +{
64970 +    mcl->op = __HYPERVISOR_update_va_mapping_otherdomain;
64971 +    mcl->args[0] = va;
64972 +#if defined(CONFIG_X86_64)
64973 +    mcl->args[1] = new_val.pte;
64974 +    mcl->args[2] = flags;
64975 +    mcl->args[3] = domid;
64976 +#elif defined(CONFIG_X86_PAE)
64977 +    mcl->args[1] = new_val.pte_low;
64978 +    mcl->args[2] = new_val.pte_high;
64979 +    mcl->args[3] = flags;
64980 +    mcl->args[4] = domid;
64981 +#else
64982 +    mcl->args[1] = new_val.pte_low;
64983 +    mcl->args[2] = 0;
64984 +    mcl->args[3] = flags;
64985 +    mcl->args[4] = domid;
64986 +#endif
64987 +}
64988 +
64989 +#endif /* __HYPERVISOR_H__ */
64990 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/io.h linux-2.6.16/include/asm-i386/mach-xen/asm/io.h
64991 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/io.h        1970-01-01 01:00:00.000000000 +0100
64992 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/io.h     2006-06-26 09:51:32.000000000 +0200
64993 @@ -0,0 +1,401 @@
64994 +#ifndef _ASM_IO_H
64995 +#define _ASM_IO_H
64996 +
64997 +#include <linux/config.h>
64998 +#include <linux/string.h>
64999 +#include <linux/compiler.h>
65000 +
65001 +/*
65002 + * This file contains the definitions for the x86 IO instructions
65003 + * inb/inw/inl/outb/outw/outl and the "string versions" of the same
65004 + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
65005 + * versions of the single-IO instructions (inb_p/inw_p/..).
65006 + *
65007 + * This file is not meant to be obfuscating: it's just complicated
65008 + * to (a) handle it all in a way that makes gcc able to optimize it
65009 + * as well as possible and (b) trying to avoid writing the same thing
65010 + * over and over again with slight variations and possibly making a
65011 + * mistake somewhere.
65012 + */
65013 +
65014 +/*
65015 + * Thanks to James van Artsdalen for a better timing-fix than
65016 + * the two short jumps: using outb's to a nonexistent port seems
65017 + * to guarantee better timings even on fast machines.
65018 + *
65019 + * On the other hand, I'd like to be sure of a non-existent port:
65020 + * I feel a bit unsafe about using 0x80 (should be safe, though)
65021 + *
65022 + *             Linus
65023 + */
65024 +
65025 + /*
65026 +  *  Bit simplified and optimized by Jan Hubicka
65027 +  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
65028 +  *
65029 +  *  isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
65030 +  *  isa_read[wl] and isa_write[wl] fixed
65031 +  *  - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
65032 +  */
65033 +
65034 +#define IO_SPACE_LIMIT 0xffff
65035 +
65036 +#define XQUAD_PORTIO_BASE 0xfe400000
65037 +#define XQUAD_PORTIO_QUAD 0x40000  /* 256k per quad. */
65038 +
65039 +#ifdef __KERNEL__
65040 +
65041 +#include <asm-generic/iomap.h>
65042 +
65043 +#include <linux/vmalloc.h>
65044 +#include <asm/fixmap.h>
65045 +
65046 +/*
65047 + * Convert a physical pointer to a virtual kernel pointer for /dev/mem
65048 + * access
65049 + */
65050 +#define xlate_dev_mem_ptr(p)   __va(p)
65051 +
65052 +/*
65053 + * Convert a virtual cached pointer to an uncached pointer
65054 + */
65055 +#define xlate_dev_kmem_ptr(p)  p
65056 +
65057 +/**
65058 + *     virt_to_phys    -       map virtual addresses to physical
65059 + *     @address: address to remap
65060 + *
65061 + *     The returned physical address is the physical (CPU) mapping for
65062 + *     the memory address given. It is only valid to use this function on
65063 + *     addresses directly mapped or allocated via kmalloc. 
65064 + *
65065 + *     This function does not give bus mappings for DMA transfers. In
65066 + *     almost all conceivable cases a device driver should not be using
65067 + *     this function
65068 + */
65069
65070 +static inline unsigned long virt_to_phys(volatile void * address)
65071 +{
65072 +       return __pa(address);
65073 +}
65074 +
65075 +/**
65076 + *     phys_to_virt    -       map physical address to virtual
65077 + *     @address: address to remap
65078 + *
65079 + *     The returned virtual address is a current CPU mapping for
65080 + *     the memory address given. It is only valid to use this function on
65081 + *     addresses that have a kernel mapping
65082 + *
65083 + *     This function does not handle bus mappings for DMA transfers. In
65084 + *     almost all conceivable cases a device driver should not be using
65085 + *     this function
65086 + */
65087 +
65088 +static inline void * phys_to_virt(unsigned long address)
65089 +{
65090 +       return __va(address);
65091 +}
65092 +
65093 +/*
65094 + * Change "struct page" to physical address.
65095 + */
65096 +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
65097 +#define page_to_phys(page)      (phys_to_machine(page_to_pseudophys(page)))
65098 +
65099 +#define bio_to_pseudophys(bio)  (page_to_pseudophys(bio_page((bio))) + \
65100 +                                 (unsigned long) bio_offset((bio)))
65101 +#define bvec_to_pseudophys(bv)  (page_to_pseudophys((bv)->bv_page) + \
65102 +                                 (unsigned long) (bv)->bv_offset)
65103 +
65104 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)      \
65105 +       (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
65106 +        ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
65107 +         bvec_to_pseudophys((vec2))))
65108 +
65109 +extern void __iomem * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
65110 +
65111 +/**
65112 + * ioremap     -   map bus memory into CPU space
65113 + * @offset:    bus address of the memory
65114 + * @size:      size of the resource to map
65115 + *
65116 + * ioremap performs a platform specific sequence of operations to
65117 + * make bus memory CPU accessible via the readb/readw/readl/writeb/
65118 + * writew/writel functions and the other mmio helpers. The returned
65119 + * address is not guaranteed to be usable directly as a virtual
65120 + * address. 
65121 + */
65122 +
65123 +static inline void __iomem * ioremap(unsigned long offset, unsigned long size)
65124 +{
65125 +       return __ioremap(offset, size, 0);
65126 +}
65127 +
65128 +extern void __iomem * ioremap_nocache(unsigned long offset, unsigned long size);
65129 +extern void iounmap(volatile void __iomem *addr);
65130 +
65131 +/*
65132 + * bt_ioremap() and bt_iounmap() are for temporary early boot-time
65133 + * mappings, before the real ioremap() is functional.
65134 + * A boot-time mapping is currently limited to at most 16 pages.
65135 + */
65136 +extern void *bt_ioremap(unsigned long offset, unsigned long size);
65137 +extern void bt_iounmap(void *addr, unsigned long size);
65138 +
65139 +/* Use early IO mappings for DMI because it's initialized early */
65140 +#define dmi_ioremap bt_ioremap
65141 +#define dmi_iounmap bt_iounmap
65142 +#define dmi_alloc alloc_bootmem
65143 +
65144 +/*
65145 + * ISA I/O bus memory addresses are 1:1 with the physical address.
65146 + */
65147 +#define isa_virt_to_bus(_x) isa_virt_to_bus_is_UNSUPPORTED->x
65148 +#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x
65149 +#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
65150 +
65151 +/*
65152 + * However PCI ones are not necessarily 1:1 and therefore these interfaces
65153 + * are forbidden in portable PCI drivers.
65154 + *
65155 + * Allow them on x86 for legacy drivers, though.
65156 + */
65157 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
65158 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
65159 +
65160 +/*
65161 + * readX/writeX() are used to access memory mapped devices. On some
65162 + * architectures the memory mapped IO stuff needs to be accessed
65163 + * differently. On the x86 architecture, we just read/write the
65164 + * memory location directly.
65165 + */
65166 +
65167 +static inline unsigned char readb(const volatile void __iomem *addr)
65168 +{
65169 +       return *(volatile unsigned char __force *) addr;
65170 +}
65171 +static inline unsigned short readw(const volatile void __iomem *addr)
65172 +{
65173 +       return *(volatile unsigned short __force *) addr;
65174 +}
65175 +static inline unsigned int readl(const volatile void __iomem *addr)
65176 +{
65177 +       return *(volatile unsigned int __force *) addr;
65178 +}
65179 +#define readb_relaxed(addr) readb(addr)
65180 +#define readw_relaxed(addr) readw(addr)
65181 +#define readl_relaxed(addr) readl(addr)
65182 +#define __raw_readb readb
65183 +#define __raw_readw readw
65184 +#define __raw_readl readl
65185 +
65186 +static inline void writeb(unsigned char b, volatile void __iomem *addr)
65187 +{
65188 +       *(volatile unsigned char __force *) addr = b;
65189 +}
65190 +static inline void writew(unsigned short b, volatile void __iomem *addr)
65191 +{
65192 +       *(volatile unsigned short __force *) addr = b;
65193 +}
65194 +static inline void writel(unsigned int b, volatile void __iomem *addr)
65195 +{
65196 +       *(volatile unsigned int __force *) addr = b;
65197 +}
65198 +#define __raw_writeb writeb
65199 +#define __raw_writew writew
65200 +#define __raw_writel writel
65201 +
65202 +#define mmiowb()
65203 +
65204 +static inline void memset_io(volatile void __iomem *addr, unsigned char val, int count)
65205 +{
65206 +       memset((void __force *) addr, val, count);
65207 +}
65208 +static inline void memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
65209 +{
65210 +       __memcpy(dst, (void __force *) src, count);
65211 +}
65212 +static inline void memcpy_toio(volatile void __iomem *dst, const void *src, int count)
65213 +{
65214 +       __memcpy((void __force *) dst, src, count);
65215 +}
65216 +
65217 +/*
65218 + * ISA space is 'always mapped' on a typical x86 system, no need to
65219 + * explicitly ioremap() it. The fact that the ISA IO space is mapped
65220 + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
65221 + * are physical addresses. The following constant pointer can be
65222 + * used as the IO-area pointer (it can be iounmapped as well, so the
65223 + * analogy with PCI is quite large):
65224 + */
65225 +#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
65226 +
65227 +#define isa_readb(a) readb(__ISA_IO_base + (a))
65228 +#define isa_readw(a) readw(__ISA_IO_base + (a))
65229 +#define isa_readl(a) readl(__ISA_IO_base + (a))
65230 +#define isa_writeb(b,a) writeb(b,__ISA_IO_base + (a))
65231 +#define isa_writew(w,a) writew(w,__ISA_IO_base + (a))
65232 +#define isa_writel(l,a) writel(l,__ISA_IO_base + (a))
65233 +#define isa_memset_io(a,b,c)           memset_io(__ISA_IO_base + (a),(b),(c))
65234 +#define isa_memcpy_fromio(a,b,c)       memcpy_fromio((a),__ISA_IO_base + (b),(c))
65235 +#define isa_memcpy_toio(a,b,c)         memcpy_toio(__ISA_IO_base + (a),(b),(c))
65236 +
65237 +
65238 +/*
65239 + * Again, i386 does not require mem IO specific function.
65240 + */
65241 +
65242 +#define eth_io_copy_and_sum(a,b,c,d)           eth_copy_and_sum((a),(void __force *)(b),(c),(d))
65243 +#define isa_eth_io_copy_and_sum(a,b,c,d)       eth_copy_and_sum((a),(void __force *)(__ISA_IO_base + (b)),(c),(d))
65244 +
65245 +/**
65246 + *     check_signature         -       find BIOS signatures
65247 + *     @io_addr: mmio address to check 
65248 + *     @signature:  signature block
65249 + *     @length: length of signature
65250 + *
65251 + *     Perform a signature comparison with the mmio address io_addr. This
65252 + *     address should have been obtained by ioremap.
65253 + *     Returns 1 on a match.
65254 + */
65255
65256 +static inline int check_signature(volatile void __iomem * io_addr,
65257 +       const unsigned char *signature, int length)
65258 +{
65259 +       int retval = 0;
65260 +       do {
65261 +               if (readb(io_addr) != *signature)
65262 +                       goto out;
65263 +               io_addr++;
65264 +               signature++;
65265 +               length--;
65266 +       } while (length);
65267 +       retval = 1;
65268 +out:
65269 +       return retval;
65270 +}
65271 +
65272 +/*
65273 + *     Cache management
65274 + *
65275 + *     This needed for two cases
65276 + *     1. Out of order aware processors
65277 + *     2. Accidentally out of order processors (PPro errata #51)
65278 + */
65279
65280 +#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
65281 +
65282 +static inline void flush_write_buffers(void)
65283 +{
65284 +       __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory");
65285 +}
65286 +
65287 +#define dma_cache_inv(_start,_size)            flush_write_buffers()
65288 +#define dma_cache_wback(_start,_size)          flush_write_buffers()
65289 +#define dma_cache_wback_inv(_start,_size)      flush_write_buffers()
65290 +
65291 +#else
65292 +
65293 +/* Nothing to do */
65294 +
65295 +#define dma_cache_inv(_start,_size)            do { } while (0)
65296 +#define dma_cache_wback(_start,_size)          do { } while (0)
65297 +#define dma_cache_wback_inv(_start,_size)      do { } while (0)
65298 +#define flush_write_buffers()
65299 +
65300 +#endif
65301 +
65302 +#endif /* __KERNEL__ */
65303 +
65304 +#ifdef SLOW_IO_BY_JUMPING
65305 +#define __SLOW_DOWN_IO "jmp 1f; 1: jmp 1f; 1:"
65306 +#else
65307 +#define __SLOW_DOWN_IO "outb %%al,$0x80;"
65308 +#endif
65309 +
65310 +static inline void slow_down_io(void) {
65311 +       __asm__ __volatile__(
65312 +               __SLOW_DOWN_IO
65313 +#ifdef REALLY_SLOW_IO
65314 +               __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
65315 +#endif
65316 +               : : );
65317 +}
65318 +
65319 +#ifdef CONFIG_X86_NUMAQ
65320 +extern void *xquad_portio;    /* Where the IO area was mapped */
65321 +#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
65322 +#define __BUILDIO(bwl,bw,type) \
65323 +static inline void out##bwl##_quad(unsigned type value, int port, int quad) { \
65324 +       if (xquad_portio) \
65325 +               write##bwl(value, XQUAD_PORT_ADDR(port, quad)); \
65326 +       else \
65327 +               out##bwl##_local(value, port); \
65328 +} \
65329 +static inline void out##bwl(unsigned type value, int port) { \
65330 +       out##bwl##_quad(value, port, 0); \
65331 +} \
65332 +static inline unsigned type in##bwl##_quad(int port, int quad) { \
65333 +       if (xquad_portio) \
65334 +               return read##bwl(XQUAD_PORT_ADDR(port, quad)); \
65335 +       else \
65336 +               return in##bwl##_local(port); \
65337 +} \
65338 +static inline unsigned type in##bwl(int port) { \
65339 +       return in##bwl##_quad(port, 0); \
65340 +}
65341 +#else
65342 +#define __BUILDIO(bwl,bw,type) \
65343 +static inline void out##bwl(unsigned type value, int port) { \
65344 +       out##bwl##_local(value, port); \
65345 +} \
65346 +static inline unsigned type in##bwl(int port) { \
65347 +       return in##bwl##_local(port); \
65348 +}
65349 +#endif
65350 +
65351 +
65352 +#define BUILDIO(bwl,bw,type) \
65353 +static inline void out##bwl##_local(unsigned type value, int port) { \
65354 +       __asm__ __volatile__("out" #bwl " %" #bw "0, %w1" : : "a"(value), "Nd"(port)); \
65355 +} \
65356 +static inline unsigned type in##bwl##_local(int port) { \
65357 +       unsigned type value; \
65358 +       __asm__ __volatile__("in" #bwl " %w1, %" #bw "0" : "=a"(value) : "Nd"(port)); \
65359 +       return value; \
65360 +} \
65361 +static inline void out##bwl##_local_p(unsigned type value, int port) { \
65362 +       out##bwl##_local(value, port); \
65363 +       slow_down_io(); \
65364 +} \
65365 +static inline unsigned type in##bwl##_local_p(int port) { \
65366 +       unsigned type value = in##bwl##_local(port); \
65367 +       slow_down_io(); \
65368 +       return value; \
65369 +} \
65370 +__BUILDIO(bwl,bw,type) \
65371 +static inline void out##bwl##_p(unsigned type value, int port) { \
65372 +       out##bwl(value, port); \
65373 +       slow_down_io(); \
65374 +} \
65375 +static inline unsigned type in##bwl##_p(int port) { \
65376 +       unsigned type value = in##bwl(port); \
65377 +       slow_down_io(); \
65378 +       return value; \
65379 +} \
65380 +static inline void outs##bwl(int port, const void *addr, unsigned long count) { \
65381 +       __asm__ __volatile__("rep; outs" #bwl : "+S"(addr), "+c"(count) : "d"(port)); \
65382 +} \
65383 +static inline void ins##bwl(int port, void *addr, unsigned long count) { \
65384 +       __asm__ __volatile__("rep; ins" #bwl : "+D"(addr), "+c"(count) : "d"(port)); \
65385 +}
65386 +
65387 +BUILDIO(b,b,char)
65388 +BUILDIO(w,w,short)
65389 +BUILDIO(l,,int)
65390 +
65391 +/* We will be supplying our own /dev/mem implementation */
65392 +#define ARCH_HAS_DEV_MEM
65393 +
65394 +#endif
65395 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/kmap_types.h linux-2.6.16/include/asm-i386/mach-xen/asm/kmap_types.h
65396 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/kmap_types.h        1970-01-01 01:00:00.000000000 +0100
65397 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/kmap_types.h     2006-06-26 09:51:32.000000000 +0200
65398 @@ -0,0 +1,32 @@
65399 +#ifndef _ASM_KMAP_TYPES_H
65400 +#define _ASM_KMAP_TYPES_H
65401 +
65402 +#include <linux/config.h>
65403 +
65404 +#ifdef CONFIG_DEBUG_HIGHMEM
65405 +# define D(n) __KM_FENCE_##n ,
65406 +#else
65407 +# define D(n)
65408 +#endif
65409 +
65410 +enum km_type {
65411 +D(0)   KM_BOUNCE_READ,
65412 +D(1)   KM_SKB_SUNRPC_DATA,
65413 +D(2)   KM_SKB_DATA_SOFTIRQ,
65414 +D(3)   KM_USER0,
65415 +D(4)   KM_USER1,
65416 +D(5)   KM_BIO_SRC_IRQ,
65417 +D(6)   KM_BIO_DST_IRQ,
65418 +D(7)   KM_PTE0,
65419 +D(8)   KM_PTE1,
65420 +D(9)   KM_IRQ0,
65421 +D(10)  KM_IRQ1,
65422 +D(11)  KM_SOFTIRQ0,
65423 +D(12)  KM_SOFTIRQ1,
65424 +D(13)  KM_SWIOTLB,
65425 +D(14)  KM_TYPE_NR
65426 +};
65427 +
65428 +#undef D
65429 +
65430 +#endif
65431 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/mmu.h linux-2.6.16/include/asm-i386/mach-xen/asm/mmu.h
65432 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/mmu.h       1970-01-01 01:00:00.000000000 +0100
65433 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/mmu.h    2006-06-26 09:51:32.000000000 +0200
65434 @@ -0,0 +1,21 @@
65435 +#ifndef __i386_MMU_H
65436 +#define __i386_MMU_H
65437 +
65438 +#include <asm/semaphore.h>
65439 +/*
65440 + * The i386 doesn't have a mmu context, but
65441 + * we put the segment information here.
65442 + *
65443 + * cpu_vm_mask is used to optimize ldt flushing.
65444 + */
65445 +typedef struct { 
65446 +       int size;
65447 +       struct semaphore sem;
65448 +       void *ldt;
65449 +} mm_context_t;
65450 +
65451 +/* mm/memory.c:exit_mmap hook */
65452 +extern void _arch_exit_mmap(struct mm_struct *mm);
65453 +#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
65454 +
65455 +#endif
65456 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/mmu_context.h linux-2.6.16/include/asm-i386/mach-xen/asm/mmu_context.h
65457 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/mmu_context.h       1970-01-01 01:00:00.000000000 +0100
65458 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/mmu_context.h    2006-06-26 09:51:32.000000000 +0200
65459 @@ -0,0 +1,105 @@
65460 +#ifndef __I386_SCHED_H
65461 +#define __I386_SCHED_H
65462 +
65463 +#include <linux/config.h>
65464 +#include <asm/desc.h>
65465 +#include <asm/atomic.h>
65466 +#include <asm/pgalloc.h>
65467 +#include <asm/tlbflush.h>
65468 +
65469 +/*
65470 + * Used for LDT copy/destruction.
65471 + */
65472 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
65473 +void destroy_context(struct mm_struct *mm);
65474 +
65475 +
65476 +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
65477 +{
65478 +#if 0 /* XEN: no lazy tlb */
65479 +       unsigned cpu = smp_processor_id();
65480 +       if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
65481 +               per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY;
65482 +#endif
65483 +}
65484 +
65485 +#define prepare_arch_switch(next)      __prepare_arch_switch()
65486 +
65487 +static inline void __prepare_arch_switch(void)
65488 +{
65489 +       /*
65490 +        * Save away %fs and %gs. No need to save %es and %ds, as those
65491 +        * are always kernel segments while inside the kernel. Must
65492 +        * happen before reload of cr3/ldt (i.e., not in __switch_to).
65493 +        */
65494 +       asm volatile ( "mov %%fs,%0 ; mov %%gs,%1"
65495 +               : "=m" (current->thread.fs),
65496 +                 "=m" (current->thread.gs));
65497 +       asm volatile ( "movl %0,%%fs ; movl %0,%%gs"
65498 +               : : "r" (0) );
65499 +}
65500 +
65501 +extern void mm_pin(struct mm_struct *mm);
65502 +extern void mm_unpin(struct mm_struct *mm);
65503 +void mm_pin_all(void);
65504 +
65505 +static inline void switch_mm(struct mm_struct *prev,
65506 +                            struct mm_struct *next,
65507 +                            struct task_struct *tsk)
65508 +{
65509 +       int cpu = smp_processor_id();
65510 +       struct mmuext_op _op[2], *op = _op;
65511 +
65512 +       if (likely(prev != next)) {
65513 +               if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
65514 +                       mm_pin(next);
65515 +
65516 +               /* stop flush ipis for the previous mm */
65517 +               cpu_clear(cpu, prev->cpu_vm_mask);
65518 +#if 0 /* XEN: no lazy tlb */
65519 +               per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
65520 +               per_cpu(cpu_tlbstate, cpu).active_mm = next;
65521 +#endif
65522 +               cpu_set(cpu, next->cpu_vm_mask);
65523 +
65524 +               /* Re-load page tables: load_cr3(next->pgd) */
65525 +               op->cmd = MMUEXT_NEW_BASEPTR;
65526 +               op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
65527 +               op++;
65528 +
65529 +               /*
65530 +                * load the LDT, if the LDT is different:
65531 +                */
65532 +               if (unlikely(prev->context.ldt != next->context.ldt)) {
65533 +                       /* load_LDT_nolock(&next->context, cpu) */
65534 +                       op->cmd = MMUEXT_SET_LDT;
65535 +                       op->arg1.linear_addr = (unsigned long)next->context.ldt;
65536 +                       op->arg2.nr_ents     = next->context.size;
65537 +                       op++;
65538 +               }
65539 +
65540 +               BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
65541 +       }
65542 +#if 0 /* XEN: no lazy tlb */
65543 +       else {
65544 +               per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
65545 +               BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
65546 +
65547 +               if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
65548 +                       /* We were in lazy tlb mode and leave_mm disabled 
65549 +                        * tlb flush IPI delivery. We must reload %cr3.
65550 +                        */
65551 +                       load_cr3(next->pgd);
65552 +                       load_LDT_nolock(&next->context, cpu);
65553 +               }
65554 +       }
65555 +#endif
65556 +}
65557 +
65558 +#define deactivate_mm(tsk, mm) \
65559 +       asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0))
65560 +
65561 +#define activate_mm(prev, next) \
65562 +       switch_mm((prev),(next),NULL)
65563 +
65564 +#endif
65565 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/page.h linux-2.6.16/include/asm-i386/mach-xen/asm/page.h
65566 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/page.h      1970-01-01 01:00:00.000000000 +0100
65567 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/page.h   2006-06-26 09:51:32.000000000 +0200
65568 @@ -0,0 +1,327 @@
65569 +#ifndef _I386_PAGE_H
65570 +#define _I386_PAGE_H
65571 +
65572 +/* PAGE_SHIFT determines the page size */
65573 +#define PAGE_SHIFT     12
65574 +#define PAGE_SIZE      (1UL << PAGE_SHIFT)
65575 +#define PAGE_MASK      (~(PAGE_SIZE-1))
65576 +
65577 +#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
65578 +#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
65579 +
65580 +#ifdef __KERNEL__
65581 +#ifndef __ASSEMBLY__
65582 +
65583 +#include <linux/config.h>
65584 +#include <linux/string.h>
65585 +#include <linux/types.h>
65586 +#include <linux/kernel.h>
65587 +#include <asm/bug.h>
65588 +#include <xen/interface/xen.h>
65589 +#include <xen/features.h>
65590 +#include <xen/foreign_page.h>
65591 +
65592 +#define arch_free_page(_page,_order)                   \
65593 +({     int foreign = PageForeign(_page);               \
65594 +       if (foreign)                                    \
65595 +               (PageForeignDestructor(_page))(_page);  \
65596 +       foreign;                                        \
65597 +})
65598 +#define HAVE_ARCH_FREE_PAGE
65599 +
65600 +#ifdef CONFIG_XEN_SCRUB_PAGES
65601 +#define scrub_pages(_p,_n) memset((void *)(_p), 0, (_n) << PAGE_SHIFT)
65602 +#else
65603 +#define scrub_pages(_p,_n) ((void)0)
65604 +#endif
65605 +
65606 +#ifdef CONFIG_X86_USE_3DNOW
65607 +
65608 +#include <asm/mmx.h>
65609 +
65610 +#define clear_page(page)       mmx_clear_page((void *)(page))
65611 +#define copy_page(to,from)     mmx_copy_page(to,from)
65612 +
65613 +#else
65614 +
65615 +#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
65616 +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
65617 +
65618 +/*
65619 + *     On older X86 processors it's not a win to use MMX here it seems.
65620 + *     Maybe the K6-III ?
65621 + */
65622
65623 +#define clear_page(page)       memset((void *)(page), 0, PAGE_SIZE)
65624 +#define copy_page(to,from)     memcpy((void *)(to), (void *)(from), PAGE_SIZE)
65625 +
65626 +#endif
65627 +
65628 +#define clear_user_page(page, vaddr, pg)       clear_page(page)
65629 +#define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)
65630 +
65631 +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
65632 +#define INVALID_P2M_ENTRY      (~0UL)
65633 +#define FOREIGN_FRAME_BIT      (1UL<<31)
65634 +#define FOREIGN_FRAME(m)       ((m) | FOREIGN_FRAME_BIT)
65635 +
65636 +extern unsigned long *phys_to_machine_mapping;
65637 +
65638 +static inline unsigned long pfn_to_mfn(unsigned long pfn)
65639 +{
65640 +       if (xen_feature(XENFEAT_auto_translated_physmap))
65641 +               return pfn;
65642 +       return phys_to_machine_mapping[(unsigned int)(pfn)] &
65643 +               ~FOREIGN_FRAME_BIT;
65644 +}
65645 +
65646 +static inline int phys_to_machine_mapping_valid(unsigned long pfn)
65647 +{
65648 +       if (xen_feature(XENFEAT_auto_translated_physmap))
65649 +               return 1;
65650 +       return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
65651 +}
65652 +
65653 +static inline unsigned long mfn_to_pfn(unsigned long mfn)
65654 +{
65655 +       unsigned long pfn;
65656 +
65657 +       if (xen_feature(XENFEAT_auto_translated_physmap))
65658 +               return mfn;
65659 +
65660 +       /*
65661 +        * The array access can fail (e.g., device space beyond end of RAM).
65662 +        * In such cases it doesn't matter what we return (we return garbage),
65663 +        * but we must handle the fault without crashing!
65664 +        */
65665 +       asm (
65666 +               "1:     movl %1,%0\n"
65667 +               "2:\n"
65668 +               ".section __ex_table,\"a\"\n"
65669 +               "       .align 4\n"
65670 +               "       .long 1b,2b\n"
65671 +               ".previous"
65672 +               : "=r" (pfn) : "m" (machine_to_phys_mapping[mfn]) );
65673 +
65674 +       return pfn;
65675 +}
65676 +
65677 +/*
65678 + * We detect special mappings in one of two ways:
65679 + *  1. If the MFN is an I/O page then Xen will set the m2p entry
65680 + *     to be outside our maximum possible pseudophys range.
65681 + *  2. If the MFN belongs to a different domain then we will certainly
65682 + *     not have MFN in our p2m table. Conversely, if the page is ours,
65683 + *     then we'll have p2m(m2p(MFN))==MFN.
65684 + * If we detect a special mapping then it doesn't have a 'struct page'.
65685 + * We force !pfn_valid() by returning an out-of-range pointer.
65686 + *
65687 + * NB. These checks require that, for any MFN that is not in our reservation,
65688 + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
65689 + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
65690 + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
65691 + *
65692 + * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
65693 + *      use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
65694 + *      require. In all the cases we care about, the FOREIGN_FRAME bit is
65695 + *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.
65696 + */
65697 +static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
65698 +{
65699 +       extern unsigned long max_mapnr;
65700 +       unsigned long pfn = mfn_to_pfn(mfn);
65701 +       if ((pfn < max_mapnr)
65702 +           && !xen_feature(XENFEAT_auto_translated_physmap)
65703 +           && (phys_to_machine_mapping[pfn] != mfn))
65704 +               return max_mapnr; /* force !pfn_valid() */
65705 +       return pfn;
65706 +}
65707 +
65708 +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
65709 +{
65710 +       if (xen_feature(XENFEAT_auto_translated_physmap)) {
65711 +               BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
65712 +               return;
65713 +       }
65714 +       phys_to_machine_mapping[pfn] = mfn;
65715 +}
65716 +
65717 +/* Definitions for machine and pseudophysical addresses. */
65718 +#ifdef CONFIG_X86_PAE
65719 +typedef unsigned long long paddr_t;
65720 +typedef unsigned long long maddr_t;
65721 +#else
65722 +typedef unsigned long paddr_t;
65723 +typedef unsigned long maddr_t;
65724 +#endif
65725 +
65726 +static inline maddr_t phys_to_machine(paddr_t phys)
65727 +{
65728 +       maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
65729 +       machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
65730 +       return machine;
65731 +}
65732 +static inline paddr_t machine_to_phys(maddr_t machine)
65733 +{
65734 +       paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
65735 +       phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
65736 +       return phys;
65737 +}
65738 +
65739 +/*
65740 + * These are used to make use of C type-checking..
65741 + */
65742 +extern int nx_enabled;
65743 +#ifdef CONFIG_X86_PAE
65744 +extern unsigned long long __supported_pte_mask;
65745 +typedef struct { unsigned long pte_low, pte_high; } pte_t;
65746 +typedef struct { unsigned long long pmd; } pmd_t;
65747 +typedef struct { unsigned long long pgd; } pgd_t;
65748 +typedef struct { unsigned long long pgprot; } pgprot_t;
65749 +#define __pte(x) ({ unsigned long long _x = (x);        \
65750 +    if (_x & 1) _x = phys_to_machine(_x);               \
65751 +    ((pte_t) {(unsigned long)(_x), (unsigned long)(_x>>32)}); })
65752 +#define __pgd(x) ({ unsigned long long _x = (x); \
65753 +    (((_x)&1) ? ((pgd_t) {phys_to_machine(_x)}) : ((pgd_t) {(_x)})); })
65754 +#define __pmd(x) ({ unsigned long long _x = (x); \
65755 +    (((_x)&1) ? ((pmd_t) {phys_to_machine(_x)}) : ((pmd_t) {(_x)})); })
65756 +static inline unsigned long long pte_val(pte_t x)
65757 +{
65758 +       unsigned long long ret;
65759 +
65760 +       if (x.pte_low) {
65761 +               ret = x.pte_low | (unsigned long long)x.pte_high << 32;
65762 +               ret = machine_to_phys(ret) | 1;
65763 +       } else {
65764 +               ret = 0;
65765 +       }
65766 +       return ret;
65767 +}
65768 +static inline unsigned long long pmd_val(pmd_t x)
65769 +{
65770 +       unsigned long long ret = x.pmd;
65771 +       if (ret) ret = machine_to_phys(ret) | 1;
65772 +       return ret;
65773 +}
65774 +static inline unsigned long long pgd_val(pgd_t x)
65775 +{
65776 +       unsigned long long ret = x.pgd;
65777 +       if (ret) ret = machine_to_phys(ret) | 1;
65778 +       return ret;
65779 +}
65780 +static inline unsigned long long pte_val_ma(pte_t x)
65781 +{
65782 +       return (unsigned long long)x.pte_high << 32 | x.pte_low;
65783 +}
65784 +#define HPAGE_SHIFT    21
65785 +#else
65786 +typedef struct { unsigned long pte_low; } pte_t;
65787 +typedef struct { unsigned long pgd; } pgd_t;
65788 +typedef struct { unsigned long pgprot; } pgprot_t;
65789 +#define boot_pte_t pte_t /* or would you rather have a typedef */
65790 +#define pte_val(x)     (((x).pte_low & 1) ? machine_to_phys((x).pte_low) : \
65791 +                        (x).pte_low)
65792 +#define pte_val_ma(x)  ((x).pte_low)
65793 +#define __pte(x) ({ unsigned long _x = (x); \
65794 +    (((_x)&1) ? ((pte_t) {phys_to_machine(_x)}) : ((pte_t) {(_x)})); })
65795 +#define __pgd(x) ({ unsigned long _x = (x); \
65796 +    (((_x)&1) ? ((pgd_t) {phys_to_machine(_x)}) : ((pgd_t) {(_x)})); })
65797 +static inline unsigned long pgd_val(pgd_t x)
65798 +{
65799 +       unsigned long ret = x.pgd;
65800 +       if (ret) ret = machine_to_phys(ret) | 1;
65801 +       return ret;
65802 +}
65803 +#define HPAGE_SHIFT    22
65804 +#endif
65805 +#define PTE_MASK       PAGE_MASK
65806 +
65807 +#ifdef CONFIG_HUGETLB_PAGE
65808 +#define HPAGE_SIZE     ((1UL) << HPAGE_SHIFT)
65809 +#define HPAGE_MASK     (~(HPAGE_SIZE - 1))
65810 +#define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
65811 +#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
65812 +#endif
65813 +
65814 +#define pgprot_val(x)  ((x).pgprot)
65815 +
65816 +#define __pte_ma(x)    ((pte_t) { (x) } )
65817 +#define __pgprot(x)    ((pgprot_t) { (x) } )
65818 +
65819 +#endif /* !__ASSEMBLY__ */
65820 +
65821 +/* to align the pointer to the (next) page boundary */
65822 +#define PAGE_ALIGN(addr)       (((addr)+PAGE_SIZE-1)&PAGE_MASK)
65823 +
65824 +/*
65825 + * This handles the memory map.. We could make this a config
65826 + * option, but too many people screw it up, and too few need
65827 + * it.
65828 + *
65829 + * A __PAGE_OFFSET of 0xC0000000 means that the kernel has
65830 + * a virtual address space of one gigabyte, which limits the
65831 + * amount of physical memory you can use to about 950MB. 
65832 + *
65833 + * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
65834 + * and CONFIG_HIGHMEM64G options in the kernel configuration.
65835 + */
65836 +
65837 +#ifndef __ASSEMBLY__
65838 +
65839 +/*
65840 + * This much address space is reserved for vmalloc() and iomap()
65841 + * as well as fixmap mappings.
65842 + */
65843 +extern unsigned int __VMALLOC_RESERVE;
65844 +
65845 +extern int sysctl_legacy_va_layout;
65846 +
65847 +extern int page_is_ram(unsigned long pagenr);
65848 +
65849 +#endif /* __ASSEMBLY__ */
65850 +
65851 +#ifdef __ASSEMBLY__
65852 +#define __PAGE_OFFSET          CONFIG_PAGE_OFFSET
65853 +#define __PHYSICAL_START       CONFIG_PHYSICAL_START
65854 +#else
65855 +#define __PAGE_OFFSET          ((unsigned long)CONFIG_PAGE_OFFSET)
65856 +#define __PHYSICAL_START       ((unsigned long)CONFIG_PHYSICAL_START)
65857 +#endif
65858 +#define __KERNEL_START         (__PAGE_OFFSET + __PHYSICAL_START)
65859 +
65860 +#undef LOAD_OFFSET
65861 +#define LOAD_OFFSET            0
65862 +
65863 +
65864 +#define PAGE_OFFSET            ((unsigned long)__PAGE_OFFSET)
65865 +#define VMALLOC_RESERVE                ((unsigned long)__VMALLOC_RESERVE)
65866 +#define MAXMEM                 (__FIXADDR_TOP-__PAGE_OFFSET-__VMALLOC_RESERVE)
65867 +#define __pa(x)                        ((unsigned long)(x)-PAGE_OFFSET)
65868 +#define __va(x)                        ((void *)((unsigned long)(x)+PAGE_OFFSET))
65869 +#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
65870 +#ifdef CONFIG_FLATMEM
65871 +#define pfn_to_page(pfn)       (mem_map + (pfn))
65872 +#define page_to_pfn(page)      ((unsigned long)((page) - mem_map))
65873 +#define pfn_valid(pfn)         ((pfn) < max_mapnr)
65874 +#endif /* CONFIG_FLATMEM */
65875 +#define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
65876 +
65877 +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
65878 +
65879 +#define VM_DATA_DEFAULT_FLAGS \
65880 +       (VM_READ | VM_WRITE | \
65881 +       ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
65882 +                VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
65883 +
65884 +/* VIRT <-> MACHINE conversion */
65885 +#define virt_to_machine(v)     (phys_to_machine(__pa(v)))
65886 +#define virt_to_mfn(v)         (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
65887 +#define mfn_to_virt(m)         (__va(mfn_to_pfn(m) << PAGE_SHIFT))
65888 +
65889 +#define __HAVE_ARCH_GATE_AREA 1
65890 +
65891 +#endif /* __KERNEL__ */
65892 +
65893 +#include <asm-generic/page.h>
65894 +
65895 +#endif /* _I386_PAGE_H */
65896 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/param.h linux-2.6.16/include/asm-i386/mach-xen/asm/param.h
65897 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/param.h     1970-01-01 01:00:00.000000000 +0100
65898 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/param.h  2006-06-26 09:51:32.000000000 +0200
65899 @@ -0,0 +1,24 @@
65900 +#ifndef _ASMi386_PARAM_H
65901 +#define _ASMi386_PARAM_H
65902 +
65903 +#ifdef __KERNEL__
65904 +# include <linux/config.h>
65905 +# define HZ            CONFIG_HZ       /* Internal kernel timer frequency */
65906 +# define USER_HZ       100             /* .. some user interfaces are in "ticks" */
65907 +# define CLOCKS_PER_SEC                (USER_HZ)       /* like times() */
65908 +#endif
65909 +
65910 +#ifndef HZ
65911 +#define HZ 100
65912 +#endif
65913 +
65914 +#define EXEC_PAGESIZE  4096
65915 +
65916 +#ifndef NOGROUP
65917 +#define NOGROUP                (-1)
65918 +#endif
65919 +
65920 +#define MAXHOSTNAMELEN 64      /* max length of hostname */
65921 +#define COMMAND_LINE_SIZE 256
65922 +
65923 +#endif
65924 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/pci.h linux-2.6.16/include/asm-i386/mach-xen/asm/pci.h
65925 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/pci.h       1970-01-01 01:00:00.000000000 +0100
65926 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/pci.h    2006-06-26 09:51:32.000000000 +0200
65927 @@ -0,0 +1,154 @@
65928 +#ifndef __i386_PCI_H
65929 +#define __i386_PCI_H
65930 +
65931 +#include <linux/config.h>
65932 +
65933 +#ifdef __KERNEL__
65934 +#include <linux/mm.h>          /* for struct page */
65935 +
65936 +/* Can be used to override the logic in pci_scan_bus for skipping
65937 +   already-configured bus numbers - to be used for buggy BIOSes
65938 +   or architectures with incomplete PCI setup by the loader */
65939 +
65940 +#ifdef CONFIG_PCI
65941 +extern unsigned int pcibios_assign_all_busses(void);
65942 +#else
65943 +#define pcibios_assign_all_busses()    0
65944 +#endif
65945 +#define pcibios_scan_all_fns(a, b)     0
65946 +
65947 +extern unsigned long pci_mem_start;
65948 +#define PCIBIOS_MIN_IO         0x1000
65949 +#define PCIBIOS_MIN_MEM                (pci_mem_start)
65950 +
65951 +#define PCIBIOS_MIN_CARDBUS_IO 0x4000
65952 +
65953 +void pcibios_config_init(void);
65954 +struct pci_bus * pcibios_scan_root(int bus);
65955 +
65956 +void pcibios_set_master(struct pci_dev *dev);
65957 +void pcibios_penalize_isa_irq(int irq, int active);
65958 +struct irq_routing_table *pcibios_get_irq_routing_table(void);
65959 +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
65960 +
65961 +/* Dynamic DMA mapping stuff.
65962 + * i386 has everything mapped statically.
65963 + */
65964 +
65965 +#include <linux/types.h>
65966 +#include <linux/slab.h>
65967 +#include <asm/scatterlist.h>
65968 +#include <linux/string.h>
65969 +#include <asm/io.h>
65970 +
65971 +struct pci_dev;
65972 +
65973 +#ifdef CONFIG_SWIOTLB
65974 +
65975 +
65976 +/* On Xen we use SWIOTLB instead of blk-specific bounce buffers. */
65977 +#define PCI_DMA_BUS_IS_PHYS    (0)
65978 +
65979 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)      \
65980 +       dma_addr_t ADDR_NAME;
65981 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)                \
65982 +       __u32 LEN_NAME;
65983 +#define pci_unmap_addr(PTR, ADDR_NAME)                 \
65984 +       ((PTR)->ADDR_NAME)
65985 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)                \
65986 +       (((PTR)->ADDR_NAME) = (VAL))
65987 +#define pci_unmap_len(PTR, LEN_NAME)                   \
65988 +       ((PTR)->LEN_NAME)
65989 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)          \
65990 +       (((PTR)->LEN_NAME) = (VAL))
65991 +
65992 +#else
65993 +
65994 +/* The PCI address space does equal the physical memory
65995 + * address space.  The networking and block device layers use
65996 + * this boolean for bounce buffer decisions.
65997 + */
65998 +#define PCI_DMA_BUS_IS_PHYS    (1)
65999 +
66000 +/* pci_unmap_{page,single} is a nop so... */
66001 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
66002 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
66003 +#define pci_unmap_addr(PTR, ADDR_NAME)         (0)
66004 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)        do { } while (0)
66005 +#define pci_unmap_len(PTR, LEN_NAME)           (0)
66006 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)  do { } while (0)
66007 +
66008 +#endif
66009 +
66010 +/* This is always fine. */
66011 +#define pci_dac_dma_supported(pci_dev, mask)   (1)
66012 +
66013 +static inline dma64_addr_t
66014 +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
66015 +{
66016 +       return ((dma64_addr_t) page_to_phys(page) +
66017 +               (dma64_addr_t) offset);
66018 +}
66019 +
66020 +static inline struct page *
66021 +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
66022 +{
66023 +       return pfn_to_page(dma_addr >> PAGE_SHIFT);
66024 +}
66025 +
66026 +static inline unsigned long
66027 +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
66028 +{
66029 +       return (dma_addr & ~PAGE_MASK);
66030 +}
66031 +
66032 +static inline void
66033 +pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
66034 +{
66035 +}
66036 +
66037 +static inline void
66038 +pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
66039 +{
66040 +       flush_write_buffers();
66041 +}
66042 +
66043 +#define HAVE_PCI_MMAP
66044 +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
66045 +                              enum pci_mmap_state mmap_state, int write_combine);
66046 +
66047 +
66048 +static inline void pcibios_add_platform_entries(struct pci_dev *dev)
66049 +{
66050 +}
66051 +
66052 +#ifdef CONFIG_PCI
66053 +static inline void pci_dma_burst_advice(struct pci_dev *pdev,
66054 +                                       enum pci_dma_burst_strategy *strat,
66055 +                                       unsigned long *strategy_parameter)
66056 +{
66057 +       *strat = PCI_DMA_BURST_INFINITY;
66058 +       *strategy_parameter = ~0UL;
66059 +}
66060 +#endif
66061 +
66062 +#endif /* __KERNEL__ */
66063 +
66064 +#ifdef CONFIG_XEN_PCIDEV_FRONTEND
66065 +#include <xen/pcifront.h>
66066 +#endif /* CONFIG_XEN_PCIDEV_FRONTEND */
66067 +
66068 +/* implement the pci_ DMA API in terms of the generic device dma_ one */
66069 +#include <asm-generic/pci-dma-compat.h>
66070 +
66071 +/* generic pci stuff */
66072 +#include <asm-generic/pci.h>
66073 +
66074 +/* On Xen we have to scan all functions since Xen hides bridges from
66075 + * us.  If a bridge is at fn=0 and that slot has a multifunction
66076 + * device, we won't find the additional devices without scanning all
66077 + * functions. */
66078 +#undef pcibios_scan_all_fns
66079 +#define pcibios_scan_all_fns(a, b)     1
66080 +
66081 +#endif /* __i386_PCI_H */
66082 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/pgalloc.h linux-2.6.16/include/asm-i386/mach-xen/asm/pgalloc.h
66083 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/pgalloc.h   1970-01-01 01:00:00.000000000 +0100
66084 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/pgalloc.h        2006-06-26 09:51:32.000000000 +0200
66085 @@ -0,0 +1,64 @@
66086 +#ifndef _I386_PGALLOC_H
66087 +#define _I386_PGALLOC_H
66088 +
66089 +#include <linux/config.h>
66090 +#include <asm/fixmap.h>
66091 +#include <linux/threads.h>
66092 +#include <linux/mm.h>          /* for struct page */
66093 +#include <asm/io.h>            /* for phys_to_virt and page_to_pseudophys */
66094 +
66095 +/* Is this pagetable pinned? */
66096 +#define PG_pinned      PG_arch_1
66097 +
66098 +#define pmd_populate_kernel(mm, pmd, pte) \
66099 +               set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
66100 +
66101 +#define pmd_populate(mm, pmd, pte)                                     \
66102 +do {                                                                   \
66103 +       if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) {     \
66104 +               if (!PageHighMem(pte))                                  \
66105 +                       BUG_ON(HYPERVISOR_update_va_mapping(            \
66106 +                         (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT),\
66107 +                         pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));\
66108 +               set_pmd(pmd, __pmd(_PAGE_TABLE +                        \
66109 +                       ((unsigned long long)page_to_pfn(pte) <<        \
66110 +                               (unsigned long long) PAGE_SHIFT)));     \
66111 +       } else {                                                        \
66112 +               *(pmd) = __pmd(_PAGE_TABLE +                            \
66113 +                       ((unsigned long long)page_to_pfn(pte) <<        \
66114 +                               (unsigned long long) PAGE_SHIFT));      \
66115 +       }                                                               \
66116 +} while (0)
66117 +
66118 +/*
66119 + * Allocate and free page tables.
66120 + */
66121 +extern pgd_t *pgd_alloc(struct mm_struct *);
66122 +extern void pgd_free(pgd_t *pgd);
66123 +
66124 +extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
66125 +extern struct page *pte_alloc_one(struct mm_struct *, unsigned long);
66126 +
66127 +static inline void pte_free_kernel(pte_t *pte)
66128 +{
66129 +       free_page((unsigned long)pte);
66130 +       make_page_writable(pte, XENFEAT_writable_page_tables);
66131 +}
66132 +
66133 +extern void pte_free(struct page *pte);
66134 +
66135 +#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
66136 +
66137 +#ifdef CONFIG_X86_PAE
66138 +/*
66139 + * In the PAE case we free the pmds as part of the pgd.
66140 + */
66141 +#define pmd_alloc_one(mm, addr)                ({ BUG(); ((pmd_t *)2); })
66142 +#define pmd_free(x)                    do { } while (0)
66143 +#define __pmd_free_tlb(tlb,x)          do { } while (0)
66144 +#define pud_populate(mm, pmd, pte)     BUG()
66145 +#endif
66146 +
66147 +#define check_pgt_cache()      do { } while (0)
66148 +
66149 +#endif /* _I386_PGALLOC_H */
66150 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h linux-2.6.16/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h
66151 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h       1970-01-01 01:00:00.000000000 +0100
66152 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h    2006-06-26 09:51:32.000000000 +0200
66153 @@ -0,0 +1,21 @@
66154 +#ifndef _I386_PGTABLE_2LEVEL_DEFS_H
66155 +#define _I386_PGTABLE_2LEVEL_DEFS_H
66156 +
66157 +#define HAVE_SHARED_KERNEL_PMD 0
66158 +
66159 +/*
66160 + * traditional i386 two-level paging structure:
66161 + */
66162 +
66163 +#define PGDIR_SHIFT    22
66164 +#define PTRS_PER_PGD   1024
66165 +#define PTRS_PER_PGD_NO_HV     (HYPERVISOR_VIRT_START >> PGDIR_SHIFT)
66166 +
66167 +/*
66168 + * the i386 is two-level, so we don't really have any
66169 + * PMD directory physically.
66170 + */
66171 +
66172 +#define PTRS_PER_PTE   1024
66173 +
66174 +#endif /* _I386_PGTABLE_2LEVEL_DEFS_H */
66175 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/pgtable-2level.h linux-2.6.16/include/asm-i386/mach-xen/asm/pgtable-2level.h
66176 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/pgtable-2level.h    1970-01-01 01:00:00.000000000 +0100
66177 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/pgtable-2level.h 2006-06-26 09:51:32.000000000 +0200
66178 @@ -0,0 +1,83 @@
66179 +#ifndef _I386_PGTABLE_2LEVEL_H
66180 +#define _I386_PGTABLE_2LEVEL_H
66181 +
66182 +#include <asm-generic/pgtable-nopmd.h>
66183 +
66184 +#define pte_ERROR(e) \
66185 +       printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low)
66186 +#define pgd_ERROR(e) \
66187 +       printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
66188 +
66189 +/*
66190 + * Certain architectures need to do special things when PTEs
66191 + * within a page table are directly modified.  Thus, the following
66192 + * hook is made available.
66193 + */
66194 +#define set_pte(pteptr, pteval) (*(pteptr) = pteval)
66195 +
66196 +#define set_pte_at(_mm,addr,ptep,pteval) do {                          \
66197 +       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
66198 +           HYPERVISOR_update_va_mapping((addr), (pteval), 0))          \
66199 +               set_pte((ptep), (pteval));                              \
66200 +} while (0)
66201 +
66202 +#define set_pte_at_sync(_mm,addr,ptep,pteval) do {                     \
66203 +       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
66204 +           HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \
66205 +               set_pte((ptep), (pteval));                              \
66206 +               xen_invlpg((addr));                                     \
66207 +       }                                                               \
66208 +} while (0)
66209 +
66210 +#define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
66211 +
66212 +#define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval))
66213 +
66214 +#define ptep_get_and_clear(mm,addr,xp) __pte_ma(xchg(&(xp)->pte_low, 0))
66215 +#define pte_same(a, b)         ((a).pte_low == (b).pte_low)
66216 +#define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT)
66217 +#define pte_pfn(_pte) mfn_to_local_pfn(pte_mfn(_pte))
66218 +
66219 +#define pte_page(_pte) pfn_to_page(pte_pfn(_pte))
66220 +
66221 +#define pte_none(x)            (!(x).pte_low)
66222 +#define pfn_pte(pfn, prot)     __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
66223 +#define pfn_pte_ma(pfn, prot)  __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
66224 +#define pfn_pmd(pfn, prot)     __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
66225 +
66226 +/*
66227 + * All present user pages are user-executable:
66228 + */
66229 +static inline int pte_exec(pte_t pte)
66230 +{
66231 +       return pte_user(pte);
66232 +}
66233 +
66234 +/*
66235 + * All present pages are kernel-executable:
66236 + */
66237 +static inline int pte_exec_kernel(pte_t pte)
66238 +{
66239 +       return 1;
66240 +}
66241 +
66242 +/*
66243 + * Bits 0, 6 and 7 are taken, split up the 29 bits of offset
66244 + * into this range:
66245 + */
66246 +#define PTE_FILE_MAX_BITS      29
66247 +
66248 +#define pte_to_pgoff(pte) \
66249 +       ((((pte).pte_low >> 1) & 0x1f ) + (((pte).pte_low >> 8) << 5 ))
66250 +
66251 +#define pgoff_to_pte(off) \
66252 +       ((pte_t) { (((off) & 0x1f) << 1) + (((off) >> 5) << 8) + _PAGE_FILE })
66253 +
66254 +/* Encode and de-code a swap entry */
66255 +#define __swp_type(x)                  (((x).val >> 1) & 0x1f)
66256 +#define __swp_offset(x)                        ((x).val >> 8)
66257 +#define __swp_entry(type, offset)      ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
66258 +#define __pte_to_swp_entry(pte)                ((swp_entry_t) { (pte).pte_low })
66259 +#define __swp_entry_to_pte(x)          ((pte_t) { (x).val })
66260 +
66261 +#endif /* _I386_PGTABLE_2LEVEL_H */
66262 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h linux-2.6.16/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h
66263 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h       1970-01-01 01:00:00.000000000 +0100
66264 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h    2006-06-26 09:51:32.000000000 +0200
66265 @@ -0,0 +1,25 @@
66266 +#ifndef _I386_PGTABLE_3LEVEL_DEFS_H
66267 +#define _I386_PGTABLE_3LEVEL_DEFS_H
66268 +
66269 +#define HAVE_SHARED_KERNEL_PMD 0
66270 +
66271 +/*
66272 + * PGDIR_SHIFT determines what a top-level page table entry can map
66273 + */
66274 +#define PGDIR_SHIFT    30
66275 +#define PTRS_PER_PGD   4
66276 +#define PTRS_PER_PGD_NO_HV 4
66277 +
66278 +/*
66279 + * PMD_SHIFT determines the size of the area a middle-level
66280 + * page table can map
66281 + */
66282 +#define PMD_SHIFT      21
66283 +#define PTRS_PER_PMD   512
66284 +
66285 +/*
66286 + * entries per page directory level
66287 + */
66288 +#define PTRS_PER_PTE   512
66289 +
66290 +#endif /* _I386_PGTABLE_3LEVEL_DEFS_H */
66291 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/pgtable-3level.h linux-2.6.16/include/asm-i386/mach-xen/asm/pgtable-3level.h
66292 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/pgtable-3level.h    1970-01-01 01:00:00.000000000 +0100
66293 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/pgtable-3level.h 2006-06-26 09:51:32.000000000 +0200
66294 @@ -0,0 +1,181 @@
66295 +#ifndef _I386_PGTABLE_3LEVEL_H
66296 +#define _I386_PGTABLE_3LEVEL_H
66297 +
66298 +#include <asm-generic/pgtable-nopud.h>
66299 +
66300 +/*
66301 + * Intel Physical Address Extension (PAE) Mode - three-level page
66302 + * tables on PPro+ CPUs.
66303 + *
66304 + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
66305 + */
66306 +
66307 +#define pte_ERROR(e) \
66308 +       printk("%s:%d: bad pte %p(%08lx%08lx).\n", __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low)
66309 +#define pmd_ERROR(e) \
66310 +       printk("%s:%d: bad pmd %p(%016Lx).\n", __FILE__, __LINE__, &(e), pmd_val(e))
66311 +#define pgd_ERROR(e) \
66312 +       printk("%s:%d: bad pgd %p(%016Lx).\n", __FILE__, __LINE__, &(e), pgd_val(e))
66313 +
66314 +#define pud_none(pud)                          0
66315 +#define pud_bad(pud)                           0
66316 +#define pud_present(pud)                       1
66317 +
66318 +/*
66319 + * Is the pte executable?
66320 + */
66321 +static inline int pte_x(pte_t pte)
66322 +{
66323 +       return !(pte_val(pte) & _PAGE_NX);
66324 +}
66325 +
66326 +/*
66327 + * All present user-pages with !NX bit are user-executable:
66328 + */
66329 +static inline int pte_exec(pte_t pte)
66330 +{
66331 +       return pte_user(pte) && pte_x(pte);
66332 +}
66333 +/*
66334 + * All present pages with !NX bit are kernel-executable:
66335 + */
66336 +static inline int pte_exec_kernel(pte_t pte)
66337 +{
66338 +       return pte_x(pte);
66339 +}
66340 +
66341 +/* Rules for using set_pte: the pte being assigned *must* be
66342 + * either not present or in a state where the hardware will
66343 + * not attempt to update the pte.  In places where this is
66344 + * not possible, use pte_get_and_clear to obtain the old pte
66345 + * value and then use set_pte to update it.  -ben
66346 + */
66347 +#define __HAVE_ARCH_SET_PTE_ATOMIC
66348 +
66349 +#if 1
66350 +/* use writable pagetables */
66351 +static inline void set_pte(pte_t *ptep, pte_t pte)
66352 +{
66353 +       ptep->pte_high = pte.pte_high;
66354 +       smp_wmb();
66355 +       ptep->pte_low = pte.pte_low;
66356 +}
66357 +# define set_pte_atomic(pteptr,pteval) \
66358 +               set_64bit((unsigned long long *)(pteptr),pte_val_ma(pteval))
66359 +#else
66360 +/* no writable pagetables */
66361 +# define set_pte(pteptr,pteval)                                \
66362 +               xen_l1_entry_update((pteptr), (pteval))
66363 +# define set_pte_atomic(pteptr,pteval) set_pte(pteptr,pteval)
66364 +#endif
66365 +
66366 +#define set_pte_at(_mm,addr,ptep,pteval) do {                          \
66367 +       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
66368 +           HYPERVISOR_update_va_mapping((addr), (pteval), 0))          \
66369 +               set_pte((ptep), (pteval));                              \
66370 +} while (0)
66371 +
66372 +#define set_pte_at_sync(_mm,addr,ptep,pteval) do {                     \
66373 +       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
66374 +           HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \
66375 +               set_pte((ptep), (pteval));                              \
66376 +               xen_invlpg((addr));                                     \
66377 +       }                                                               \
66378 +} while (0)
66379 +
66380 +#define set_pmd(pmdptr,pmdval)                         \
66381 +               xen_l2_entry_update((pmdptr), (pmdval))
66382 +#define set_pud(pudptr,pudval) \
66383 +               xen_l3_entry_update((pudptr), (pudval))
66384 +
66385 +/*
66386 + * Pentium-II erratum A13: in PAE mode we explicitly have to flush
66387 + * the TLB via cr3 if the top-level pgd is changed...
66388 + * We do not let the generic code free and clear pgd entries due to
66389 + * this erratum.
66390 + */
66391 +static inline void pud_clear (pud_t * pud) { }
66392 +
66393 +#define pud_page(pud) \
66394 +((struct page *) __va(pud_val(pud) & PAGE_MASK))
66395 +
66396 +#define pud_page_kernel(pud) \
66397 +((unsigned long) __va(pud_val(pud) & PAGE_MASK))
66398 +
66399 +
66400 +/* Find an entry in the second-level page table.. */
66401 +#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
66402 +                       pmd_index(address))
66403 +
66404 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
66405 +{
66406 +       pte_t res;
66407 +
66408 +       /* xchg acts as a barrier before the setting of the high bits */
66409 +       res.pte_low = xchg(&ptep->pte_low, 0);
66410 +       res.pte_high = ptep->pte_high;
66411 +       ptep->pte_high = 0;
66412 +
66413 +       return res;
66414 +}
66415 +
66416 +static inline int pte_same(pte_t a, pte_t b)
66417 +{
66418 +       return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
66419 +}
66420 +
66421 +#define pte_page(x)    pfn_to_page(pte_pfn(x))
66422 +
66423 +static inline int pte_none(pte_t pte)
66424 +{
66425 +       return !pte.pte_low && !pte.pte_high;
66426 +}
66427 +
66428 +#define pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) |\
66429 +                      (((_pte).pte_high & 0xfff) << (32-PAGE_SHIFT)))
66430 +#define pte_pfn(_pte) mfn_to_local_pfn(pte_mfn(_pte))
66431 +
66432 +extern unsigned long long __supported_pte_mask;
66433 +
66434 +static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot)
66435 +{
66436 +       pte_t pte;
66437 +
66438 +       pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \
66439 +                                       (pgprot_val(pgprot) >> 32);
66440 +       pte.pte_high &= (__supported_pte_mask >> 32);
66441 +       pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \
66442 +                                                       __supported_pte_mask;
66443 +       return pte;
66444 +}
66445 +
66446 +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
66447 +{
66448 +       return pfn_pte_ma(pfn_to_mfn(page_nr), pgprot);
66449 +}
66450 +
66451 +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
66452 +{
66453 +       BUG(); panic("needs review");
66454 +       return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) | \
66455 +                       pgprot_val(pgprot)) & __supported_pte_mask);
66456 +}
66457 +
66458 +/*
66459 + * Bits 0, 6 and 7 are taken in the low part of the pte,
66460 + * put the 32 bits of offset into the high part.
66461 + */
66462 +#define pte_to_pgoff(pte) ((pte).pte_high)
66463 +#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
66464 +#define PTE_FILE_MAX_BITS       32
66465 +
66466 +/* Encode and de-code a swap entry */
66467 +#define __swp_type(x)                  (((x).val) & 0x1f)
66468 +#define __swp_offset(x)                        ((x).val >> 5)
66469 +#define __swp_entry(type, offset)      ((swp_entry_t){(type) | (offset) << 5})
66470 +#define __pte_to_swp_entry(pte)                ((swp_entry_t){ (pte).pte_high })
66471 +#define __swp_entry_to_pte(x)          ((pte_t){ 0, (x).val })
66472 +
66473 +#define __pmd_free_tlb(tlb, x)         do { } while (0)
66474 +
66475 +#endif /* _I386_PGTABLE_3LEVEL_H */
66476 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/pgtable.h linux-2.6.16/include/asm-i386/mach-xen/asm/pgtable.h
66477 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/pgtable.h   1970-01-01 01:00:00.000000000 +0100
66478 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/pgtable.h        2006-06-26 09:51:32.000000000 +0200
66479 @@ -0,0 +1,521 @@
66480 +#ifndef _I386_PGTABLE_H
66481 +#define _I386_PGTABLE_H
66482 +
66483 +#include <linux/config.h>
66484 +#include <asm/hypervisor.h>
66485 +
66486 +/*
66487 + * The Linux memory management assumes a three-level page table setup. On
66488 + * the i386, we use that, but "fold" the mid level into the top-level page
66489 + * table, so that we physically have the same two-level page table as the
66490 + * i386 mmu expects.
66491 + *
66492 + * This file contains the functions and defines necessary to modify and use
66493 + * the i386 page table tree.
66494 + */
66495 +#ifndef __ASSEMBLY__
66496 +#include <asm/processor.h>
66497 +#include <asm/fixmap.h>
66498 +#include <linux/threads.h>
66499 +
66500 +#ifndef _I386_BITOPS_H
66501 +#include <asm/bitops.h>
66502 +#endif
66503 +
66504 +#include <linux/slab.h>
66505 +#include <linux/list.h>
66506 +#include <linux/spinlock.h>
66507 +
66508 +struct mm_struct;
66509 +struct vm_area_struct;
66510 +
66511 +/*
66512 + * ZERO_PAGE is a global shared page that is always zero: used
66513 + * for zero-mapped memory areas etc..
66514 + */
66515 +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
66516 +extern unsigned long empty_zero_page[1024];
66517 +extern pgd_t *swapper_pg_dir;
66518 +extern kmem_cache_t *pgd_cache;
66519 +extern kmem_cache_t *pmd_cache;
66520 +extern spinlock_t pgd_lock;
66521 +extern struct page *pgd_list;
66522 +
66523 +void pmd_ctor(void *, kmem_cache_t *, unsigned long);
66524 +void pgd_ctor(void *, kmem_cache_t *, unsigned long);
66525 +void pgd_dtor(void *, kmem_cache_t *, unsigned long);
66526 +void pgtable_cache_init(void);
66527 +void paging_init(void);
66528 +
66529 +/*
66530 + * The Linux x86 paging architecture is 'compile-time dual-mode', it
66531 + * implements both the traditional 2-level x86 page tables and the
66532 + * newer 3-level PAE-mode page tables.
66533 + */
66534 +#ifdef CONFIG_X86_PAE
66535 +# include <asm/pgtable-3level-defs.h>
66536 +# define PMD_SIZE      (1UL << PMD_SHIFT)
66537 +# define PMD_MASK      (~(PMD_SIZE-1))
66538 +#else
66539 +# include <asm/pgtable-2level-defs.h>
66540 +#endif
66541 +
66542 +#define PGDIR_SIZE     (1UL << PGDIR_SHIFT)
66543 +#define PGDIR_MASK     (~(PGDIR_SIZE-1))
66544 +
66545 +#define USER_PTRS_PER_PGD      (TASK_SIZE/PGDIR_SIZE)
66546 +#define FIRST_USER_ADDRESS     0
66547 +
66548 +#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
66549 +#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
66550 +
66551 +#define TWOLEVEL_PGDIR_SHIFT   22
66552 +#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT)
66553 +#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS)
66554 +
66555 +/* Just any arbitrary offset to the start of the vmalloc VM area: the
66556 + * current 8MB value just means that there will be a 8MB "hole" after the
66557 + * physical memory until the kernel virtual memory starts.  That means that
66558 + * any out-of-bounds memory accesses will hopefully be caught.
66559 + * The vmalloc() routines leaves a hole of 4kB between each vmalloced
66560 + * area for the same reason. ;)
66561 + */
66562 +#define VMALLOC_OFFSET (8*1024*1024)
66563 +#define VMALLOC_START  (((unsigned long) high_memory + vmalloc_earlyreserve + \
66564 +                       2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
66565 +#ifdef CONFIG_HIGHMEM
66566 +# define VMALLOC_END   (PKMAP_BASE-2*PAGE_SIZE)
66567 +#else
66568 +# define VMALLOC_END   (FIXADDR_START-2*PAGE_SIZE)
66569 +#endif
66570 +
66571 +/*
66572 + * _PAGE_PSE set in the page directory entry just means that
66573 + * the page directory entry points directly to a 4MB-aligned block of
66574 + * memory. 
66575 + */
66576 +#define _PAGE_BIT_PRESENT      0
66577 +#define _PAGE_BIT_RW           1
66578 +#define _PAGE_BIT_USER         2
66579 +#define _PAGE_BIT_PWT          3
66580 +#define _PAGE_BIT_PCD          4
66581 +#define _PAGE_BIT_ACCESSED     5
66582 +#define _PAGE_BIT_DIRTY                6
66583 +#define _PAGE_BIT_PSE          7       /* 4 MB (or 2MB) page, Pentium+, if present.. */
66584 +#define _PAGE_BIT_GLOBAL       8       /* Global TLB entry PPro+ */
66585 +#define _PAGE_BIT_UNUSED1      9       /* available for programmer */
66586 +#define _PAGE_BIT_UNUSED2      10
66587 +#define _PAGE_BIT_UNUSED3      11
66588 +#define _PAGE_BIT_NX           63
66589 +
66590 +#define _PAGE_PRESENT  0x001
66591 +#define _PAGE_RW       0x002
66592 +#define _PAGE_USER     0x004
66593 +#define _PAGE_PWT      0x008
66594 +#define _PAGE_PCD      0x010
66595 +#define _PAGE_ACCESSED 0x020
66596 +#define _PAGE_DIRTY    0x040
66597 +#define _PAGE_PSE      0x080   /* 4 MB (or 2MB) page, Pentium+, if present.. */
66598 +#define _PAGE_GLOBAL   0x100   /* Global TLB entry PPro+ */
66599 +#define _PAGE_UNUSED1  0x200   /* available for programmer */
66600 +#define _PAGE_UNUSED2  0x400
66601 +#define _PAGE_UNUSED3  0x800
66602 +
66603 +/* If _PAGE_PRESENT is clear, we use these: */
66604 +#define _PAGE_FILE     0x040   /* nonlinear file mapping, saved PTE; unset:swap */
66605 +#define _PAGE_PROTNONE 0x080   /* if the user mapped it with PROT_NONE;
66606 +                                  pte_present gives true */
66607 +#ifdef CONFIG_X86_PAE
66608 +#define _PAGE_NX       (1ULL<<_PAGE_BIT_NX)
66609 +#else
66610 +#define _PAGE_NX       0
66611 +#endif
66612 +
66613 +#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
66614 +#define _KERNPG_TABLE  (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
66615 +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
66616 +
66617 +#define PAGE_NONE \
66618 +       __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
66619 +#define PAGE_SHARED \
66620 +       __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
66621 +
66622 +#define PAGE_SHARED_EXEC \
66623 +       __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
66624 +#define PAGE_COPY_NOEXEC \
66625 +       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
66626 +#define PAGE_COPY_EXEC \
66627 +       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
66628 +#define PAGE_COPY \
66629 +       PAGE_COPY_NOEXEC
66630 +#define PAGE_READONLY \
66631 +       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
66632 +#define PAGE_READONLY_EXEC \
66633 +       __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
66634 +
66635 +#define _PAGE_KERNEL \
66636 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
66637 +#define _PAGE_KERNEL_EXEC \
66638 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
66639 +
66640 +extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
66641 +#define __PAGE_KERNEL_RO               (__PAGE_KERNEL & ~_PAGE_RW)
66642 +#define __PAGE_KERNEL_NOCACHE          (__PAGE_KERNEL | _PAGE_PCD)
66643 +#define __PAGE_KERNEL_LARGE            (__PAGE_KERNEL | _PAGE_PSE)
66644 +#define __PAGE_KERNEL_LARGE_EXEC       (__PAGE_KERNEL_EXEC | _PAGE_PSE)
66645 +
66646 +#define PAGE_KERNEL            __pgprot(__PAGE_KERNEL)
66647 +#define PAGE_KERNEL_RO         __pgprot(__PAGE_KERNEL_RO)
66648 +#define PAGE_KERNEL_EXEC       __pgprot(__PAGE_KERNEL_EXEC)
66649 +#define PAGE_KERNEL_NOCACHE    __pgprot(__PAGE_KERNEL_NOCACHE)
66650 +#define PAGE_KERNEL_LARGE      __pgprot(__PAGE_KERNEL_LARGE)
66651 +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
66652 +
66653 +/*
66654 + * The i386 can't do page protection for execute, and considers that
66655 + * the same are read. Also, write permissions imply read permissions.
66656 + * This is the closest we can get..
66657 + */
66658 +#define __P000 PAGE_NONE
66659 +#define __P001 PAGE_READONLY
66660 +#define __P010 PAGE_COPY
66661 +#define __P011 PAGE_COPY
66662 +#define __P100 PAGE_READONLY_EXEC
66663 +#define __P101 PAGE_READONLY_EXEC
66664 +#define __P110 PAGE_COPY_EXEC
66665 +#define __P111 PAGE_COPY_EXEC
66666 +
66667 +#define __S000 PAGE_NONE
66668 +#define __S001 PAGE_READONLY
66669 +#define __S010 PAGE_SHARED
66670 +#define __S011 PAGE_SHARED
66671 +#define __S100 PAGE_READONLY_EXEC
66672 +#define __S101 PAGE_READONLY_EXEC
66673 +#define __S110 PAGE_SHARED_EXEC
66674 +#define __S111 PAGE_SHARED_EXEC
66675 +
66676 +/*
66677 + * Define this if things work differently on an i386 and an i486:
66678 + * it will (on an i486) warn about kernel memory accesses that are
66679 + * done without a 'access_ok(VERIFY_WRITE,..)'
66680 + */
66681 +#undef TEST_ACCESS_OK
66682 +
66683 +/* The boot page tables (all created as a single array) */
66684 +extern unsigned long pg0[];
66685 +
66686 +#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
66687 +#define pte_clear(mm,addr,xp)  do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
66688 +
66689 +/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
66690 +#define pmd_none(x)    (!(unsigned long)pmd_val(x))
66691 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
66692 +   can temporarily clear it. */
66693 +#define pmd_present(x) (pmd_val(x))
66694 +#define pmd_clear(xp)  do { set_pmd(xp, __pmd(0)); } while (0)
66695 +#define pmd_bad(x)     ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
66696 +
66697 +
66698 +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
66699 +
66700 +/*
66701 + * The following only work if pte_present() is true.
66702 + * Undefined behaviour if not..
66703 + */
66704 +#define __LARGE_PTE (_PAGE_PSE | _PAGE_PRESENT)
66705 +static inline int pte_user(pte_t pte)          { return (pte).pte_low & _PAGE_USER; }
66706 +static inline int pte_read(pte_t pte)          { return (pte).pte_low & _PAGE_USER; }
66707 +static inline int pte_dirty(pte_t pte)         { return (pte).pte_low & _PAGE_DIRTY; }
66708 +static inline int pte_young(pte_t pte)         { return (pte).pte_low & _PAGE_ACCESSED; }
66709 +static inline int pte_write(pte_t pte)         { return (pte).pte_low & _PAGE_RW; }
66710 +static inline int pte_huge(pte_t pte)          { return ((pte).pte_low & __LARGE_PTE) == __LARGE_PTE; }
66711 +
66712 +/*
66713 + * The following only works if pte_present() is not true.
66714 + */
66715 +static inline int pte_file(pte_t pte)          { return (pte).pte_low & _PAGE_FILE; }
66716 +
66717 +static inline pte_t pte_rdprotect(pte_t pte)   { (pte).pte_low &= ~_PAGE_USER; return pte; }
66718 +static inline pte_t pte_exprotect(pte_t pte)   { (pte).pte_low &= ~_PAGE_USER; return pte; }
66719 +static inline pte_t pte_mkclean(pte_t pte)     { (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
66720 +static inline pte_t pte_mkold(pte_t pte)       { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
66721 +static inline pte_t pte_wrprotect(pte_t pte)   { (pte).pte_low &= ~_PAGE_RW; return pte; }
66722 +static inline pte_t pte_mkread(pte_t pte)      { (pte).pte_low |= _PAGE_USER; return pte; }
66723 +static inline pte_t pte_mkexec(pte_t pte)      { (pte).pte_low |= _PAGE_USER; return pte; }
66724 +static inline pte_t pte_mkdirty(pte_t pte)     { (pte).pte_low |= _PAGE_DIRTY; return pte; }
66725 +static inline pte_t pte_mkyoung(pte_t pte)     { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
66726 +static inline pte_t pte_mkwrite(pte_t pte)     { (pte).pte_low |= _PAGE_RW; return pte; }
66727 +static inline pte_t pte_mkhuge(pte_t pte)      { (pte).pte_low |= __LARGE_PTE; return pte; }
66728 +
66729 +#ifdef CONFIG_X86_PAE
66730 +# include <asm/pgtable-3level.h>
66731 +#else
66732 +# include <asm/pgtable-2level.h>
66733 +#endif
66734 +
66735 +static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
66736 +{
66737 +       if (!pte_dirty(*ptep))
66738 +               return 0;
66739 +       return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low);
66740 +}
66741 +
66742 +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
66743 +{
66744 +       if (!pte_young(*ptep))
66745 +               return 0;
66746 +       return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low);
66747 +}
66748 +
66749 +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
66750 +{
66751 +       pte_t pte;
66752 +       if (full) {
66753 +               pte = *ptep;
66754 +#ifdef CONFIG_X86_PAE
66755 +               /* Cannot do this in a single step, as the compiler may
66756 +                  issue the two stores in either order, but the hypervisor
66757 +                  must not see the high part before the low one. */
66758 +               ptep->pte_low = 0;
66759 +               barrier();
66760 +               ptep->pte_high = 0;
66761 +#else
66762 +               *ptep = __pte(0);
66763 +#endif
66764 +       } else {
66765 +               pte = ptep_get_and_clear(mm, addr, ptep);
66766 +       }
66767 +       return pte;
66768 +}
66769 +
66770 +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
66771 +{
66772 +       if (pte_write(*ptep))
66773 +               clear_bit(_PAGE_BIT_RW, &ptep->pte_low);
66774 +}
66775 +
66776 +/*
66777 + * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
66778 + *
66779 + *  dst - pointer to pgd range anwhere on a pgd page
66780 + *  src - ""
66781 + *  count - the number of pgds to copy.
66782 + *
66783 + * dst and src can be on the same page, but the range must not overlap,
66784 + * and must not cross a page boundary.
66785 + */
66786 +static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
66787 +{
66788 +       memcpy(dst, src, count * sizeof(pgd_t));
66789 +}
66790 +
66791 +/*
66792 + * Macro to mark a page protection value as "uncacheable".  On processors which do not support
66793 + * it, this is a no-op.
66794 + */
66795 +#define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3)                                          \
66796 +                                ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot))
66797 +
66798 +/*
66799 + * Conversion functions: convert a page and protection to a page entry,
66800 + * and a page entry and page directory to the page they refer to.
66801 + */
66802 +
66803 +#define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))
66804 +
66805 +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
66806 +{
66807 +       pte.pte_low &= _PAGE_CHG_MASK;
66808 +       pte.pte_low |= pgprot_val(newprot);
66809 +#ifdef CONFIG_X86_PAE
66810 +       /*
66811 +        * Chop off the NX bit (if present), and add the NX portion of
66812 +        * the newprot (if present):
66813 +        */
66814 +       pte.pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
66815 +       pte.pte_high |= (pgprot_val(newprot) >> 32) & \
66816 +                                       (__supported_pte_mask >> 32);
66817 +#endif
66818 +       return pte;
66819 +}
66820 +
66821 +#define pmd_large(pmd) \
66822 +((pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
66823 +
66824 +/*
66825 + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
66826 + *
66827 + * this macro returns the index of the entry in the pgd page which would
66828 + * control the given virtual address
66829 + */
66830 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
66831 +#define pgd_index_k(addr) pgd_index(addr)
66832 +
66833 +/*
66834 + * pgd_offset() returns a (pgd_t *)
66835 + * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
66836 + */
66837 +#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))
66838 +
66839 +/*
66840 + * a shortcut which implies the use of the kernel's pgd, instead
66841 + * of a process's
66842 + */
66843 +#define pgd_offset_k(address) pgd_offset(&init_mm, address)
66844 +
66845 +/*
66846 + * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
66847 + *
66848 + * this macro returns the index of the entry in the pmd page which would
66849 + * control the given virtual address
66850 + */
66851 +#define pmd_index(address) \
66852 +               (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
66853 +
66854 +/*
66855 + * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
66856 + *
66857 + * this macro returns the index of the entry in the pte page which would
66858 + * control the given virtual address
66859 + */
66860 +#define pte_index(address) \
66861 +               (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
66862 +#define pte_offset_kernel(dir, address) \
66863 +       ((pte_t *) pmd_page_kernel(*(dir)) +  pte_index(address))
66864 +
66865 +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
66866 +
66867 +#define pmd_page_kernel(pmd) \
66868 +               ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
66869 +
66870 +/*
66871 + * Helper function that returns the kernel pagetable entry controlling
66872 + * the virtual address 'address'. NULL means no pagetable entry present.
66873 + * NOTE: the return type is pte_t but if the pmd is PSE then we return it
66874 + * as a pte too.
66875 + */
66876 +extern pte_t *lookup_address(unsigned long address);
66877 +
66878 +/*
66879 + * Make a given kernel text page executable/non-executable.
66880 + * Returns the previous executability setting of that page (which
66881 + * is used to restore the previous state). Used by the SMP bootup code.
66882 + * NOTE: this is an __init function for security reasons.
66883 + */
66884 +#ifdef CONFIG_X86_PAE
66885 + extern int set_kernel_exec(unsigned long vaddr, int enable);
66886 +#else
66887 + static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
66888 +#endif
66889 +
66890 +extern void noexec_setup(const char *str);
66891 +
66892 +#if defined(CONFIG_HIGHPTE)
66893 +#define pte_offset_map(dir, address) \
66894 +       ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + \
66895 +        pte_index(address))
66896 +#define pte_offset_map_nested(dir, address) \
66897 +       ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + \
66898 +        pte_index(address))
66899 +#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
66900 +#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
66901 +#else
66902 +#define pte_offset_map(dir, address) \
66903 +       ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address))
66904 +#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
66905 +#define pte_unmap(pte) do { } while (0)
66906 +#define pte_unmap_nested(pte) do { } while (0)
66907 +#endif
66908 +
66909 +/*
66910 + * The i386 doesn't have any external MMU info: the kernel page
66911 + * tables contain all the necessary information.
66912 + *
66913 + * Also, we only update the dirty/accessed state if we set
66914 + * the dirty bit by hand in the kernel, since the hardware
66915 + * will do the accessed bit for us, and we don't want to
66916 + * race with other CPU's that might be updating the dirty
66917 + * bit at the same time.
66918 + */
66919 +#define update_mmu_cache(vma,address,pte) do { } while (0)
66920 +#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
66921 +#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
66922 +       do {                                                              \
66923 +               if (__dirty) {                                            \
66924 +                       if ( likely((__vma)->vm_mm == current->mm) ) {    \
66925 +                           BUG_ON(HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG|UVMF_MULTI|(unsigned long)((__vma)->vm_mm->cpu_vm_mask.bits))); \
66926 +                       } else {                                          \
66927 +                            xen_l1_entry_update((__ptep), (__entry)); \
66928 +                           flush_tlb_page((__vma), (__address));         \
66929 +                       }                                                 \
66930 +               }                                                         \
66931 +       } while (0)
66932 +
66933 +#define __HAVE_ARCH_PTEP_ESTABLISH
66934 +#define ptep_establish(__vma, __address, __ptep, __entry)              \
66935 +do {                                                                   \
66936 +       ptep_set_access_flags(__vma, __address, __ptep, __entry, 1);    \
66937 +} while (0)
66938 +
66939 +#include <xen/features.h>
66940 +void make_lowmem_page_readonly(void *va, unsigned int feature);
66941 +void make_lowmem_page_writable(void *va, unsigned int feature);
66942 +void make_page_readonly(void *va, unsigned int feature);
66943 +void make_page_writable(void *va, unsigned int feature);
66944 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
66945 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
66946 +
66947 +#define virt_to_ptep(__va)                                             \
66948 +({                                                                     \
66949 +       pgd_t *__pgd = pgd_offset_k((unsigned long)(__va));             \
66950 +       pud_t *__pud = pud_offset(__pgd, (unsigned long)(__va));        \
66951 +       pmd_t *__pmd = pmd_offset(__pud, (unsigned long)(__va));        \
66952 +       pte_offset_kernel(__pmd, (unsigned long)(__va));                \
66953 +})
66954 +
66955 +#define arbitrary_virt_to_machine(__va)                                        \
66956 +({                                                                     \
66957 +       maddr_t m = (maddr_t)pte_mfn(*virt_to_ptep(__va)) << PAGE_SHIFT;\
66958 +       m | ((unsigned long)(__va) & (PAGE_SIZE-1));                    \
66959 +})
66960 +
66961 +#endif /* !__ASSEMBLY__ */
66962 +
66963 +#ifdef CONFIG_FLATMEM
66964 +#define kern_addr_valid(addr)  (1)
66965 +#endif /* CONFIG_FLATMEM */
66966 +
66967 +int direct_remap_pfn_range(struct vm_area_struct *vma,
66968 +                           unsigned long address, 
66969 +                           unsigned long mfn,
66970 +                           unsigned long size, 
66971 +                           pgprot_t prot,
66972 +                           domid_t  domid);
66973 +int direct_kernel_remap_pfn_range(unsigned long address, 
66974 +                                 unsigned long mfn,
66975 +                                 unsigned long size, 
66976 +                                 pgprot_t prot,
66977 +                                 domid_t  domid);
66978 +int create_lookup_pte_addr(struct mm_struct *mm,
66979 +                           unsigned long address,
66980 +                           uint64_t *ptep);
66981 +int touch_pte_range(struct mm_struct *mm,
66982 +                    unsigned long address,
66983 +                    unsigned long size);
66984 +
66985 +#define io_remap_pfn_range(vma,from,pfn,size,prot) \
66986 +direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
66987 +
66988 +#define MK_IOSPACE_PFN(space, pfn)     (pfn)
66989 +#define GET_IOSPACE(pfn)               0
66990 +#define GET_PFN(pfn)                   (pfn)
66991 +
66992 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
66993 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
66994 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
66995 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
66996 +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
66997 +#define __HAVE_ARCH_PTE_SAME
66998 +#include <asm-generic/pgtable.h>
66999 +
67000 +#endif /* _I386_PGTABLE_H */
67001 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/processor.h linux-2.6.16/include/asm-i386/mach-xen/asm/processor.h
67002 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/processor.h 1970-01-01 01:00:00.000000000 +0100
67003 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/processor.h      2006-06-26 09:51:32.000000000 +0200
67004 @@ -0,0 +1,753 @@
67005 +/*
67006 + * include/asm-i386/processor.h
67007 + *
67008 + * Copyright (C) 1994 Linus Torvalds
67009 + */
67010 +
67011 +#ifndef __ASM_I386_PROCESSOR_H
67012 +#define __ASM_I386_PROCESSOR_H
67013 +
67014 +#include <asm/vm86.h>
67015 +#include <asm/math_emu.h>
67016 +#include <asm/segment.h>
67017 +#include <asm/page.h>
67018 +#include <asm/types.h>
67019 +#include <asm/sigcontext.h>
67020 +#include <asm/cpufeature.h>
67021 +#include <asm/msr.h>
67022 +#include <asm/system.h>
67023 +#include <linux/cache.h>
67024 +#include <linux/config.h>
67025 +#include <linux/threads.h>
67026 +#include <asm/percpu.h>
67027 +#include <xen/interface/physdev.h>
67028 +
67029 +/* flag for disabling the tsc */
67030 +extern int tsc_disable;
67031 +
67032 +struct desc_struct {
67033 +       unsigned long a,b;
67034 +};
67035 +
67036 +#define desc_empty(desc) \
67037 +               (!((desc)->a | (desc)->b))
67038 +
67039 +#define desc_equal(desc1, desc2) \
67040 +               (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
67041 +/*
67042 + * Default implementation of macro that returns current
67043 + * instruction pointer ("program counter").
67044 + */
67045 +#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; })
67046 +
67047 +/*
67048 + *  CPU type and hardware bug flags. Kept separately for each CPU.
67049 + *  Members of this structure are referenced in head.S, so think twice
67050 + *  before touching them. [mj]
67051 + */
67052 +
67053 +struct cpuinfo_x86 {
67054 +       __u8    x86;            /* CPU family */
67055 +       __u8    x86_vendor;     /* CPU vendor */
67056 +       __u8    x86_model;
67057 +       __u8    x86_mask;
67058 +       char    wp_works_ok;    /* It doesn't on 386's */
67059 +       char    hlt_works_ok;   /* Problems on some 486Dx4's and old 386's */
67060 +       char    hard_math;
67061 +       char    rfu;
67062 +               int     cpuid_level;    /* Maximum supported CPUID level, -1=no CPUID */
67063 +       unsigned long   x86_capability[NCAPINTS];
67064 +       char    x86_vendor_id[16];
67065 +       char    x86_model_id[64];
67066 +       int     x86_cache_size;  /* in KB - valid for CPUS which support this
67067 +                                   call  */
67068 +       int     x86_cache_alignment;    /* In bytes */
67069 +       char    fdiv_bug;
67070 +       char    f00f_bug;
67071 +       char    coma_bug;
67072 +       char    pad0;
67073 +       int     x86_power;
67074 +       unsigned long loops_per_jiffy;
67075 +       unsigned char x86_max_cores;    /* cpuid returned max cores value */
67076 +       unsigned char booted_cores;     /* number of cores as seen by OS */
67077 +       unsigned char apicid;
67078 +} __attribute__((__aligned__(SMP_CACHE_BYTES)));
67079 +
67080 +#define X86_VENDOR_INTEL 0
67081 +#define X86_VENDOR_CYRIX 1
67082 +#define X86_VENDOR_AMD 2
67083 +#define X86_VENDOR_UMC 3
67084 +#define X86_VENDOR_NEXGEN 4
67085 +#define X86_VENDOR_CENTAUR 5
67086 +#define X86_VENDOR_RISE 6
67087 +#define X86_VENDOR_TRANSMETA 7
67088 +#define X86_VENDOR_NSC 8
67089 +#define X86_VENDOR_NUM 9
67090 +#define X86_VENDOR_UNKNOWN 0xff
67091 +
67092 +/*
67093 + * capabilities of CPUs
67094 + */
67095 +
67096 +extern struct cpuinfo_x86 boot_cpu_data;
67097 +extern struct cpuinfo_x86 new_cpu_data;
67098 +#ifndef CONFIG_X86_NO_TSS
67099 +extern struct tss_struct doublefault_tss;
67100 +DECLARE_PER_CPU(struct tss_struct, init_tss);
67101 +#endif
67102 +
67103 +#ifdef CONFIG_SMP
67104 +extern struct cpuinfo_x86 cpu_data[];
67105 +#define current_cpu_data cpu_data[smp_processor_id()]
67106 +#else
67107 +#define cpu_data (&boot_cpu_data)
67108 +#define current_cpu_data boot_cpu_data
67109 +#endif
67110 +
67111 +extern int phys_proc_id[NR_CPUS];
67112 +extern int cpu_core_id[NR_CPUS];
67113 +extern char ignore_fpu_irq;
67114 +
67115 +extern void identify_cpu(struct cpuinfo_x86 *);
67116 +extern void print_cpu_info(struct cpuinfo_x86 *);
67117 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
67118 +
67119 +#ifdef CONFIG_X86_HT
67120 +extern void detect_ht(struct cpuinfo_x86 *c);
67121 +#else
67122 +static inline void detect_ht(struct cpuinfo_x86 *c) {}
67123 +#endif
67124 +
67125 +/*
67126 + * EFLAGS bits
67127 + */
67128 +#define X86_EFLAGS_CF  0x00000001 /* Carry Flag */
67129 +#define X86_EFLAGS_PF  0x00000004 /* Parity Flag */
67130 +#define X86_EFLAGS_AF  0x00000010 /* Auxillary carry Flag */
67131 +#define X86_EFLAGS_ZF  0x00000040 /* Zero Flag */
67132 +#define X86_EFLAGS_SF  0x00000080 /* Sign Flag */
67133 +#define X86_EFLAGS_TF  0x00000100 /* Trap Flag */
67134 +#define X86_EFLAGS_IF  0x00000200 /* Interrupt Flag */
67135 +#define X86_EFLAGS_DF  0x00000400 /* Direction Flag */
67136 +#define X86_EFLAGS_OF  0x00000800 /* Overflow Flag */
67137 +#define X86_EFLAGS_IOPL        0x00003000 /* IOPL mask */
67138 +#define X86_EFLAGS_NT  0x00004000 /* Nested Task */
67139 +#define X86_EFLAGS_RF  0x00010000 /* Resume Flag */
67140 +#define X86_EFLAGS_VM  0x00020000 /* Virtual Mode */
67141 +#define X86_EFLAGS_AC  0x00040000 /* Alignment Check */
67142 +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
67143 +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
67144 +#define X86_EFLAGS_ID  0x00200000 /* CPUID detection flag */
67145 +
67146 +/*
67147 + * Generic CPUID function
67148 + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
67149 + * resulting in stale register contents being returned.
67150 + */
67151 +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
67152 +{
67153 +       __asm__(XEN_CPUID
67154 +               : "=a" (*eax),
67155 +                 "=b" (*ebx),
67156 +                 "=c" (*ecx),
67157 +                 "=d" (*edx)
67158 +               : "0" (op), "c"(0));
67159 +}
67160 +
67161 +/* Some CPUID calls want 'count' to be placed in ecx */
67162 +static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
67163 +               int *edx)
67164 +{
67165 +       __asm__(XEN_CPUID
67166 +               : "=a" (*eax),
67167 +                 "=b" (*ebx),
67168 +                 "=c" (*ecx),
67169 +                 "=d" (*edx)
67170 +               : "0" (op), "c" (count));
67171 +}
67172 +
67173 +/*
67174 + * CPUID functions returning a single datum
67175 + */
67176 +static inline unsigned int cpuid_eax(unsigned int op)
67177 +{
67178 +       unsigned int eax;
67179 +
67180 +       __asm__(XEN_CPUID
67181 +               : "=a" (eax)
67182 +               : "0" (op)
67183 +               : "bx", "cx", "dx");
67184 +       return eax;
67185 +}
67186 +static inline unsigned int cpuid_ebx(unsigned int op)
67187 +{
67188 +       unsigned int eax, ebx;
67189 +
67190 +       __asm__(XEN_CPUID
67191 +               : "=a" (eax), "=b" (ebx)
67192 +               : "0" (op)
67193 +               : "cx", "dx" );
67194 +       return ebx;
67195 +}
67196 +static inline unsigned int cpuid_ecx(unsigned int op)
67197 +{
67198 +       unsigned int eax, ecx;
67199 +
67200 +       __asm__(XEN_CPUID
67201 +               : "=a" (eax), "=c" (ecx)
67202 +               : "0" (op)
67203 +               : "bx", "dx" );
67204 +       return ecx;
67205 +}
67206 +static inline unsigned int cpuid_edx(unsigned int op)
67207 +{
67208 +       unsigned int eax, edx;
67209 +
67210 +       __asm__(XEN_CPUID
67211 +               : "=a" (eax), "=d" (edx)
67212 +               : "0" (op)
67213 +               : "bx", "cx");
67214 +       return edx;
67215 +}
67216 +
67217 +#define load_cr3(pgdir) write_cr3(__pa(pgdir))
67218 +
67219 +/*
67220 + * Intel CPU features in CR4
67221 + */
67222 +#define X86_CR4_VME            0x0001  /* enable vm86 extensions */
67223 +#define X86_CR4_PVI            0x0002  /* virtual interrupts flag enable */
67224 +#define X86_CR4_TSD            0x0004  /* disable time stamp at ipl 3 */
67225 +#define X86_CR4_DE             0x0008  /* enable debugging extensions */
67226 +#define X86_CR4_PSE            0x0010  /* enable page size extensions */
67227 +#define X86_CR4_PAE            0x0020  /* enable physical address extensions */
67228 +#define X86_CR4_MCE            0x0040  /* Machine check enable */
67229 +#define X86_CR4_PGE            0x0080  /* enable global pages */
67230 +#define X86_CR4_PCE            0x0100  /* enable performance counters at ipl 3 */
67231 +#define X86_CR4_OSFXSR         0x0200  /* enable fast FPU save and restore */
67232 +#define X86_CR4_OSXMMEXCPT     0x0400  /* enable unmasked SSE exceptions */
67233 +
67234 +/*
67235 + * Save the cr4 feature set we're using (ie
67236 + * Pentium 4MB enable and PPro Global page
67237 + * enable), so that any CPU's that boot up
67238 + * after us can get the correct flags.
67239 + */
67240 +extern unsigned long mmu_cr4_features;
67241 +
67242 +static inline void set_in_cr4 (unsigned long mask)
67243 +{
67244 +       unsigned cr4;
67245 +       mmu_cr4_features |= mask;
67246 +       cr4 = read_cr4();
67247 +       cr4 |= mask;
67248 +       write_cr4(cr4);
67249 +}
67250 +
67251 +static inline void clear_in_cr4 (unsigned long mask)
67252 +{
67253 +       unsigned cr4;
67254 +       mmu_cr4_features &= ~mask;
67255 +       cr4 = read_cr4();
67256 +       cr4 &= ~mask;
67257 +       write_cr4(cr4);
67258 +}
67259 +
67260 +/*
67261 + *      NSC/Cyrix CPU configuration register indexes
67262 + */
67263 +
67264 +#define CX86_PCR0 0x20
67265 +#define CX86_GCR  0xb8
67266 +#define CX86_CCR0 0xc0
67267 +#define CX86_CCR1 0xc1
67268 +#define CX86_CCR2 0xc2
67269 +#define CX86_CCR3 0xc3
67270 +#define CX86_CCR4 0xe8
67271 +#define CX86_CCR5 0xe9
67272 +#define CX86_CCR6 0xea
67273 +#define CX86_CCR7 0xeb
67274 +#define CX86_PCR1 0xf0
67275 +#define CX86_DIR0 0xfe
67276 +#define CX86_DIR1 0xff
67277 +#define CX86_ARR_BASE 0xc4
67278 +#define CX86_RCR_BASE 0xdc
67279 +
67280 +/*
67281 + *      NSC/Cyrix CPU indexed register access macros
67282 + */
67283 +
67284 +#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); })
67285 +
67286 +#define setCx86(reg, data) do { \
67287 +       outb((reg), 0x22); \
67288 +       outb((data), 0x23); \
67289 +} while (0)
67290 +
67291 +/* Stop speculative execution */
67292 +static inline void sync_core(void)
67293 +{
67294 +       int tmp;
67295 +       asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
67296 +}
67297 +
67298 +static inline void __monitor(const void *eax, unsigned long ecx,
67299 +               unsigned long edx)
67300 +{
67301 +       /* "monitor %eax,%ecx,%edx;" */
67302 +       asm volatile(
67303 +               ".byte 0x0f,0x01,0xc8;"
67304 +               : :"a" (eax), "c" (ecx), "d"(edx));
67305 +}
67306 +
67307 +static inline void __mwait(unsigned long eax, unsigned long ecx)
67308 +{
67309 +       /* "mwait %eax,%ecx;" */
67310 +       asm volatile(
67311 +               ".byte 0x0f,0x01,0xc9;"
67312 +               : :"a" (eax), "c" (ecx));
67313 +}
67314 +
67315 +/* from system description table in BIOS.  Mostly for MCA use, but
67316 +others may find it useful. */
67317 +extern unsigned int machine_id;
67318 +extern unsigned int machine_submodel_id;
67319 +extern unsigned int BIOS_revision;
67320 +extern unsigned int mca_pentium_flag;
67321 +
67322 +/* Boot loader type from the setup header */
67323 +extern int bootloader_type;
67324 +
67325 +/*
67326 + * User space process size: 3GB (default).
67327 + */
67328 +#define TASK_SIZE      (PAGE_OFFSET)
67329 +
67330 +/* This decides where the kernel will search for a free chunk of vm
67331 + * space during mmap's.
67332 + */
67333 +#define TASK_UNMAPPED_BASE     (PAGE_ALIGN(TASK_SIZE / 3))
67334 +
67335 +#define HAVE_ARCH_PICK_MMAP_LAYOUT
67336 +
67337 +/*
67338 + * Size of io_bitmap.
67339 + */
67340 +#define IO_BITMAP_BITS  65536
67341 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
67342 +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
67343 +#ifndef CONFIG_X86_NO_TSS
67344 +#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
67345 +#endif
67346 +#define INVALID_IO_BITMAP_OFFSET 0x8000
67347 +#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
67348 +
67349 +struct i387_fsave_struct {
67350 +       long    cwd;
67351 +       long    swd;
67352 +       long    twd;
67353 +       long    fip;
67354 +       long    fcs;
67355 +       long    foo;
67356 +       long    fos;
67357 +       long    st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
67358 +       long    status;         /* software status information */
67359 +};
67360 +
67361 +struct i387_fxsave_struct {
67362 +       unsigned short  cwd;
67363 +       unsigned short  swd;
67364 +       unsigned short  twd;
67365 +       unsigned short  fop;
67366 +       long    fip;
67367 +       long    fcs;
67368 +       long    foo;
67369 +       long    fos;
67370 +       long    mxcsr;
67371 +       long    mxcsr_mask;
67372 +       long    st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
67373 +       long    xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
67374 +       long    padding[56];
67375 +} __attribute__ ((aligned (16)));
67376 +
67377 +struct i387_soft_struct {
67378 +       long    cwd;
67379 +       long    swd;
67380 +       long    twd;
67381 +       long    fip;
67382 +       long    fcs;
67383 +       long    foo;
67384 +       long    fos;
67385 +       long    st_space[20];   /* 8*10 bytes for each FP-reg = 80 bytes */
67386 +       unsigned char   ftop, changed, lookahead, no_update, rm, alimit;
67387 +       struct info     *info;
67388 +       unsigned long   entry_eip;
67389 +};
67390 +
67391 +union i387_union {
67392 +       struct i387_fsave_struct        fsave;
67393 +       struct i387_fxsave_struct       fxsave;
67394 +       struct i387_soft_struct soft;
67395 +};
67396 +
67397 +typedef struct {
67398 +       unsigned long seg;
67399 +} mm_segment_t;
67400 +
67401 +struct thread_struct;
67402 +
67403 +#ifndef CONFIG_X86_NO_TSS
67404 +struct tss_struct {
67405 +       unsigned short  back_link,__blh;
67406 +       unsigned long   esp0;
67407 +       unsigned short  ss0,__ss0h;
67408 +       unsigned long   esp1;
67409 +       unsigned short  ss1,__ss1h;     /* ss1 is used to cache MSR_IA32_SYSENTER_CS */
67410 +       unsigned long   esp2;
67411 +       unsigned short  ss2,__ss2h;
67412 +       unsigned long   __cr3;
67413 +       unsigned long   eip;
67414 +       unsigned long   eflags;
67415 +       unsigned long   eax,ecx,edx,ebx;
67416 +       unsigned long   esp;
67417 +       unsigned long   ebp;
67418 +       unsigned long   esi;
67419 +       unsigned long   edi;
67420 +       unsigned short  es, __esh;
67421 +       unsigned short  cs, __csh;
67422 +       unsigned short  ss, __ssh;
67423 +       unsigned short  ds, __dsh;
67424 +       unsigned short  fs, __fsh;
67425 +       unsigned short  gs, __gsh;
67426 +       unsigned short  ldt, __ldth;
67427 +       unsigned short  trace, io_bitmap_base;
67428 +       /*
67429 +        * The extra 1 is there because the CPU will access an
67430 +        * additional byte beyond the end of the IO permission
67431 +        * bitmap. The extra byte must be all 1 bits, and must
67432 +        * be within the limit.
67433 +        */
67434 +       unsigned long   io_bitmap[IO_BITMAP_LONGS + 1];
67435 +       /*
67436 +        * Cache the current maximum and the last task that used the bitmap:
67437 +        */
67438 +       unsigned long io_bitmap_max;
67439 +       struct thread_struct *io_bitmap_owner;
67440 +       /*
67441 +        * pads the TSS to be cacheline-aligned (size is 0x100)
67442 +        */
67443 +       unsigned long __cacheline_filler[35];
67444 +       /*
67445 +        * .. and then another 0x100 bytes for emergency kernel stack
67446 +        */
67447 +       unsigned long stack[64];
67448 +} __attribute__((packed));
67449 +#endif
67450 +
67451 +#define ARCH_MIN_TASKALIGN     16
67452 +
67453 +struct thread_struct {
67454 +/* cached TLS descriptors. */
67455 +       struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
67456 +       unsigned long   esp0;
67457 +       unsigned long   sysenter_cs;
67458 +       unsigned long   eip;
67459 +       unsigned long   esp;
67460 +       unsigned long   fs;
67461 +       unsigned long   gs;
67462 +/* Hardware debugging registers */
67463 +       unsigned long   debugreg[8];  /* %%db0-7 debug registers */
67464 +/* fault info */
67465 +       unsigned long   cr2, trap_no, error_code;
67466 +/* floating point info */
67467 +       union i387_union        i387;
67468 +/* virtual 86 mode info */
67469 +       struct vm86_struct __user * vm86_info;
67470 +       unsigned long           screen_bitmap;
67471 +       unsigned long           v86flags, v86mask, saved_esp0;
67472 +       unsigned int            saved_fs, saved_gs;
67473 +/* IO permissions */
67474 +       unsigned long   *io_bitmap_ptr;
67475 +       unsigned long   iopl;
67476 +/* max allowed port in the bitmap, in bytes: */
67477 +       unsigned long   io_bitmap_max;
67478 +};
67479 +
67480 +#define INIT_THREAD  {                                                 \
67481 +       .vm86_info = NULL,                                              \
67482 +       .sysenter_cs = __KERNEL_CS,                                     \
67483 +       .io_bitmap_ptr = NULL,                                          \
67484 +}
67485 +
67486 +#ifndef CONFIG_X86_NO_TSS
67487 +/*
67488 + * Note that the .io_bitmap member must be extra-big. This is because
67489 + * the CPU will access an additional byte beyond the end of the IO
67490 + * permission bitmap. The extra byte must be all 1 bits, and must
67491 + * be within the limit.
67492 + */
67493 +#define INIT_TSS  {                                                    \
67494 +       .esp0           = sizeof(init_stack) + (long)&init_stack,       \
67495 +       .ss0            = __KERNEL_DS,                                  \
67496 +       .ss1            = __KERNEL_CS,                                  \
67497 +       .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,                     \
67498 +       .io_bitmap      = { [ 0 ... IO_BITMAP_LONGS] = ~0 },            \
67499 +}
67500 +
67501 +static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread)
67502 +{
67503 +       tss->esp0 = thread->esp0;
67504 +#ifdef CONFIG_X86_SYSENTER
67505 +       /* This can only happen when SEP is enabled, no need to test "SEP"arately */
67506 +       if (unlikely(tss->ss1 != thread->sysenter_cs)) {
67507 +               tss->ss1 = thread->sysenter_cs;
67508 +               wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
67509 +       }
67510 +#endif
67511 +}
67512 +#define load_esp0(tss, thread) \
67513 +       __load_esp0(tss, thread)
67514 +#else
67515 +#define load_esp0(tss, thread) \
67516 +       HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)
67517 +#endif
67518 +
67519 +#define start_thread(regs, new_eip, new_esp) do {              \
67520 +       __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0));       \
67521 +       set_fs(USER_DS);                                        \
67522 +       regs->xds = __USER_DS;                                  \
67523 +       regs->xes = __USER_DS;                                  \
67524 +       regs->xss = __USER_DS;                                  \
67525 +       regs->xcs = __USER_CS;                                  \
67526 +       regs->eip = new_eip;                                    \
67527 +       regs->esp = new_esp;                                    \
67528 +} while (0)
67529 +
67530 +/*
67531 + * These special macros can be used to get or set a debugging register
67532 + */
67533 +#define get_debugreg(var, register)                            \
67534 +               (var) = HYPERVISOR_get_debugreg((register))
67535 +#define set_debugreg(value, register)                  \
67536 +               HYPERVISOR_set_debugreg((register), (value))
67537 +
67538 +/*
67539 + * Set IOPL bits in EFLAGS from given mask
67540 + */
67541 +static inline void set_iopl_mask(unsigned mask)
67542 +{
67543 +       physdev_op_t op;
67544 +
67545 +       /* Force the change at ring 0. */
67546 +       op.cmd = PHYSDEVOP_SET_IOPL;
67547 +       op.u.set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
67548 +       HYPERVISOR_physdev_op(&op);
67549 +}
67550 +
67551 +/* Forward declaration, a strange C thing */
67552 +struct task_struct;
67553 +struct mm_struct;
67554 +
67555 +/* Free all resources held by a thread. */
67556 +extern void release_thread(struct task_struct *);
67557 +
67558 +/* Prepare to copy thread state - unlazy all lazy status */
67559 +extern void prepare_to_copy(struct task_struct *tsk);
67560 +
67561 +/*
67562 + * create a kernel thread without removing it from tasklists
67563 + */
67564 +extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
67565 +
67566 +extern unsigned long thread_saved_pc(struct task_struct *tsk);
67567 +void show_trace(struct task_struct *task, unsigned long *stack);
67568 +
67569 +unsigned long get_wchan(struct task_struct *p);
67570 +
67571 +#define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
67572 +#define KSTK_TOP(info)                                                 \
67573 +({                                                                     \
67574 +       unsigned long *__ptr = (unsigned long *)(info);                 \
67575 +       (unsigned long)(&__ptr[THREAD_SIZE_LONGS]);                     \
67576 +})
67577 +
67578 +/*
67579 + * The below -8 is to reserve 8 bytes on top of the ring0 stack.
67580 + * This is necessary to guarantee that the entire "struct pt_regs"
67581 + * is accessable even if the CPU haven't stored the SS/ESP registers
67582 + * on the stack (interrupt gate does not save these registers
67583 + * when switching to the same priv ring).
67584 + * Therefore beware: accessing the xss/esp fields of the
67585 + * "struct pt_regs" is possible, but they may contain the
67586 + * completely wrong values.
67587 + */
67588 +#define task_pt_regs(task)                                             \
67589 +({                                                                     \
67590 +       struct pt_regs *__regs__;                                       \
67591 +       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
67592 +       __regs__ - 1;                                                   \
67593 +})
67594 +
67595 +#define KSTK_EIP(task) (task_pt_regs(task)->eip)
67596 +#define KSTK_ESP(task) (task_pt_regs(task)->esp)
67597 +
67598 +
67599 +struct microcode_header {
67600 +       unsigned int hdrver;
67601 +       unsigned int rev;
67602 +       unsigned int date;
67603 +       unsigned int sig;
67604 +       unsigned int cksum;
67605 +       unsigned int ldrver;
67606 +       unsigned int pf;
67607 +       unsigned int datasize;
67608 +       unsigned int totalsize;
67609 +       unsigned int reserved[3];
67610 +};
67611 +
67612 +struct microcode {
67613 +       struct microcode_header hdr;
67614 +       unsigned int bits[0];
67615 +};
67616 +
67617 +typedef struct microcode microcode_t;
67618 +typedef struct microcode_header microcode_header_t;
67619 +
67620 +/* microcode format is extended from prescott processors */
67621 +struct extended_signature {
67622 +       unsigned int sig;
67623 +       unsigned int pf;
67624 +       unsigned int cksum;
67625 +};
67626 +
67627 +struct extended_sigtable {
67628 +       unsigned int count;
67629 +       unsigned int cksum;
67630 +       unsigned int reserved[3];
67631 +       struct extended_signature sigs[0];
67632 +};
67633 +/* '6' because it used to be for P6 only (but now covers Pentium 4 as well) */
67634 +#define MICROCODE_IOCFREE      _IO('6',0)
67635 +
67636 +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
67637 +static inline void rep_nop(void)
67638 +{
67639 +       __asm__ __volatile__("rep;nop": : :"memory");
67640 +}
67641 +
67642 +#define cpu_relax()    rep_nop()
67643 +
67644 +/* generic versions from gas */
67645 +#define GENERIC_NOP1   ".byte 0x90\n"
67646 +#define GENERIC_NOP2           ".byte 0x89,0xf6\n"
67647 +#define GENERIC_NOP3        ".byte 0x8d,0x76,0x00\n"
67648 +#define GENERIC_NOP4        ".byte 0x8d,0x74,0x26,0x00\n"
67649 +#define GENERIC_NOP5        GENERIC_NOP1 GENERIC_NOP4
67650 +#define GENERIC_NOP6   ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
67651 +#define GENERIC_NOP7   ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
67652 +#define GENERIC_NOP8   GENERIC_NOP1 GENERIC_NOP7
67653 +
67654 +/* Opteron nops */
67655 +#define K8_NOP1 GENERIC_NOP1
67656 +#define K8_NOP2        ".byte 0x66,0x90\n" 
67657 +#define K8_NOP3        ".byte 0x66,0x66,0x90\n" 
67658 +#define K8_NOP4        ".byte 0x66,0x66,0x66,0x90\n" 
67659 +#define K8_NOP5        K8_NOP3 K8_NOP2 
67660 +#define K8_NOP6        K8_NOP3 K8_NOP3
67661 +#define K8_NOP7        K8_NOP4 K8_NOP3
67662 +#define K8_NOP8        K8_NOP4 K8_NOP4
67663 +
67664 +/* K7 nops */
67665 +/* uses eax dependencies (arbitary choice) */
67666 +#define K7_NOP1  GENERIC_NOP1
67667 +#define K7_NOP2        ".byte 0x8b,0xc0\n" 
67668 +#define K7_NOP3        ".byte 0x8d,0x04,0x20\n"
67669 +#define K7_NOP4        ".byte 0x8d,0x44,0x20,0x00\n"
67670 +#define K7_NOP5        K7_NOP4 ASM_NOP1
67671 +#define K7_NOP6        ".byte 0x8d,0x80,0,0,0,0\n"
67672 +#define K7_NOP7        ".byte 0x8D,0x04,0x05,0,0,0,0\n"
67673 +#define K7_NOP8        K7_NOP7 ASM_NOP1
67674 +
67675 +#ifdef CONFIG_MK8
67676 +#define ASM_NOP1 K8_NOP1
67677 +#define ASM_NOP2 K8_NOP2
67678 +#define ASM_NOP3 K8_NOP3
67679 +#define ASM_NOP4 K8_NOP4
67680 +#define ASM_NOP5 K8_NOP5
67681 +#define ASM_NOP6 K8_NOP6
67682 +#define ASM_NOP7 K8_NOP7
67683 +#define ASM_NOP8 K8_NOP8
67684 +#elif defined(CONFIG_MK7)
67685 +#define ASM_NOP1 K7_NOP1
67686 +#define ASM_NOP2 K7_NOP2
67687 +#define ASM_NOP3 K7_NOP3
67688 +#define ASM_NOP4 K7_NOP4
67689 +#define ASM_NOP5 K7_NOP5
67690 +#define ASM_NOP6 K7_NOP6
67691 +#define ASM_NOP7 K7_NOP7
67692 +#define ASM_NOP8 K7_NOP8
67693 +#else
67694 +#define ASM_NOP1 GENERIC_NOP1
67695 +#define ASM_NOP2 GENERIC_NOP2
67696 +#define ASM_NOP3 GENERIC_NOP3
67697 +#define ASM_NOP4 GENERIC_NOP4
67698 +#define ASM_NOP5 GENERIC_NOP5
67699 +#define ASM_NOP6 GENERIC_NOP6
67700 +#define ASM_NOP7 GENERIC_NOP7
67701 +#define ASM_NOP8 GENERIC_NOP8
67702 +#endif
67703 +
67704 +#define ASM_NOP_MAX 8
67705 +
67706 +/* Prefetch instructions for Pentium III and AMD Athlon */
67707 +/* It's not worth to care about 3dnow! prefetches for the K6
67708 +   because they are microcoded there and very slow.
67709 +   However we don't do prefetches for pre XP Athlons currently
67710 +   That should be fixed. */
67711 +#define ARCH_HAS_PREFETCH
67712 +static inline void prefetch(const void *x)
67713 +{
67714 +       alternative_input(ASM_NOP4,
67715 +                         "prefetchnta (%1)",
67716 +                         X86_FEATURE_XMM,
67717 +                         "r" (x));
67718 +}
67719 +
67720 +#define ARCH_HAS_PREFETCH
67721 +#define ARCH_HAS_PREFETCHW
67722 +#define ARCH_HAS_SPINLOCK_PREFETCH
67723 +
67724 +/* 3dnow! prefetch to get an exclusive cache line. Useful for 
67725 +   spinlocks to avoid one state transition in the cache coherency protocol. */
67726 +static inline void prefetchw(const void *x)
67727 +{
67728 +       alternative_input(ASM_NOP4,
67729 +                         "prefetchw (%1)",
67730 +                         X86_FEATURE_3DNOW,
67731 +                         "r" (x));
67732 +}
67733 +#define spin_lock_prefetch(x)  prefetchw(x)
67734 +
67735 +extern void select_idle_routine(const struct cpuinfo_x86 *c);
67736 +
67737 +#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
67738 +
67739 +extern unsigned long boot_option_idle_override;
67740 +extern void enable_sep_cpu(void);
67741 +extern int sysenter_setup(void);
67742 +
67743 +#ifdef CONFIG_MTRR
67744 +extern void mtrr_ap_init(void);
67745 +extern void mtrr_bp_init(void);
67746 +#else
67747 +#define mtrr_ap_init() do {} while (0)
67748 +#define mtrr_bp_init() do {} while (0)
67749 +#endif
67750 +
67751 +#ifdef CONFIG_X86_MCE
67752 +extern void mcheck_init(struct cpuinfo_x86 *c);
67753 +#else
67754 +#define mcheck_init(c) do {} while(0)
67755 +#endif
67756 +
67757 +#endif /* __ASM_I386_PROCESSOR_H */
67758 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/ptrace.h linux-2.6.16/include/asm-i386/mach-xen/asm/ptrace.h
67759 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/ptrace.h    1970-01-01 01:00:00.000000000 +0100
67760 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/ptrace.h 2006-06-26 09:51:32.000000000 +0200
67761 @@ -0,0 +1,90 @@
67762 +#ifndef _I386_PTRACE_H
67763 +#define _I386_PTRACE_H
67764 +
67765 +#define EBX 0
67766 +#define ECX 1
67767 +#define EDX 2
67768 +#define ESI 3
67769 +#define EDI 4
67770 +#define EBP 5
67771 +#define EAX 6
67772 +#define DS 7
67773 +#define ES 8
67774 +#define FS 9
67775 +#define GS 10
67776 +#define ORIG_EAX 11
67777 +#define EIP 12
67778 +#define CS  13
67779 +#define EFL 14
67780 +#define UESP 15
67781 +#define SS   16
67782 +#define FRAME_SIZE 17
67783 +
67784 +/* this struct defines the way the registers are stored on the 
67785 +   stack during a system call. */
67786 +
67787 +struct pt_regs {
67788 +       long ebx;
67789 +       long ecx;
67790 +       long edx;
67791 +       long esi;
67792 +       long edi;
67793 +       long ebp;
67794 +       long eax;
67795 +       int  xds;
67796 +       int  xes;
67797 +       long orig_eax;
67798 +       long eip;
67799 +       int  xcs;
67800 +       long eflags;
67801 +       long esp;
67802 +       int  xss;
67803 +};
67804 +
67805 +/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
67806 +#define PTRACE_GETREGS            12
67807 +#define PTRACE_SETREGS            13
67808 +#define PTRACE_GETFPREGS          14
67809 +#define PTRACE_SETFPREGS          15
67810 +#define PTRACE_GETFPXREGS         18
67811 +#define PTRACE_SETFPXREGS         19
67812 +
67813 +#define PTRACE_OLDSETOPTIONS         21
67814 +
67815 +#define PTRACE_GET_THREAD_AREA    25
67816 +#define PTRACE_SET_THREAD_AREA    26
67817 +
67818 +#define PTRACE_SYSEMU            31
67819 +#define PTRACE_SYSEMU_SINGLESTEP  32
67820 +
67821 +#ifdef __KERNEL__
67822 +
67823 +#include <asm/vm86.h>
67824 +
67825 +struct task_struct;
67826 +extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code);
67827 +
67828 +/*
67829 + * user_mode_vm(regs) determines whether a register set came from user mode.
67830 + * This is true if V8086 mode was enabled OR if the register set was from
67831 + * protected mode with RPL-3 CS value.  This tricky test checks that with
67832 + * one comparison.  Many places in the kernel can bypass this full check
67833 + * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
67834 + */
67835 +static inline int user_mode(struct pt_regs *regs)
67836 +{
67837 +       return (regs->xcs & 2) != 0;
67838 +}
67839 +static inline int user_mode_vm(struct pt_regs *regs)
67840 +{
67841 +       return ((regs->xcs & 2) | (regs->eflags & VM_MASK)) != 0;
67842 +}
67843 +#define instruction_pointer(regs) ((regs)->eip)
67844 +#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
67845 +extern unsigned long profile_pc(struct pt_regs *regs);
67846 +#else
67847 +#define profile_pc(regs) instruction_pointer(regs)
67848 +#endif
67849 +#endif /* __KERNEL__ */
67850 +
67851 +#endif
67852 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/scatterlist.h linux-2.6.16/include/asm-i386/mach-xen/asm/scatterlist.h
67853 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/scatterlist.h       1970-01-01 01:00:00.000000000 +0100
67854 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/scatterlist.h    2006-06-26 09:51:32.000000000 +0200
67855 @@ -0,0 +1,22 @@
67856 +#ifndef _I386_SCATTERLIST_H
67857 +#define _I386_SCATTERLIST_H
67858 +
67859 +struct scatterlist {
67860 +    struct page                *page;
67861 +    unsigned int       offset;
67862 +    unsigned int       length;
67863 +    dma_addr_t         dma_address;
67864 +    unsigned int       dma_length;
67865 +};
67866 +
67867 +/* These macros should be used after a pci_map_sg call has been done
67868 + * to get bus addresses of each of the SG entries and their lengths.
67869 + * You should only work with the number of sg entries pci_map_sg
67870 + * returns.
67871 + */
67872 +#define sg_dma_address(sg)     ((sg)->dma_address)
67873 +#define sg_dma_len(sg)         ((sg)->dma_length)
67874 +
67875 +#define ISA_DMA_THRESHOLD (0x00ffffff)
67876 +
67877 +#endif /* !(_I386_SCATTERLIST_H) */
67878 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/segment.h linux-2.6.16/include/asm-i386/mach-xen/asm/segment.h
67879 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/segment.h   1970-01-01 01:00:00.000000000 +0100
67880 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/segment.h        2006-06-26 09:51:32.000000000 +0200
67881 @@ -0,0 +1,117 @@
67882 +#ifndef _ASM_SEGMENT_H
67883 +#define _ASM_SEGMENT_H
67884 +
67885 +/*
67886 + * The layout of the per-CPU GDT under Linux:
67887 + *
67888 + *   0 - null
67889 + *   1 - reserved
67890 + *   2 - reserved
67891 + *   3 - reserved
67892 + *
67893 + *   4 - unused                        <==== new cacheline
67894 + *   5 - unused
67895 + *
67896 + *  ------- start of TLS (Thread-Local Storage) segments:
67897 + *
67898 + *   6 - TLS segment #1                        [ glibc's TLS segment ]
67899 + *   7 - TLS segment #2                        [ Wine's %fs Win32 segment ]
67900 + *   8 - TLS segment #3
67901 + *   9 - reserved
67902 + *  10 - reserved
67903 + *  11 - reserved
67904 + *
67905 + *  ------- start of kernel segments:
67906 + *
67907 + *  12 - kernel code segment           <==== new cacheline
67908 + *  13 - kernel data segment
67909 + *  14 - default user CS
67910 + *  15 - default user DS
67911 + *  16 - TSS
67912 + *  17 - LDT
67913 + *  18 - PNPBIOS support (16->32 gate)
67914 + *  19 - PNPBIOS support
67915 + *  20 - PNPBIOS support
67916 + *  21 - PNPBIOS support
67917 + *  22 - PNPBIOS support
67918 + *  23 - APM BIOS support
67919 + *  24 - APM BIOS support
67920 + *  25 - APM BIOS support 
67921 + *
67922 + *  26 - ESPFIX small SS
67923 + *  27 - unused
67924 + *  28 - unused
67925 + *  29 - unused
67926 + *  30 - unused
67927 + *  31 - TSS for double fault handler
67928 + */
67929 +#define GDT_ENTRY_TLS_ENTRIES  3
67930 +#define GDT_ENTRY_TLS_MIN      6
67931 +#define GDT_ENTRY_TLS_MAX      (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
67932 +
67933 +#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
67934 +
67935 +#define GDT_ENTRY_DEFAULT_USER_CS      14
67936 +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3)
67937 +
67938 +#define GDT_ENTRY_DEFAULT_USER_DS      15
67939 +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3)
67940 +
67941 +#define GDT_ENTRY_KERNEL_BASE  12
67942 +
67943 +#define GDT_ENTRY_KERNEL_CS            (GDT_ENTRY_KERNEL_BASE + 0)
67944 +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
67945 +#define GET_KERNEL_CS() (__KERNEL_CS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
67946 +
67947 +#define GDT_ENTRY_KERNEL_DS            (GDT_ENTRY_KERNEL_BASE + 1)
67948 +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
67949 +#define GET_KERNEL_DS() (__KERNEL_DS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
67950 +
67951 +#define GDT_ENTRY_TSS                  (GDT_ENTRY_KERNEL_BASE + 4)
67952 +#define GDT_ENTRY_LDT                  (GDT_ENTRY_KERNEL_BASE + 5)
67953 +
67954 +#define GDT_ENTRY_PNPBIOS_BASE         (GDT_ENTRY_KERNEL_BASE + 6)
67955 +#define GDT_ENTRY_APMBIOS_BASE         (GDT_ENTRY_KERNEL_BASE + 11)
67956 +
67957 +#define GDT_ENTRY_ESPFIX_SS            (GDT_ENTRY_KERNEL_BASE + 14)
67958 +#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
67959 +
67960 +#define GDT_ENTRY_DOUBLEFAULT_TSS      31
67961 +
67962 +/*
67963 + * The GDT has 32 entries
67964 + */
67965 +#define GDT_ENTRIES 32
67966 +
67967 +#define GDT_SIZE (GDT_ENTRIES * 8)
67968 +
67969 +/* Simple and small GDT entries for booting only */
67970 +
67971 +#define GDT_ENTRY_BOOT_CS              2
67972 +#define __BOOT_CS      (GDT_ENTRY_BOOT_CS * 8)
67973 +
67974 +#define GDT_ENTRY_BOOT_DS              (GDT_ENTRY_BOOT_CS + 1)
67975 +#define __BOOT_DS      (GDT_ENTRY_BOOT_DS * 8)
67976 +
67977 +/* The PnP BIOS entries in the GDT */
67978 +#define GDT_ENTRY_PNPBIOS_CS32         (GDT_ENTRY_PNPBIOS_BASE + 0)
67979 +#define GDT_ENTRY_PNPBIOS_CS16         (GDT_ENTRY_PNPBIOS_BASE + 1)
67980 +#define GDT_ENTRY_PNPBIOS_DS           (GDT_ENTRY_PNPBIOS_BASE + 2)
67981 +#define GDT_ENTRY_PNPBIOS_TS1          (GDT_ENTRY_PNPBIOS_BASE + 3)
67982 +#define GDT_ENTRY_PNPBIOS_TS2          (GDT_ENTRY_PNPBIOS_BASE + 4)
67983 +
67984 +/* The PnP BIOS selectors */
67985 +#define PNP_CS32   (GDT_ENTRY_PNPBIOS_CS32 * 8)        /* segment for calling fn */
67986 +#define PNP_CS16   (GDT_ENTRY_PNPBIOS_CS16 * 8)        /* code segment for BIOS */
67987 +#define PNP_DS     (GDT_ENTRY_PNPBIOS_DS * 8)  /* data segment for BIOS */
67988 +#define PNP_TS1    (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
67989 +#define PNP_TS2    (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
67990 +
67991 +/*
67992 + * The interrupt descriptor table has room for 256 idt's,
67993 + * the global descriptor table is dependent on the number
67994 + * of tasks we can have..
67995 + */
67996 +#define IDT_ENTRIES 256
67997 +
67998 +#endif
67999 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/setup.h linux-2.6.16/include/asm-i386/mach-xen/asm/setup.h
68000 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/setup.h     1970-01-01 01:00:00.000000000 +0100
68001 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/setup.h  2006-06-26 09:51:32.000000000 +0200
68002 @@ -0,0 +1,66 @@
68003 +/*
68004 + *     Just a place holder. We don't want to have to test x86 before
68005 + *     we include stuff
68006 + */
68007 +
68008 +#ifndef _i386_SETUP_H
68009 +#define _i386_SETUP_H
68010 +
68011 +#define PFN_UP(x)      (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
68012 +#define PFN_DOWN(x)    ((x) >> PAGE_SHIFT)
68013 +#define PFN_PHYS(x)    ((unsigned long long)(x) << PAGE_SHIFT)
68014 +
68015 +/*
68016 + * Reserved space for vmalloc and iomap - defined in asm/page.h
68017 + */
68018 +#define MAXMEM_PFN     PFN_DOWN(MAXMEM)
68019 +#define MAX_NONPAE_PFN (1 << 20)
68020 +
68021 +#define PARAM_SIZE 4096
68022 +#define COMMAND_LINE_SIZE 256
68023 +
68024 +#define OLD_CL_MAGIC_ADDR      0x90020
68025 +#define OLD_CL_MAGIC           0xA33F
68026 +#define OLD_CL_BASE_ADDR       0x90000
68027 +#define OLD_CL_OFFSET          0x90022
68028 +#define NEW_CL_POINTER         0x228   /* Relative to real mode data */
68029 +
68030 +#ifndef __ASSEMBLY__
68031 +/*
68032 + * This is set up by the setup-routine at boot-time
68033 + */
68034 +extern unsigned char boot_params[PARAM_SIZE];
68035 +
68036 +#define PARAM  (boot_params)
68037 +#define SCREEN_INFO (*(struct screen_info *) (PARAM+0))
68038 +#define EXT_MEM_K (*(unsigned short *) (PARAM+2))
68039 +#define ALT_MEM_K (*(unsigned long *) (PARAM+0x1e0))
68040 +#define E820_MAP_NR (*(char*) (PARAM+E820NR))
68041 +#define E820_MAP    ((struct e820entry *) (PARAM+E820MAP))
68042 +#define APM_BIOS_INFO (*(struct apm_bios_info *) (PARAM+0x40))
68043 +#define IST_INFO   (*(struct ist_info *) (PARAM+0x60))
68044 +#define DRIVE_INFO (*(struct drive_info_struct *) (PARAM+0x80))
68045 +#define SYS_DESC_TABLE (*(struct sys_desc_table_struct*)(PARAM+0xa0))
68046 +#define EFI_SYSTAB ((efi_system_table_t *) *((unsigned long *)(PARAM+0x1c4)))
68047 +#define EFI_MEMDESC_SIZE (*((unsigned long *) (PARAM+0x1c8)))
68048 +#define EFI_MEMDESC_VERSION (*((unsigned long *) (PARAM+0x1cc)))
68049 +#define EFI_MEMMAP ((void *) *((unsigned long *)(PARAM+0x1d0)))
68050 +#define EFI_MEMMAP_SIZE (*((unsigned long *) (PARAM+0x1d4)))
68051 +#define MOUNT_ROOT_RDONLY (*(unsigned short *) (PARAM+0x1F2))
68052 +#define RAMDISK_FLAGS (*(unsigned short *) (PARAM+0x1F8))
68053 +#define VIDEO_MODE (*(unsigned short *) (PARAM+0x1FA))
68054 +#define ORIG_ROOT_DEV (*(unsigned short *) (PARAM+0x1FC))
68055 +#define AUX_DEVICE_INFO (*(unsigned char *) (PARAM+0x1FF))
68056 +#define LOADER_TYPE (*(unsigned char *) (PARAM+0x210))
68057 +#define KERNEL_START (*(unsigned long *) (PARAM+0x214))
68058 +#define INITRD_START (__pa(xen_start_info->mod_start))
68059 +#define INITRD_SIZE (xen_start_info->mod_len)
68060 +#define EDID_INFO   (*(struct edid_info *) (PARAM+0x440))
68061 +#define EDD_NR     (*(unsigned char *) (PARAM+EDDNR))
68062 +#define EDD_MBR_SIG_NR (*(unsigned char *) (PARAM+EDD_MBR_SIG_NR_BUF))
68063 +#define EDD_MBR_SIGNATURE ((unsigned int *) (PARAM+EDD_MBR_SIG_BUF))
68064 +#define EDD_BUF     ((struct edd_info *) (PARAM+EDDBUF))
68065 +
68066 +#endif /* __ASSEMBLY__ */
68067 +
68068 +#endif /* _i386_SETUP_H */
68069 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/smp.h linux-2.6.16/include/asm-i386/mach-xen/asm/smp.h
68070 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/smp.h       1970-01-01 01:00:00.000000000 +0100
68071 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/smp.h    2006-06-26 09:51:32.000000000 +0200
68072 @@ -0,0 +1,103 @@
68073 +#ifndef __ASM_SMP_H
68074 +#define __ASM_SMP_H
68075 +
68076 +/*
68077 + * We need the APIC definitions automatically as part of 'smp.h'
68078 + */
68079 +#ifndef __ASSEMBLY__
68080 +#include <linux/config.h>
68081 +#include <linux/kernel.h>
68082 +#include <linux/threads.h>
68083 +#include <linux/cpumask.h>
68084 +#endif
68085 +
68086 +#ifdef CONFIG_X86_LOCAL_APIC
68087 +#ifndef __ASSEMBLY__
68088 +#include <asm/fixmap.h>
68089 +#include <asm/bitops.h>
68090 +#include <asm/mpspec.h>
68091 +#ifdef CONFIG_X86_IO_APIC
68092 +#include <asm/io_apic.h>
68093 +#endif
68094 +#include <asm/apic.h>
68095 +#endif
68096 +#endif
68097 +
68098 +#define BAD_APICID 0xFFu
68099 +#ifdef CONFIG_SMP
68100 +#ifndef __ASSEMBLY__
68101 +
68102 +/*
68103 + * Private routines/data
68104 + */
68105
68106 +extern void smp_alloc_memory(void);
68107 +extern int pic_mode;
68108 +extern int smp_num_siblings;
68109 +extern cpumask_t cpu_sibling_map[];
68110 +extern cpumask_t cpu_core_map[];
68111 +
68112 +extern void (*mtrr_hook) (void);
68113 +extern void zap_low_mappings (void);
68114 +extern void lock_ipi_call_lock(void);
68115 +extern void unlock_ipi_call_lock(void);
68116 +
68117 +#define MAX_APICID 256
68118 +extern u8 x86_cpu_to_apicid[];
68119 +
68120 +#define cpu_physical_id(cpu)   x86_cpu_to_apicid[cpu]
68121 +
68122 +#ifdef CONFIG_HOTPLUG_CPU
68123 +extern void cpu_exit_clear(void);
68124 +extern void cpu_uninit(void);
68125 +#endif
68126 +
68127 +/*
68128 + * This function is needed by all SMP systems. It must _always_ be valid
68129 + * from the initial startup. We map APIC_BASE very early in page_setup(),
68130 + * so this is correct in the x86 case.
68131 + */
68132 +#define raw_smp_processor_id() (current_thread_info()->cpu)
68133 +
68134 +extern cpumask_t cpu_possible_map;
68135 +#define cpu_callin_map cpu_possible_map
68136 +
68137 +/* We don't mark CPUs online until __cpu_up(), so we need another measure */
68138 +static inline int num_booting_cpus(void)
68139 +{
68140 +       return cpus_weight(cpu_possible_map);
68141 +}
68142 +
68143 +#ifdef CONFIG_X86_LOCAL_APIC
68144 +
68145 +#ifdef APIC_DEFINITION
68146 +extern int hard_smp_processor_id(void);
68147 +#else
68148 +#include <mach_apicdef.h>
68149 +static inline int hard_smp_processor_id(void)
68150 +{
68151 +       /* we don't want to mark this access volatile - bad code generation */
68152 +       return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
68153 +}
68154 +#endif
68155 +
68156 +static __inline int logical_smp_processor_id(void)
68157 +{
68158 +       /* we don't want to mark this access volatile - bad code generation */
68159 +       return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
68160 +}
68161 +
68162 +#endif
68163 +
68164 +extern int __cpu_disable(void);
68165 +extern void __cpu_die(unsigned int cpu);
68166 +#endif /* !__ASSEMBLY__ */
68167 +
68168 +#else /* CONFIG_SMP */
68169 +
68170 +#define cpu_physical_id(cpu)           boot_cpu_physical_apicid
68171 +
68172 +#define NO_PROC_ID             0xFF            /* No processor magic marker */
68173 +
68174 +#endif
68175 +#endif
68176 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/spinlock.h linux-2.6.16/include/asm-i386/mach-xen/asm/spinlock.h
68177 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/spinlock.h  1970-01-01 01:00:00.000000000 +0100
68178 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/spinlock.h       2006-06-26 09:51:32.000000000 +0200
68179 @@ -0,0 +1,217 @@
68180 +#ifndef __ASM_SPINLOCK_H
68181 +#define __ASM_SPINLOCK_H
68182 +
68183 +#include <asm/atomic.h>
68184 +#include <asm/rwlock.h>
68185 +#include <asm/page.h>
68186 +#include <linux/config.h>
68187 +#include <linux/compiler.h>
68188 +#include <asm/smp_alt.h>
68189 +
68190 +/*
68191 + * Your basic SMP spinlocks, allowing only a single CPU anywhere
68192 + *
68193 + * Simple spin lock operations.  There are two variants, one clears IRQ's
68194 + * on the local processor, one does not.
68195 + *
68196 + * We make no fairness assumptions. They have a cost.
68197 + *
68198 + * (the type definitions are in asm/spinlock_types.h)
68199 + */
68200 +
68201 +#define __raw_spin_is_locked(x) \
68202 +               (*(volatile signed char *)(&(x)->slock) <= 0)
68203 +
68204 +#define __raw_spin_lock_string \
68205 +       "\n1:\n" \
68206 +       LOCK \
68207 +       "decb %0\n\t" \
68208 +       "jns 3f\n" \
68209 +       "2:\t" \
68210 +       "rep;nop\n\t" \
68211 +       "cmpb $0,%0\n\t" \
68212 +       "jle 2b\n\t" \
68213 +       "jmp 1b\n" \
68214 +       "3:\n\t"
68215 +
68216 +#define __raw_spin_lock_string_flags \
68217 +       "\n1:\n" \
68218 +       LOCK \
68219 +       "decb %0\n\t" \
68220 +       "jns 4f\n\t" \
68221 +       "2:\t" \
68222 +       "testl $0x200, %1\n\t" \
68223 +       "jz 3f\n\t" \
68224 +       "#sti\n\t" \
68225 +       "3:\t" \
68226 +       "rep;nop\n\t" \
68227 +       "cmpb $0, %0\n\t" \
68228 +       "jle 3b\n\t" \
68229 +       "#cli\n\t" \
68230 +       "jmp 1b\n" \
68231 +       "4:\n\t"
68232 +
68233 +static inline void __raw_spin_lock(raw_spinlock_t *lock)
68234 +{
68235 +       __asm__ __volatile__(
68236 +               __raw_spin_lock_string
68237 +               :"=m" (lock->slock) : : "memory");
68238 +}
68239 +
68240 +static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
68241 +{
68242 +       __asm__ __volatile__(
68243 +               __raw_spin_lock_string_flags
68244 +               :"=m" (lock->slock) : "r" (flags) : "memory");
68245 +}
68246 +
68247 +static inline int __raw_spin_trylock(raw_spinlock_t *lock)
68248 +{
68249 +       char oldval;
68250 +#ifdef CONFIG_SMP_ALTERNATIVES
68251 +       __asm__ __volatile__(
68252 +               "1:movb %1,%b0\n"
68253 +               "movb $0,%1\n"
68254 +               "2:"
68255 +               ".section __smp_alternatives,\"a\"\n"
68256 +               ".long 1b\n"
68257 +               ".long 3f\n"
68258 +               ".previous\n"
68259 +               ".section __smp_replacements,\"a\"\n"
68260 +               "3: .byte 2b - 1b\n"
68261 +               ".byte 5f-4f\n"
68262 +               ".byte 0\n"
68263 +               ".byte 6f-5f\n"
68264 +               ".byte -1\n"
68265 +               "4: xchgb %b0,%1\n"
68266 +               "5: movb %1,%b0\n"
68267 +               "movb $0,%1\n"
68268 +               "6:\n"
68269 +               ".previous\n"
68270 +               :"=q" (oldval), "=m" (lock->slock)
68271 +               :"0" (0) : "memory");
68272 +#else
68273 +       __asm__ __volatile__(
68274 +               "xchgb %b0,%1"
68275 +               :"=q" (oldval), "=m" (lock->slock)
68276 +               :"0" (0) : "memory");
68277 +#endif
68278 +       return oldval > 0;
68279 +}
68280 +
68281 +/*
68282 + * __raw_spin_unlock based on writing $1 to the low byte.
68283 + * This method works. Despite all the confusion.
68284 + * (except on PPro SMP or if we are using OOSTORE, so we use xchgb there)
68285 + * (PPro errata 66, 92)
68286 + */
68287 +
68288 +#if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE)
68289 +
68290 +#define __raw_spin_unlock_string \
68291 +       "movb $1,%0" \
68292 +               :"=m" (lock->slock) : : "memory"
68293 +
68294 +
68295 +static inline void __raw_spin_unlock(raw_spinlock_t *lock)
68296 +{
68297 +       __asm__ __volatile__(
68298 +               __raw_spin_unlock_string
68299 +       );
68300 +}
68301 +
68302 +#else
68303 +
68304 +#define __raw_spin_unlock_string \
68305 +       "xchgb %b0, %1" \
68306 +               :"=q" (oldval), "=m" (lock->slock) \
68307 +               :"0" (oldval) : "memory"
68308 +
68309 +static inline void __raw_spin_unlock(raw_spinlock_t *lock)
68310 +{
68311 +       char oldval = 1;
68312 +
68313 +       __asm__ __volatile__(
68314 +               __raw_spin_unlock_string
68315 +       );
68316 +}
68317 +
68318 +#endif
68319 +
68320 +#define __raw_spin_unlock_wait(lock) \
68321 +       do { while (__raw_spin_is_locked(lock)) cpu_relax(); } while (0)
68322 +
68323 +/*
68324 + * Read-write spinlocks, allowing multiple readers
68325 + * but only one writer.
68326 + *
68327 + * NOTE! it is quite common to have readers in interrupts
68328 + * but no interrupt writers. For those circumstances we
68329 + * can "mix" irq-safe locks - any writer needs to get a
68330 + * irq-safe write-lock, but readers can get non-irqsafe
68331 + * read-locks.
68332 + *
68333 + * On x86, we implement read-write locks as a 32-bit counter
68334 + * with the high bit (sign) being the "contended" bit.
68335 + *
68336 + * The inline assembly is non-obvious. Think about it.
68337 + *
68338 + * Changed to use the same technique as rw semaphores.  See
68339 + * semaphore.h for details.  -ben
68340 + *
68341 + * the helpers are in arch/i386/kernel/semaphore.c
68342 + */
68343 +
68344 +/**
68345 + * read_can_lock - would read_trylock() succeed?
68346 + * @lock: the rwlock in question.
68347 + */
68348 +#define __raw_read_can_lock(x)         ((int)(x)->lock > 0)
68349 +
68350 +/**
68351 + * write_can_lock - would write_trylock() succeed?
68352 + * @lock: the rwlock in question.
68353 + */
68354 +#define __raw_write_can_lock(x)                ((x)->lock == RW_LOCK_BIAS)
68355 +
68356 +static inline void __raw_read_lock(raw_rwlock_t *rw)
68357 +{
68358 +       __build_read_lock(rw, "__read_lock_failed");
68359 +}
68360 +
68361 +static inline void __raw_write_lock(raw_rwlock_t *rw)
68362 +{
68363 +       __build_write_lock(rw, "__write_lock_failed");
68364 +}
68365 +
68366 +static inline int __raw_read_trylock(raw_rwlock_t *lock)
68367 +{
68368 +       atomic_t *count = (atomic_t *)lock;
68369 +       atomic_dec(count);
68370 +       if (atomic_read(count) >= 0)
68371 +               return 1;
68372 +       atomic_inc(count);
68373 +       return 0;
68374 +}
68375 +
68376 +static inline int __raw_write_trylock(raw_rwlock_t *lock)
68377 +{
68378 +       atomic_t *count = (atomic_t *)lock;
68379 +       if (atomic_sub_and_test(RW_LOCK_BIAS, count))
68380 +               return 1;
68381 +       atomic_add(RW_LOCK_BIAS, count);
68382 +       return 0;
68383 +}
68384 +
68385 +static inline void __raw_read_unlock(raw_rwlock_t *rw)
68386 +{
68387 +       asm volatile(LOCK "incl %0" :"=m" (rw->lock) : : "memory");
68388 +}
68389 +
68390 +static inline void __raw_write_unlock(raw_rwlock_t *rw)
68391 +{
68392 +       asm volatile(LOCK "addl $" RW_LOCK_BIAS_STR ", %0"
68393 +                                : "=m" (rw->lock) : : "memory");
68394 +}
68395 +
68396 +#endif /* __ASM_SPINLOCK_H */
68397 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/swiotlb.h linux-2.6.16/include/asm-i386/mach-xen/asm/swiotlb.h
68398 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/swiotlb.h   1970-01-01 01:00:00.000000000 +0100
68399 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/swiotlb.h        2006-06-26 09:51:32.000000000 +0200
68400 @@ -0,0 +1,43 @@
68401 +#ifndef _ASM_SWIOTLB_H
68402 +#define _ASM_SWIOTLB_H 1
68403 +
68404 +#include <linux/config.h>
68405 +
68406 +/* SWIOTLB interface */
68407 +
68408 +extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr, size_t size,
68409 +                                     int dir);
68410 +extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
68411 +                                 size_t size, int dir);
68412 +extern void swiotlb_sync_single_for_cpu(struct device *hwdev,
68413 +                                        dma_addr_t dev_addr,
68414 +                                        size_t size, int dir);
68415 +extern void swiotlb_sync_single_for_device(struct device *hwdev,
68416 +                                           dma_addr_t dev_addr,
68417 +                                           size_t size, int dir);
68418 +extern void swiotlb_sync_sg_for_cpu(struct device *hwdev,
68419 +                                    struct scatterlist *sg, int nelems,
68420 +                                    int dir);
68421 +extern void swiotlb_sync_sg_for_device(struct device *hwdev,
68422 +                                       struct scatterlist *sg, int nelems,
68423 +                                       int dir);
68424 +extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
68425 +                     int nents, int direction);
68426 +extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
68427 +                        int nents, int direction);
68428 +extern int swiotlb_dma_mapping_error(dma_addr_t dma_addr);
68429 +extern dma_addr_t swiotlb_map_page(struct device *hwdev, struct page *page,
68430 +                                   unsigned long offset, size_t size,
68431 +                                   enum dma_data_direction direction);
68432 +extern void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address,
68433 +                               size_t size, enum dma_data_direction direction);
68434 +extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
68435 +extern void swiotlb_init(void);
68436 +
68437 +#ifdef CONFIG_SWIOTLB
68438 +extern int swiotlb;
68439 +#else
68440 +#define swiotlb 0
68441 +#endif
68442 +
68443 +#endif
68444 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/synch_bitops.h linux-2.6.16/include/asm-i386/mach-xen/asm/synch_bitops.h
68445 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/synch_bitops.h      1970-01-01 01:00:00.000000000 +0100
68446 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/synch_bitops.h   2006-06-26 09:51:32.000000000 +0200
68447 @@ -0,0 +1,141 @@
68448 +#ifndef __XEN_SYNCH_BITOPS_H__
68449 +#define __XEN_SYNCH_BITOPS_H__
68450 +
68451 +/*
68452 + * Copyright 1992, Linus Torvalds.
68453 + * Heavily modified to provide guaranteed strong synchronisation
68454 + * when communicating with Xen or other guest OSes running on other CPUs.
68455 + */
68456 +
68457 +#include <linux/config.h>
68458 +
68459 +#define ADDR (*(volatile long *) addr)
68460 +
68461 +static __inline__ void synch_set_bit(int nr, volatile void * addr)
68462 +{
68463 +    __asm__ __volatile__ ( 
68464 +        "lock btsl %1,%0"
68465 +        : "+m" (ADDR) : "Ir" (nr) : "memory" );
68466 +}
68467 +
68468 +static __inline__ void synch_clear_bit(int nr, volatile void * addr)
68469 +{
68470 +    __asm__ __volatile__ (
68471 +        "lock btrl %1,%0"
68472 +        : "+m" (ADDR) : "Ir" (nr) : "memory" );
68473 +}
68474 +
68475 +static __inline__ void synch_change_bit(int nr, volatile void * addr)
68476 +{
68477 +    __asm__ __volatile__ (
68478 +        "lock btcl %1,%0"
68479 +        : "+m" (ADDR) : "Ir" (nr) : "memory" );
68480 +}
68481 +
68482 +static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr)
68483 +{
68484 +    int oldbit;
68485 +    __asm__ __volatile__ (
68486 +        "lock btsl %2,%1\n\tsbbl %0,%0"
68487 +        : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
68488 +    return oldbit;
68489 +}
68490 +
68491 +static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr)
68492 +{
68493 +    int oldbit;
68494 +    __asm__ __volatile__ (
68495 +        "lock btrl %2,%1\n\tsbbl %0,%0"
68496 +        : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
68497 +    return oldbit;
68498 +}
68499 +
68500 +static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr)
68501 +{
68502 +    int oldbit;
68503 +
68504 +    __asm__ __volatile__ (
68505 +        "lock btcl %2,%1\n\tsbbl %0,%0"
68506 +        : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory");
68507 +    return oldbit;
68508 +}
68509 +
68510 +struct __synch_xchg_dummy { unsigned long a[100]; };
68511 +#define __synch_xg(x) ((struct __synch_xchg_dummy *)(x))
68512 +
68513 +#define synch_cmpxchg(ptr, old, new) \
68514 +((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\
68515 +                                     (unsigned long)(old), \
68516 +                                     (unsigned long)(new), \
68517 +                                     sizeof(*(ptr))))
68518 +
68519 +static inline unsigned long __synch_cmpxchg(volatile void *ptr,
68520 +                                           unsigned long old,
68521 +                                           unsigned long new, int size)
68522 +{
68523 +       unsigned long prev;
68524 +       switch (size) {
68525 +       case 1:
68526 +               __asm__ __volatile__("lock; cmpxchgb %b1,%2"
68527 +                                    : "=a"(prev)
68528 +                                    : "q"(new), "m"(*__synch_xg(ptr)),
68529 +                                      "0"(old)
68530 +                                    : "memory");
68531 +               return prev;
68532 +       case 2:
68533 +               __asm__ __volatile__("lock; cmpxchgw %w1,%2"
68534 +                                    : "=a"(prev)
68535 +                                    : "r"(new), "m"(*__synch_xg(ptr)),
68536 +                                      "0"(old)
68537 +                                    : "memory");
68538 +               return prev;
68539 +#ifdef CONFIG_X86_64
68540 +       case 4:
68541 +               __asm__ __volatile__("lock; cmpxchgl %k1,%2"
68542 +                                    : "=a"(prev)
68543 +                                    : "r"(new), "m"(*__synch_xg(ptr)),
68544 +                                      "0"(old)
68545 +                                    : "memory");
68546 +               return prev;
68547 +       case 8:
68548 +               __asm__ __volatile__("lock; cmpxchgq %1,%2"
68549 +                                    : "=a"(prev)
68550 +                                    : "r"(new), "m"(*__synch_xg(ptr)),
68551 +                                      "0"(old)
68552 +                                    : "memory");
68553 +               return prev;
68554 +#else
68555 +       case 4:
68556 +               __asm__ __volatile__("lock; cmpxchgl %1,%2"
68557 +                                    : "=a"(prev)
68558 +                                    : "r"(new), "m"(*__synch_xg(ptr)),
68559 +                                      "0"(old)
68560 +                                    : "memory");
68561 +               return prev;
68562 +#endif
68563 +       }
68564 +       return old;
68565 +}
68566 +
68567 +static __always_inline int synch_const_test_bit(int nr,
68568 +                                               const volatile void * addr)
68569 +{
68570 +    return ((1UL << (nr & 31)) & 
68571 +            (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
68572 +}
68573 +
68574 +static __inline__ int synch_var_test_bit(int nr, volatile void * addr)
68575 +{
68576 +    int oldbit;
68577 +    __asm__ __volatile__ (
68578 +        "btl %2,%1\n\tsbbl %0,%0"
68579 +        : "=r" (oldbit) : "m" (ADDR), "Ir" (nr) );
68580 +    return oldbit;
68581 +}
68582 +
68583 +#define synch_test_bit(nr,addr) \
68584 +(__builtin_constant_p(nr) ? \
68585 + synch_const_test_bit((nr),(addr)) : \
68586 + synch_var_test_bit((nr),(addr)))
68587 +
68588 +#endif /* __XEN_SYNCH_BITOPS_H__ */
68589 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/system.h linux-2.6.16/include/asm-i386/mach-xen/asm/system.h
68590 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/system.h    1970-01-01 01:00:00.000000000 +0100
68591 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/system.h 2006-06-26 09:51:32.000000000 +0200
68592 @@ -0,0 +1,679 @@
68593 +#ifndef __ASM_SYSTEM_H
68594 +#define __ASM_SYSTEM_H
68595 +
68596 +#include <linux/config.h>
68597 +#include <linux/kernel.h>
68598 +#include <linux/bitops.h>
68599 +#include <asm/synch_bitops.h>
68600 +#include <asm/segment.h>
68601 +#include <asm/cpufeature.h>
68602 +#include <asm/hypervisor.h>
68603 +#include <asm/smp_alt.h>
68604 +
68605 +#ifdef __KERNEL__
68606 +
68607 +#ifdef CONFIG_SMP
68608 +#define __vcpu_id smp_processor_id()
68609 +#else
68610 +#define __vcpu_id 0
68611 +#endif
68612 +
68613 +struct task_struct;    /* one of the stranger aspects of C forward declarations.. */
68614 +extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
68615 +
68616 +#define switch_to(prev,next,last) do {                                 \
68617 +       unsigned long esi,edi;                                          \
68618 +       asm volatile("pushl %%ebp\n\t"                                  \
68619 +                    "movl %%esp,%0\n\t"        /* save ESP */          \
68620 +                    "movl %5,%%esp\n\t"        /* restore ESP */       \
68621 +                    "movl $1f,%1\n\t"          /* save EIP */          \
68622 +                    "pushl %6\n\t"             /* restore EIP */       \
68623 +                    "jmp __switch_to\n"                                \
68624 +                    "1:\t"                                             \
68625 +                    "popl %%ebp\n\t"                                   \
68626 +                    :"=m" (prev->thread.esp),"=m" (prev->thread.eip),  \
68627 +                     "=a" (last),"=S" (esi),"=D" (edi)                 \
68628 +                    :"m" (next->thread.esp),"m" (next->thread.eip),    \
68629 +                     "2" (prev), "d" (next));                          \
68630 +} while (0)
68631 +
68632 +#define _set_base(addr,base) do { unsigned long __pr; \
68633 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
68634 +       "rorl $16,%%edx\n\t" \
68635 +       "movb %%dl,%2\n\t" \
68636 +       "movb %%dh,%3" \
68637 +       :"=&d" (__pr) \
68638 +       :"m" (*((addr)+2)), \
68639 +        "m" (*((addr)+4)), \
68640 +        "m" (*((addr)+7)), \
68641 +         "0" (base) \
68642 +        ); } while(0)
68643 +
68644 +#define _set_limit(addr,limit) do { unsigned long __lr; \
68645 +__asm__ __volatile__ ("movw %%dx,%1\n\t" \
68646 +       "rorl $16,%%edx\n\t" \
68647 +       "movb %2,%%dh\n\t" \
68648 +       "andb $0xf0,%%dh\n\t" \
68649 +       "orb %%dh,%%dl\n\t" \
68650 +       "movb %%dl,%2" \
68651 +       :"=&d" (__lr) \
68652 +       :"m" (*(addr)), \
68653 +        "m" (*((addr)+6)), \
68654 +        "0" (limit) \
68655 +        ); } while(0)
68656 +
68657 +#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
68658 +#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
68659 +
68660 +/*
68661 + * Load a segment. Fall back on loading the zero
68662 + * segment if something goes wrong..
68663 + */
68664 +#define loadsegment(seg,value)                 \
68665 +       asm volatile("\n"                       \
68666 +               "1:\t"                          \
68667 +               "mov %0,%%" #seg "\n"           \
68668 +               "2:\n"                          \
68669 +               ".section .fixup,\"ax\"\n"      \
68670 +               "3:\t"                          \
68671 +               "pushl $0\n\t"                  \
68672 +               "popl %%" #seg "\n\t"           \
68673 +               "jmp 2b\n"                      \
68674 +               ".previous\n"                   \
68675 +               ".section __ex_table,\"a\"\n\t" \
68676 +               ".align 4\n\t"                  \
68677 +               ".long 1b,3b\n"                 \
68678 +               ".previous"                     \
68679 +               : :"rm" (value))
68680 +
68681 +/*
68682 + * Save a segment register away
68683 + */
68684 +#define savesegment(seg, value) \
68685 +       asm volatile("mov %%" #seg ",%0":"=rm" (value))
68686 +
68687 +/*
68688 + * Clear and set 'TS' bit respectively
68689 + */
68690 +#define clts() (HYPERVISOR_fpu_taskswitch(0))
68691 +#define read_cr0() ({ \
68692 +       unsigned int __dummy; \
68693 +       __asm__ __volatile__( \
68694 +               "movl %%cr0,%0\n\t" \
68695 +               :"=r" (__dummy)); \
68696 +       __dummy; \
68697 +})
68698 +#define write_cr0(x) \
68699 +       __asm__ __volatile__("movl %0,%%cr0": :"r" (x));
68700 +
68701 +#define read_cr2() \
68702 +       (HYPERVISOR_shared_info->vcpu_info[smp_processor_id()].arch.cr2)
68703 +#define write_cr2(x) \
68704 +       __asm__ __volatile__("movl %0,%%cr2": :"r" (x));
68705 +
68706 +#define read_cr3() ({ \
68707 +       unsigned int __dummy; \
68708 +       __asm__ ( \
68709 +               "movl %%cr3,%0\n\t" \
68710 +               :"=r" (__dummy)); \
68711 +       machine_to_phys(__dummy); \
68712 +})
68713 +#define write_cr3(x) ({                                                \
68714 +       maddr_t __dummy = phys_to_machine(x);                   \
68715 +       __asm__ __volatile__("movl %0,%%cr3": :"r" (__dummy));  \
68716 +})
68717 +
68718 +#define read_cr4() ({ \
68719 +       unsigned int __dummy; \
68720 +       __asm__( \
68721 +               "movl %%cr4,%0\n\t" \
68722 +               :"=r" (__dummy)); \
68723 +       __dummy; \
68724 +})
68725 +
68726 +#define read_cr4_safe() ({                           \
68727 +       unsigned int __dummy;                         \
68728 +       /* This could fault if %cr4 does not exist */ \
68729 +       __asm__("1: movl %%cr4, %0              \n"   \
68730 +               "2:                             \n"   \
68731 +               ".section __ex_table,\"a\"      \n"   \
68732 +               ".long 1b,2b                    \n"   \
68733 +               ".previous                      \n"   \
68734 +               : "=r" (__dummy): "0" (0));           \
68735 +       __dummy;                                      \
68736 +})
68737 +
68738 +#define write_cr4(x) \
68739 +       __asm__ __volatile__("movl %0,%%cr4": :"r" (x));
68740 +#define stts() (HYPERVISOR_fpu_taskswitch(1))
68741 +
68742 +#endif /* __KERNEL__ */
68743 +
68744 +#define wbinvd() \
68745 +       __asm__ __volatile__ ("wbinvd": : :"memory");
68746 +
68747 +static inline unsigned long get_limit(unsigned long segment)
68748 +{
68749 +       unsigned long __limit;
68750 +       __asm__("lsll %1,%0"
68751 +               :"=r" (__limit):"r" (segment));
68752 +       return __limit+1;
68753 +}
68754 +
68755 +#define nop() __asm__ __volatile__ ("nop")
68756 +
68757 +#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
68758 +
68759 +#define tas(ptr) (xchg((ptr),1))
68760 +
68761 +struct __xchg_dummy { unsigned long a[100]; };
68762 +#define __xg(x) ((struct __xchg_dummy *)(x))
68763 +
68764 +
68765 +#ifdef CONFIG_X86_CMPXCHG64
68766 +
68767 +/*
68768 + * The semantics of XCHGCMP8B are a bit strange, this is why
68769 + * there is a loop and the loading of %%eax and %%edx has to
68770 + * be inside. This inlines well in most cases, the cached
68771 + * cost is around ~38 cycles. (in the future we might want
68772 + * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
68773 + * might have an implicit FPU-save as a cost, so it's not
68774 + * clear which path to go.)
68775 + *
68776 + * cmpxchg8b must be used with the lock prefix here to allow
68777 + * the instruction to be executed atomically, see page 3-102
68778 + * of the instruction set reference 24319102.pdf. We need
68779 + * the reader side to see the coherent 64bit value.
68780 + */
68781 +static inline void __set_64bit (unsigned long long * ptr,
68782 +               unsigned int low, unsigned int high)
68783 +{
68784 +       __asm__ __volatile__ (
68785 +               "\n1:\t"
68786 +               "movl (%0), %%eax\n\t"
68787 +               "movl 4(%0), %%edx\n\t"
68788 +               "lock cmpxchg8b (%0)\n\t"
68789 +               "jnz 1b"
68790 +               : /* no outputs */
68791 +               :       "D"(ptr),
68792 +                       "b"(low),
68793 +                       "c"(high)
68794 +               :       "ax","dx","memory");
68795 +}
68796 +
68797 +static inline void __set_64bit_constant (unsigned long long *ptr,
68798 +                                                unsigned long long value)
68799 +{
68800 +       __set_64bit(ptr,(unsigned int)(value), (unsigned int)((value)>>32ULL));
68801 +}
68802 +#define ll_low(x)      *(((unsigned int*)&(x))+0)
68803 +#define ll_high(x)     *(((unsigned int*)&(x))+1)
68804 +
68805 +static inline void __set_64bit_var (unsigned long long *ptr,
68806 +                        unsigned long long value)
68807 +{
68808 +       __set_64bit(ptr,ll_low(value), ll_high(value));
68809 +}
68810 +
68811 +#define set_64bit(ptr,value) \
68812 +(__builtin_constant_p(value) ? \
68813 + __set_64bit_constant(ptr, value) : \
68814 + __set_64bit_var(ptr, value) )
68815 +
68816 +#define _set_64bit(ptr,value) \
68817 +(__builtin_constant_p(value) ? \
68818 + __set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \
68819 + __set_64bit(ptr, ll_low(value), ll_high(value)) )
68820 +
68821 +#endif
68822 +
68823 +/*
68824 + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
68825 + * Note 2: xchg has side effect, so that attribute volatile is necessary,
68826 + *       but generally the primitive is invalid, *ptr is output argument. --ANK
68827 + */
68828 +static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
68829 +{
68830 +       switch (size) {
68831 +               case 1:
68832 +                       __asm__ __volatile__("xchgb %b0,%1"
68833 +                               :"=q" (x)
68834 +                               :"m" (*__xg(ptr)), "0" (x)
68835 +                               :"memory");
68836 +                       break;
68837 +               case 2:
68838 +                       __asm__ __volatile__("xchgw %w0,%1"
68839 +                               :"=r" (x)
68840 +                               :"m" (*__xg(ptr)), "0" (x)
68841 +                               :"memory");
68842 +                       break;
68843 +               case 4:
68844 +                       __asm__ __volatile__("xchgl %0,%1"
68845 +                               :"=r" (x)
68846 +                               :"m" (*__xg(ptr)), "0" (x)
68847 +                               :"memory");
68848 +                       break;
68849 +       }
68850 +       return x;
68851 +}
68852 +
68853 +/*
68854 + * Atomic compare and exchange.  Compare OLD with MEM, if identical,
68855 + * store NEW in MEM.  Return the initial value in MEM.  Success is
68856 + * indicated by comparing RETURN with OLD.
68857 + */
68858 +
68859 +#ifdef CONFIG_X86_CMPXCHG
68860 +#define __HAVE_ARCH_CMPXCHG 1
68861 +#define cmpxchg(ptr,o,n)\
68862 +       ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
68863 +                                       (unsigned long)(n),sizeof(*(ptr))))
68864 +#endif
68865 +
68866 +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
68867 +                                     unsigned long new, int size)
68868 +{
68869 +       unsigned long prev;
68870 +       switch (size) {
68871 +       case 1:
68872 +               __asm__ __volatile__(LOCK "cmpxchgb %b1,%2"
68873 +                                    : "=a"(prev)
68874 +                                    : "q"(new), "m"(*__xg(ptr)), "0"(old)
68875 +                                    : "memory");
68876 +               return prev;
68877 +       case 2:
68878 +               __asm__ __volatile__(LOCK "cmpxchgw %w1,%2"
68879 +                                    : "=a"(prev)
68880 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
68881 +                                    : "memory");
68882 +               return prev;
68883 +       case 4:
68884 +               __asm__ __volatile__(LOCK "cmpxchgl %1,%2"
68885 +                                    : "=a"(prev)
68886 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
68887 +                                    : "memory");
68888 +               return prev;
68889 +       }
68890 +       return old;
68891 +}
68892 +
68893 +#ifndef CONFIG_X86_CMPXCHG
68894 +/*
68895 + * Building a kernel capable running on 80386. It may be necessary to
68896 + * simulate the cmpxchg on the 80386 CPU. For that purpose we define
68897 + * a function for each of the sizes we support.
68898 + */
68899 +
68900 +extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8);
68901 +extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16);
68902 +extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32);
68903 +
68904 +static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
68905 +                                     unsigned long new, int size)
68906 +{
68907 +       switch (size) {
68908 +       case 1:
68909 +               return cmpxchg_386_u8(ptr, old, new);
68910 +       case 2:
68911 +               return cmpxchg_386_u16(ptr, old, new);
68912 +       case 4:
68913 +               return cmpxchg_386_u32(ptr, old, new);
68914 +       }
68915 +       return old;
68916 +}
68917 +
68918 +#define cmpxchg(ptr,o,n)                                               \
68919 +({                                                                     \
68920 +       __typeof__(*(ptr)) __ret;                                       \
68921 +       if (likely(boot_cpu_data.x86 > 3))                              \
68922 +               __ret = __cmpxchg((ptr), (unsigned long)(o),            \
68923 +                                       (unsigned long)(n), sizeof(*(ptr))); \
68924 +       else                                                            \
68925 +               __ret = cmpxchg_386((ptr), (unsigned long)(o),          \
68926 +                                       (unsigned long)(n), sizeof(*(ptr))); \
68927 +       __ret;                                                          \
68928 +})
68929 +#endif
68930 +
68931 +#ifdef CONFIG_X86_CMPXCHG64
68932 +
68933 +static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old,
68934 +                                     unsigned long long new)
68935 +{
68936 +       unsigned long long prev;
68937 +       __asm__ __volatile__(LOCK "cmpxchg8b %3"
68938 +                            : "=A"(prev)
68939 +                            : "b"((unsigned long)new),
68940 +                              "c"((unsigned long)(new >> 32)),
68941 +                              "m"(*__xg(ptr)),
68942 +                              "0"(old)
68943 +                            : "memory");
68944 +       return prev;
68945 +}
68946 +
68947 +#define cmpxchg64(ptr,o,n)\
68948 +       ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\
68949 +                                       (unsigned long long)(n)))
68950 +
68951 +#endif
68952 +    
68953 +#ifdef __KERNEL__
68954 +struct alt_instr { 
68955 +       __u8 *instr;            /* original instruction */
68956 +       __u8 *replacement;
68957 +       __u8  cpuid;            /* cpuid bit set for replacement */
68958 +       __u8  instrlen;         /* length of original instruction */
68959 +       __u8  replacementlen;   /* length of new instruction, <= instrlen */ 
68960 +       __u8  pad;
68961 +}; 
68962 +#endif
68963 +
68964 +/* 
68965 + * Alternative instructions for different CPU types or capabilities.
68966 + * 
68967 + * This allows to use optimized instructions even on generic binary
68968 + * kernels.
68969 + * 
68970 + * length of oldinstr must be longer or equal the length of newinstr
68971 + * It can be padded with nops as needed.
68972 + * 
68973 + * For non barrier like inlines please define new variants
68974 + * without volatile and memory clobber.
68975 + */
68976 +#define alternative(oldinstr, newinstr, feature)       \
68977 +       asm volatile ("661:\n\t" oldinstr "\n662:\n"                 \
68978 +                     ".section .altinstructions,\"a\"\n"            \
68979 +                     "  .align 4\n"                                   \
68980 +                     "  .long 661b\n"            /* label */          \
68981 +                     "  .long 663f\n"            /* new instruction */         \
68982 +                     "  .byte %c0\n"             /* feature bit */    \
68983 +                     "  .byte 662b-661b\n"       /* sourcelen */      \
68984 +                     "  .byte 664f-663f\n"       /* replacementlen */ \
68985 +                     ".previous\n"                                             \
68986 +                     ".section .altinstr_replacement,\"ax\"\n"                 \
68987 +                     "663:\n\t" newinstr "\n664:\n"   /* replacement */    \
68988 +                     ".previous" :: "i" (feature) : "memory")  
68989 +
68990 +/*
68991 + * Alternative inline assembly with input.
68992 + * 
68993 + * Pecularities:
68994 + * No memory clobber here. 
68995 + * Argument numbers start with 1.
68996 + * Best is to use constraints that are fixed size (like (%1) ... "r")
68997 + * If you use variable sized constraints like "m" or "g" in the 
68998 + * replacement maake sure to pad to the worst case length.
68999 + */
69000 +#define alternative_input(oldinstr, newinstr, feature, input...)               \
69001 +       asm volatile ("661:\n\t" oldinstr "\n662:\n"                            \
69002 +                     ".section .altinstructions,\"a\"\n"                       \
69003 +                     "  .align 4\n"                                            \
69004 +                     "  .long 661b\n"            /* label */                   \
69005 +                     "  .long 663f\n"            /* new instruction */         \
69006 +                     "  .byte %c0\n"             /* feature bit */             \
69007 +                     "  .byte 662b-661b\n"       /* sourcelen */               \
69008 +                     "  .byte 664f-663f\n"       /* replacementlen */          \
69009 +                     ".previous\n"                                             \
69010 +                     ".section .altinstr_replacement,\"ax\"\n"                 \
69011 +                     "663:\n\t" newinstr "\n664:\n"   /* replacement */        \
69012 +                     ".previous" :: "i" (feature), ##input)
69013 +
69014 +/*
69015 + * Force strict CPU ordering.
69016 + * And yes, this is required on UP too when we're talking
69017 + * to devices.
69018 + *
69019 + * For now, "wmb()" doesn't actually do anything, as all
69020 + * Intel CPU's follow what Intel calls a *Processor Order*,
69021 + * in which all writes are seen in the program order even
69022 + * outside the CPU.
69023 + *
69024 + * I expect future Intel CPU's to have a weaker ordering,
69025 + * but I'd also expect them to finally get their act together
69026 + * and add some real memory barriers if so.
69027 + *
69028 + * Some non intel clones support out of order store. wmb() ceases to be a
69029 + * nop for these.
69030 + */
69031
69032 +
69033 +/* 
69034 + * Actually only lfence would be needed for mb() because all stores done 
69035 + * by the kernel should be already ordered. But keep a full barrier for now. 
69036 + */
69037 +
69038 +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
69039 +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
69040 +
69041 +/**
69042 + * read_barrier_depends - Flush all pending reads that subsequents reads
69043 + * depend on.
69044 + *
69045 + * No data-dependent reads from memory-like regions are ever reordered
69046 + * over this barrier.  All reads preceding this primitive are guaranteed
69047 + * to access memory (but not necessarily other CPUs' caches) before any
69048 + * reads following this primitive that depend on the data return by
69049 + * any of the preceding reads.  This primitive is much lighter weight than
69050 + * rmb() on most CPUs, and is never heavier weight than is
69051 + * rmb().
69052 + *
69053 + * These ordering constraints are respected by both the local CPU
69054 + * and the compiler.
69055 + *
69056 + * Ordering is not guaranteed by anything other than these primitives,
69057 + * not even by data dependencies.  See the documentation for
69058 + * memory_barrier() for examples and URLs to more information.
69059 + *
69060 + * For example, the following code would force ordering (the initial
69061 + * value of "a" is zero, "b" is one, and "p" is "&a"):
69062 + *
69063 + * <programlisting>
69064 + *     CPU 0                           CPU 1
69065 + *
69066 + *     b = 2;
69067 + *     memory_barrier();
69068 + *     p = &b;                         q = p;
69069 + *                                     read_barrier_depends();
69070 + *                                     d = *q;
69071 + * </programlisting>
69072 + *
69073 + * because the read of "*q" depends on the read of "p" and these
69074 + * two reads are separated by a read_barrier_depends().  However,
69075 + * the following code, with the same initial values for "a" and "b":
69076 + *
69077 + * <programlisting>
69078 + *     CPU 0                           CPU 1
69079 + *
69080 + *     a = 2;
69081 + *     memory_barrier();
69082 + *     b = 3;                          y = b;
69083 + *                                     read_barrier_depends();
69084 + *                                     x = a;
69085 + * </programlisting>
69086 + *
69087 + * does not enforce ordering, since there is no data dependency between
69088 + * the read of "a" and the read of "b".  Therefore, on some CPUs, such
69089 + * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
69090 + * in cases like thiswhere there are no data dependencies.
69091 + **/
69092 +
69093 +#define read_barrier_depends() do { } while(0)
69094 +
69095 +#ifdef CONFIG_X86_OOSTORE
69096 +/* Actually there are no OOO store capable CPUs for now that do SSE, 
69097 +   but make it already an possibility. */
69098 +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
69099 +#else
69100 +#define wmb()  __asm__ __volatile__ ("": : :"memory")
69101 +#endif
69102 +
69103 +#ifdef CONFIG_SMP
69104 +#define smp_wmb()      wmb()
69105 +#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
69106 +#define smp_alt_mb(instr)                                           \
69107 +__asm__ __volatile__("6667:\nnop\nnop\nnop\nnop\nnop\nnop\n6668:\n" \
69108 +                    ".section __smp_alternatives,\"a\"\n"          \
69109 +                    ".long 6667b\n"                                \
69110 +                     ".long 6673f\n"                                \
69111 +                    ".previous\n"                                  \
69112 +                    ".section __smp_replacements,\"a\"\n"          \
69113 +                    "6673:.byte 6668b-6667b\n"                     \
69114 +                    ".byte 6670f-6669f\n"                          \
69115 +                    ".byte 6671f-6670f\n"                          \
69116 +                     ".byte 0\n"                                    \
69117 +                    ".byte %c0\n"                                  \
69118 +                    "6669:lock;addl $0,0(%%esp)\n"                 \
69119 +                    "6670:" instr "\n"                             \
69120 +                    "6671:\n"                                      \
69121 +                    ".previous\n"                                  \
69122 +                    :                                              \
69123 +                    : "i" (X86_FEATURE_XMM2)                       \
69124 +                    : "memory")
69125 +#define smp_rmb() smp_alt_mb("lfence")
69126 +#define smp_mb()  smp_alt_mb("mfence")
69127 +#define set_mb(var, value) do {                                     \
69128 +unsigned long __set_mb_temp;                                        \
69129 +__asm__ __volatile__("6667:movl %1, %0\n6668:\n"                    \
69130 +                    ".section __smp_alternatives,\"a\"\n"          \
69131 +                    ".long 6667b\n"                                \
69132 +                    ".long 6673f\n"                                \
69133 +                    ".previous\n"                                  \
69134 +                    ".section __smp_replacements,\"a\"\n"          \
69135 +                    "6673: .byte 6668b-6667b\n"                    \
69136 +                    ".byte 6670f-6669f\n"                          \
69137 +                    ".byte 0\n"                                    \
69138 +                    ".byte 6671f-6670f\n"                          \
69139 +                    ".byte -1\n"                                   \
69140 +                    "6669: xchg %1, %0\n"                          \
69141 +                    "6670:movl %1, %0\n"                           \
69142 +                    "6671:\n"                                      \
69143 +                    ".previous\n"                                  \
69144 +                    : "=m" (var), "=r" (__set_mb_temp)             \
69145 +                    : "1" (value)                                  \
69146 +                    : "memory"); } while (0)
69147 +#else
69148 +#define smp_rmb()      rmb()
69149 +#define smp_mb()       mb()
69150 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
69151 +#endif
69152 +#define smp_read_barrier_depends()     read_barrier_depends()
69153 +#else
69154 +#define smp_mb()       barrier()
69155 +#define smp_rmb()      barrier()
69156 +#define smp_wmb()      barrier()
69157 +#define smp_read_barrier_depends()     do { } while(0)
69158 +#define set_mb(var, value) do { var = value; barrier(); } while (0)
69159 +#endif
69160 +
69161 +#define set_wmb(var, value) do { var = value; wmb(); } while (0)
69162 +
69163 +/* interrupt control.. */
69164 +
69165 +/* 
69166 + * The use of 'barrier' in the following reflects their use as local-lock
69167 + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
69168 + * critical operations are executed. All critical operations must complete
69169 + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
69170 + * includes these barriers, for example.
69171 + */
69172 +
69173 +#define __cli()                                                                \
69174 +do {                                                                   \
69175 +       vcpu_info_t *_vcpu;                                             \
69176 +       preempt_disable();                                              \
69177 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
69178 +       _vcpu->evtchn_upcall_mask = 1;                                  \
69179 +       preempt_enable_no_resched();                                    \
69180 +       barrier();                                                      \
69181 +} while (0)
69182 +
69183 +#define __sti()                                                                \
69184 +do {                                                                   \
69185 +       vcpu_info_t *_vcpu;                                             \
69186 +       barrier();                                                      \
69187 +       preempt_disable();                                              \
69188 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
69189 +       _vcpu->evtchn_upcall_mask = 0;                                  \
69190 +       barrier(); /* unmask then check (avoid races) */                \
69191 +       if (unlikely(_vcpu->evtchn_upcall_pending))                     \
69192 +               force_evtchn_callback();                                \
69193 +       preempt_enable();                                               \
69194 +} while (0)
69195 +
69196 +#define __save_flags(x)                                                        \
69197 +do {                                                                   \
69198 +       vcpu_info_t *_vcpu;                                             \
69199 +       preempt_disable();                                              \
69200 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
69201 +       (x) = _vcpu->evtchn_upcall_mask;                                \
69202 +       preempt_enable();                                               \
69203 +} while (0)
69204 +
69205 +#define __restore_flags(x)                                             \
69206 +do {                                                                   \
69207 +       vcpu_info_t *_vcpu;                                             \
69208 +       barrier();                                                      \
69209 +       preempt_disable();                                              \
69210 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
69211 +       if ((_vcpu->evtchn_upcall_mask = (x)) == 0) {                   \
69212 +               barrier(); /* unmask then check (avoid races) */        \
69213 +               if (unlikely(_vcpu->evtchn_upcall_pending))             \
69214 +                       force_evtchn_callback();                        \
69215 +               preempt_enable();                                       \
69216 +       } else                                                          \
69217 +               preempt_enable_no_resched();                            \
69218 +} while (0)
69219 +
69220 +#define safe_halt()            ((void)0)
69221 +#define halt()                 ((void)0)
69222 +
69223 +#define __save_and_cli(x)                                              \
69224 +do {                                                                   \
69225 +       vcpu_info_t *_vcpu;                                             \
69226 +       preempt_disable();                                              \
69227 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
69228 +       (x) = _vcpu->evtchn_upcall_mask;                                \
69229 +       _vcpu->evtchn_upcall_mask = 1;                                  \
69230 +       preempt_enable_no_resched();                                    \
69231 +       barrier();                                                      \
69232 +} while (0)
69233 +
69234 +#define local_irq_save(x)      __save_and_cli(x)
69235 +#define local_irq_restore(x)   __restore_flags(x)
69236 +#define local_save_flags(x)    __save_flags(x)
69237 +#define local_irq_disable()    __cli()
69238 +#define local_irq_enable()     __sti()
69239 +
69240 +/* Cannot use preempt_enable() here as we would recurse in preempt_sched(). */
69241 +#define irqs_disabled()                                                        \
69242 +({     int ___x;                                                       \
69243 +       vcpu_info_t *_vcpu;                                             \
69244 +       preempt_disable();                                              \
69245 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
69246 +       ___x = (_vcpu->evtchn_upcall_mask != 0);                        \
69247 +       preempt_enable_no_resched();                                    \
69248 +       ___x; })
69249 +
69250 +/*
69251 + * disable hlt during certain critical i/o operations
69252 + */
69253 +#define HAVE_DISABLE_HLT
69254 +void disable_hlt(void);
69255 +void enable_hlt(void);
69256 +
69257 +extern int es7000_plat;
69258 +void cpu_idle_wait(void);
69259 +
69260 +/*
69261 + * On SMP systems, when the scheduler does migration-cost autodetection,
69262 + * it needs a way to flush as much of the CPU's caches as possible:
69263 + */
69264 +static inline void sched_cacheflush(void)
69265 +{
69266 +       wbinvd();
69267 +}
69268 +
69269 +extern unsigned long arch_align_stack(unsigned long sp);
69270 +
69271 +#endif
69272 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/tlbflush.h linux-2.6.16/include/asm-i386/mach-xen/asm/tlbflush.h
69273 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/tlbflush.h  1970-01-01 01:00:00.000000000 +0100
69274 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/tlbflush.h       2006-06-26 09:51:32.000000000 +0200
69275 @@ -0,0 +1,102 @@
69276 +#ifndef _I386_TLBFLUSH_H
69277 +#define _I386_TLBFLUSH_H
69278 +
69279 +#include <linux/config.h>
69280 +#include <linux/mm.h>
69281 +#include <asm/processor.h>
69282 +
69283 +#define __flush_tlb() xen_tlb_flush()
69284 +#define __flush_tlb_global() xen_tlb_flush()
69285 +#define __flush_tlb_all() xen_tlb_flush()
69286 +
69287 +extern unsigned long pgkern_mask;
69288 +
69289 +#define cpu_has_invlpg (boot_cpu_data.x86 > 3)
69290 +
69291 +#define __flush_tlb_single(addr) xen_invlpg(addr)
69292 +
69293 +#define __flush_tlb_one(addr) __flush_tlb_single(addr)
69294 +
69295 +/*
69296 + * TLB flushing:
69297 + *
69298 + *  - flush_tlb() flushes the current mm struct TLBs
69299 + *  - flush_tlb_all() flushes all processes TLBs
69300 + *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
69301 + *  - flush_tlb_page(vma, vmaddr) flushes one page
69302 + *  - flush_tlb_range(vma, start, end) flushes a range of pages
69303 + *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
69304 + *  - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
69305 + *
69306 + * ..but the i386 has somewhat limited tlb flushing capabilities,
69307 + * and page-granular flushes are available only on i486 and up.
69308 + */
69309 +
69310 +#ifndef CONFIG_SMP
69311 +
69312 +#define flush_tlb() __flush_tlb()
69313 +#define flush_tlb_all() __flush_tlb_all()
69314 +#define local_flush_tlb() __flush_tlb()
69315 +
69316 +static inline void flush_tlb_mm(struct mm_struct *mm)
69317 +{
69318 +       if (mm == current->active_mm)
69319 +               __flush_tlb();
69320 +}
69321 +
69322 +static inline void flush_tlb_page(struct vm_area_struct *vma,
69323 +       unsigned long addr)
69324 +{
69325 +       if (vma->vm_mm == current->active_mm)
69326 +               __flush_tlb_one(addr);
69327 +}
69328 +
69329 +static inline void flush_tlb_range(struct vm_area_struct *vma,
69330 +       unsigned long start, unsigned long end)
69331 +{
69332 +       if (vma->vm_mm == current->active_mm)
69333 +               __flush_tlb();
69334 +}
69335 +
69336 +#else
69337 +
69338 +#include <asm/smp.h>
69339 +
69340 +#define local_flush_tlb() \
69341 +       __flush_tlb()
69342 +
69343 +extern void flush_tlb_all(void);
69344 +extern void flush_tlb_current_task(void);
69345 +extern void flush_tlb_mm(struct mm_struct *);
69346 +extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
69347 +
69348 +#define flush_tlb()    flush_tlb_current_task()
69349 +
69350 +static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
69351 +{
69352 +       flush_tlb_mm(vma->vm_mm);
69353 +}
69354 +
69355 +#define TLBSTATE_OK    1
69356 +#define TLBSTATE_LAZY  2
69357 +
69358 +struct tlb_state
69359 +{
69360 +       struct mm_struct *active_mm;
69361 +       int state;
69362 +       char __cacheline_padding[L1_CACHE_BYTES-8];
69363 +};
69364 +DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
69365 +
69366 +
69367 +#endif
69368 +
69369 +#define flush_tlb_kernel_range(start, end) flush_tlb_all()
69370 +
69371 +static inline void flush_tlb_pgtables(struct mm_struct *mm,
69372 +                                     unsigned long start, unsigned long end)
69373 +{
69374 +       /* i386 does not keep any page table caches in TLB */
69375 +}
69376 +
69377 +#endif /* _I386_TLBFLUSH_H */
69378 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/asm/vga.h linux-2.6.16/include/asm-i386/mach-xen/asm/vga.h
69379 --- linux-2.6.16.orig/include/asm-i386/mach-xen/asm/vga.h       1970-01-01 01:00:00.000000000 +0100
69380 +++ linux-2.6.16/include/asm-i386/mach-xen/asm/vga.h    2006-06-26 09:51:32.000000000 +0200
69381 @@ -0,0 +1,20 @@
69382 +/*
69383 + *     Access to VGA videoram
69384 + *
69385 + *     (c) 1998 Martin Mares <mj@ucw.cz>
69386 + */
69387 +
69388 +#ifndef _LINUX_ASM_VGA_H_
69389 +#define _LINUX_ASM_VGA_H_
69390 +
69391 +/*
69392 + *     On the PC, we can just recalculate addresses and then
69393 + *     access the videoram directly without any black magic.
69394 + */
69395 +
69396 +#define VGA_MAP_MEM(x) (unsigned long)isa_bus_to_virt(x)
69397 +
69398 +#define vga_readb(x) (*(x))
69399 +#define vga_writeb(x,y) (*(y) = (x))
69400 +
69401 +#endif
69402 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/irq_vectors.h linux-2.6.16/include/asm-i386/mach-xen/irq_vectors.h
69403 --- linux-2.6.16.orig/include/asm-i386/mach-xen/irq_vectors.h   1970-01-01 01:00:00.000000000 +0100
69404 +++ linux-2.6.16/include/asm-i386/mach-xen/irq_vectors.h        2006-06-26 09:51:32.000000000 +0200
69405 @@ -0,0 +1,125 @@
69406 +/*
69407 + * This file should contain #defines for all of the interrupt vector
69408 + * numbers used by this architecture.
69409 + *
69410 + * In addition, there are some standard defines:
69411 + *
69412 + *     FIRST_EXTERNAL_VECTOR:
69413 + *             The first free place for external interrupts
69414 + *
69415 + *     SYSCALL_VECTOR:
69416 + *             The IRQ vector a syscall makes the user to kernel transition
69417 + *             under.
69418 + *
69419 + *     TIMER_IRQ:
69420 + *             The IRQ number the timer interrupt comes in at.
69421 + *
69422 + *     NR_IRQS:
69423 + *             The total number of interrupt vectors (including all the
69424 + *             architecture specific interrupts) needed.
69425 + *
69426 + */                    
69427 +#ifndef _ASM_IRQ_VECTORS_H
69428 +#define _ASM_IRQ_VECTORS_H
69429 +
69430 +/*
69431 + * IDT vectors usable for external interrupt sources start
69432 + * at 0x20:
69433 + */
69434 +#define FIRST_EXTERNAL_VECTOR  0x20
69435 +
69436 +#define SYSCALL_VECTOR         0x80
69437 +
69438 +/*
69439 + * Vectors 0x20-0x2f are used for ISA interrupts.
69440 + */
69441 +
69442 +#if 0
69443 +/*
69444 + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
69445 + *
69446 + *  some of the following vectors are 'rare', they are merged
69447 + *  into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
69448 + *  TLB, reschedule and local APIC vectors are performance-critical.
69449 + *
69450 + *  Vectors 0xf0-0xfa are free (reserved for future Linux use).
69451 + */
69452 +#define SPURIOUS_APIC_VECTOR   0xff
69453 +#define ERROR_APIC_VECTOR      0xfe
69454 +#define INVALIDATE_TLB_VECTOR  0xfd
69455 +#define RESCHEDULE_VECTOR      0xfc
69456 +#define CALL_FUNCTION_VECTOR   0xfb
69457 +
69458 +#define THERMAL_APIC_VECTOR    0xf0
69459 +/*
69460 + * Local APIC timer IRQ vector is on a different priority level,
69461 + * to work around the 'lost local interrupt if more than 2 IRQ
69462 + * sources per level' errata.
69463 + */
69464 +#define LOCAL_TIMER_VECTOR     0xef
69465 +#endif
69466 +
69467 +#define SPURIOUS_APIC_VECTOR   0xff
69468 +#define ERROR_APIC_VECTOR      0xfe
69469 +
69470 +/*
69471 + * First APIC vector available to drivers: (vectors 0x30-0xee)
69472 + * we start at 0x31 to spread out vectors evenly between priority
69473 + * levels. (0x80 is the syscall vector)
69474 + */
69475 +#define FIRST_DEVICE_VECTOR    0x31
69476 +#define FIRST_SYSTEM_VECTOR    0xef
69477 +
69478 +/*
69479 + * 16 8259A IRQ's, 208 potential APIC interrupt sources.
69480 + * Right now the APIC is mostly only used for SMP.
69481 + * 256 vectors is an architectural limit. (we can have
69482 + * more than 256 devices theoretically, but they will
69483 + * have to use shared interrupts)
69484 + * Since vectors 0x00-0x1f are used/reserved for the CPU,
69485 + * the usable vector space is 0x20-0xff (224 vectors)
69486 + */
69487 +
69488 +#define RESCHEDULE_VECTOR      0
69489 +#define CALL_FUNCTION_VECTOR   1
69490 +#define NR_IPIS                        2
69491 +
69492 +/*
69493 + * The maximum number of vectors supported by i386 processors
69494 + * is limited to 256. For processors other than i386, NR_VECTORS
69495 + * should be changed accordingly.
69496 + */
69497 +#define NR_VECTORS 256
69498 +
69499 +#define FPU_IRQ                        13
69500 +
69501 +#define        FIRST_VM86_IRQ          3
69502 +#define LAST_VM86_IRQ          15
69503 +#define invalid_vm86_irq(irq)  ((irq) < 3 || (irq) > 15)
69504 +
69505 +/*
69506 + * The flat IRQ space is divided into two regions:
69507 + *  1. A one-to-one mapping of real physical IRQs. This space is only used
69508 + *     if we have physical device-access privilege. This region is at the 
69509 + *     start of the IRQ space so that existing device drivers do not need
69510 + *     to be modified to translate physical IRQ numbers into our IRQ space.
69511 + *  3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
69512 + *     are bound using the provided bind/unbind functions.
69513 + */
69514 +
69515 +#define PIRQ_BASE              0
69516 +#define NR_PIRQS               256
69517 +
69518 +#define DYNIRQ_BASE            (PIRQ_BASE + NR_PIRQS)
69519 +#define NR_DYNIRQS             256
69520 +
69521 +#define NR_IRQS                        (NR_PIRQS + NR_DYNIRQS)
69522 +#define NR_IRQ_VECTORS         NR_IRQS
69523 +
69524 +#define pirq_to_irq(_x)                ((_x) + PIRQ_BASE)
69525 +#define irq_to_pirq(_x)                ((_x) - PIRQ_BASE)
69526 +
69527 +#define dynirq_to_irq(_x)      ((_x) + DYNIRQ_BASE)
69528 +#define irq_to_dynirq(_x)      ((_x) - DYNIRQ_BASE)
69529 +
69530 +#endif /* _ASM_IRQ_VECTORS_H */
69531 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/mach_traps.h linux-2.6.16/include/asm-i386/mach-xen/mach_traps.h
69532 --- linux-2.6.16.orig/include/asm-i386/mach-xen/mach_traps.h    1970-01-01 01:00:00.000000000 +0100
69533 +++ linux-2.6.16/include/asm-i386/mach-xen/mach_traps.h 2006-06-26 09:51:32.000000000 +0200
69534 @@ -0,0 +1,33 @@
69535 +/*
69536 + *  include/asm-xen/asm-i386/mach-xen/mach_traps.h
69537 + *
69538 + *  Machine specific NMI handling for Xen
69539 + */
69540 +#ifndef _MACH_TRAPS_H
69541 +#define _MACH_TRAPS_H
69542 +
69543 +#include <linux/bitops.h>
69544 +#include <xen/interface/nmi.h>
69545 +
69546 +static inline void clear_mem_error(unsigned char reason) {}
69547 +static inline void clear_io_check_error(unsigned char reason) {}
69548 +
69549 +static inline unsigned char get_nmi_reason(void)
69550 +{
69551 +       shared_info_t *s = HYPERVISOR_shared_info;
69552 +       unsigned char reason = 0;
69553 +
69554 +       /* construct a value which looks like it came from
69555 +        * port 0x61.
69556 +        */
69557 +       if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
69558 +               reason |= 0x40;
69559 +       if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
69560 +               reason |= 0x80;
69561 +
69562 +        return reason;
69563 +}
69564 +
69565 +static inline void reassert_nmi(void) {}
69566 +
69567 +#endif /* !_MACH_TRAPS_H */
69568 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/setup_arch_post.h linux-2.6.16/include/asm-i386/mach-xen/setup_arch_post.h
69569 --- linux-2.6.16.orig/include/asm-i386/mach-xen/setup_arch_post.h       1970-01-01 01:00:00.000000000 +0100
69570 +++ linux-2.6.16/include/asm-i386/mach-xen/setup_arch_post.h    2006-06-26 09:51:32.000000000 +0200
69571 @@ -0,0 +1,45 @@
69572 +/**
69573 + * machine_specific_memory_setup - Hook for machine specific memory setup.
69574 + *
69575 + * Description:
69576 + *     This is included late in kernel/setup.c so that it can make
69577 + *     use of all of the static functions.
69578 + **/
69579 +
69580 +static char * __init machine_specific_memory_setup(void)
69581 +{
69582 +       unsigned long max_pfn = xen_start_info->nr_pages;
69583 +
69584 +       e820.nr_map = 0;
69585 +       add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
69586 +
69587 +       return "Xen";
69588 +}
69589 +
69590 +extern void hypervisor_callback(void);
69591 +extern void failsafe_callback(void);
69592 +extern void nmi(void);
69593 +
69594 +static void __init machine_specific_arch_setup(void)
69595 +{
69596 +       struct xen_platform_parameters pp;
69597 +       struct xennmi_callback cb;
69598 +
69599 +       if (xen_feature(XENFEAT_auto_translated_physmap) &&
69600 +           xen_start_info->shared_info < xen_start_info->nr_pages) {
69601 +               HYPERVISOR_shared_info =
69602 +                       (shared_info_t *)__va(xen_start_info->shared_info);
69603 +               memset(empty_zero_page, 0, sizeof(empty_zero_page));
69604 +       }
69605 +
69606 +       HYPERVISOR_set_callbacks(
69607 +           __KERNEL_CS, (unsigned long)hypervisor_callback,
69608 +           __KERNEL_CS, (unsigned long)failsafe_callback);
69609 +
69610 +       cb.handler_address = (unsigned long)&nmi;
69611 +       HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
69612 +
69613 +       if (HYPERVISOR_xen_version(XENVER_platform_parameters,
69614 +                                  &pp) == 0)
69615 +               set_fixaddr_top(pp.virt_start - PAGE_SIZE);
69616 +}
69617 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/mach-xen/setup_arch_pre.h linux-2.6.16/include/asm-i386/mach-xen/setup_arch_pre.h
69618 --- linux-2.6.16.orig/include/asm-i386/mach-xen/setup_arch_pre.h        1970-01-01 01:00:00.000000000 +0100
69619 +++ linux-2.6.16/include/asm-i386/mach-xen/setup_arch_pre.h     2006-06-26 09:51:32.000000000 +0200
69620 @@ -0,0 +1,5 @@
69621 +/* Hook to call BIOS initialisation function */
69622 +
69623 +#define ARCH_SETUP machine_specific_arch_setup();
69624 +
69625 +static void __init machine_specific_arch_setup(void);
69626 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/page.h linux-2.6.16/include/asm-i386/page.h
69627 --- linux-2.6.16.orig/include/asm-i386/page.h   2006-06-26 09:49:46.000000000 +0200
69628 +++ linux-2.6.16/include/asm-i386/page.h        2006-06-26 09:56:49.000000000 +0200
69629 @@ -112,7 +112,7 @@
69630  #define __PAGE_OFFSET          CONFIG_PAGE_OFFSET
69631  #define __PHYSICAL_START       CONFIG_PHYSICAL_START
69632  #define __KERNEL_START         (__PAGE_OFFSET + __PHYSICAL_START)
69633 -#define __MAXMEM               (-__PAGE_OFFSET-__VMALLOC_RESERVE)
69634 +#define __MAXMEM               (__FIXADDR_TOP-__PAGE_OFFSET-__VMALLOC_RESERVE)
69635  
69636  #define PAGE_OFFSET            ((unsigned long)__PAGE_OFFSET)
69637  #define PHYSICAL_START         ((unsigned long)__PHYSICAL_START)
69638 @@ -135,6 +135,8 @@
69639         ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
69640                  VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
69641  
69642 +#define __HAVE_ARCH_GATE_AREA 1
69643 +
69644  #endif /* __KERNEL__ */
69645  
69646  #include <asm-generic/page.h>
69647 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/rwsem.h linux-2.6.16/include/asm-i386/rwsem.h
69648 --- linux-2.6.16.orig/include/asm-i386/rwsem.h  2006-03-20 06:53:29.000000000 +0100
69649 +++ linux-2.6.16/include/asm-i386/rwsem.h       2006-06-26 09:51:32.000000000 +0200
69650 @@ -40,6 +40,7 @@
69651  
69652  #include <linux/list.h>
69653  #include <linux/spinlock.h>
69654 +#include <asm/smp_alt.h>
69655  
69656  struct rwsem_waiter;
69657  
69658 @@ -99,7 +100,7 @@
69659  {
69660         __asm__ __volatile__(
69661                 "# beginning down_read\n\t"
69662 -LOCK_PREFIX    "  incl      (%%eax)\n\t" /* adds 0x00000001, returns the old value */
69663 +LOCK           "  incl      (%%eax)\n\t" /* adds 0x00000001, returns the old value */
69664                 "  js        2f\n\t" /* jump if we weren't granted the lock */
69665                 "1:\n\t"
69666                 LOCK_SECTION_START("")
69667 @@ -130,7 +131,7 @@
69668                 "  movl      %1,%2\n\t"
69669                 "  addl      %3,%2\n\t"
69670                 "  jle       2f\n\t"
69671 -LOCK_PREFIX    "  cmpxchgl  %2,%0\n\t"
69672 +LOCK           "  cmpxchgl  %2,%0\n\t"
69673                 "  jnz       1b\n\t"
69674                 "2:\n\t"
69675                 "# ending __down_read_trylock\n\t"
69676 @@ -150,7 +151,7 @@
69677         tmp = RWSEM_ACTIVE_WRITE_BIAS;
69678         __asm__ __volatile__(
69679                 "# beginning down_write\n\t"
69680 -LOCK_PREFIX    "  xadd      %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns the old value */
69681 +LOCK           "  xadd      %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns the old value */
69682                 "  testl     %%edx,%%edx\n\t" /* was the count 0 before? */
69683                 "  jnz       2f\n\t" /* jump if we weren't granted the lock */
69684                 "1:\n\t"
69685 @@ -188,7 +189,7 @@
69686         __s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
69687         __asm__ __volatile__(
69688                 "# beginning __up_read\n\t"
69689 -LOCK_PREFIX    "  xadd      %%edx,(%%eax)\n\t" /* subtracts 1, returns the old value */
69690 +LOCK           "  xadd      %%edx,(%%eax)\n\t" /* subtracts 1, returns the old value */
69691                 "  js        2f\n\t" /* jump if the lock is being waited upon */
69692                 "1:\n\t"
69693                 LOCK_SECTION_START("")
69694 @@ -214,7 +215,7 @@
69695         __asm__ __volatile__(
69696                 "# beginning __up_write\n\t"
69697                 "  movl      %2,%%edx\n\t"
69698 -LOCK_PREFIX    "  xaddl     %%edx,(%%eax)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */
69699 +LOCK           "  xaddl     %%edx,(%%eax)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */
69700                 "  jnz       2f\n\t" /* jump if the lock is being waited upon */
69701                 "1:\n\t"
69702                 LOCK_SECTION_START("")
69703 @@ -239,7 +240,7 @@
69704  {
69705         __asm__ __volatile__(
69706                 "# beginning __downgrade_write\n\t"
69707 -LOCK_PREFIX    "  addl      %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 0xYYYY0001 */
69708 +LOCK           "  addl      %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 0xYYYY0001 */
69709                 "  js        2f\n\t" /* jump if the lock is being waited upon */
69710                 "1:\n\t"
69711                 LOCK_SECTION_START("")
69712 @@ -263,7 +264,7 @@
69713  static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
69714  {
69715         __asm__ __volatile__(
69716 -LOCK_PREFIX    "addl %1,%0"
69717 +LOCK             "addl %1,%0"
69718                 : "=m"(sem->count)
69719                 : "ir"(delta), "m"(sem->count));
69720  }
69721 @@ -276,7 +277,7 @@
69722         int tmp = delta;
69723  
69724         __asm__ __volatile__(
69725 -LOCK_PREFIX    "xadd %0,(%2)"
69726 +LOCK             "xadd %0,(%2)"
69727                 : "+r"(tmp), "=m"(sem->count)
69728                 : "r"(sem), "m"(sem->count)
69729                 : "memory");
69730 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/smp_alt.h linux-2.6.16/include/asm-i386/smp_alt.h
69731 --- linux-2.6.16.orig/include/asm-i386/smp_alt.h        1970-01-01 01:00:00.000000000 +0100
69732 +++ linux-2.6.16/include/asm-i386/smp_alt.h     2006-06-26 09:51:32.000000000 +0200
69733 @@ -0,0 +1,32 @@
69734 +#ifndef __ASM_SMP_ALT_H__
69735 +#define __ASM_SMP_ALT_H__
69736 +
69737 +#include <linux/config.h>
69738 +
69739 +#ifdef CONFIG_SMP
69740 +#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
69741 +#define LOCK \
69742 +        "6677: nop\n" \
69743 +       ".section __smp_alternatives,\"a\"\n" \
69744 +       ".long 6677b\n" \
69745 +       ".long 6678f\n" \
69746 +       ".previous\n" \
69747 +       ".section __smp_replacements,\"a\"\n" \
69748 +       "6678: .byte 1\n" \
69749 +       ".byte 1\n" \
69750 +       ".byte 0\n" \
69751 +        ".byte 1\n" \
69752 +       ".byte -1\n" \
69753 +       "lock\n" \
69754 +       "nop\n" \
69755 +       ".previous\n"
69756 +void prepare_for_smp(void);
69757 +void unprepare_for_smp(void);
69758 +#else
69759 +#define LOCK "lock ; "
69760 +#endif
69761 +#else
69762 +#define LOCK ""
69763 +#endif
69764 +
69765 +#endif /* __ASM_SMP_ALT_H__ */
69766 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/spinlock.h linux-2.6.16/include/asm-i386/spinlock.h
69767 --- linux-2.6.16.orig/include/asm-i386/spinlock.h       2006-03-20 06:53:29.000000000 +0100
69768 +++ linux-2.6.16/include/asm-i386/spinlock.h    2006-06-26 09:51:32.000000000 +0200
69769 @@ -6,6 +6,7 @@
69770  #include <asm/page.h>
69771  #include <linux/config.h>
69772  #include <linux/compiler.h>
69773 +#include <asm/smp_alt.h>
69774  
69775  /*
69776   * Your basic SMP spinlocks, allowing only a single CPU anywhere
69777 @@ -22,8 +23,9 @@
69778                 (*(volatile signed char *)(&(x)->slock) <= 0)
69779  
69780  #define __raw_spin_lock_string \
69781 -       "\n1:\t" \
69782 -       "lock ; decb %0\n\t" \
69783 +       "\n1:\n" \
69784 +       LOCK \
69785 +       "decb %0\n\t" \
69786         "jns 3f\n" \
69787         "2:\t" \
69788         "rep;nop\n\t" \
69789 @@ -33,8 +35,9 @@
69790         "3:\n\t"
69791  
69792  #define __raw_spin_lock_string_flags \
69793 -       "\n1:\t" \
69794 -       "lock ; decb %0\n\t" \
69795 +       "\n1:\n" \
69796 +       LOCK \
69797 +       "decb %0\n\t" \
69798         "jns 4f\n\t" \
69799         "2:\t" \
69800         "testl $0x200, %1\n\t" \
69801 @@ -65,10 +68,34 @@
69802  static inline int __raw_spin_trylock(raw_spinlock_t *lock)
69803  {
69804         char oldval;
69805 +#ifdef CONFIG_SMP_ALTERNATIVES
69806 +       __asm__ __volatile__(
69807 +               "1:movb %1,%b0\n"
69808 +               "movb $0,%1\n"
69809 +               "2:"
69810 +               ".section __smp_alternatives,\"a\"\n"
69811 +               ".long 1b\n"
69812 +               ".long 3f\n"
69813 +               ".previous\n"
69814 +               ".section __smp_replacements,\"a\"\n"
69815 +               "3: .byte 2b - 1b\n"
69816 +               ".byte 5f-4f\n"
69817 +               ".byte 0\n"
69818 +               ".byte 6f-5f\n"
69819 +               ".byte -1\n"
69820 +               "4: xchgb %b0,%1\n"
69821 +               "5: movb %1,%b0\n"
69822 +               "movb $0,%1\n"
69823 +               "6:\n"
69824 +               ".previous\n"
69825 +               :"=q" (oldval), "=m" (lock->slock)
69826 +               :"0" (0) : "memory");
69827 +#else
69828         __asm__ __volatile__(
69829                 "xchgb %b0,%1"
69830                 :"=q" (oldval), "=m" (lock->slock)
69831                 :"0" (0) : "memory");
69832 +#endif
69833         return oldval > 0;
69834  }
69835  
69836 @@ -178,12 +205,12 @@
69837  
69838  static inline void __raw_read_unlock(raw_rwlock_t *rw)
69839  {
69840 -       asm volatile("lock ; incl %0" :"=m" (rw->lock) : : "memory");
69841 +       asm volatile(LOCK "incl %0" :"=m" (rw->lock) : : "memory");
69842  }
69843  
69844  static inline void __raw_write_unlock(raw_rwlock_t *rw)
69845  {
69846 -       asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ", %0"
69847 +       asm volatile(LOCK "addl $" RW_LOCK_BIAS_STR ", %0"
69848                                  : "=m" (rw->lock) : : "memory");
69849  }
69850  
69851 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-i386/system.h linux-2.6.16/include/asm-i386/system.h
69852 --- linux-2.6.16.orig/include/asm-i386/system.h 2006-03-20 06:53:29.000000000 +0100
69853 +++ linux-2.6.16/include/asm-i386/system.h      2006-06-26 09:51:32.000000000 +0200
69854 @@ -5,7 +5,7 @@
69855  #include <linux/kernel.h>
69856  #include <asm/segment.h>
69857  #include <asm/cpufeature.h>
69858 -#include <linux/bitops.h> /* for LOCK_PREFIX */
69859 +#include <asm/smp_alt.h>
69860  
69861  #ifdef __KERNEL__
69862  
69863 @@ -271,19 +271,19 @@
69864         unsigned long prev;
69865         switch (size) {
69866         case 1:
69867 -               __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
69868 +               __asm__ __volatile__(LOCK "cmpxchgb %b1,%2"
69869                                      : "=a"(prev)
69870                                      : "q"(new), "m"(*__xg(ptr)), "0"(old)
69871                                      : "memory");
69872                 return prev;
69873         case 2:
69874 -               __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
69875 +               __asm__ __volatile__(LOCK "cmpxchgw %w1,%2"
69876                                      : "=a"(prev)
69877                                      : "r"(new), "m"(*__xg(ptr)), "0"(old)
69878                                      : "memory");
69879                 return prev;
69880         case 4:
69881 -               __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
69882 +               __asm__ __volatile__(LOCK "cmpxchgl %1,%2"
69883                                      : "=a"(prev)
69884                                      : "r"(new), "m"(*__xg(ptr)), "0"(old)
69885                                      : "memory");
69886 @@ -336,7 +336,7 @@
69887                                       unsigned long long new)
69888  {
69889         unsigned long long prev;
69890 -       __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
69891 +       __asm__ __volatile__(LOCK "cmpxchg8b %3"
69892                              : "=A"(prev)
69893                              : "b"((unsigned long)new),
69894                                "c"((unsigned long)(new >> 32)),
69895 @@ -503,11 +503,55 @@
69896  #endif
69897  
69898  #ifdef CONFIG_SMP
69899 +#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
69900 +#define smp_alt_mb(instr)                                           \
69901 +__asm__ __volatile__("6667:\nnop\nnop\nnop\nnop\nnop\nnop\n6668:\n" \
69902 +                    ".section __smp_alternatives,\"a\"\n"          \
69903 +                    ".long 6667b\n"                                \
69904 +                     ".long 6673f\n"                                \
69905 +                    ".previous\n"                                  \
69906 +                    ".section __smp_replacements,\"a\"\n"          \
69907 +                    "6673:.byte 6668b-6667b\n"                     \
69908 +                    ".byte 6670f-6669f\n"                          \
69909 +                    ".byte 6671f-6670f\n"                          \
69910 +                     ".byte 0\n"                                    \
69911 +                    ".byte %c0\n"                                  \
69912 +                    "6669:lock;addl $0,0(%%esp)\n"                 \
69913 +                    "6670:" instr "\n"                             \
69914 +                    "6671:\n"                                      \
69915 +                    ".previous\n"                                  \
69916 +                    :                                              \
69917 +                    : "i" (X86_FEATURE_XMM2)                       \
69918 +                    : "memory")
69919 +#define smp_mb()  smp_alt_mb("mfence")
69920 +#define smp_rmb() smp_alt_mb("lfence")
69921 +#define set_mb(var, value) do {                                     \
69922 +unsigned long __set_mb_temp;                                        \
69923 +__asm__ __volatile__("6667:movl %1, %0\n6668:\n"                    \
69924 +                    ".section __smp_alternatives,\"a\"\n"          \
69925 +                    ".long 6667b\n"                                \
69926 +                    ".long 6673f\n"                                \
69927 +                    ".previous\n"                                  \
69928 +                    ".section __smp_replacements,\"a\"\n"          \
69929 +                    "6673: .byte 6668b-6667b\n"                    \
69930 +                    ".byte 6670f-6669f\n"                          \
69931 +                    ".byte 0\n"                                    \
69932 +                    ".byte 6671f-6670f\n"                          \
69933 +                    ".byte -1\n"                                   \
69934 +                    "6669: xchg %1, %0\n"                          \
69935 +                    "6670:movl %1, %0\n"                           \
69936 +                    "6671:\n"                                      \
69937 +                    ".previous\n"                                  \
69938 +                    : "=m" (var), "=r" (__set_mb_temp)             \
69939 +                    : "1" (value)                                  \
69940 +                    : "memory"); } while (0)
69941 +#else
69942  #define smp_mb()       mb()
69943  #define smp_rmb()      rmb()
69944 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
69945 +#endif
69946  #define smp_wmb()      wmb()
69947  #define smp_read_barrier_depends()     read_barrier_depends()
69948 -#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
69949  #else
69950  #define smp_mb()       barrier()
69951  #define smp_rmb()      barrier()
69952 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-ia64/fixmap.h linux-2.6.16/include/asm-ia64/fixmap.h
69953 --- linux-2.6.16.orig/include/asm-ia64/fixmap.h 1970-01-01 01:00:00.000000000 +0100
69954 +++ linux-2.6.16/include/asm-ia64/fixmap.h      2006-06-26 09:51:32.000000000 +0200
69955 @@ -0,0 +1,2 @@
69956 +#define clear_fixmap(x)        do {} while (0)
69957 +#define        set_fixmap(x,y) do {} while (0)
69958 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-ia64/gcc_intrin.h linux-2.6.16/include/asm-ia64/gcc_intrin.h
69959 --- linux-2.6.16.orig/include/asm-ia64/gcc_intrin.h     2006-03-20 06:53:29.000000000 +0100
69960 +++ linux-2.6.16/include/asm-ia64/gcc_intrin.h  2006-06-26 09:51:32.000000000 +0200
69961 @@ -26,7 +26,7 @@
69962  
69963  register unsigned long ia64_r13 asm ("r13") __attribute_used__;
69964  
69965 -#define ia64_setreg(regnum, val)                                               \
69966 +#define __ia64_setreg(regnum, val)                                             \
69967  ({                                                                             \
69968         switch (regnum) {                                                       \
69969             case _IA64_REG_PSR_L:                                               \
69970 @@ -55,7 +55,7 @@
69971         }                                                                       \
69972  })
69973  
69974 -#define ia64_getreg(regnum)                                                    \
69975 +#define __ia64_getreg(regnum)                                                  \
69976  ({                                                                             \
69977         __u64 ia64_intri_res;                                                   \
69978                                                                                 \
69979 @@ -92,7 +92,7 @@
69980  
69981  #define ia64_hint_pause 0
69982  
69983 -#define ia64_hint(mode)                                                \
69984 +#define __ia64_hint(mode)                                              \
69985  ({                                                             \
69986         switch (mode) {                                         \
69987         case ia64_hint_pause:                                   \
69988 @@ -374,7 +374,7 @@
69989  
69990  #define ia64_invala() asm volatile ("invala" ::: "memory")
69991  
69992 -#define ia64_thash(addr)                                                       \
69993 +#define __ia64_thash(addr)                                                     \
69994  ({                                                                             \
69995         __u64 ia64_intri_res;                                                   \
69996         asm volatile ("thash %0=%1" : "=r"(ia64_intri_res) : "r" (addr));       \
69997 @@ -394,18 +394,18 @@
69998  
69999  #define ia64_nop(x)    asm volatile ("nop %0"::"i"(x));
70000  
70001 -#define ia64_itci(addr)        asm volatile ("itc.i %0;;" :: "r"(addr) : "memory")
70002 +#define __ia64_itci(addr)      asm volatile ("itc.i %0;;" :: "r"(addr) : "memory")
70003  
70004 -#define ia64_itcd(addr)        asm volatile ("itc.d %0;;" :: "r"(addr) : "memory")
70005 +#define __ia64_itcd(addr)      asm volatile ("itc.d %0;;" :: "r"(addr) : "memory")
70006  
70007  
70008 -#define ia64_itri(trnum, addr) asm volatile ("itr.i itr[%0]=%1"                                \
70009 +#define __ia64_itri(trnum, addr) asm volatile ("itr.i itr[%0]=%1"                      \
70010                                              :: "r"(trnum), "r"(addr) : "memory")
70011  
70012 -#define ia64_itrd(trnum, addr) asm volatile ("itr.d dtr[%0]=%1"                                \
70013 +#define __ia64_itrd(trnum, addr) asm volatile ("itr.d dtr[%0]=%1"                      \
70014                                              :: "r"(trnum), "r"(addr) : "memory")
70015  
70016 -#define ia64_tpa(addr)                                                         \
70017 +#define __ia64_tpa(addr)                                                       \
70018  ({                                                                             \
70019         __u64 ia64_pa;                                                          \
70020         asm volatile ("tpa %0 = %1" : "=r"(ia64_pa) : "r"(addr) : "memory");    \
70021 @@ -415,22 +415,22 @@
70022  #define __ia64_set_dbr(index, val)                                             \
70023         asm volatile ("mov dbr[%0]=%1" :: "r"(index), "r"(val) : "memory")
70024  
70025 -#define ia64_set_ibr(index, val)                                               \
70026 +#define __ia64_set_ibr(index, val)                                             \
70027         asm volatile ("mov ibr[%0]=%1" :: "r"(index), "r"(val) : "memory")
70028  
70029 -#define ia64_set_pkr(index, val)                                               \
70030 +#define __ia64_set_pkr(index, val)                                             \
70031         asm volatile ("mov pkr[%0]=%1" :: "r"(index), "r"(val) : "memory")
70032  
70033 -#define ia64_set_pmc(index, val)                                               \
70034 +#define __ia64_set_pmc(index, val)                                             \
70035         asm volatile ("mov pmc[%0]=%1" :: "r"(index), "r"(val) : "memory")
70036  
70037 -#define ia64_set_pmd(index, val)                                               \
70038 +#define __ia64_set_pmd(index, val)                                             \
70039         asm volatile ("mov pmd[%0]=%1" :: "r"(index), "r"(val) : "memory")
70040  
70041 -#define ia64_set_rr(index, val)                                                        \
70042 +#define __ia64_set_rr(index, val)                                                      \
70043         asm volatile ("mov rr[%0]=%1" :: "r"(index), "r"(val) : "memory");
70044  
70045 -#define ia64_get_cpuid(index)                                                          \
70046 +#define __ia64_get_cpuid(index)                                                                \
70047  ({                                                                                     \
70048         __u64 ia64_intri_res;                                                           \
70049         asm volatile ("mov %0=cpuid[%r1]" : "=r"(ia64_intri_res) : "rO"(index));        \
70050 @@ -444,21 +444,21 @@
70051         ia64_intri_res;                                                         \
70052  })
70053  
70054 -#define ia64_get_ibr(index)                                                    \
70055 +#define __ia64_get_ibr(index)                                                  \
70056  ({                                                                             \
70057         __u64 ia64_intri_res;                                                   \
70058         asm volatile ("mov %0=ibr[%1]" : "=r"(ia64_intri_res) : "r"(index));    \
70059         ia64_intri_res;                                                         \
70060  })
70061  
70062 -#define ia64_get_pkr(index)                                                    \
70063 +#define __ia64_get_pkr(index)                                                  \
70064  ({                                                                             \
70065         __u64 ia64_intri_res;                                                   \
70066         asm volatile ("mov %0=pkr[%1]" : "=r"(ia64_intri_res) : "r"(index));    \
70067         ia64_intri_res;                                                         \
70068  })
70069  
70070 -#define ia64_get_pmc(index)                                                    \
70071 +#define __ia64_get_pmc(index)                                                  \
70072  ({                                                                             \
70073         __u64 ia64_intri_res;                                                   \
70074         asm volatile ("mov %0=pmc[%1]" : "=r"(ia64_intri_res) : "r"(index));    \
70075 @@ -466,48 +466,48 @@
70076  })
70077  
70078  
70079 -#define ia64_get_pmd(index)                                                    \
70080 +#define __ia64_get_pmd(index)                                                  \
70081  ({                                                                             \
70082         __u64 ia64_intri_res;                                                   \
70083         asm volatile ("mov %0=pmd[%1]" : "=r"(ia64_intri_res) : "r"(index));    \
70084         ia64_intri_res;                                                         \
70085  })
70086  
70087 -#define ia64_get_rr(index)                                                     \
70088 +#define __ia64_get_rr(index)                                                   \
70089  ({                                                                             \
70090         __u64 ia64_intri_res;                                                   \
70091         asm volatile ("mov %0=rr[%1]" : "=r"(ia64_intri_res) : "r" (index));    \
70092         ia64_intri_res;                                                         \
70093  })
70094  
70095 -#define ia64_fc(addr)  asm volatile ("fc %0" :: "r"(addr) : "memory")
70096 +#define __ia64_fc(addr)        asm volatile ("fc %0" :: "r"(addr) : "memory")
70097  
70098  
70099  #define ia64_sync_i()  asm volatile (";; sync.i" ::: "memory")
70100  
70101 -#define ia64_ssm(mask) asm volatile ("ssm %0":: "i"((mask)) : "memory")
70102 -#define ia64_rsm(mask) asm volatile ("rsm %0":: "i"((mask)) : "memory")
70103 +#define __ia64_ssm(mask)       asm volatile ("ssm %0":: "i"((mask)) : "memory")
70104 +#define __ia64_rsm(mask)       asm volatile ("rsm %0":: "i"((mask)) : "memory")
70105  #define ia64_sum(mask) asm volatile ("sum %0":: "i"((mask)) : "memory")
70106  #define ia64_rum(mask) asm volatile ("rum %0":: "i"((mask)) : "memory")
70107  
70108 -#define ia64_ptce(addr)        asm volatile ("ptc.e %0" :: "r"(addr))
70109 +#define __ia64_ptce(addr)      asm volatile ("ptc.e %0" :: "r"(addr))
70110  
70111 -#define ia64_ptcga(addr, size)                                                 \
70112 +#define __ia64_ptcga(addr, size)                                                       \
70113  do {                                                                           \
70114         asm volatile ("ptc.ga %0,%1" :: "r"(addr), "r"(size) : "memory");       \
70115         ia64_dv_serialize_data();                                               \
70116  } while (0)
70117  
70118 -#define ia64_ptcl(addr, size)                                                  \
70119 +#define __ia64_ptcl(addr, size)                                                        \
70120  do {                                                                           \
70121         asm volatile ("ptc.l %0,%1" :: "r"(addr), "r"(size) : "memory");        \
70122         ia64_dv_serialize_data();                                               \
70123  } while (0)
70124  
70125 -#define ia64_ptri(addr, size)                                          \
70126 +#define __ia64_ptri(addr, size)                                                \
70127         asm volatile ("ptr.i %0,%1" :: "r"(addr), "r"(size) : "memory")
70128  
70129 -#define ia64_ptrd(addr, size)                                          \
70130 +#define __ia64_ptrd(addr, size)                                                \
70131         asm volatile ("ptr.d %0,%1" :: "r"(addr), "r"(size) : "memory")
70132  
70133  /* Values for lfhint in ia64_lfetch and ia64_lfetch_fault */
70134 @@ -589,7 +589,7 @@
70135          }                                                              \
70136  })
70137  
70138 -#define ia64_intrin_local_irq_restore(x)                       \
70139 +#define __ia64_intrin_local_irq_restore(x)                     \
70140  do {                                                           \
70141         asm volatile (";;   cmp.ne p6,p7=%0,r0;;"               \
70142                       "(p6) ssm psr.i;"                         \
70143 @@ -598,4 +598,6 @@
70144                       :: "r"((x)) : "p6", "p7", "memory");      \
70145  } while (0)
70146  
70147 +#define __ia64_get_psr_i()     (__ia64_getreg(_IA64_REG_PSR) & 0x4000UL)
70148 +
70149  #endif /* _ASM_IA64_GCC_INTRIN_H */
70150 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-ia64/hypercall.h linux-2.6.16/include/asm-ia64/hypercall.h
70151 --- linux-2.6.16.orig/include/asm-ia64/hypercall.h      1970-01-01 01:00:00.000000000 +0100
70152 +++ linux-2.6.16/include/asm-ia64/hypercall.h   2006-06-26 09:51:32.000000000 +0200
70153 @@ -0,0 +1,274 @@
70154 +/******************************************************************************
70155 + * hypercall.h
70156 + * 
70157 + * Linux-specific hypervisor handling.
70158 + * 
70159 + * Copyright (c) 2002-2004, K A Fraser
70160 + * 
70161 + * This program is free software; you can redistribute it and/or
70162 + * modify it under the terms of the GNU General Public License version 2
70163 + * as published by the Free Software Foundation; or, when distributed
70164 + * separately from the Linux kernel or incorporated into other
70165 + * software packages, subject to the following license:
70166 + * 
70167 + * Permission is hereby granted, free of charge, to any person obtaining a copy
70168 + * of this source file (the "Software"), to deal in the Software without
70169 + * restriction, including without limitation the rights to use, copy, modify,
70170 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
70171 + * and to permit persons to whom the Software is furnished to do so, subject to
70172 + * the following conditions:
70173 + * 
70174 + * The above copyright notice and this permission notice shall be included in
70175 + * all copies or substantial portions of the Software.
70176 + * 
70177 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
70178 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
70179 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
70180 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
70181 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
70182 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
70183 + * IN THE SOFTWARE.
70184 + */
70185 +
70186 +#ifndef __HYPERCALL_H__
70187 +#define __HYPERCALL_H__
70188 +
70189 +#ifndef __HYPERVISOR_H__
70190 +# error "please don't include this file directly"
70191 +#endif
70192 +
70193 +/* FIXME: temp place to hold these page related macros */
70194 +#include <asm/page.h>
70195 +#define virt_to_machine(v) __pa(v)
70196 +#define machine_to_virt(m) __va(m)
70197 +#define virt_to_mfn(v) ((__pa(v)) >> PAGE_SHIFT)
70198 +#define mfn_to_virt(m) (__va((m) << PAGE_SHIFT))
70199 +
70200 +/*
70201 + * Assembler stubs for hyper-calls.
70202 + */
70203 +
70204 +#define _hypercall0(type, name)                                        \
70205 +({                                                             \
70206 +       long __res;                                             \
70207 +       __asm__ __volatile__ (";;\n"                            \
70208 +                             "mov r2=%1\n"                     \
70209 +                             "break 0x1000 ;;\n"               \
70210 +                             "mov %0=r8 ;;\n"                  \
70211 +                             : "=r" (__res)                    \
70212 +                             : "i" (__HYPERVISOR_##name)       \
70213 +                             : "r2","r8",                      \
70214 +                               "memory" );                     \
70215 +       (type)__res;                                            \
70216 +})
70217 +
70218 +#define _hypercall1(type, name, a1)                            \
70219 +({                                                             \
70220 +       long __res;                                             \
70221 +       __asm__ __volatile__ (";;\n"                            \
70222 +                             "mov r14=%2\n"                    \
70223 +                             "mov r2=%1\n"                     \
70224 +                             "break 0x1000 ;;\n"               \
70225 +                             "mov %0=r8 ;;\n"                  \
70226 +                             : "=r" (__res)                    \
70227 +                             : "i" (__HYPERVISOR_##name),      \
70228 +                               "r" ((unsigned long)(a1))       \
70229 +                             : "r14","r2","r8",                \
70230 +                               "memory" );                     \
70231 +       (type)__res;                                            \
70232 +})
70233 +
70234 +#define _hypercall2(type, name, a1, a2)                                \
70235 +({                                                             \
70236 +       long __res;                                             \
70237 +       __asm__ __volatile__ (";;\n"                            \
70238 +                             "mov r14=%2\n"                    \
70239 +                             "mov r15=%3\n"                    \
70240 +                             "mov r2=%1\n"                     \
70241 +                             "break 0x1000 ;;\n"               \
70242 +                             "mov %0=r8 ;;\n"                  \
70243 +                             : "=r" (__res)                    \
70244 +                             : "i" (__HYPERVISOR_##name),      \
70245 +                               "r" ((unsigned long)(a1)),      \
70246 +                               "r" ((unsigned long)(a2))       \
70247 +                             : "r14","r15","r2","r8",          \
70248 +                               "memory" );                     \
70249 +       (type)__res;                                            \
70250 +})
70251 +
70252 +#define _hypercall3(type, name, a1, a2, a3)                    \
70253 +({                                                             \
70254 +       long __res;                                             \
70255 +       __asm__ __volatile__ (";;\n"                            \
70256 +                             "mov r14=%2\n"                    \
70257 +                             "mov r15=%3\n"                    \
70258 +                             "mov r16=%4\n"                    \
70259 +                             "mov r2=%1\n"                     \
70260 +                             "break 0x1000 ;;\n"               \
70261 +                             "mov %0=r8 ;;\n"                  \
70262 +                             : "=r" (__res)                    \
70263 +                             : "i" (__HYPERVISOR_##name),      \
70264 +                               "r" ((unsigned long)(a1)),      \
70265 +                               "r" ((unsigned long)(a2)),      \
70266 +                               "r" ((unsigned long)(a3))       \
70267 +                             : "r14","r15","r16","r2","r8",    \
70268 +                               "memory" );                     \
70269 +       (type)__res;                                            \
70270 +})
70271 +
70272 +#define _hypercall4(type, name, a1, a2, a3, a4)                        \
70273 +({                                                             \
70274 +       long __res;                                             \
70275 +       __asm__ __volatile__ (";;\n"                            \
70276 +                             "mov r14=%2\n"                    \
70277 +                             "mov r15=%3\n"                    \
70278 +                             "mov r16=%4\n"                    \
70279 +                             "mov r17=%5\n"                    \
70280 +                             "mov r2=%1\n"                     \
70281 +                             "break 0x1000 ;;\n"               \
70282 +                             "mov %0=r8 ;;\n"                  \
70283 +                             : "=r" (__res)                    \
70284 +                             : "i" (__HYPERVISOR_##name),      \
70285 +                               "r" ((unsigned long)(a1)),      \
70286 +                               "r" ((unsigned long)(a2)),      \
70287 +                               "r" ((unsigned long)(a3)),      \
70288 +                               "r" ((unsigned long)(a4))       \
70289 +                             : "r14","r15","r16","r2","r8",    \
70290 +                               "r17","memory" );               \
70291 +       (type)__res;                                            \
70292 +})
70293 +
70294 +#define _hypercall5(type, name, a1, a2, a3, a4, a5)            \
70295 +({                                                             \
70296 +       long __res;                                             \
70297 +       __asm__ __volatile__ (";;\n"                            \
70298 +                             "mov r14=%2\n"                    \
70299 +                             "mov r15=%3\n"                    \
70300 +                             "mov r16=%4\n"                    \
70301 +                             "mov r17=%5\n"                    \
70302 +                             "mov r18=%6\n"                    \
70303 +                             "mov r2=%1\n"                     \
70304 +                             "break 0x1000 ;;\n"               \
70305 +                             "mov %0=r8 ;;\n"                  \
70306 +                             : "=r" (__res)                    \
70307 +                             : "i" (__HYPERVISOR_##name),      \
70308 +                               "r" ((unsigned long)(a1)),      \
70309 +                               "r" ((unsigned long)(a2)),      \
70310 +                               "r" ((unsigned long)(a3)),      \
70311 +                               "r" ((unsigned long)(a4)),      \
70312 +                               "r" ((unsigned long)(a5))       \
70313 +                             : "r14","r15","r16","r2","r8",    \
70314 +                               "r17","r18","memory" );         \
70315 +       (type)__res;                                            \
70316 +})
70317 +
70318 +static inline int
70319 +HYPERVISOR_sched_op_compat(
70320 +    int cmd, unsigned long arg)
70321 +{
70322 +       return _hypercall2(int, sched_op_compat, cmd, arg);
70323 +}
70324 +
70325 +static inline int
70326 +HYPERVISOR_sched_op(
70327 +       int cmd, void *arg)
70328 +{
70329 +       return _hypercall2(int, sched_op, cmd, arg);
70330 +}
70331 +
70332 +static inline long
70333 +HYPERVISOR_set_timer_op(
70334 +    u64 timeout)
70335 +{
70336 +    unsigned long timeout_hi = (unsigned long)(timeout>>32);
70337 +    unsigned long timeout_lo = (unsigned long)timeout;
70338 +    return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
70339 +}
70340 +
70341 +static inline int
70342 +HYPERVISOR_dom0_op(
70343 +    dom0_op_t *dom0_op)
70344 +{
70345 +    dom0_op->interface_version = DOM0_INTERFACE_VERSION;
70346 +    return _hypercall1(int, dom0_op, dom0_op);
70347 +}
70348 +
70349 +static inline int
70350 +HYPERVISOR_multicall(
70351 +    void *call_list, int nr_calls)
70352 +{
70353 +    return _hypercall2(int, multicall, call_list, nr_calls);
70354 +}
70355 +
70356 +static inline int
70357 +HYPERVISOR_memory_op(
70358 +    unsigned int cmd, void *arg)
70359 +{
70360 +    return _hypercall2(int, memory_op, cmd, arg);
70361 +}
70362 +
70363 +static inline int
70364 +HYPERVISOR_event_channel_op(
70365 +    void *op)
70366 +{
70367 +    return _hypercall1(int, event_channel_op, op);
70368 +}
70369 +
70370 +static inline int
70371 +HYPERVISOR_xen_version(
70372 +    int cmd, void *arg)
70373 +{
70374 +    return _hypercall2(int, xen_version, cmd, arg);
70375 +}
70376 +
70377 +static inline int
70378 +HYPERVISOR_console_io(
70379 +    int cmd, int count, char *str)
70380 +{
70381 +    return _hypercall3(int, console_io, cmd, count, str);
70382 +}
70383 +
70384 +static inline int
70385 +HYPERVISOR_physdev_op(
70386 +    void *physdev_op)
70387 +{
70388 +    return _hypercall1(int, physdev_op, physdev_op);
70389 +}
70390 +
70391 +static inline int
70392 +HYPERVISOR_grant_table_op(
70393 +    unsigned int cmd, void *uop, unsigned int count)
70394 +{
70395 +    return _hypercall3(int, grant_table_op, cmd, uop, count);
70396 +}
70397 +
70398 +static inline int
70399 +HYPERVISOR_vcpu_op(
70400 +       int cmd, int vcpuid, void *extra_args)
70401 +{
70402 +    return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
70403 +}
70404 +
70405 +static inline int
70406 +HYPERVISOR_suspend(
70407 +       unsigned long srec)
70408 +{
70409 +       struct sched_shutdown sched_shutdown = {
70410 +               .reason = SHUTDOWN_suspend
70411 +       };
70412 +
70413 +       int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
70414 +                            &sched_shutdown, srec);
70415 +
70416 +       if (rc == -ENOSYS)
70417 +               rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
70418 +                                SHUTDOWN_suspend, srec);
70419 +
70420 +       return rc;
70421 +}
70422 +
70423 +extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs);
70424 +static inline void exit_idle(void) {}
70425 +#define do_IRQ(irq, regs) __do_IRQ((irq), (regs))
70426 +
70427 +#endif /* __HYPERCALL_H__ */
70428 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-ia64/hypervisor.h linux-2.6.16/include/asm-ia64/hypervisor.h
70429 --- linux-2.6.16.orig/include/asm-ia64/hypervisor.h     1970-01-01 01:00:00.000000000 +0100
70430 +++ linux-2.6.16/include/asm-ia64/hypervisor.h  2006-06-26 09:51:32.000000000 +0200
70431 @@ -0,0 +1,138 @@
70432 +/******************************************************************************
70433 + * hypervisor.h
70434 + * 
70435 + * Linux-specific hypervisor handling.
70436 + * 
70437 + * Copyright (c) 2002-2004, K A Fraser
70438 + * 
70439 + * This program is free software; you can redistribute it and/or
70440 + * modify it under the terms of the GNU General Public License version 2
70441 + * as published by the Free Software Foundation; or, when distributed
70442 + * separately from the Linux kernel or incorporated into other
70443 + * software packages, subject to the following license:
70444 + * 
70445 + * Permission is hereby granted, free of charge, to any person obtaining a copy
70446 + * of this source file (the "Software"), to deal in the Software without
70447 + * restriction, including without limitation the rights to use, copy, modify,
70448 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
70449 + * and to permit persons to whom the Software is furnished to do so, subject to
70450 + * the following conditions:
70451 + * 
70452 + * The above copyright notice and this permission notice shall be included in
70453 + * all copies or substantial portions of the Software.
70454 + * 
70455 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
70456 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
70457 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
70458 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
70459 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
70460 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
70461 + * IN THE SOFTWARE.
70462 + */
70463 +
70464 +#ifndef __HYPERVISOR_H__
70465 +#define __HYPERVISOR_H__
70466 +
70467 +#include <linux/config.h>
70468 +#include <linux/types.h>
70469 +#include <linux/kernel.h>
70470 +#include <linux/version.h>
70471 +#include <linux/errno.h>
70472 +#include <xen/interface/xen.h>
70473 +#include <xen/interface/dom0_ops.h>
70474 +#include <xen/interface/sched.h>
70475 +#include <asm/ptrace.h>
70476 +#include <asm/page.h>
70477 +
70478 +extern shared_info_t *HYPERVISOR_shared_info;
70479 +extern start_info_t *xen_start_info;
70480 +
70481 +void force_evtchn_callback(void);
70482 +
70483 +int xen_init(void);
70484 +
70485 +/* Turn jiffies into Xen system time. XXX Implement me. */
70486 +#define jiffies_to_st(j)       0
70487 +
70488 +#include <asm/hypercall.h>
70489 +
70490 +static inline int
70491 +HYPERVISOR_yield(
70492 +       void)
70493 +{
70494 +       int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
70495 +
70496 +       if (rc == -ENOSYS)
70497 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
70498 +
70499 +       return rc;
70500 +}
70501 +
70502 +static inline int
70503 +HYPERVISOR_block(
70504 +       void)
70505 +{
70506 +       int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL);
70507 +
70508 +       if (rc == -ENOSYS)
70509 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0);
70510 +
70511 +       return rc;
70512 +}
70513 +
70514 +static inline int
70515 +HYPERVISOR_shutdown(
70516 +       unsigned int reason)
70517 +{
70518 +       struct sched_shutdown sched_shutdown = {
70519 +               .reason = reason
70520 +       };
70521 +
70522 +       int rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown);
70523 +
70524 +       if (rc == -ENOSYS)
70525 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason);
70526 +
70527 +       return rc;
70528 +}
70529 +
70530 +static inline int
70531 +HYPERVISOR_poll(
70532 +       evtchn_port_t *ports, unsigned int nr_ports, u64 timeout)
70533 +{
70534 +       struct sched_poll sched_poll = {
70535 +               .ports = ports,
70536 +               .nr_ports = nr_ports,
70537 +               .timeout = jiffies_to_st(timeout)
70538 +       };
70539 +
70540 +       int rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
70541 +
70542 +       if (rc == -ENOSYS)
70543 +               rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
70544 +
70545 +       return rc;
70546 +}
70547 +
70548 +// for drivers/xen/privcmd/privcmd.c
70549 +#define direct_remap_pfn_range(a,b,c,d,e,f) remap_pfn_range(a,b,c,d,e)
70550 +#define        pfn_to_mfn(x)   (x)
70551 +#define        mfn_to_pfn(x)   (x)
70552 +#define machine_to_phys_mapping 0
70553 +
70554 +// for drivers/xen/balloon/balloon.c
70555 +#ifdef CONFIG_XEN_SCRUB_PAGES
70556 +#define scrub_pages(_p,_n) memset((void *)(_p), 0, (_n) << PAGE_SHIFT)
70557 +#else
70558 +#define scrub_pages(_p,_n) ((void)0)
70559 +#endif
70560 +#define        pte_mfn(_x)     pte_pfn(_x)
70561 +#define INVALID_P2M_ENTRY      (~0UL)
70562 +#define __pte_ma(_x)   ((pte_t) {(_x)})
70563 +#define phys_to_machine_mapping_valid(_x)      (1)
70564 +#define        kmap_flush_unused()     do {} while (0)
70565 +#define set_phys_to_machine(_x,_y)     do {} while (0)
70566 +#define xen_machphys_update(_x,_y)     do {} while (0)
70567 +#define pfn_pte_ma(_x,_y)      __pte_ma(0)
70568 +
70569 +#endif /* __HYPERVISOR_H__ */
70570 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-ia64/intel_intrin.h linux-2.6.16/include/asm-ia64/intel_intrin.h
70571 --- linux-2.6.16.orig/include/asm-ia64/intel_intrin.h   2006-03-20 06:53:29.000000000 +0100
70572 +++ linux-2.6.16/include/asm-ia64/intel_intrin.h        2006-06-26 09:51:32.000000000 +0200
70573 @@ -119,10 +119,10 @@
70574                          * intrinsic
70575                          */
70576  
70577 -#define ia64_getreg            __getReg
70578 -#define ia64_setreg            __setReg
70579 +#define __ia64_getreg          __getReg
70580 +#define __ia64_setreg          __setReg
70581  
70582 -#define ia64_hint(x)
70583 +#define __ia64_hint(x)
70584  
70585  #define ia64_mux1_brcst         0
70586  #define ia64_mux1_mix           8
70587 @@ -135,16 +135,16 @@
70588  #define ia64_getf_exp          __getf_exp
70589  #define ia64_shrp              _m64_shrp
70590  
70591 -#define ia64_tpa               __tpa
70592 +#define __ia64_tpa             __tpa
70593  #define ia64_invala            __invala
70594  #define ia64_invala_gr         __invala_gr
70595  #define ia64_invala_fr         __invala_fr
70596  #define ia64_nop               __nop
70597  #define ia64_sum               __sum
70598 -#define ia64_ssm               __ssm
70599 +#define __ia64_ssm             __ssm
70600  #define ia64_rum               __rum
70601 -#define ia64_rsm               __rsm
70602 -#define ia64_fc                __fc
70603 +#define __ia64_rsm             __rsm
70604 +#define __ia64_fc              __fc
70605  
70606  #define ia64_ldfs              __ldfs
70607  #define ia64_ldfd              __ldfd
70608 @@ -182,24 +182,24 @@
70609  
70610  #define __ia64_set_dbr(index, val)     \
70611                 __setIndReg(_IA64_REG_INDR_DBR, index, val)
70612 -#define ia64_set_ibr(index, val)       \
70613 +#define __ia64_set_ibr(index, val)     \
70614                 __setIndReg(_IA64_REG_INDR_IBR, index, val)
70615 -#define ia64_set_pkr(index, val)       \
70616 +#define __ia64_set_pkr(index, val)     \
70617                 __setIndReg(_IA64_REG_INDR_PKR, index, val)
70618 -#define ia64_set_pmc(index, val)       \
70619 +#define __ia64_set_pmc(index, val)     \
70620                 __setIndReg(_IA64_REG_INDR_PMC, index, val)
70621 -#define ia64_set_pmd(index, val)       \
70622 +#define __ia64_set_pmd(index, val)     \
70623                 __setIndReg(_IA64_REG_INDR_PMD, index, val)
70624 -#define ia64_set_rr(index, val)        \
70625 +#define __ia64_set_rr(index, val)      \
70626                 __setIndReg(_IA64_REG_INDR_RR, index, val)
70627  
70628 -#define ia64_get_cpuid(index)  __getIndReg(_IA64_REG_INDR_CPUID, index)
70629 +#define __ia64_get_cpuid(index)        __getIndReg(_IA64_REG_INDR_CPUID, index)
70630  #define __ia64_get_dbr(index)  __getIndReg(_IA64_REG_INDR_DBR, index)
70631 -#define ia64_get_ibr(index)    __getIndReg(_IA64_REG_INDR_IBR, index)
70632 -#define ia64_get_pkr(index)    __getIndReg(_IA64_REG_INDR_PKR, index)
70633 -#define ia64_get_pmc(index)    __getIndReg(_IA64_REG_INDR_PMC, index)
70634 -#define ia64_get_pmd(index)    __getIndReg(_IA64_REG_INDR_PMD, index)
70635 -#define ia64_get_rr(index)     __getIndReg(_IA64_REG_INDR_RR, index)
70636 +#define __ia64_get_ibr(index)  __getIndReg(_IA64_REG_INDR_IBR, index)
70637 +#define __ia64_get_pkr(index)  __getIndReg(_IA64_REG_INDR_PKR, index)
70638 +#define __ia64_get_pmc(index)  __getIndReg(_IA64_REG_INDR_PMC, index)
70639 +#define __ia64_get_pmd(index)          __getIndReg(_IA64_REG_INDR_PMD, index)
70640 +#define __ia64_get_rr(index)   __getIndReg(_IA64_REG_INDR_RR, index)
70641  
70642  #define ia64_srlz_d            __dsrlz
70643  #define ia64_srlz_i            __isrlz
70644 @@ -218,18 +218,18 @@
70645  #define ia64_ld8_acq           __ld8_acq
70646  
70647  #define ia64_sync_i            __synci
70648 -#define ia64_thash             __thash
70649 -#define ia64_ttag              __ttag
70650 -#define ia64_itcd              __itcd
70651 -#define ia64_itci              __itci
70652 -#define ia64_itrd              __itrd
70653 -#define ia64_itri              __itri
70654 -#define ia64_ptce              __ptce
70655 -#define ia64_ptcl              __ptcl
70656 -#define ia64_ptcg              __ptcg
70657 -#define ia64_ptcga             __ptcga
70658 -#define ia64_ptri              __ptri
70659 -#define ia64_ptrd              __ptrd
70660 +#define __ia64_thash           __thash
70661 +#define __ia64_ttag            __ttag
70662 +#define __ia64_itcd            __itcd
70663 +#define __ia64_itci            __itci
70664 +#define __ia64_itrd            __itrd
70665 +#define __ia64_itri            __itri
70666 +#define __ia64_ptce            __ptce
70667 +#define __ia64_ptcl            __ptcl
70668 +#define __ia64_ptcg            __ptcg
70669 +#define __ia64_ptcga           __ptcga
70670 +#define __ia64_ptri            __ptri
70671 +#define __ia64_ptrd            __ptrd
70672  #define ia64_dep_mi            _m64_dep_mi
70673  
70674  /* Values for lfhint in __lfetch and __lfetch_fault */
70675 @@ -244,14 +244,16 @@
70676  #define ia64_lfetch_fault      __lfetch_fault
70677  #define ia64_lfetch_fault_excl __lfetch_fault_excl
70678  
70679 -#define ia64_intrin_local_irq_restore(x)               \
70680 +#define __ia64_intrin_local_irq_restore(x)             \
70681  do {                                                   \
70682         if ((x) != 0) {                                 \
70683 -               ia64_ssm(IA64_PSR_I);                   \
70684 +               __ia64_ssm(IA64_PSR_I);                 \
70685                 ia64_srlz_d();                          \
70686         } else {                                        \
70687 -               ia64_rsm(IA64_PSR_I);                   \
70688 +               __ia64_rsm(IA64_PSR_I);                 \
70689         }                                               \
70690  } while (0)
70691  
70692 +#define __ia64_get_psr_i()     (__ia64_getreg(_IA64_REG_PSR) & 0x4000UL)
70693 +
70694  #endif /* _ASM_IA64_INTEL_INTRIN_H */
70695 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-ia64/meminit.h linux-2.6.16/include/asm-ia64/meminit.h
70696 --- linux-2.6.16.orig/include/asm-ia64/meminit.h        2006-03-20 06:53:29.000000000 +0100
70697 +++ linux-2.6.16/include/asm-ia64/meminit.h     2006-06-26 09:51:32.000000000 +0200
70698 @@ -17,10 +17,15 @@
70699   *     - command line string
70700   *     - kernel code & data
70701   *     - Kernel memory map built from EFI memory map
70702 + *     - xen start info
70703   *
70704   * More could be added if necessary
70705   */
70706 +#ifndef CONFIG_XEN
70707  #define IA64_MAX_RSVD_REGIONS 6
70708 +#else
70709 +#define IA64_MAX_RSVD_REGIONS 7
70710 +#endif
70711  
70712  struct rsvd_region {
70713         unsigned long start;    /* virtual address of beginning of element */
70714 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-ia64/pal.h linux-2.6.16/include/asm-ia64/pal.h
70715 --- linux-2.6.16.orig/include/asm-ia64/pal.h    2006-03-20 06:53:29.000000000 +0100
70716 +++ linux-2.6.16/include/asm-ia64/pal.h 2006-06-26 09:51:32.000000000 +0200
70717 @@ -81,6 +81,7 @@
70718  #ifndef __ASSEMBLY__
70719  
70720  #include <linux/types.h>
70721 +#include <asm/processor.h>
70722  #include <asm/fpu.h>
70723  
70724  /*
70725 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-ia64/privop.h linux-2.6.16/include/asm-ia64/privop.h
70726 --- linux-2.6.16.orig/include/asm-ia64/privop.h 1970-01-01 01:00:00.000000000 +0100
70727 +++ linux-2.6.16/include/asm-ia64/privop.h      2006-06-26 09:51:32.000000000 +0200
70728 @@ -0,0 +1,59 @@
70729 +#ifndef _ASM_IA64_PRIVOP_H
70730 +#define _ASM_IA64_PRIVOP_H
70731 +
70732 +/*
70733 + * Copyright (C) 2005 Hewlett-Packard Co
70734 + *     Dan Magenheimer <dan.magenheimer@hp.com>
70735 + *
70736 + */
70737 +
70738 +#include <linux/config.h>
70739 +#ifdef CONFIG_XEN
70740 +#include <asm/xen/privop.h>
70741 +#endif
70742 +
70743 +#ifndef __ASSEMBLY
70744 +
70745 +#ifndef IA64_PARAVIRTUALIZED
70746 +
70747 +#define ia64_getreg                    __ia64_getreg
70748 +#define ia64_setreg                    __ia64_setreg
70749 +#define ia64_hint                      __ia64_hint
70750 +#define ia64_thash                     __ia64_thash
70751 +#define ia64_itci                      __ia64_itci
70752 +#define ia64_itcd                      __ia64_itcd
70753 +#define ia64_itri                      __ia64_itri
70754 +#define ia64_itrd                      __ia64_itrd
70755 +#define ia64_tpa                       __ia64_tpa
70756 +#define ia64_set_ibr                   __ia64_set_ibr
70757 +#define ia64_set_pkr                   __ia64_set_pkr
70758 +#define ia64_set_pmc                   __ia64_set_pmc
70759 +#define ia64_set_pmd                   __ia64_set_pmd
70760 +#define ia64_set_rr                    __ia64_set_rr
70761 +#define ia64_get_cpuid                 __ia64_get_cpuid
70762 +#define ia64_get_ibr                   __ia64_get_ibr
70763 +#define ia64_get_pkr                   __ia64_get_pkr
70764 +#define ia64_get_pmc                   __ia64_get_pmc
70765 +#define ia64_get_pmd                   __ia64_get_pmd
70766 +#define ia64_get_rr                    __ia64_get_rr
70767 +#define ia64_fc                                __ia64_fc
70768 +#define ia64_ssm                       __ia64_ssm
70769 +#define ia64_rsm                       __ia64_rsm
70770 +#define ia64_ptce                      __ia64_ptce
70771 +#define ia64_ptcga                     __ia64_ptcga
70772 +#define ia64_ptcl                      __ia64_ptcl
70773 +#define ia64_ptri                      __ia64_ptri
70774 +#define ia64_ptrd                      __ia64_ptrd
70775 +#define        ia64_get_psr_i                  __ia64_get_psr_i
70776 +#define ia64_intrin_local_irq_restore  __ia64_intrin_local_irq_restore
70777 +#define ia64_pal_halt_light            __ia64_pal_halt_light
70778 +#define        ia64_leave_kernel               __ia64_leave_kernel
70779 +#define        ia64_leave_syscall              __ia64_leave_syscall
70780 +#define        ia64_switch_to                  __ia64_switch_to
70781 +#define        ia64_pal_call_static            __ia64_pal_call_static
70782 +
70783 +#endif /* !IA64_PARAVIRTUALIZED */
70784 +
70785 +#endif /* !__ASSEMBLY */
70786 +
70787 +#endif /* _ASM_IA64_PRIVOP_H */
70788 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-ia64/processor.h linux-2.6.16/include/asm-ia64/processor.h
70789 --- linux-2.6.16.orig/include/asm-ia64/processor.h      2006-03-20 06:53:29.000000000 +0100
70790 +++ linux-2.6.16/include/asm-ia64/processor.h   2006-06-26 09:51:32.000000000 +0200
70791 @@ -19,6 +19,7 @@
70792  #include <asm/kregs.h>
70793  #include <asm/ptrace.h>
70794  #include <asm/ustack.h>
70795 +#include <asm/privop.h>
70796  
70797  #define IA64_NUM_DBG_REGS      8
70798  /*
70799 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-ia64/synch_bitops.h linux-2.6.16/include/asm-ia64/synch_bitops.h
70800 --- linux-2.6.16.orig/include/asm-ia64/synch_bitops.h   1970-01-01 01:00:00.000000000 +0100
70801 +++ linux-2.6.16/include/asm-ia64/synch_bitops.h        2006-06-26 09:51:32.000000000 +0200
70802 @@ -0,0 +1,61 @@
70803 +#ifndef __XEN_SYNCH_BITOPS_H__
70804 +#define __XEN_SYNCH_BITOPS_H__
70805 +
70806 +/*
70807 + * Copyright 1992, Linus Torvalds.
70808 + * Heavily modified to provide guaranteed strong synchronisation
70809 + * when communicating with Xen or other guest OSes running on other CPUs.
70810 + */
70811 +
70812 +#include <linux/config.h>
70813 +
70814 +#define ADDR (*(volatile long *) addr)
70815 +
70816 +static __inline__ void synch_set_bit(int nr, volatile void * addr)
70817 +{
70818 +       set_bit(nr, addr);
70819 +}
70820 +
70821 +static __inline__ void synch_clear_bit(int nr, volatile void * addr)
70822 +{
70823 +       clear_bit(nr, addr);
70824 +}
70825 +
70826 +static __inline__ void synch_change_bit(int nr, volatile void * addr)
70827 +{
70828 +       change_bit(nr, addr);
70829 +}
70830 +
70831 +static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr)
70832 +{
70833 +    return test_and_set_bit(nr, addr);
70834 +}
70835 +
70836 +static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr)
70837 +{
70838 +    return test_and_clear_bit(nr, addr);
70839 +}
70840 +
70841 +static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr)
70842 +{
70843 +    return test_and_change_bit(nr, addr);
70844 +}
70845 +
70846 +static __inline__ int synch_const_test_bit(int nr, const volatile void * addr)
70847 +{
70848 +    return test_bit(nr, addr);
70849 +}
70850 +
70851 +static __inline__ int synch_var_test_bit(int nr, volatile void * addr)
70852 +{
70853 +    return test_bit(nr, addr);
70854 +}
70855 +
70856 +#define synch_cmpxchg  ia64_cmpxchg4_acq
70857 +
70858 +#define synch_test_bit(nr,addr) \
70859 +(__builtin_constant_p(nr) ? \
70860 + synch_const_test_bit((nr),(addr)) : \
70861 + synch_var_test_bit((nr),(addr)))
70862 +
70863 +#endif /* __XEN_SYNCH_BITOPS_H__ */
70864 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-ia64/system.h linux-2.6.16/include/asm-ia64/system.h
70865 --- linux-2.6.16.orig/include/asm-ia64/system.h 2006-03-20 06:53:29.000000000 +0100
70866 +++ linux-2.6.16/include/asm-ia64/system.h      2006-06-26 09:51:32.000000000 +0200
70867 @@ -125,7 +125,7 @@
70868  #define __local_irq_save(x)                    \
70869  do {                                           \
70870         ia64_stop();                            \
70871 -       (x) = ia64_getreg(_IA64_REG_PSR);       \
70872 +       (x) = ia64_get_psr_i();                 \
70873         ia64_stop();                            \
70874         ia64_rsm(IA64_PSR_I);                   \
70875  } while (0)
70876 @@ -173,7 +173,7 @@
70877  #endif /* !CONFIG_IA64_DEBUG_IRQ */
70878  
70879  #define local_irq_enable()     ({ ia64_stop(); ia64_ssm(IA64_PSR_I); ia64_srlz_d(); })
70880 -#define local_save_flags(flags)        ({ ia64_stop(); (flags) = ia64_getreg(_IA64_REG_PSR); })
70881 +#define local_save_flags(flags)        ({ ia64_stop(); (flags) = ia64_get_psr_i(); })
70882  
70883  #define irqs_disabled()                                \
70884  ({                                             \
70885 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-ia64/xen/privop.h linux-2.6.16/include/asm-ia64/xen/privop.h
70886 --- linux-2.6.16.orig/include/asm-ia64/xen/privop.h     1970-01-01 01:00:00.000000000 +0100
70887 +++ linux-2.6.16/include/asm-ia64/xen/privop.h  2006-06-26 09:51:32.000000000 +0200
70888 @@ -0,0 +1,277 @@
70889 +#ifndef _ASM_IA64_XEN_PRIVOP_H
70890 +#define _ASM_IA64_XEN_PRIVOP_H
70891 +
70892 +/*
70893 + * Copyright (C) 2005 Hewlett-Packard Co
70894 + *     Dan Magenheimer <dan.magenheimer@hp.com>
70895 + *
70896 + * Paravirtualizations of privileged operations for Xen/ia64
70897 + *
70898 + */
70899 +
70900 +
70901 +#include <asm/xen/asm-xsi-offsets.h>
70902 +
70903 +#define IA64_PARAVIRTUALIZED
70904 +
70905 +#ifdef __ASSEMBLY__
70906 +#define        XEN_HYPER_RFI                   break 0x1
70907 +#define        XEN_HYPER_RSM_PSR_DT            break 0x2
70908 +#define        XEN_HYPER_SSM_PSR_DT            break 0x3
70909 +#define        XEN_HYPER_COVER                 break 0x4
70910 +#define        XEN_HYPER_ITC_D                 break 0x5
70911 +#define        XEN_HYPER_ITC_I                 break 0x6
70912 +#define        XEN_HYPER_SSM_I                 break 0x7
70913 +#define        XEN_HYPER_GET_IVR               break 0x8
70914 +#define        XEN_HYPER_GET_TPR               break 0x9
70915 +#define        XEN_HYPER_SET_TPR               break 0xa
70916 +#define        XEN_HYPER_EOI                   break 0xb
70917 +#define        XEN_HYPER_SET_ITM               break 0xc
70918 +#define        XEN_HYPER_THASH                 break 0xd
70919 +#define        XEN_HYPER_PTC_GA                break 0xe
70920 +#define        XEN_HYPER_ITR_D                 break 0xf
70921 +#define        XEN_HYPER_GET_RR                break 0x10
70922 +#define        XEN_HYPER_SET_RR                break 0x11
70923 +#define        XEN_HYPER_SET_KR                break 0x12
70924 +#define        XEN_HYPER_FC                    break 0x13
70925 +#define        XEN_HYPER_GET_CPUID             break 0x14
70926 +#define        XEN_HYPER_GET_PMD               break 0x15
70927 +#define        XEN_HYPER_GET_EFLAG             break 0x16
70928 +#define        XEN_HYPER_SET_EFLAG             break 0x17
70929 +#endif
70930 +
70931 +#ifndef __ASSEMBLY__
70932 +#ifdef MODULE
70933 +extern int is_running_on_xen(void);
70934 +#define running_on_xen (is_running_on_xen())
70935 +#else
70936 +extern int running_on_xen;
70937 +#endif
70938 +
70939 +#define        XEN_HYPER_SSM_I                 asm("break 0x7");
70940 +#define        XEN_HYPER_GET_IVR               asm("break 0x8");
70941 +
70942 +/************************************************/
70943 +/* Instructions paravirtualized for correctness */
70944 +/************************************************/
70945 +
70946 +/* "fc" and "thash" are privilege-sensitive instructions, meaning they
70947 + *  may have different semantics depending on whether they are executed
70948 + *  at PL0 vs PL!=0.  When paravirtualized, these instructions mustn't
70949 + *  be allowed to execute directly, lest incorrect semantics result. */
70950 +extern unsigned long xen_fc(unsigned long addr);
70951 +#define ia64_fc(addr)                  xen_fc((unsigned long)(addr))
70952 +extern unsigned long xen_thash(unsigned long addr);
70953 +#define ia64_thash(addr)               xen_thash((unsigned long)(addr))
70954 +/* Note that "ttag" and "cover" are also privilege-sensitive; "ttag"
70955 + * is not currently used (though it may be in a long-format VHPT system!)
70956 + * and the semantics of cover only change if psr.ic is off which is very
70957 + * rare (and currently non-existent outside of assembly code */
70958 +
70959 +/* There are also privilege-sensitive registers.  These registers are
70960 + * readable at any privilege level but only writable at PL0. */
70961 +extern unsigned long xen_get_cpuid(int index);
70962 +#define        ia64_get_cpuid(i)               xen_get_cpuid(i)
70963 +extern unsigned long xen_get_pmd(int index);
70964 +#define        ia64_get_pmd(i)                 xen_get_pmd(i)
70965 +extern unsigned long xen_get_eflag(void);      /* see xen_ia64_getreg */
70966 +extern void xen_set_eflag(unsigned long);      /* see xen_ia64_setreg */
70967 +
70968 +/************************************************/
70969 +/* Instructions paravirtualized for performance */
70970 +/************************************************/
70971 +
70972 +/* Xen uses memory-mapped virtual privileged registers for access to many
70973 + * performance-sensitive privileged registers.  Some, like the processor
70974 + * status register (psr), are broken up into multiple memory locations.
70975 + * Others, like "pend", are abstractions based on privileged registers.
70976 + * "Pend" is guaranteed to be set if reading cr.ivr would return a
70977 + * (non-spurious) interrupt. */
70978 +#define xen_get_virtual_psr_i()                (*(int *)(XSI_PSR_I))
70979 +#define xen_set_virtual_psr_i(_val)    ({ *(int *)(XSI_PSR_I) = _val ? 1:0; })
70980 +#define xen_set_virtual_psr_ic(_val)   ({ *(int *)(XSI_PSR_IC) = _val ? 1:0; })
70981 +#define xen_get_virtual_pend()         (*(int *)(XSI_PEND))
70982 +
70983 +/* Hyperprivops are "break" instructions with a well-defined API.
70984 + * In particular, the virtual psr.ic bit must be off; in this way
70985 + * it is guaranteed to never conflict with a linux break instruction.
70986 + * Normally, this is done in a xen stub but this one is frequent enough
70987 + * that we inline it */
70988 +#define xen_hyper_ssm_i()                                              \
70989 +({                                                                     \
70990 +       xen_set_virtual_psr_i(0);                                       \
70991 +       xen_set_virtual_psr_ic(0);                                      \
70992 +       XEN_HYPER_SSM_I;                                                \
70993 +})
70994 +
70995 +/* turning off interrupts can be paravirtualized simply by writing
70996 + * to a memory-mapped virtual psr.i bit (implemented as a 16-bit bool) */
70997 +#define xen_rsm_i()    xen_set_virtual_psr_i(0)
70998 +
70999 +/* turning on interrupts is a bit more complicated.. write to the
71000 + * memory-mapped virtual psr.i bit first (to avoid race condition),
71001 + * then if any interrupts were pending, we have to execute a hyperprivop
71002 + * to ensure the pending interrupt gets delivered; else we're done! */
71003 +#define xen_ssm_i()                                                    \
71004 +({                                                                     \
71005 +       int old = xen_get_virtual_psr_i();                              \
71006 +       xen_set_virtual_psr_i(1);                                       \
71007 +       if (!old && xen_get_virtual_pend()) xen_hyper_ssm_i();          \
71008 +})
71009 +
71010 +#define xen_ia64_intrin_local_irq_restore(x)                           \
71011 +{                                                                      \
71012 +     if (running_on_xen) {                                             \
71013 +       if ((x) & IA64_PSR_I) { xen_ssm_i(); }                          \
71014 +       else { xen_rsm_i(); }                                           \
71015 +    }                                                                  \
71016 +    else __ia64_intrin_local_irq_restore((x));                         \
71017 +}
71018 +
71019 +#define        xen_get_psr_i()                                                 \
71020 +(                                                                      \
71021 +       (running_on_xen) ?                                              \
71022 +               (xen_get_virtual_psr_i() ? IA64_PSR_I : 0)              \
71023 +               : __ia64_get_psr_i()                                    \
71024 +)
71025 +
71026 +#define xen_ia64_ssm(mask)                                             \
71027 +{                                                                      \
71028 +       if ((mask)==IA64_PSR_I) {                                       \
71029 +               if (running_on_xen) { xen_ssm_i(); }                    \
71030 +               else { __ia64_ssm(mask); }                              \
71031 +       }                                                               \
71032 +       else { __ia64_ssm(mask); }                                      \
71033 +}
71034 +
71035 +#define xen_ia64_rsm(mask)                                             \
71036 +{                                                                      \
71037 +       if ((mask)==IA64_PSR_I) {                                       \
71038 +               if (running_on_xen) { xen_rsm_i(); }                    \
71039 +               else { __ia64_rsm(mask); }                              \
71040 +       }                                                               \
71041 +       else { __ia64_rsm(mask); }                                      \
71042 +}
71043 +
71044 +
71045 +/* Although all privileged operations can be left to trap and will
71046 + * be properly handled by Xen, some are frequent enough that we use
71047 + * hyperprivops for performance. */
71048 +
71049 +extern unsigned long xen_get_ivr(void);
71050 +extern unsigned long xen_get_tpr(void);
71051 +extern void xen_set_itm(unsigned long);
71052 +extern void xen_set_tpr(unsigned long);
71053 +extern void xen_eoi(void);
71054 +extern void xen_set_rr(unsigned long index, unsigned long val);
71055 +extern unsigned long xen_get_rr(unsigned long index);
71056 +extern void xen_set_kr(unsigned long index, unsigned long val);
71057 +
71058 +/* Note: It may look wrong to test for running_on_xen in each case.
71059 + * However regnum is always a constant so, as written, the compiler
71060 + * eliminates the switch statement, whereas running_on_xen must be
71061 + * tested dynamically. */
71062 +#define xen_ia64_getreg(regnum)                                                \
71063 +({                                                                     \
71064 +       __u64 ia64_intri_res;                                           \
71065 +                                                                       \
71066 +       switch(regnum) {                                                \
71067 +       case _IA64_REG_CR_IVR:                                          \
71068 +               ia64_intri_res = (running_on_xen) ?                     \
71069 +                       xen_get_ivr() :                                 \
71070 +                       __ia64_getreg(regnum);                          \
71071 +               break;                                                  \
71072 +       case _IA64_REG_CR_TPR:                                          \
71073 +               ia64_intri_res = (running_on_xen) ?                     \
71074 +                       xen_get_tpr() :                                 \
71075 +                       __ia64_getreg(regnum);                          \
71076 +               break;                                                  \
71077 +       case _IA64_REG_AR_EFLAG:                                        \
71078 +               ia64_intri_res = (running_on_xen) ?                     \
71079 +                       xen_get_eflag() :                               \
71080 +                       __ia64_getreg(regnum);                          \
71081 +               break;                                                  \
71082 +       default:                                                        \
71083 +               ia64_intri_res = __ia64_getreg(regnum);                 \
71084 +               break;                                                  \
71085 +       }                                                               \
71086 +       ia64_intri_res;                                                 \
71087 +})
71088 +
71089 +#define xen_ia64_setreg(regnum,val)                                    \
71090 +({                                                                     \
71091 +       switch(regnum) {                                                \
71092 +       case _IA64_REG_AR_KR0 ... _IA64_REG_AR_KR7:                     \
71093 +               (running_on_xen) ?                                      \
71094 +                       xen_set_kr((regnum-_IA64_REG_AR_KR0), val) :    \
71095 +                       __ia64_setreg(regnum,val);                      \
71096 +               break;                                                  \
71097 +       case _IA64_REG_CR_ITM:                                          \
71098 +               (running_on_xen) ?                                      \
71099 +                       xen_set_itm(val) :                              \
71100 +                       __ia64_setreg(regnum,val);                      \
71101 +               break;                                                  \
71102 +       case _IA64_REG_CR_TPR:                                          \
71103 +               (running_on_xen) ?                                      \
71104 +                       xen_set_tpr(val) :                              \
71105 +                       __ia64_setreg(regnum,val);                      \
71106 +               break;                                                  \
71107 +       case _IA64_REG_CR_EOI:                                          \
71108 +               (running_on_xen) ?                                      \
71109 +                       xen_eoi() :                                     \
71110 +                       __ia64_setreg(regnum,val);                      \
71111 +               break;                                                  \
71112 +       case _IA64_REG_AR_EFLAG:                                        \
71113 +               (running_on_xen) ?                                      \
71114 +                       xen_set_eflag(val) :                            \
71115 +                       __ia64_setreg(regnum,val);                      \
71116 +               break;                                                  \
71117 +       default:                                                        \
71118 +               __ia64_setreg(regnum,val);                              \
71119 +               break;                                                  \
71120 +       }                                                               \
71121 +})
71122 +
71123 +#define ia64_ssm                       xen_ia64_ssm
71124 +#define ia64_rsm                       xen_ia64_rsm
71125 +#define ia64_intrin_local_irq_restore  xen_ia64_intrin_local_irq_restore
71126 +#define        ia64_ptcga                      xen_ptcga
71127 +#define        ia64_set_rr(index,val)          xen_set_rr(index,val)
71128 +#define        ia64_get_rr(index)              xen_get_rr(index)
71129 +#define ia64_getreg                    xen_ia64_getreg
71130 +#define ia64_setreg                    xen_ia64_setreg
71131 +#define        ia64_get_psr_i                  xen_get_psr_i
71132 +
71133 +/* the remainder of these are not performance-sensitive so its
71134 + * OK to not paravirtualize and just take a privop trap and emulate */
71135 +#define ia64_hint                      __ia64_hint
71136 +#define ia64_set_pmd                   __ia64_set_pmd
71137 +#define ia64_itci                      __ia64_itci
71138 +#define ia64_itcd                      __ia64_itcd
71139 +#define ia64_itri                      __ia64_itri
71140 +#define ia64_itrd                      __ia64_itrd
71141 +#define ia64_tpa                       __ia64_tpa
71142 +#define ia64_set_ibr                   __ia64_set_ibr
71143 +#define ia64_set_pkr                   __ia64_set_pkr
71144 +#define ia64_set_pmc                   __ia64_set_pmc
71145 +#define ia64_get_ibr                   __ia64_get_ibr
71146 +#define ia64_get_pkr                   __ia64_get_pkr
71147 +#define ia64_get_pmc                   __ia64_get_pmc
71148 +#define ia64_ptce                      __ia64_ptce
71149 +#define ia64_ptcl                      __ia64_ptcl
71150 +#define ia64_ptri                      __ia64_ptri
71151 +#define ia64_ptrd                      __ia64_ptrd
71152 +
71153 +#endif /* !__ASSEMBLY__ */
71154 +
71155 +/* these routines utilize privilege-sensitive or performance-sensitive
71156 + * privileged instructions so the code must be replaced with
71157 + * paravirtualized versions */
71158 +#define ia64_pal_halt_light            xen_pal_halt_light
71159 +#define        ia64_leave_kernel               xen_leave_kernel
71160 +#define        ia64_leave_syscall              xen_leave_syscall
71161 +#define        ia64_trace_syscall              xen_trace_syscall
71162 +#define        ia64_switch_to                  xen_switch_to
71163 +#define        ia64_pal_call_static            xen_pal_call_static
71164 +
71165 +#endif /* _ASM_IA64_XEN_PRIVOP_H */
71166 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-um/page.h linux-2.6.16/include/asm-um/page.h
71167 --- linux-2.6.16.orig/include/asm-um/page.h     2006-03-20 06:53:29.000000000 +0100
71168 +++ linux-2.6.16/include/asm-um/page.h  2006-06-26 09:51:32.000000000 +0200
71169 @@ -118,7 +118,7 @@
71170  extern struct page *arch_validate(struct page *page, gfp_t mask, int order);
71171  #define HAVE_ARCH_VALIDATE
71172  
71173 -extern void arch_free_page(struct page *page, int order);
71174 +extern int arch_free_page(struct page *page, int order);
71175  #define HAVE_ARCH_FREE_PAGE
71176  
71177  #include <asm-generic/page.h>
71178 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/apic.h linux-2.6.16/include/asm-x86_64/apic.h
71179 --- linux-2.6.16.orig/include/asm-x86_64/apic.h 2006-03-20 06:53:29.000000000 +0100
71180 +++ linux-2.6.16/include/asm-x86_64/apic.h      2006-06-26 09:51:32.000000000 +0200
71181 @@ -105,11 +105,13 @@
71182  
71183  extern void setup_threshold_lvt(unsigned long lvt_off);
71184  
71185 +#ifndef CONFIG_XEN
71186  void smp_send_timer_broadcast_ipi(void);
71187  void switch_APIC_timer_to_ipi(void *cpumask);
71188  void switch_ipi_to_APIC_timer(void *cpumask);
71189  
71190  #define ARCH_APICTIMER_STOPS_ON_C3     1
71191 +#endif
71192  
71193  #endif /* CONFIG_X86_LOCAL_APIC */
71194  
71195 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/arch_hooks.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/arch_hooks.h
71196 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/arch_hooks.h      1970-01-01 01:00:00.000000000 +0100
71197 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/arch_hooks.h   2006-06-26 09:51:32.000000000 +0200
71198 @@ -0,0 +1,27 @@
71199 +#ifndef _ASM_ARCH_HOOKS_H
71200 +#define _ASM_ARCH_HOOKS_H
71201 +
71202 +#include <linux/interrupt.h>
71203 +
71204 +/*
71205 + *     linux/include/asm/arch_hooks.h
71206 + *
71207 + *     define the architecture specific hooks 
71208 + */
71209 +
71210 +/* these aren't arch hooks, they are generic routines
71211 + * that can be used by the hooks */
71212 +extern void init_ISA_irqs(void);
71213 +extern void apic_intr_init(void);
71214 +extern void smp_intr_init(void);
71215 +extern irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs);
71216 +
71217 +/* these are the defined hooks */
71218 +extern void intr_init_hook(void);
71219 +extern void pre_intr_init_hook(void);
71220 +extern void pre_setup_arch_hook(void);
71221 +extern void trap_init_hook(void);
71222 +extern void time_init_hook(void);
71223 +extern void mca_nmi_hook(void);
71224 +
71225 +#endif
71226 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/bootsetup.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/bootsetup.h
71227 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/bootsetup.h       1970-01-01 01:00:00.000000000 +0100
71228 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/bootsetup.h    2006-06-26 09:51:32.000000000 +0200
71229 @@ -0,0 +1,42 @@
71230 +
71231 +#ifndef _X86_64_BOOTSETUP_H
71232 +#define _X86_64_BOOTSETUP_H 1
71233 +
71234 +#define BOOT_PARAM_SIZE                4096
71235 +extern char x86_boot_params[BOOT_PARAM_SIZE];
71236 +
71237 +/*
71238 + * This is set up by the setup-routine at boot-time
71239 + */
71240 +#define PARAM  ((unsigned char *)x86_boot_params)
71241 +#define SCREEN_INFO (*(struct screen_info *) (PARAM+0))
71242 +#define EXT_MEM_K (*(unsigned short *) (PARAM+2))
71243 +#define ALT_MEM_K (*(unsigned int *) (PARAM+0x1e0))
71244 +#define E820_MAP_NR (*(char*) (PARAM+E820NR))
71245 +#define E820_MAP    ((struct e820entry *) (PARAM+E820MAP))
71246 +#define APM_BIOS_INFO (*(struct apm_bios_info *) (PARAM+0x40))
71247 +#define DRIVE_INFO (*(struct drive_info_struct *) (PARAM+0x80))
71248 +#define SYS_DESC_TABLE (*(struct sys_desc_table_struct*)(PARAM+0xa0))
71249 +#define MOUNT_ROOT_RDONLY (*(unsigned short *) (PARAM+0x1F2))
71250 +#define RAMDISK_FLAGS (*(unsigned short *) (PARAM+0x1F8))
71251 +#define SAVED_VIDEO_MODE (*(unsigned short *) (PARAM+0x1FA))
71252 +#define ORIG_ROOT_DEV (*(unsigned short *) (PARAM+0x1FC))
71253 +#define AUX_DEVICE_INFO (*(unsigned char *) (PARAM+0x1FF))
71254 +#define LOADER_TYPE (*(unsigned char *) (PARAM+0x210))
71255 +#define KERNEL_START (*(unsigned int *) (PARAM+0x214))
71256 +
71257 +#define INITRD_START (__pa(xen_start_info->mod_start))
71258 +#define INITRD_SIZE (xen_start_info->mod_len)
71259 +#define EDID_INFO   (*(struct edid_info *) (PARAM+0x440))
71260 +
71261 +#define EDD_NR     (*(unsigned char *) (PARAM+EDDNR))
71262 +#define EDD_MBR_SIG_NR (*(unsigned char *) (PARAM+EDD_MBR_SIG_NR_BUF))
71263 +#define EDD_MBR_SIGNATURE ((unsigned int *) (PARAM+EDD_MBR_SIG_BUF))
71264 +#define EDD_BUF     ((struct edd_info *) (PARAM+EDDBUF))
71265 +#define COMMAND_LINE saved_command_line
71266 +
71267 +#define RAMDISK_IMAGE_START_MASK       0x07FF
71268 +#define RAMDISK_PROMPT_FLAG            0x8000
71269 +#define RAMDISK_LOAD_FLAG              0x4000  
71270 +
71271 +#endif
71272 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/desc.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/desc.h
71273 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/desc.h    1970-01-01 01:00:00.000000000 +0100
71274 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/desc.h 2006-06-26 09:51:32.000000000 +0200
71275 @@ -0,0 +1,263 @@
71276 +/* Written 2000 by Andi Kleen */ 
71277 +#ifndef __ARCH_DESC_H
71278 +#define __ARCH_DESC_H
71279 +
71280 +#include <linux/threads.h>
71281 +#include <asm/ldt.h>
71282 +
71283 +#ifndef __ASSEMBLY__
71284 +
71285 +#include <linux/string.h>
71286 +#include <linux/smp.h>
71287 +
71288 +#include <asm/segment.h>
71289 +#include <asm/mmu.h>
71290 +
71291 +// 8 byte segment descriptor
71292 +struct desc_struct { 
71293 +       u16 limit0;
71294 +       u16 base0;
71295 +       unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1;
71296 +       unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8;
71297 +} __attribute__((packed)); 
71298 +
71299 +struct n_desc_struct { 
71300 +       unsigned int a,b;
71301 +};     
71302 +
71303 +enum { 
71304 +       GATE_INTERRUPT = 0xE, 
71305 +       GATE_TRAP = 0xF,        
71306 +       GATE_CALL = 0xC,
71307 +};     
71308 +
71309 +// 16byte gate
71310 +struct gate_struct {          
71311 +       u16 offset_low;
71312 +       u16 segment; 
71313 +       unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
71314 +       u16 offset_middle;
71315 +       u32 offset_high;
71316 +       u32 zero1; 
71317 +} __attribute__((packed));
71318 +
71319 +#define PTR_LOW(x) ((unsigned long)(x) & 0xFFFF) 
71320 +#define PTR_MIDDLE(x) (((unsigned long)(x) >> 16) & 0xFFFF)
71321 +#define PTR_HIGH(x) ((unsigned long)(x) >> 32)
71322 +
71323 +enum { 
71324 +       DESC_TSS = 0x9,
71325 +       DESC_LDT = 0x2,
71326 +}; 
71327 +
71328 +// LDT or TSS descriptor in the GDT. 16 bytes.
71329 +struct ldttss_desc { 
71330 +       u16 limit0;
71331 +       u16 base0;
71332 +       unsigned base1 : 8, type : 5, dpl : 2, p : 1;
71333 +       unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
71334 +       u32 base3;
71335 +       u32 zero1; 
71336 +} __attribute__((packed)); 
71337 +
71338 +struct desc_ptr {
71339 +       unsigned short size;
71340 +       unsigned long address;
71341 +} __attribute__((packed)) ;
71342 +
71343 +extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS];
71344 +
71345 +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
71346 +
71347 +#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8))
71348 +#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8))
71349 +
71350 +static inline void clear_LDT(void)
71351 +{
71352 +       int cpu = get_cpu();
71353 +
71354 +       /*
71355 +        * NB. We load the default_ldt for lcall7/27 handling on demand, as
71356 +        * it slows down context switching. Noone uses it anyway.
71357 +        */
71358 +       cpu = cpu;              /* XXX avoid compiler warning */
71359 +       xen_set_ldt(0UL, 0);
71360 +       put_cpu();
71361 +}
71362 +
71363 +/*
71364 + * This is the ldt that every process will get unless we need
71365 + * something other than this.
71366 + */
71367 +extern struct desc_struct default_ldt[];
71368 +#ifndef CONFIG_X86_NO_IDT
71369 +extern struct gate_struct idt_table[]; 
71370 +#endif
71371 +extern struct desc_ptr cpu_gdt_descr[];
71372 +
71373 +/* the cpu gdt accessor */
71374 +#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)
71375 +
71376 +static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist)  
71377 +{
71378 +       struct gate_struct s;   
71379 +       s.offset_low = PTR_LOW(func); 
71380 +       s.segment = __KERNEL_CS;
71381 +       s.ist = ist; 
71382 +       s.p = 1;
71383 +       s.dpl = dpl; 
71384 +       s.zero0 = 0;
71385 +       s.zero1 = 0; 
71386 +       s.type = type; 
71387 +       s.offset_middle = PTR_MIDDLE(func); 
71388 +       s.offset_high = PTR_HIGH(func); 
71389 +       /* does not need to be atomic because it is only done once at setup time */ 
71390 +       memcpy(adr, &s, 16); 
71391 +} 
71392 +
71393 +#ifndef CONFIG_X86_NO_IDT
71394 +static inline void set_intr_gate(int nr, void *func) 
71395 +{ 
71396 +       BUG_ON((unsigned)nr > 0xFF);
71397 +       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0); 
71398 +} 
71399 +
71400 +static inline void set_intr_gate_ist(int nr, void *func, unsigned ist) 
71401 +{ 
71402 +       BUG_ON((unsigned)nr > 0xFF);
71403 +       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist); 
71404 +} 
71405 +
71406 +static inline void set_system_gate(int nr, void *func) 
71407 +{ 
71408 +       BUG_ON((unsigned)nr > 0xFF);
71409 +       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0); 
71410 +} 
71411 +
71412 +static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
71413 +{
71414 +       _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist);
71415 +}
71416 +#endif
71417 +
71418 +static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type, 
71419 +                                        unsigned size) 
71420 +{ 
71421 +       struct ldttss_desc d;
71422 +       memset(&d,0,sizeof(d)); 
71423 +       d.limit0 = size & 0xFFFF;
71424 +       d.base0 = PTR_LOW(tss); 
71425 +       d.base1 = PTR_MIDDLE(tss) & 0xFF; 
71426 +       d.type = type;
71427 +       d.p = 1; 
71428 +       d.limit1 = (size >> 16) & 0xF;
71429 +       d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF; 
71430 +       d.base3 = PTR_HIGH(tss); 
71431 +       memcpy(ptr, &d, 16); 
71432 +}
71433 +
71434 +#ifndef CONFIG_X86_NO_TSS
71435 +static inline void set_tss_desc(unsigned cpu, void *addr)
71436 +{ 
71437 +       /*
71438 +        * sizeof(unsigned long) coming from an extra "long" at the end
71439 +        * of the iobitmap. See tss_struct definition in processor.h
71440 +        *
71441 +        * -1? seg base+limit should be pointing to the address of the
71442 +        * last valid byte
71443 +        */
71444 +       set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS], 
71445 +               (unsigned long)addr, DESC_TSS,
71446 +               IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1);
71447 +} 
71448 +#endif
71449 +
71450 +static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
71451 +{ 
71452 +       set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr,
71453 +                             DESC_LDT, size * 8 - 1);
71454 +}
71455 +
71456 +static inline void set_seg_base(unsigned cpu, int entry, void *base)
71457 +{ 
71458 +       struct desc_struct *d = &cpu_gdt(cpu)[entry];
71459 +       u32 addr = (u32)(u64)base;
71460 +       BUG_ON((u64)base >> 32); 
71461 +       d->base0 = addr & 0xffff;
71462 +       d->base1 = (addr >> 16) & 0xff;
71463 +       d->base2 = (addr >> 24) & 0xff;
71464 +} 
71465 +
71466 +#define LDT_entry_a(info) \
71467 +       ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
71468 +/* Don't allow setting of the lm bit. It is useless anyways because 
71469 +   64bit system calls require __USER_CS. */ 
71470 +#define LDT_entry_b(info) \
71471 +       (((info)->base_addr & 0xff000000) | \
71472 +       (((info)->base_addr & 0x00ff0000) >> 16) | \
71473 +       ((info)->limit & 0xf0000) | \
71474 +       (((info)->read_exec_only ^ 1) << 9) | \
71475 +       ((info)->contents << 10) | \
71476 +       (((info)->seg_not_present ^ 1) << 15) | \
71477 +       ((info)->seg_32bit << 22) | \
71478 +       ((info)->limit_in_pages << 23) | \
71479 +       ((info)->useable << 20) | \
71480 +       /* ((info)->lm << 21) | */ \
71481 +       0x7000)
71482 +
71483 +#define LDT_empty(info) (\
71484 +       (info)->base_addr       == 0    && \
71485 +       (info)->limit           == 0    && \
71486 +       (info)->contents        == 0    && \
71487 +       (info)->read_exec_only  == 1    && \
71488 +       (info)->seg_32bit       == 0    && \
71489 +       (info)->limit_in_pages  == 0    && \
71490 +       (info)->seg_not_present == 1    && \
71491 +       (info)->useable         == 0    && \
71492 +       (info)->lm              == 0)
71493 +
71494 +#if TLS_SIZE != 24
71495 +# error update this code.
71496 +#endif
71497 +
71498 +static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
71499 +{
71500 +#if 0
71501 +       u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
71502 +       gdt[0] = t->tls_array[0];
71503 +       gdt[1] = t->tls_array[1];
71504 +       gdt[2] = t->tls_array[2];
71505 +#endif
71506 +#define C(i) \
71507 +       HYPERVISOR_update_descriptor(virt_to_machine(&cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]), t->tls_array[i])
71508 +
71509 +       C(0); C(1); C(2);
71510 +#undef C
71511 +} 
71512 +
71513 +/*
71514 + * load one particular LDT into the current CPU
71515 + */
71516 +static inline void load_LDT_nolock (mm_context_t *pc, int cpu)
71517 +{
71518 +       void *segments = pc->ldt;
71519 +       int count = pc->size;
71520 +
71521 +       if (likely(!count))
71522 +               segments = NULL;
71523 +
71524 +       xen_set_ldt((unsigned long)segments, count);
71525 +}
71526 +
71527 +static inline void load_LDT(mm_context_t *pc)
71528 +{
71529 +       int cpu = get_cpu();
71530 +       load_LDT_nolock(pc, cpu);
71531 +       put_cpu();
71532 +}
71533 +
71534 +extern struct desc_ptr idt_descr;
71535 +
71536 +#endif /* !__ASSEMBLY__ */
71537 +
71538 +#endif
71539 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/dma-mapping.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/dma-mapping.h
71540 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/dma-mapping.h     1970-01-01 01:00:00.000000000 +0100
71541 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/dma-mapping.h  2006-06-26 09:51:32.000000000 +0200
71542 @@ -0,0 +1,191 @@
71543 +#ifndef _X8664_DMA_MAPPING_H
71544 +#define _X8664_DMA_MAPPING_H 1
71545 +
71546 +/*
71547 + * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
71548 + * documentation.
71549 + */
71550 +
71551 +#include <linux/config.h>
71552 +
71553 +#include <asm/scatterlist.h>
71554 +#include <asm/io.h>
71555 +#include <asm/swiotlb.h>
71556 +
71557 +struct dma_mapping_ops {
71558 +       int             (*mapping_error)(dma_addr_t dma_addr);
71559 +       void*           (*alloc_coherent)(struct device *dev, size_t size,
71560 +                                dma_addr_t *dma_handle, gfp_t gfp);
71561 +       void            (*free_coherent)(struct device *dev, size_t size,
71562 +                                void *vaddr, dma_addr_t dma_handle);
71563 +       dma_addr_t      (*map_single)(struct device *hwdev, void *ptr,
71564 +                                size_t size, int direction);
71565 +       /* like map_single, but doesn't check the device mask */
71566 +       dma_addr_t      (*map_simple)(struct device *hwdev, char *ptr,
71567 +                                size_t size, int direction);
71568 +       void            (*unmap_single)(struct device *dev, dma_addr_t addr,
71569 +                               size_t size, int direction);
71570 +       void            (*sync_single_for_cpu)(struct device *hwdev,
71571 +                               dma_addr_t dma_handle, size_t size,
71572 +                               int direction);
71573 +       void            (*sync_single_for_device)(struct device *hwdev,
71574 +                                dma_addr_t dma_handle, size_t size,
71575 +                               int direction);
71576 +       void            (*sync_single_range_for_cpu)(struct device *hwdev,
71577 +                                dma_addr_t dma_handle, unsigned long offset,
71578 +                               size_t size, int direction);
71579 +       void            (*sync_single_range_for_device)(struct device *hwdev,
71580 +                               dma_addr_t dma_handle, unsigned long offset,
71581 +                               size_t size, int direction);
71582 +       void            (*sync_sg_for_cpu)(struct device *hwdev,
71583 +                                struct scatterlist *sg, int nelems,
71584 +                               int direction);
71585 +       void            (*sync_sg_for_device)(struct device *hwdev,
71586 +                               struct scatterlist *sg, int nelems,
71587 +                               int direction);
71588 +       int             (*map_sg)(struct device *hwdev, struct scatterlist *sg,
71589 +                               int nents, int direction);
71590 +       void            (*unmap_sg)(struct device *hwdev,
71591 +                               struct scatterlist *sg, int nents,
71592 +                               int direction);
71593 +       int             (*dma_supported)(struct device *hwdev, u64 mask);
71594 +       int             is_phys;
71595 +};
71596 +
71597 +extern dma_addr_t bad_dma_address;
71598 +extern struct dma_mapping_ops* dma_ops;
71599 +extern int iommu_merge;
71600 +
71601 +#if 0
71602 +static inline int dma_mapping_error(dma_addr_t dma_addr)
71603 +{
71604 +       if (dma_ops->mapping_error)
71605 +               return dma_ops->mapping_error(dma_addr);
71606 +
71607 +       return (dma_addr == bad_dma_address);
71608 +}
71609 +
71610 +extern void *dma_alloc_coherent(struct device *dev, size_t size,
71611 +                               dma_addr_t *dma_handle, gfp_t gfp);
71612 +extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
71613 +                             dma_addr_t dma_handle);
71614 +
71615 +static inline dma_addr_t
71616 +dma_map_single(struct device *hwdev, void *ptr, size_t size,
71617 +              int direction)
71618 +{
71619 +       return dma_ops->map_single(hwdev, ptr, size, direction);
71620 +}
71621 +
71622 +static inline void
71623 +dma_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
71624 +                int direction)
71625 +{
71626 +       dma_ops->unmap_single(dev, addr, size, direction);
71627 +}
71628 +
71629 +#define dma_map_page(dev,page,offset,size,dir) \
71630 +       dma_map_single((dev), page_address(page)+(offset), (size), (dir))
71631 +
71632 +#define dma_unmap_page dma_unmap_single
71633 +
71634 +static inline void
71635 +dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
71636 +                       size_t size, int direction)
71637 +{
71638 +       if (dma_ops->sync_single_for_cpu)
71639 +               dma_ops->sync_single_for_cpu(hwdev, dma_handle, size,
71640 +                                            direction);
71641 +       flush_write_buffers();
71642 +}
71643 +
71644 +static inline void
71645 +dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
71646 +                          size_t size, int direction)
71647 +{
71648 +       if (dma_ops->sync_single_for_device)
71649 +               dma_ops->sync_single_for_device(hwdev, dma_handle, size,
71650 +                                               direction);
71651 +       flush_write_buffers();
71652 +}
71653 +
71654 +static inline void
71655 +dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
71656 +                             unsigned long offset, size_t size, int direction)
71657 +{
71658 +       if (dma_ops->sync_single_range_for_cpu) {
71659 +               dma_ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, size, direction);
71660 +       }
71661 +
71662 +       flush_write_buffers();
71663 +}
71664 +
71665 +static inline void
71666 +dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
71667 +                                unsigned long offset, size_t size, int direction)
71668 +{
71669 +       if (dma_ops->sync_single_range_for_device)
71670 +               dma_ops->sync_single_range_for_device(hwdev, dma_handle,
71671 +                                                     offset, size, direction);
71672 +
71673 +       flush_write_buffers();
71674 +}
71675 +
71676 +static inline void
71677 +dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
71678 +                   int nelems, int direction)
71679 +{
71680 +       if (dma_ops->sync_sg_for_cpu)
71681 +               dma_ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
71682 +       flush_write_buffers();
71683 +}
71684 +
71685 +static inline void
71686 +dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
71687 +                      int nelems, int direction)
71688 +{
71689 +       if (dma_ops->sync_sg_for_device) {
71690 +               dma_ops->sync_sg_for_device(hwdev, sg, nelems, direction);
71691 +       }
71692 +
71693 +       flush_write_buffers();
71694 +}
71695 +
71696 +static inline int
71697 +dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction)
71698 +{
71699 +       return dma_ops->map_sg(hwdev, sg, nents, direction);
71700 +}
71701 +
71702 +static inline void
71703 +dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
71704 +            int direction)
71705 +{
71706 +       dma_ops->unmap_sg(hwdev, sg, nents, direction);
71707 +}
71708 +
71709 +extern int dma_supported(struct device *hwdev, u64 mask);
71710 +
71711 +/* same for gart, swiotlb, and nommu */
71712 +static inline int dma_get_cache_alignment(void)
71713 +{
71714 +       return boot_cpu_data.x86_clflush_size;
71715 +}
71716 +
71717 +#define dma_is_consistent(h) 1
71718 +
71719 +extern int dma_set_mask(struct device *dev, u64 mask);
71720 +
71721 +static inline void
71722 +dma_cache_sync(void *vaddr, size_t size, enum dma_data_direction dir)
71723 +{
71724 +       flush_write_buffers();
71725 +}
71726 +
71727 +extern struct device fallback_dev;
71728 +extern int panic_on_overflow;
71729 +#endif
71730 +
71731 +#endif /* _X8664_DMA_MAPPING_H */
71732 +
71733 +#include <asm-i386/mach-xen/asm/dma-mapping.h>
71734 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/fixmap.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/fixmap.h
71735 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/fixmap.h  1970-01-01 01:00:00.000000000 +0100
71736 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/fixmap.h       2006-06-26 09:51:32.000000000 +0200
71737 @@ -0,0 +1,109 @@
71738 +/*
71739 + * fixmap.h: compile-time virtual memory allocation
71740 + *
71741 + * This file is subject to the terms and conditions of the GNU General Public
71742 + * License.  See the file "COPYING" in the main directory of this archive
71743 + * for more details.
71744 + *
71745 + * Copyright (C) 1998 Ingo Molnar
71746 + */
71747 +
71748 +#ifndef _ASM_FIXMAP_H
71749 +#define _ASM_FIXMAP_H
71750 +
71751 +#include <linux/config.h>
71752 +#include <linux/kernel.h>
71753 +#include <asm/apicdef.h>
71754 +#include <xen/gnttab.h>
71755 +#include <asm/page.h>
71756 +#include <asm/vsyscall.h>
71757 +#include <asm/vsyscall32.h>
71758 +#include <asm/acpi.h>
71759 +
71760 +/*
71761 + * Here we define all the compile-time 'special' virtual
71762 + * addresses. The point is to have a constant address at
71763 + * compile time, but to set the physical address only
71764 + * in the boot process.
71765 + *
71766 + * these 'compile-time allocated' memory buffers are
71767 + * fixed-size 4k pages. (or larger if used with an increment
71768 + * highger than 1) use fixmap_set(idx,phys) to associate
71769 + * physical memory with fixmap indices.
71770 + *
71771 + * TLB entries of such buffers will not be flushed across
71772 + * task switches.
71773 + */
71774 +
71775 +enum fixed_addresses {
71776 +       VSYSCALL_LAST_PAGE,
71777 +       VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
71778 +       VSYSCALL_HPET,
71779 +       FIX_HPET_BASE,
71780 +#ifdef CONFIG_X86_LOCAL_APIC
71781 +       FIX_APIC_BASE,  /* local (CPU) APIC) -- required for SMP or not */
71782 +#endif
71783 +#ifdef CONFIG_X86_IO_APIC
71784 +       FIX_IO_APIC_BASE_0,
71785 +       FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
71786 +#endif
71787 +#ifdef CONFIG_ACPI
71788 +       FIX_ACPI_BEGIN,
71789 +       FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
71790 +#endif
71791 +       FIX_SHARED_INFO,
71792 +#define NR_FIX_ISAMAPS 256
71793 +       FIX_ISAMAP_END,
71794 +       FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
71795 +       __end_of_fixed_addresses
71796 +};
71797 +
71798 +extern void __set_fixmap (enum fixed_addresses idx,
71799 +                                       unsigned long phys, pgprot_t flags);
71800 +
71801 +#define set_fixmap(idx, phys) \
71802 +               __set_fixmap(idx, phys, PAGE_KERNEL)
71803 +/*
71804 + * Some hardware wants to get fixmapped without caching.
71805 + */
71806 +#define set_fixmap_nocache(idx, phys) \
71807 +               __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
71808 +
71809 +#define clear_fixmap(idx) \
71810 +                __set_fixmap(idx, 0, __pgprot(0))
71811 +
71812 +#define FIXADDR_TOP    (VSYSCALL_END-PAGE_SIZE)
71813 +#define FIXADDR_SIZE   (__end_of_fixed_addresses << PAGE_SHIFT)
71814 +#define FIXADDR_START  (FIXADDR_TOP - FIXADDR_SIZE)
71815 +
71816 +/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
71817 +#define FIXADDR_USER_START     ((unsigned long)VSYSCALL32_VSYSCALL)
71818 +#define FIXADDR_USER_END       (FIXADDR_USER_START + PAGE_SIZE)
71819 +
71820 +#define __fix_to_virt(x)       (FIXADDR_TOP - ((x) << PAGE_SHIFT))
71821 +
71822 +extern void __this_fixmap_does_not_exist(void);
71823 +
71824 +/*
71825 + * 'index to address' translation. If anyone tries to use the idx
71826 + * directly without translation, we catch the bug with a NULL-deference
71827 + * kernel oops. Illegal ranges of incoming indices are caught too.
71828 + */
71829 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
71830 +{
71831 +       /*
71832 +        * this branch gets completely eliminated after inlining,
71833 +        * except when someone tries to use fixaddr indices in an
71834 +        * illegal way. (such as mixing up address types or using
71835 +        * out-of-range indices).
71836 +        *
71837 +        * If it doesn't get removed, the linker will complain
71838 +        * loudly with a reasonably clear error message..
71839 +        */
71840 +       if (idx >= __end_of_fixed_addresses)
71841 +               __this_fixmap_does_not_exist();
71842 +
71843 +        return __fix_to_virt(idx);
71844 +}
71845 +
71846 +#endif
71847 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/floppy.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/floppy.h
71848 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/floppy.h  1970-01-01 01:00:00.000000000 +0100
71849 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/floppy.h       2006-06-26 09:51:32.000000000 +0200
71850 @@ -0,0 +1,206 @@
71851 +/*
71852 + * Architecture specific parts of the Floppy driver
71853 + *
71854 + * This file is subject to the terms and conditions of the GNU General Public
71855 + * License.  See the file "COPYING" in the main directory of this archive
71856 + * for more details.
71857 + *
71858 + * Copyright (C) 1995
71859 + *
71860 + * Modifications for Xen are Copyright (c) 2004, Keir Fraser.
71861 + */
71862 +#ifndef __ASM_XEN_X86_64_FLOPPY_H
71863 +#define __ASM_XEN_X86_64_FLOPPY_H
71864 +
71865 +#include <linux/vmalloc.h>
71866 +
71867 +/*
71868 + * The DMA channel used by the floppy controller cannot access data at
71869 + * addresses >= 16MB
71870 + *
71871 + * Went back to the 1MB limit, as some people had problems with the floppy
71872 + * driver otherwise. It doesn't matter much for performance anyway, as most
71873 + * floppy accesses go through the track buffer.
71874 + */
71875 +#define _CROSS_64KB(a,s,vdma) \
71876 +(!(vdma) && ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64))
71877 +
71878 +/* XEN: Hit DMA paths on the head. This trick from asm-m68k/floppy.h. */
71879 +#include <asm/dma.h>
71880 +#undef MAX_DMA_ADDRESS
71881 +#define MAX_DMA_ADDRESS 0
71882 +#define CROSS_64KB(a,s) (0)
71883 +
71884 +#define fd_inb(port)                   inb_p(port)
71885 +#define fd_outb(value,port)            outb_p(value,port)
71886 +
71887 +#define fd_request_dma()        (0)
71888 +#define fd_free_dma()           ((void)0)
71889 +#define fd_enable_irq()         enable_irq(FLOPPY_IRQ)
71890 +#define fd_disable_irq()        disable_irq(FLOPPY_IRQ)
71891 +#define fd_free_irq()          free_irq(FLOPPY_IRQ, NULL)
71892 +#define fd_get_dma_residue()    vdma_get_dma_residue(FLOPPY_DMA)
71893 +/*
71894 + * Do not use vmalloc/vfree: floppy_release_irq_and_dma() gets called from
71895 + * softirq context via motor_off_callback. A generic bug we happen to trigger.
71896 + */
71897 +#define fd_dma_mem_alloc(size) __get_free_pages(GFP_KERNEL, get_order(size))
71898 +#define fd_dma_mem_free(addr, size) free_pages(addr, get_order(size))
71899 +#define fd_dma_setup(addr, size, mode, io) vdma_dma_setup(addr, size, mode, io)
71900 +
71901 +static int virtual_dma_count;
71902 +static int virtual_dma_residue;
71903 +static char *virtual_dma_addr;
71904 +static int virtual_dma_mode;
71905 +static int doing_pdma;
71906 +
71907 +static irqreturn_t floppy_hardint(int irq, void *dev_id, struct pt_regs * regs)
71908 +{
71909 +       register unsigned char st;
71910 +
71911 +#undef TRACE_FLPY_INT
71912 +
71913 +#ifdef TRACE_FLPY_INT
71914 +       static int calls=0;
71915 +       static int bytes=0;
71916 +       static int dma_wait=0;
71917 +#endif
71918 +       if (!doing_pdma)
71919 +               return floppy_interrupt(irq, dev_id, regs);
71920 +
71921 +#ifdef TRACE_FLPY_INT
71922 +       if(!calls)
71923 +               bytes = virtual_dma_count;
71924 +#endif
71925 +
71926 +       {
71927 +               register int lcount;
71928 +               register char *lptr;
71929 +
71930 +               st = 1;
71931 +               for(lcount=virtual_dma_count, lptr=virtual_dma_addr; 
71932 +                   lcount; lcount--, lptr++) {
71933 +                       st=inb(virtual_dma_port+4) & 0xa0 ;
71934 +                       if(st != 0xa0) 
71935 +                               break;
71936 +                       if(virtual_dma_mode)
71937 +                               outb_p(*lptr, virtual_dma_port+5);
71938 +                       else
71939 +                               *lptr = inb_p(virtual_dma_port+5);
71940 +               }
71941 +               virtual_dma_count = lcount;
71942 +               virtual_dma_addr = lptr;
71943 +               st = inb(virtual_dma_port+4);
71944 +       }
71945 +
71946 +#ifdef TRACE_FLPY_INT
71947 +       calls++;
71948 +#endif
71949 +       if(st == 0x20)
71950 +               return IRQ_HANDLED;
71951 +       if(!(st & 0x20)) {
71952 +               virtual_dma_residue += virtual_dma_count;
71953 +               virtual_dma_count=0;
71954 +#ifdef TRACE_FLPY_INT
71955 +               printk("count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n", 
71956 +                      virtual_dma_count, virtual_dma_residue, calls, bytes,
71957 +                      dma_wait);
71958 +               calls = 0;
71959 +               dma_wait=0;
71960 +#endif
71961 +               doing_pdma = 0;
71962 +               floppy_interrupt(irq, dev_id, regs);
71963 +               return IRQ_HANDLED;
71964 +       }
71965 +#ifdef TRACE_FLPY_INT
71966 +       if(!virtual_dma_count)
71967 +               dma_wait++;
71968 +#endif
71969 +       return IRQ_HANDLED;
71970 +}
71971 +
71972 +static void fd_disable_dma(void)
71973 +{
71974 +       doing_pdma = 0;
71975 +       virtual_dma_residue += virtual_dma_count;
71976 +       virtual_dma_count=0;
71977 +}
71978 +
71979 +static int vdma_get_dma_residue(unsigned int dummy)
71980 +{
71981 +       return virtual_dma_count + virtual_dma_residue;
71982 +}
71983 +
71984 +
71985 +static int fd_request_irq(void)
71986 +{
71987 +       return request_irq(FLOPPY_IRQ, floppy_hardint,SA_INTERRUPT,
71988 +                                          "floppy", NULL);
71989 +}
71990 +
71991 +#if 0
71992 +static unsigned long vdma_mem_alloc(unsigned long size)
71993 +{
71994 +       return (unsigned long) vmalloc(size);
71995 +
71996 +}
71997 +
71998 +static void vdma_mem_free(unsigned long addr, unsigned long size)
71999 +{
72000 +       vfree((void *)addr);
72001 +}
72002 +#endif
72003 +
72004 +static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io)
72005 +{
72006 +       doing_pdma = 1;
72007 +       virtual_dma_port = io;
72008 +       virtual_dma_mode = (mode  == DMA_MODE_WRITE);
72009 +       virtual_dma_addr = addr;
72010 +       virtual_dma_count = size;
72011 +       virtual_dma_residue = 0;
72012 +       return 0;
72013 +}
72014 +
72015 +/* XEN: This trick to force 'virtual DMA' is from include/asm-m68k/floppy.h. */
72016 +#define FDC1 xen_floppy_init()
72017 +static int FDC2 = -1;
72018 +
72019 +static int xen_floppy_init(void)
72020 +{
72021 +       use_virtual_dma = 1;
72022 +       can_use_virtual_dma = 1;
72023 +       return 0x3f0;
72024 +}
72025 +
72026 +/*
72027 + * Floppy types are stored in the rtc's CMOS RAM and so rtc_lock
72028 + * is needed to prevent corrupted CMOS RAM in case "insmod floppy"
72029 + * coincides with another rtc CMOS user.               Paul G.
72030 + */
72031 +#define FLOPPY0_TYPE   ({                              \
72032 +       unsigned long flags;                            \
72033 +       unsigned char val;                              \
72034 +       spin_lock_irqsave(&rtc_lock, flags);            \
72035 +       val = (CMOS_READ(0x10) >> 4) & 15;              \
72036 +       spin_unlock_irqrestore(&rtc_lock, flags);       \
72037 +       val;                                            \
72038 +})
72039 +
72040 +#define FLOPPY1_TYPE   ({                              \
72041 +       unsigned long flags;                            \
72042 +       unsigned char val;                              \
72043 +       spin_lock_irqsave(&rtc_lock, flags);            \
72044 +       val = CMOS_READ(0x10) & 15;                     \
72045 +       spin_unlock_irqrestore(&rtc_lock, flags);       \
72046 +       val;                                            \
72047 +})
72048 +
72049 +#define N_FDC 2
72050 +#define N_DRIVE 8
72051 +
72052 +#define FLOPPY_MOTOR_MASK 0xf0
72053 +
72054 +#define EXTRA_FLOPPY_PARAMS
72055 +
72056 +#endif /* __ASM_XEN_X86_64_FLOPPY_H */
72057 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/hw_irq.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/hw_irq.h
72058 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/hw_irq.h  1970-01-01 01:00:00.000000000 +0100
72059 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/hw_irq.h       2006-06-26 09:51:32.000000000 +0200
72060 @@ -0,0 +1,141 @@
72061 +#ifndef _ASM_HW_IRQ_H
72062 +#define _ASM_HW_IRQ_H
72063 +
72064 +/*
72065 + *     linux/include/asm/hw_irq.h
72066 + *
72067 + *     (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
72068 + *
72069 + *     moved some of the old arch/i386/kernel/irq.h to here. VY
72070 + *
72071 + *     IRQ/IPI changes taken from work by Thomas Radke
72072 + *     <tomsoft@informatik.tu-chemnitz.de>
72073 + *
72074 + *     hacked by Andi Kleen for x86-64.
72075 + * 
72076 + *  $Id$
72077 + */
72078 +
72079 +#ifndef __ASSEMBLY__
72080 +#include <linux/config.h>
72081 +#include <asm/atomic.h>
72082 +#include <asm/irq.h>
72083 +#include <linux/profile.h>
72084 +#include <linux/smp.h>
72085 +
72086 +struct hw_interrupt_type;
72087 +#endif
72088 +
72089 +#define NMI_VECTOR             0x02
72090 +/*
72091 + * IDT vectors usable for external interrupt sources start
72092 + * at 0x20:
72093 + */
72094 +#define FIRST_EXTERNAL_VECTOR  0x20
72095 +
72096 +#define IA32_SYSCALL_VECTOR    0x80
72097 +
72098 +
72099 +/*
72100 + * Vectors 0x20-0x2f are used for ISA interrupts.
72101 + */
72102 +
72103 +/*
72104 + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
72105 + *
72106 + *  some of the following vectors are 'rare', they are merged
72107 + *  into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
72108 + *  TLB, reschedule and local APIC vectors are performance-critical.
72109 + */
72110 +#ifndef CONFIG_XEN
72111 +#define SPURIOUS_APIC_VECTOR   0xff
72112 +#define ERROR_APIC_VECTOR      0xfe
72113 +#define RESCHEDULE_VECTOR      0xfd
72114 +#define CALL_FUNCTION_VECTOR   0xfc
72115 +/* fb free - please don't readd KDB here because it's useless
72116 +   (hint - think what a NMI bit does to a vector) */
72117 +#define THERMAL_APIC_VECTOR    0xfa
72118 +#define THRESHOLD_APIC_VECTOR   0xf9
72119 +/* f8 free */
72120 +#define INVALIDATE_TLB_VECTOR_END      0xf7
72121 +#define INVALIDATE_TLB_VECTOR_START    0xf0    /* f0-f7 used for TLB flush */
72122 +
72123 +#define NUM_INVALIDATE_TLB_VECTORS     8
72124 +#endif
72125 +
72126 +/*
72127 + * Local APIC timer IRQ vector is on a different priority level,
72128 + * to work around the 'lost local interrupt if more than 2 IRQ
72129 + * sources per level' errata.
72130 + */
72131 +#define LOCAL_TIMER_VECTOR     0xef
72132 +
72133 +/*
72134 + * First APIC vector available to drivers: (vectors 0x30-0xee)
72135 + * we start at 0x31 to spread out vectors evenly between priority
72136 + * levels. (0x80 is the syscall vector)
72137 + */
72138 +#define FIRST_DEVICE_VECTOR    0x31
72139 +#define FIRST_SYSTEM_VECTOR    0xef   /* duplicated in irq.h */
72140 +
72141 +
72142 +#ifndef __ASSEMBLY__
72143 +extern u8 irq_vector[NR_IRQ_VECTORS];
72144 +#define IO_APIC_VECTOR(irq)    (irq_vector[irq])
72145 +#define AUTO_ASSIGN            -1
72146 +
72147 +/*
72148 + * Various low-level irq details needed by irq.c, process.c,
72149 + * time.c, io_apic.c and smp.c
72150 + *
72151 + * Interrupt entry/exit code at both C and assembly level
72152 + */
72153 +
72154 +extern void disable_8259A_irq(unsigned int irq);
72155 +extern void enable_8259A_irq(unsigned int irq);
72156 +extern int i8259A_irq_pending(unsigned int irq);
72157 +extern void make_8259A_irq(unsigned int irq);
72158 +extern void init_8259A(int aeoi);
72159 +extern void FASTCALL(send_IPI_self(int vector));
72160 +extern void init_VISWS_APIC_irqs(void);
72161 +extern void setup_IO_APIC(void);
72162 +extern void disable_IO_APIC(void);
72163 +extern void print_IO_APIC(void);
72164 +extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
72165 +extern void send_IPI(int dest, int vector);
72166 +extern void setup_ioapic_dest(void);
72167 +
72168 +extern unsigned long io_apic_irqs;
72169 +
72170 +extern atomic_t irq_err_count;
72171 +extern atomic_t irq_mis_count;
72172 +
72173 +#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs))
72174 +
72175 +#define __STR(x) #x
72176 +#define STR(x) __STR(x)
72177 +
72178 +#include <asm/ptrace.h>
72179 +
72180 +#define IRQ_NAME2(nr) nr##_interrupt(void)
72181 +#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
72182 +
72183 +/*
72184 + *     SMP has a few special interrupts for IPI messages
72185 + */
72186 +
72187 +#define BUILD_IRQ(nr) \
72188 +asmlinkage void IRQ_NAME(nr); \
72189 +__asm__( \
72190 +"\n.p2align\n" \
72191 +"IRQ" #nr "_interrupt:\n\t" \
72192 +       "push $" #nr "-256 ; " \
72193 +       "jmp common_interrupt");
72194 +
72195 +extern void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i);
72196 +
72197 +#define platform_legacy_irq(irq)       ((irq) < 16)
72198 +
72199 +#endif
72200 +
72201 +#endif /* _ASM_HW_IRQ_H */
72202 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/hypercall.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/hypercall.h
72203 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/hypercall.h       1970-01-01 01:00:00.000000000 +0100
72204 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/hypercall.h    2006-06-26 09:51:32.000000000 +0200
72205 @@ -0,0 +1,343 @@
72206 +/******************************************************************************
72207 + * hypercall.h
72208 + * 
72209 + * Linux-specific hypervisor handling.
72210 + * 
72211 + * Copyright (c) 2002-2004, K A Fraser
72212 + * 
72213 + * 64-bit updates:
72214 + *   Benjamin Liu <benjamin.liu@intel.com>
72215 + *   Jun Nakajima <jun.nakajima@intel.com>
72216 + * 
72217 + * This program is free software; you can redistribute it and/or
72218 + * modify it under the terms of the GNU General Public License version 2
72219 + * as published by the Free Software Foundation; or, when distributed
72220 + * separately from the Linux kernel or incorporated into other
72221 + * software packages, subject to the following license:
72222 + * 
72223 + * Permission is hereby granted, free of charge, to any person obtaining a copy
72224 + * of this source file (the "Software"), to deal in the Software without
72225 + * restriction, including without limitation the rights to use, copy, modify,
72226 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
72227 + * and to permit persons to whom the Software is furnished to do so, subject to
72228 + * the following conditions:
72229 + * 
72230 + * The above copyright notice and this permission notice shall be included in
72231 + * all copies or substantial portions of the Software.
72232 + * 
72233 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
72234 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
72235 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
72236 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
72237 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
72238 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
72239 + * IN THE SOFTWARE.
72240 + */
72241 +
72242 +#ifndef __HYPERCALL_H__
72243 +#define __HYPERCALL_H__
72244 +
72245 +#ifndef __HYPERVISOR_H__
72246 +# error "please don't include this file directly"
72247 +#endif
72248 +
72249 +#define __STR(x) #x
72250 +#define STR(x) __STR(x)
72251 +
72252 +#define _hypercall0(type, name)                        \
72253 +({                                             \
72254 +       long __res;                             \
72255 +       asm volatile (                          \
72256 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
72257 +               : "=a" (__res)                  \
72258 +               :                               \
72259 +               : "memory" );                   \
72260 +       (type)__res;                            \
72261 +})
72262 +
72263 +#define _hypercall1(type, name, a1)                            \
72264 +({                                                             \
72265 +       long __res, __ign1;                                     \
72266 +       asm volatile (                                          \
72267 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
72268 +               : "=a" (__res), "=D" (__ign1)                   \
72269 +               : "1" ((long)(a1))                              \
72270 +               : "memory" );                                   \
72271 +       (type)__res;                                            \
72272 +})
72273 +
72274 +#define _hypercall2(type, name, a1, a2)                                \
72275 +({                                                             \
72276 +       long __res, __ign1, __ign2;                             \
72277 +       asm volatile (                                          \
72278 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
72279 +               : "=a" (__res), "=D" (__ign1), "=S" (__ign2)    \
72280 +               : "1" ((long)(a1)), "2" ((long)(a2))            \
72281 +               : "memory" );                                   \
72282 +       (type)__res;                                            \
72283 +})
72284 +
72285 +#define _hypercall3(type, name, a1, a2, a3)                    \
72286 +({                                                             \
72287 +       long __res, __ign1, __ign2, __ign3;                     \
72288 +       asm volatile (                                          \
72289 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
72290 +               : "=a" (__res), "=D" (__ign1), "=S" (__ign2),   \
72291 +               "=d" (__ign3)                                   \
72292 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
72293 +               "3" ((long)(a3))                                \
72294 +               : "memory" );                                   \
72295 +       (type)__res;                                            \
72296 +})
72297 +
72298 +#define _hypercall4(type, name, a1, a2, a3, a4)                        \
72299 +({                                                             \
72300 +       long __res, __ign1, __ign2, __ign3;                     \
72301 +       asm volatile (                                          \
72302 +               "movq %7,%%r10; "                               \
72303 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
72304 +               : "=a" (__res), "=D" (__ign1), "=S" (__ign2),   \
72305 +               "=d" (__ign3)                                   \
72306 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
72307 +               "3" ((long)(a3)), "g" ((long)(a4))              \
72308 +               : "memory", "r10" );                            \
72309 +       (type)__res;                                            \
72310 +})
72311 +
72312 +#define _hypercall5(type, name, a1, a2, a3, a4, a5)            \
72313 +({                                                             \
72314 +       long __res, __ign1, __ign2, __ign3;                     \
72315 +       asm volatile (                                          \
72316 +               "movq %7,%%r10; movq %8,%%r8; "                 \
72317 +               "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
72318 +               : "=a" (__res), "=D" (__ign1), "=S" (__ign2),   \
72319 +               "=d" (__ign3)                                   \
72320 +               : "1" ((long)(a1)), "2" ((long)(a2)),           \
72321 +               "3" ((long)(a3)), "g" ((long)(a4)),             \
72322 +               "g" ((long)(a5))                                \
72323 +               : "memory", "r10", "r8" );                      \
72324 +       (type)__res;                                            \
72325 +})
72326 +
72327 +static inline int
72328 +HYPERVISOR_set_trap_table(
72329 +       trap_info_t *table)
72330 +{
72331 +       return _hypercall1(int, set_trap_table, table);
72332 +}
72333 +
72334 +static inline int
72335 +HYPERVISOR_mmu_update(
72336 +       mmu_update_t *req, int count, int *success_count, domid_t domid)
72337 +{
72338 +       return _hypercall4(int, mmu_update, req, count, success_count, domid);
72339 +}
72340 +
72341 +static inline int
72342 +HYPERVISOR_mmuext_op(
72343 +       struct mmuext_op *op, int count, int *success_count, domid_t domid)
72344 +{
72345 +       return _hypercall4(int, mmuext_op, op, count, success_count, domid);
72346 +}
72347 +
72348 +static inline int
72349 +HYPERVISOR_set_gdt(
72350 +       unsigned long *frame_list, int entries)
72351 +{
72352 +       return _hypercall2(int, set_gdt, frame_list, entries);
72353 +}
72354 +
72355 +static inline int
72356 +HYPERVISOR_stack_switch(
72357 +       unsigned long ss, unsigned long esp)
72358 +{
72359 +       return _hypercall2(int, stack_switch, ss, esp);
72360 +}
72361 +
72362 +static inline int
72363 +HYPERVISOR_set_callbacks(
72364 +       unsigned long event_address, unsigned long failsafe_address, 
72365 +       unsigned long syscall_address)
72366 +{
72367 +       return _hypercall3(int, set_callbacks,
72368 +                          event_address, failsafe_address, syscall_address);
72369 +}
72370 +
72371 +static inline int
72372 +HYPERVISOR_fpu_taskswitch(
72373 +       int set)
72374 +{
72375 +       return _hypercall1(int, fpu_taskswitch, set);
72376 +}
72377 +
72378 +static inline int
72379 +HYPERVISOR_sched_op_compat(
72380 +       int cmd, unsigned long arg)
72381 +{
72382 +       return _hypercall2(int, sched_op_compat, cmd, arg);
72383 +}
72384 +
72385 +static inline int
72386 +HYPERVISOR_sched_op(
72387 +       int cmd, void *arg)
72388 +{
72389 +       return _hypercall2(int, sched_op, cmd, arg);
72390 +}
72391 +
72392 +static inline long
72393 +HYPERVISOR_set_timer_op(
72394 +       u64 timeout)
72395 +{
72396 +       return _hypercall1(long, set_timer_op, timeout);
72397 +}
72398 +
72399 +static inline int
72400 +HYPERVISOR_dom0_op(
72401 +       dom0_op_t *dom0_op)
72402 +{
72403 +       dom0_op->interface_version = DOM0_INTERFACE_VERSION;
72404 +       return _hypercall1(int, dom0_op, dom0_op);
72405 +}
72406 +
72407 +static inline int
72408 +HYPERVISOR_set_debugreg(
72409 +       int reg, unsigned long value)
72410 +{
72411 +       return _hypercall2(int, set_debugreg, reg, value);
72412 +}
72413 +
72414 +static inline unsigned long
72415 +HYPERVISOR_get_debugreg(
72416 +       int reg)
72417 +{
72418 +       return _hypercall1(unsigned long, get_debugreg, reg);
72419 +}
72420 +
72421 +static inline int
72422 +HYPERVISOR_update_descriptor(
72423 +       unsigned long ma, unsigned long word)
72424 +{
72425 +       return _hypercall2(int, update_descriptor, ma, word);
72426 +}
72427 +
72428 +static inline int
72429 +HYPERVISOR_memory_op(
72430 +       unsigned int cmd, void *arg)
72431 +{
72432 +       return _hypercall2(int, memory_op, cmd, arg);
72433 +}
72434 +
72435 +static inline int
72436 +HYPERVISOR_multicall(
72437 +       void *call_list, int nr_calls)
72438 +{
72439 +       return _hypercall2(int, multicall, call_list, nr_calls);
72440 +}
72441 +
72442 +static inline int
72443 +HYPERVISOR_update_va_mapping(
72444 +       unsigned long va, pte_t new_val, unsigned long flags)
72445 +{
72446 +       return _hypercall3(int, update_va_mapping, va, new_val.pte, flags);
72447 +}
72448 +
72449 +static inline int
72450 +HYPERVISOR_event_channel_op(
72451 +       void *op)
72452 +{
72453 +       return _hypercall1(int, event_channel_op, op);
72454 +}
72455 +
72456 +static inline int
72457 +HYPERVISOR_xen_version(
72458 +       int cmd, void *arg)
72459 +{
72460 +       return _hypercall2(int, xen_version, cmd, arg);
72461 +}
72462 +
72463 +static inline int
72464 +HYPERVISOR_console_io(
72465 +       int cmd, int count, char *str)
72466 +{
72467 +       return _hypercall3(int, console_io, cmd, count, str);
72468 +}
72469 +
72470 +static inline int
72471 +HYPERVISOR_physdev_op(
72472 +       void *physdev_op)
72473 +{
72474 +       return _hypercall1(int, physdev_op, physdev_op);
72475 +}
72476 +
72477 +static inline int
72478 +HYPERVISOR_grant_table_op(
72479 +       unsigned int cmd, void *uop, unsigned int count)
72480 +{
72481 +       return _hypercall3(int, grant_table_op, cmd, uop, count);
72482 +}
72483 +
72484 +static inline int
72485 +HYPERVISOR_update_va_mapping_otherdomain(
72486 +       unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
72487 +{
72488 +       return _hypercall4(int, update_va_mapping_otherdomain, va,
72489 +                          new_val.pte, flags, domid);
72490 +}
72491 +
72492 +static inline int
72493 +HYPERVISOR_vm_assist(
72494 +       unsigned int cmd, unsigned int type)
72495 +{
72496 +       return _hypercall2(int, vm_assist, cmd, type);
72497 +}
72498 +
72499 +static inline int
72500 +HYPERVISOR_vcpu_op(
72501 +       int cmd, int vcpuid, void *extra_args)
72502 +{
72503 +       return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
72504 +}
72505 +
72506 +static inline int
72507 +HYPERVISOR_set_segment_base(
72508 +       int reg, unsigned long value)
72509 +{
72510 +       return _hypercall2(int, set_segment_base, reg, value);
72511 +}
72512 +
72513 +static inline int
72514 +HYPERVISOR_suspend(
72515 +       unsigned long srec)
72516 +{
72517 +       struct sched_shutdown sched_shutdown = {
72518 +               .reason = SHUTDOWN_suspend
72519 +       };
72520 +
72521 +       int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
72522 +                            &sched_shutdown, srec);
72523 +
72524 +       if (rc == -ENOSYS)
72525 +               rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
72526 +                                SHUTDOWN_suspend, srec);
72527 +
72528 +       return rc;
72529 +}
72530 +
72531 +static inline int
72532 +HYPERVISOR_nmi_op(
72533 +       unsigned long op, void *arg)
72534 +{
72535 +       return _hypercall2(int, nmi_op, op, arg);
72536 +}
72537 +
72538 +#endif /* __HYPERCALL_H__ */
72539 +
72540 +/*
72541 + * Local variables:
72542 + *  c-file-style: "linux"
72543 + *  indent-tabs-mode: t
72544 + *  c-indent-level: 8
72545 + *  c-basic-offset: 8
72546 + *  tab-width: 8
72547 + * End:
72548 + */
72549 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/hypervisor.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/hypervisor.h
72550 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/hypervisor.h      1970-01-01 01:00:00.000000000 +0100
72551 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/hypervisor.h   2006-06-26 09:51:32.000000000 +0200
72552 @@ -0,0 +1,2 @@
72553 +
72554 +#include <asm-i386/mach-xen/asm/hypervisor.h>
72555 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/io.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/io.h
72556 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/io.h      1970-01-01 01:00:00.000000000 +0100
72557 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/io.h   2006-06-26 09:51:32.000000000 +0200
72558 @@ -0,0 +1,359 @@
72559 +#ifndef _ASM_IO_H
72560 +#define _ASM_IO_H
72561 +
72562 +#include <linux/config.h>
72563 +#include <asm/fixmap.h>
72564 +
72565 +/*
72566 + * This file contains the definitions for the x86 IO instructions
72567 + * inb/inw/inl/outb/outw/outl and the "string versions" of the same
72568 + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
72569 + * versions of the single-IO instructions (inb_p/inw_p/..).
72570 + *
72571 + * This file is not meant to be obfuscating: it's just complicated
72572 + * to (a) handle it all in a way that makes gcc able to optimize it
72573 + * as well as possible and (b) trying to avoid writing the same thing
72574 + * over and over again with slight variations and possibly making a
72575 + * mistake somewhere.
72576 + */
72577 +
72578 +/*
72579 + * Thanks to James van Artsdalen for a better timing-fix than
72580 + * the two short jumps: using outb's to a nonexistent port seems
72581 + * to guarantee better timings even on fast machines.
72582 + *
72583 + * On the other hand, I'd like to be sure of a non-existent port:
72584 + * I feel a bit unsafe about using 0x80 (should be safe, though)
72585 + *
72586 + *             Linus
72587 + */
72588 +
72589 + /*
72590 +  *  Bit simplified and optimized by Jan Hubicka
72591 +  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
72592 +  *
72593 +  *  isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
72594 +  *  isa_read[wl] and isa_write[wl] fixed
72595 +  *  - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
72596 +  */
72597 +
72598 +#define __SLOW_DOWN_IO "\noutb %%al,$0x80"
72599 +
72600 +#ifdef REALLY_SLOW_IO
72601 +#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
72602 +#else
72603 +#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO
72604 +#endif
72605 +
72606 +/*
72607 + * Talk about misusing macros..
72608 + */
72609 +#define __OUT1(s,x) \
72610 +static inline void out##s(unsigned x value, unsigned short port) {
72611 +
72612 +#define __OUT2(s,s1,s2) \
72613 +__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1"
72614 +
72615 +#define __OUT(s,s1,x) \
72616 +__OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \
72617 +__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} \
72618 +
72619 +#define __IN1(s) \
72620 +static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
72621 +
72622 +#define __IN2(s,s1,s2) \
72623 +__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0"
72624 +
72625 +#define __IN(s,s1,i...) \
72626 +__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
72627 +__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
72628 +
72629 +#define __INS(s) \
72630 +static inline void ins##s(unsigned short port, void * addr, unsigned long count) \
72631 +{ __asm__ __volatile__ ("rep ; ins" #s \
72632 +: "=D" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
72633 +
72634 +#define __OUTS(s) \
72635 +static inline void outs##s(unsigned short port, const void * addr, unsigned long count) \
72636 +{ __asm__ __volatile__ ("rep ; outs" #s \
72637 +: "=S" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); }
72638 +
72639 +#define RETURN_TYPE unsigned char
72640 +__IN(b,"")
72641 +#undef RETURN_TYPE
72642 +#define RETURN_TYPE unsigned short
72643 +__IN(w,"")
72644 +#undef RETURN_TYPE
72645 +#define RETURN_TYPE unsigned int
72646 +__IN(l,"")
72647 +#undef RETURN_TYPE
72648 +
72649 +__OUT(b,"b",char)
72650 +__OUT(w,"w",short)
72651 +__OUT(l,,int)
72652 +
72653 +__INS(b)
72654 +__INS(w)
72655 +__INS(l)
72656 +
72657 +__OUTS(b)
72658 +__OUTS(w)
72659 +__OUTS(l)
72660 +
72661 +#define IO_SPACE_LIMIT 0xffff
72662 +
72663 +#if defined(__KERNEL__) && __x86_64__
72664 +
72665 +#include <linux/vmalloc.h>
72666 +
72667 +#ifndef __i386__
72668 +/*
72669 + * Change virtual addresses to physical addresses and vv.
72670 + * These are pretty trivial
72671 + */
72672 +static inline unsigned long virt_to_phys(volatile void * address)
72673 +{
72674 +       return __pa(address);
72675 +}
72676 +
72677 +static inline void * phys_to_virt(unsigned long address)
72678 +{
72679 +       return __va(address);
72680 +}
72681 +
72682 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
72683 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
72684 +#endif
72685 +
72686 +/*
72687 + * Change "struct page" to physical address.
72688 + */
72689 +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
72690 +#define page_to_phys(page)      (phys_to_machine(page_to_pseudophys(page)))
72691 +
72692 +#define bio_to_pseudophys(bio)  (page_to_pseudophys(bio_page((bio))) + \
72693 +                                 (unsigned long) bio_offset((bio)))
72694 +#define bvec_to_pseudophys(bv)  (page_to_pseudophys((bv)->bv_page) + \
72695 +                                 (unsigned long) (bv)->bv_offset)
72696 +
72697 +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)      \
72698 +       (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \
72699 +        ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \
72700 +         bvec_to_pseudophys((vec2))))
72701 +
72702 +#include <asm-generic/iomap.h>
72703 +
72704 +extern void __iomem *__ioremap(unsigned long offset, unsigned long size, unsigned long flags);
72705 +
72706 +static inline void __iomem * ioremap (unsigned long offset, unsigned long size)
72707 +{
72708 +       return __ioremap(offset, size, 0);
72709 +}
72710 +
72711 +/*
72712 + * This one maps high address device memory and turns off caching for that area.
72713 + * it's useful if some control registers are in such an area and write combining
72714 + * or read caching is not desirable:
72715 + */
72716 +extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size);
72717 +extern void iounmap(volatile void __iomem *addr);
72718 +
72719 +/* Use normal IO mappings for DMI */
72720 +#define dmi_ioremap ioremap
72721 +#define dmi_iounmap(x,l) iounmap(x)
72722 +#define dmi_alloc(l) kmalloc(l, GFP_ATOMIC)
72723 +
72724 +/*
72725 + * ISA I/O bus memory addresses are 1:1 with the physical address.
72726 + */
72727 +
72728 +#define isa_virt_to_bus(_x) isa_virt_to_bus_is_UNSUPPORTED->x
72729 +#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x
72730 +#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
72731 +
72732 +/*
72733 + * However PCI ones are not necessarily 1:1 and therefore these interfaces
72734 + * are forbidden in portable PCI drivers.
72735 + *
72736 + * Allow them on x86 for legacy drivers, though.
72737 + */
72738 +#define virt_to_bus(_x) phys_to_machine(__pa(_x))
72739 +#define bus_to_virt(_x) __va(machine_to_phys(_x))
72740 +
72741 +/*
72742 + * readX/writeX() are used to access memory mapped devices. On some
72743 + * architectures the memory mapped IO stuff needs to be accessed
72744 + * differently. On the x86 architecture, we just read/write the
72745 + * memory location directly.
72746 + */
72747 +
72748 +static inline __u8 __readb(const volatile void __iomem *addr)
72749 +{
72750 +       return *(__force volatile __u8 *)addr;
72751 +}
72752 +static inline __u16 __readw(const volatile void __iomem *addr)
72753 +{
72754 +       return *(__force volatile __u16 *)addr;
72755 +}
72756 +static inline __u32 __readl(const volatile void __iomem *addr)
72757 +{
72758 +       return *(__force volatile __u32 *)addr;
72759 +}
72760 +static inline __u64 __readq(const volatile void __iomem *addr)
72761 +{
72762 +       return *(__force volatile __u64 *)addr;
72763 +}
72764 +#define readb(x) __readb(x)
72765 +#define readw(x) __readw(x)
72766 +#define readl(x) __readl(x)
72767 +#define readq(x) __readq(x)
72768 +#define readb_relaxed(a) readb(a)
72769 +#define readw_relaxed(a) readw(a)
72770 +#define readl_relaxed(a) readl(a)
72771 +#define readq_relaxed(a) readq(a)
72772 +#define __raw_readb readb
72773 +#define __raw_readw readw
72774 +#define __raw_readl readl
72775 +#define __raw_readq readq
72776 +
72777 +#define mmiowb()
72778 +
72779 +#ifdef CONFIG_UNORDERED_IO
72780 +static inline void __writel(__u32 val, volatile void __iomem *addr)
72781 +{
72782 +       volatile __u32 __iomem *target = addr;
72783 +       asm volatile("movnti %1,%0"
72784 +                    : "=m" (*target)
72785 +                    : "r" (val) : "memory");
72786 +}
72787 +
72788 +static inline void __writeq(__u64 val, volatile void __iomem *addr)
72789 +{
72790 +       volatile __u64 __iomem *target = addr;
72791 +       asm volatile("movnti %1,%0"
72792 +                    : "=m" (*target)
72793 +                    : "r" (val) : "memory");
72794 +}
72795 +#else
72796 +static inline void __writel(__u32 b, volatile void __iomem *addr)
72797 +{
72798 +       *(__force volatile __u32 *)addr = b;
72799 +}
72800 +static inline void __writeq(__u64 b, volatile void __iomem *addr)
72801 +{
72802 +       *(__force volatile __u64 *)addr = b;
72803 +}
72804 +#endif
72805 +static inline void __writeb(__u8 b, volatile void __iomem *addr)
72806 +{
72807 +       *(__force volatile __u8 *)addr = b;
72808 +}
72809 +static inline void __writew(__u16 b, volatile void __iomem *addr)
72810 +{
72811 +       *(__force volatile __u16 *)addr = b;
72812 +}
72813 +#define writeq(val,addr) __writeq((val),(addr))
72814 +#define writel(val,addr) __writel((val),(addr))
72815 +#define writew(val,addr) __writew((val),(addr))
72816 +#define writeb(val,addr) __writeb((val),(addr))
72817 +#define __raw_writeb writeb
72818 +#define __raw_writew writew
72819 +#define __raw_writel writel
72820 +#define __raw_writeq writeq
72821 +
72822 +void __memcpy_fromio(void*,unsigned long,unsigned);
72823 +void __memcpy_toio(unsigned long,const void*,unsigned);
72824 +
72825 +static inline void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned len)
72826 +{
72827 +       __memcpy_fromio(to,(unsigned long)from,len);
72828 +}
72829 +static inline void memcpy_toio(volatile void __iomem *to, const void *from, unsigned len)
72830 +{
72831 +       __memcpy_toio((unsigned long)to,from,len);
72832 +}
72833 +
72834 +void memset_io(volatile void __iomem *a, int b, size_t c);
72835 +
72836 +/*
72837 + * ISA space is 'always mapped' on a typical x86 system, no need to
72838 + * explicitly ioremap() it. The fact that the ISA IO space is mapped
72839 + * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
72840 + * are physical addresses. The following constant pointer can be
72841 + * used as the IO-area pointer (it can be iounmapped as well, so the
72842 + * analogy with PCI is quite large):
72843 + */
72844 +#define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN)))
72845 +
72846 +#define isa_readb(a) readb(__ISA_IO_base + (a))
72847 +#define isa_readw(a) readw(__ISA_IO_base + (a))
72848 +#define isa_readl(a) readl(__ISA_IO_base + (a))
72849 +#define isa_writeb(b,a) writeb(b,__ISA_IO_base + (a))
72850 +#define isa_writew(w,a) writew(w,__ISA_IO_base + (a))
72851 +#define isa_writel(l,a) writel(l,__ISA_IO_base + (a))
72852 +#define isa_memset_io(a,b,c)           memset_io(__ISA_IO_base + (a),(b),(c))
72853 +#define isa_memcpy_fromio(a,b,c)       memcpy_fromio((a),__ISA_IO_base + (b),(c))
72854 +#define isa_memcpy_toio(a,b,c)         memcpy_toio(__ISA_IO_base + (a),(b),(c))
72855 +
72856 +
72857 +/*
72858 + * Again, x86-64 does not require mem IO specific function.
72859 + */
72860 +
72861 +#define eth_io_copy_and_sum(a,b,c,d)           eth_copy_and_sum((a),(void *)(b),(c),(d))
72862 +#define isa_eth_io_copy_and_sum(a,b,c,d)       eth_copy_and_sum((a),(void *)(__ISA_IO_base + (b)),(c),(d))
72863 +
72864 +/**
72865 + *     check_signature         -       find BIOS signatures
72866 + *     @io_addr: mmio address to check 
72867 + *     @signature:  signature block
72868 + *     @length: length of signature
72869 + *
72870 + *     Perform a signature comparison with the mmio address io_addr. This
72871 + *     address should have been obtained by ioremap.
72872 + *     Returns 1 on a match.
72873 + */
72874
72875 +static inline int check_signature(void __iomem *io_addr,
72876 +       const unsigned char *signature, int length)
72877 +{
72878 +       int retval = 0;
72879 +       do {
72880 +               if (readb(io_addr) != *signature)
72881 +                       goto out;
72882 +               io_addr++;
72883 +               signature++;
72884 +               length--;
72885 +       } while (length);
72886 +       retval = 1;
72887 +out:
72888 +       return retval;
72889 +}
72890 +
72891 +/* Nothing to do */
72892 +
72893 +#define dma_cache_inv(_start,_size)            do { } while (0)
72894 +#define dma_cache_wback(_start,_size)          do { } while (0)
72895 +#define dma_cache_wback_inv(_start,_size)      do { } while (0)
72896 +
72897 +#define flush_write_buffers() 
72898 +
72899 +extern int iommu_bio_merge;
72900 +#define BIO_VMERGE_BOUNDARY iommu_bio_merge
72901 +
72902 +/*
72903 + * Convert a physical pointer to a virtual kernel pointer for /dev/mem
72904 + * access
72905 + */
72906 +#define xlate_dev_mem_ptr(p)   __va(p)
72907 +
72908 +/*
72909 + * Convert a virtual cached pointer to an uncached pointer
72910 + */
72911 +#define xlate_dev_kmem_ptr(p)  p
72912 +
72913 +#endif /* __KERNEL__ */
72914 +
72915 +#define ARCH_HAS_DEV_MEM
72916 +
72917 +#endif
72918 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/irq.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/irq.h
72919 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/irq.h     1970-01-01 01:00:00.000000000 +0100
72920 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/irq.h  2006-06-26 09:51:32.000000000 +0200
72921 @@ -0,0 +1,39 @@
72922 +#ifndef _ASM_IRQ_H
72923 +#define _ASM_IRQ_H
72924 +
72925 +/*
72926 + *     linux/include/asm/irq.h
72927 + *
72928 + *     (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
72929 + *
72930 + *     IRQ/IPI changes taken from work by Thomas Radke
72931 + *     <tomsoft@informatik.tu-chemnitz.de>
72932 + */
72933 +
72934 +#include <linux/config.h>
72935 +#include <linux/sched.h>
72936 +/* include comes from machine specific directory */
72937 +#include "irq_vectors.h"
72938 +#include <asm/thread_info.h>
72939 +
72940 +static __inline__ int irq_canonicalize(int irq)
72941 +{
72942 +       return ((irq == 2) ? 9 : irq);
72943 +}
72944 +
72945 +#ifdef CONFIG_X86_LOCAL_APIC
72946 +#define ARCH_HAS_NMI_WATCHDOG          /* See include/linux/nmi.h */
72947 +#endif
72948 +
72949 +#define KDB_VECTOR     0xf9
72950 +
72951 +# define irq_ctx_init(cpu) do { } while (0)
72952 +
72953 +#ifdef CONFIG_HOTPLUG_CPU
72954 +#include <linux/cpumask.h>
72955 +extern void fixup_irqs(cpumask_t map);
72956 +#endif
72957 +
72958 +#define __ARCH_HAS_DO_SOFTIRQ 1
72959 +
72960 +#endif /* _ASM_IRQ_H */
72961 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/mmu.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/mmu.h
72962 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/mmu.h     1970-01-01 01:00:00.000000000 +0100
72963 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/mmu.h  2006-06-26 09:51:32.000000000 +0200
72964 @@ -0,0 +1,33 @@
72965 +#ifndef __x86_64_MMU_H
72966 +#define __x86_64_MMU_H
72967 +
72968 +#include <linux/spinlock.h>
72969 +#include <asm/semaphore.h>
72970 +
72971 +/*
72972 + * The x86_64 doesn't have a mmu context, but
72973 + * we put the segment information here.
72974 + *
72975 + * cpu_vm_mask is used to optimize ldt flushing.
72976 + */
72977 +typedef struct { 
72978 +       void *ldt;
72979 +       rwlock_t ldtlock; 
72980 +       int size;
72981 +       struct semaphore sem; 
72982 +#ifdef CONFIG_XEN
72983 +       unsigned pinned:1;
72984 +       struct list_head unpinned;
72985 +#endif
72986 +} mm_context_t;
72987 +
72988 +#ifdef CONFIG_XEN
72989 +extern struct list_head mm_unpinned;
72990 +extern spinlock_t mm_unpinned_lock;
72991 +
72992 +/* mm/memory.c:exit_mmap hook */
72993 +extern void _arch_exit_mmap(struct mm_struct *mm);
72994 +#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
72995 +#endif
72996 +
72997 +#endif
72998 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/mmu_context.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/mmu_context.h
72999 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/mmu_context.h     1970-01-01 01:00:00.000000000 +0100
73000 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/mmu_context.h  2006-06-26 09:51:32.000000000 +0200
73001 @@ -0,0 +1,134 @@
73002 +#ifndef __X86_64_MMU_CONTEXT_H
73003 +#define __X86_64_MMU_CONTEXT_H
73004 +
73005 +#include <linux/config.h>
73006 +#include <asm/desc.h>
73007 +#include <asm/atomic.h>
73008 +#include <asm/pgalloc.h>
73009 +#include <asm/page.h>
73010 +#include <asm/pda.h>
73011 +#include <asm/pgtable.h>
73012 +#include <asm/tlbflush.h>
73013 +
73014 +/*
73015 + * possibly do the LDT unload here?
73016 + */
73017 +int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
73018 +void destroy_context(struct mm_struct *mm);
73019 +
73020 +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
73021 +{
73022 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
73023 +       if (read_pda(mmu_state) == TLBSTATE_OK) 
73024 +               write_pda(mmu_state, TLBSTATE_LAZY);
73025 +#endif
73026 +}
73027 +
73028 +#define prepare_arch_switch(next)      __prepare_arch_switch()
73029 +
73030 +static inline void __prepare_arch_switch(void)
73031 +{
73032 +       /*
73033 +        * Save away %es, %ds, %fs and %gs. Must happen before reload
73034 +        * of cr3/ldt (i.e., not in __switch_to).
73035 +        */
73036 +       __asm__ __volatile__ (
73037 +               "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3"
73038 +               : "=m" (current->thread.es),
73039 +                 "=m" (current->thread.ds),
73040 +                 "=m" (current->thread.fsindex),
73041 +                 "=m" (current->thread.gsindex) );
73042 +
73043 +       if (current->thread.ds)
73044 +               __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) );
73045 +
73046 +       if (current->thread.es)
73047 +               __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) );
73048 +
73049 +       if (current->thread.fsindex) {
73050 +               __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) );
73051 +               current->thread.fs = 0;
73052 +       }
73053 +
73054 +       if (current->thread.gsindex) {
73055 +               load_gs_index(0);
73056 +               current->thread.gs = 0;
73057 +       }
73058 +}
73059 +
73060 +extern void mm_pin(struct mm_struct *mm);
73061 +extern void mm_unpin(struct mm_struct *mm);
73062 +void mm_pin_all(void);
73063 +
73064 +static inline void load_cr3(pgd_t *pgd)
73065 +{
73066 +       asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) :
73067 +                    "memory");
73068 +}
73069 +
73070 +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, 
73071 +                            struct task_struct *tsk)
73072 +{
73073 +       unsigned cpu = smp_processor_id();
73074 +       struct mmuext_op _op[3], *op = _op;
73075 +
73076 +       if (likely(prev != next)) {
73077 +               if (!next->context.pinned)
73078 +                       mm_pin(next);
73079 +
73080 +               /* stop flush ipis for the previous mm */
73081 +               clear_bit(cpu, &prev->cpu_vm_mask);
73082 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
73083 +               write_pda(mmu_state, TLBSTATE_OK);
73084 +               write_pda(active_mm, next);
73085 +#endif
73086 +               set_bit(cpu, &next->cpu_vm_mask);
73087 +
73088 +               /* load_cr3(next->pgd) */
73089 +               op->cmd = MMUEXT_NEW_BASEPTR;
73090 +               op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT);
73091 +               op++;
73092 +
73093 +               /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */
73094 +               op->cmd = MMUEXT_NEW_USER_BASEPTR;
73095 +               op->arg1.mfn = pfn_to_mfn(__pa(__user_pgd(next->pgd)) >> PAGE_SHIFT);
73096 +               op++;
73097 +               
73098 +               if (unlikely(next->context.ldt != prev->context.ldt)) {
73099 +                       /* load_LDT_nolock(&next->context, cpu) */
73100 +                       op->cmd = MMUEXT_SET_LDT;
73101 +                       op->arg1.linear_addr = (unsigned long)next->context.ldt;
73102 +                       op->arg2.nr_ents     = next->context.size;
73103 +                       op++;
73104 +               }
73105 +
73106 +               BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
73107 +       }
73108 +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
73109 +       else {
73110 +               write_pda(mmu_state, TLBSTATE_OK);
73111 +               if (read_pda(active_mm) != next)
73112 +                       out_of_line_bug();
73113 +               if(!test_and_set_bit(cpu, &next->cpu_vm_mask)) {
73114 +                       /* We were in lazy tlb mode and leave_mm disabled 
73115 +                        * tlb flush IPI delivery. We must reload CR3
73116 +                        * to make sure to use no freed page tables.
73117 +                        */
73118 +                        load_cr3(next->pgd);
73119 +                        xen_new_user_pt(__pa(__user_pgd(next->pgd)));          
73120 +                       load_LDT_nolock(&next->context, cpu);
73121 +               }
73122 +       }
73123 +#endif
73124 +}
73125 +
73126 +#define deactivate_mm(tsk,mm)  do { \
73127 +       load_gs_index(0); \
73128 +       asm volatile("movl %0,%%fs"::"r"(0));  \
73129 +} while(0)
73130 +
73131 +#define activate_mm(prev, next) do {           \
73132 +       switch_mm((prev),(next),NULL);          \
73133 +} while (0)
73134 +
73135 +#endif
73136 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/msr.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/msr.h
73137 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/msr.h     1970-01-01 01:00:00.000000000 +0100
73138 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/msr.h  2006-06-26 09:51:32.000000000 +0200
73139 @@ -0,0 +1,399 @@
73140 +#ifndef X86_64_MSR_H
73141 +#define X86_64_MSR_H 1
73142 +
73143 +#ifndef __ASSEMBLY__
73144 +/*
73145 + * Access to machine-specific registers (available on 586 and better only)
73146 + * Note: the rd* operations modify the parameters directly (without using
73147 + * pointer indirection), this allows gcc to optimize better
73148 + */
73149 +
73150 +#define rdmsr(msr,val1,val2) \
73151 +       __asm__ __volatile__("rdmsr" \
73152 +                           : "=a" (val1), "=d" (val2) \
73153 +                           : "c" (msr))
73154 +
73155 +
73156 +#define rdmsrl(msr,val) do { unsigned long a__,b__; \
73157 +       __asm__ __volatile__("rdmsr" \
73158 +                           : "=a" (a__), "=d" (b__) \
73159 +                           : "c" (msr)); \
73160 +       val = a__ | (b__<<32); \
73161 +} while(0)
73162 +
73163 +#define wrmsr(msr,val1,val2) \
73164 +     __asm__ __volatile__("wrmsr" \
73165 +                         : /* no outputs */ \
73166 +                         : "c" (msr), "a" (val1), "d" (val2))
73167 +
73168 +#define wrmsrl(msr,val) wrmsr(msr,(__u32)((__u64)(val)),((__u64)(val))>>32) 
73169 +
73170 +/* wrmsr with exception handling */
73171 +#define wrmsr_safe(msr,a,b) ({ int ret__;                      \
73172 +       asm volatile("2: wrmsr ; xorl %0,%0\n"                  \
73173 +                    "1:\n\t"                                   \
73174 +                    ".section .fixup,\"ax\"\n\t"               \
73175 +                    "3:  movl %4,%0 ; jmp 1b\n\t"              \
73176 +                    ".previous\n\t"                            \
73177 +                    ".section __ex_table,\"a\"\n"              \
73178 +                    "   .align 8\n\t"                          \
73179 +                    "   .quad  2b,3b\n\t"                      \
73180 +                    ".previous"                                \
73181 +                    : "=a" (ret__)                             \
73182 +                    : "c" (msr), "0" (a), "d" (b), "i" (-EFAULT)); \
73183 +       ret__; })
73184 +
73185 +#define checking_wrmsrl(msr,val) wrmsr_safe(msr,(u32)(val),(u32)((val)>>32))
73186 +
73187 +#define rdmsr_safe(msr,a,b) \
73188 +       ({ int ret__;                                           \
73189 +         asm volatile ("1:       rdmsr\n"                      \
73190 +                      "2:\n"                                   \
73191 +                      ".section .fixup,\"ax\"\n"               \
73192 +                      "3:       movl %4,%0\n"                  \
73193 +                      " jmp 2b\n"                              \
73194 +                      ".previous\n"                            \
73195 +                      ".section __ex_table,\"a\"\n"            \
73196 +                      " .align 8\n"                            \
73197 +                      " .quad 1b,3b\n"                         \
73198 +                      ".previous":"=&bDS" (ret__), "=a"(*(a)), "=d"(*(b))\
73199 +                      :"c"(msr), "i"(-EIO), "0"(0));           \
73200 +         ret__; })             
73201 +
73202 +#define rdtsc(low,high) \
73203 +     __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
73204 +
73205 +#define rdtscl(low) \
73206 +     __asm__ __volatile__ ("rdtsc" : "=a" (low) : : "edx")
73207 +
73208 +#define rdtscll(val) do { \
73209 +     unsigned int __a,__d; \
73210 +     asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
73211 +     (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
73212 +} while(0)
73213 +
73214 +#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
73215 +
73216 +#define rdpmc(counter,low,high) \
73217 +     __asm__ __volatile__("rdpmc" \
73218 +                         : "=a" (low), "=d" (high) \
73219 +                         : "c" (counter))
73220 +
73221 +static inline void cpuid(int op, unsigned int *eax, unsigned int *ebx,
73222 +                        unsigned int *ecx, unsigned int *edx)
73223 +{
73224 +       __asm__(XEN_CPUID
73225 +               : "=a" (*eax),
73226 +                 "=b" (*ebx),
73227 +                 "=c" (*ecx),
73228 +                 "=d" (*edx)
73229 +               : "0" (op));
73230 +}
73231 +
73232 +/* Some CPUID calls want 'count' to be placed in ecx */
73233 +static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
73234 +               int *edx)
73235 +{
73236 +       __asm__(XEN_CPUID
73237 +               : "=a" (*eax),
73238 +                 "=b" (*ebx),
73239 +                 "=c" (*ecx),
73240 +                 "=d" (*edx)
73241 +               : "0" (op), "c" (count));
73242 +}
73243 +
73244 +/*
73245 + * CPUID functions returning a single datum
73246 + */
73247 +static inline unsigned int cpuid_eax(unsigned int op)
73248 +{
73249 +       unsigned int eax;
73250 +
73251 +       __asm__(XEN_CPUID
73252 +               : "=a" (eax)
73253 +               : "0" (op)
73254 +               : "bx", "cx", "dx");
73255 +       return eax;
73256 +}
73257 +static inline unsigned int cpuid_ebx(unsigned int op)
73258 +{
73259 +       unsigned int eax, ebx;
73260 +
73261 +       __asm__(XEN_CPUID
73262 +               : "=a" (eax), "=b" (ebx)
73263 +               : "0" (op)
73264 +               : "cx", "dx" );
73265 +       return ebx;
73266 +}
73267 +static inline unsigned int cpuid_ecx(unsigned int op)
73268 +{
73269 +       unsigned int eax, ecx;
73270 +
73271 +       __asm__(XEN_CPUID
73272 +               : "=a" (eax), "=c" (ecx)
73273 +               : "0" (op)
73274 +               : "bx", "dx" );
73275 +       return ecx;
73276 +}
73277 +static inline unsigned int cpuid_edx(unsigned int op)
73278 +{
73279 +       unsigned int eax, edx;
73280 +
73281 +       __asm__(XEN_CPUID
73282 +               : "=a" (eax), "=d" (edx)
73283 +               : "0" (op)
73284 +               : "bx", "cx");
73285 +       return edx;
73286 +}
73287 +
73288 +#define MSR_IA32_UCODE_WRITE           0x79
73289 +#define MSR_IA32_UCODE_REV             0x8b
73290 +
73291 +
73292 +#endif
73293 +
73294 +/* AMD/K8 specific MSRs */ 
73295 +#define MSR_EFER 0xc0000080            /* extended feature register */
73296 +#define MSR_STAR 0xc0000081            /* legacy mode SYSCALL target */
73297 +#define MSR_LSTAR 0xc0000082           /* long mode SYSCALL target */
73298 +#define MSR_CSTAR 0xc0000083           /* compatibility mode SYSCALL target */
73299 +#define MSR_SYSCALL_MASK 0xc0000084    /* EFLAGS mask for syscall */
73300 +#define MSR_FS_BASE 0xc0000100         /* 64bit GS base */
73301 +#define MSR_GS_BASE 0xc0000101         /* 64bit FS base */
73302 +#define MSR_KERNEL_GS_BASE  0xc0000102 /* SwapGS GS shadow (or USER_GS from kernel) */ 
73303 +/* EFER bits: */ 
73304 +#define _EFER_SCE 0  /* SYSCALL/SYSRET */
73305 +#define _EFER_LME 8  /* Long mode enable */
73306 +#define _EFER_LMA 10 /* Long mode active (read-only) */
73307 +#define _EFER_NX 11  /* No execute enable */
73308 +
73309 +#define EFER_SCE (1<<_EFER_SCE)
73310 +#define EFER_LME (1<<_EFER_LME)
73311 +#define EFER_LMA (1<<_EFER_LMA)
73312 +#define EFER_NX (1<<_EFER_NX)
73313 +
73314 +/* Intel MSRs. Some also available on other CPUs */
73315 +#define MSR_IA32_TSC           0x10
73316 +#define MSR_IA32_PLATFORM_ID   0x17
73317 +
73318 +#define MSR_IA32_PERFCTR0      0xc1
73319 +#define MSR_IA32_PERFCTR1      0xc2
73320 +
73321 +#define MSR_MTRRcap            0x0fe
73322 +#define MSR_IA32_BBL_CR_CTL        0x119
73323 +
73324 +#define MSR_IA32_SYSENTER_CS   0x174
73325 +#define MSR_IA32_SYSENTER_ESP  0x175
73326 +#define MSR_IA32_SYSENTER_EIP  0x176
73327 +
73328 +#define MSR_IA32_MCG_CAP       0x179
73329 +#define MSR_IA32_MCG_STATUS        0x17a
73330 +#define MSR_IA32_MCG_CTL       0x17b
73331 +
73332 +#define MSR_IA32_EVNTSEL0      0x186
73333 +#define MSR_IA32_EVNTSEL1      0x187
73334 +
73335 +#define MSR_IA32_DEBUGCTLMSR       0x1d9
73336 +#define MSR_IA32_LASTBRANCHFROMIP  0x1db
73337 +#define MSR_IA32_LASTBRANCHTOIP        0x1dc
73338 +#define MSR_IA32_LASTINTFROMIP     0x1dd
73339 +#define MSR_IA32_LASTINTTOIP       0x1de
73340 +
73341 +#define MSR_MTRRfix64K_00000   0x250
73342 +#define MSR_MTRRfix16K_80000   0x258
73343 +#define MSR_MTRRfix16K_A0000   0x259
73344 +#define MSR_MTRRfix4K_C0000    0x268
73345 +#define MSR_MTRRfix4K_C8000    0x269
73346 +#define MSR_MTRRfix4K_D0000    0x26a
73347 +#define MSR_MTRRfix4K_D8000    0x26b
73348 +#define MSR_MTRRfix4K_E0000    0x26c
73349 +#define MSR_MTRRfix4K_E8000    0x26d
73350 +#define MSR_MTRRfix4K_F0000    0x26e
73351 +#define MSR_MTRRfix4K_F8000    0x26f
73352 +#define MSR_MTRRdefType                0x2ff
73353 +
73354 +#define MSR_IA32_MC0_CTL       0x400
73355 +#define MSR_IA32_MC0_STATUS        0x401
73356 +#define MSR_IA32_MC0_ADDR      0x402
73357 +#define MSR_IA32_MC0_MISC      0x403
73358 +
73359 +#define MSR_P6_PERFCTR0                        0xc1
73360 +#define MSR_P6_PERFCTR1                        0xc2
73361 +#define MSR_P6_EVNTSEL0                        0x186
73362 +#define MSR_P6_EVNTSEL1                        0x187
73363 +
73364 +/* K7/K8 MSRs. Not complete. See the architecture manual for a more complete list. */
73365 +#define MSR_K7_EVNTSEL0            0xC0010000
73366 +#define MSR_K7_PERFCTR0            0xC0010004
73367 +#define MSR_K7_EVNTSEL1            0xC0010001
73368 +#define MSR_K7_PERFCTR1            0xC0010005
73369 +#define MSR_K7_EVNTSEL2            0xC0010002
73370 +#define MSR_K7_PERFCTR2            0xC0010006
73371 +#define MSR_K7_EVNTSEL3            0xC0010003
73372 +#define MSR_K7_PERFCTR3            0xC0010007
73373 +#define MSR_K8_TOP_MEM1                   0xC001001A
73374 +#define MSR_K8_TOP_MEM2                   0xC001001D
73375 +#define MSR_K8_SYSCFG             0xC0010010
73376 +#define MSR_K8_HWCR               0xC0010015
73377 +
73378 +/* K6 MSRs */
73379 +#define MSR_K6_EFER                    0xC0000080
73380 +#define MSR_K6_STAR                    0xC0000081
73381 +#define MSR_K6_WHCR                    0xC0000082
73382 +#define MSR_K6_UWCCR                   0xC0000085
73383 +#define MSR_K6_PSOR                    0xC0000087
73384 +#define MSR_K6_PFIR                    0xC0000088
73385 +
73386 +/* Centaur-Hauls/IDT defined MSRs. */
73387 +#define MSR_IDT_FCR1                   0x107
73388 +#define MSR_IDT_FCR2                   0x108
73389 +#define MSR_IDT_FCR3                   0x109
73390 +#define MSR_IDT_FCR4                   0x10a
73391 +
73392 +#define MSR_IDT_MCR0                   0x110
73393 +#define MSR_IDT_MCR1                   0x111
73394 +#define MSR_IDT_MCR2                   0x112
73395 +#define MSR_IDT_MCR3                   0x113
73396 +#define MSR_IDT_MCR4                   0x114
73397 +#define MSR_IDT_MCR5                   0x115
73398 +#define MSR_IDT_MCR6                   0x116
73399 +#define MSR_IDT_MCR7                   0x117
73400 +#define MSR_IDT_MCR_CTRL               0x120
73401 +
73402 +/* VIA Cyrix defined MSRs*/
73403 +#define MSR_VIA_FCR                    0x1107
73404 +#define MSR_VIA_LONGHAUL               0x110a
73405 +#define MSR_VIA_RNG                    0x110b
73406 +#define MSR_VIA_BCR2                   0x1147
73407 +
73408 +/* Intel defined MSRs. */
73409 +#define MSR_IA32_P5_MC_ADDR            0
73410 +#define MSR_IA32_P5_MC_TYPE            1
73411 +#define MSR_IA32_PLATFORM_ID           0x17
73412 +#define MSR_IA32_EBL_CR_POWERON                0x2a
73413 +
73414 +#define MSR_IA32_APICBASE               0x1b
73415 +#define MSR_IA32_APICBASE_BSP           (1<<8)
73416 +#define MSR_IA32_APICBASE_ENABLE        (1<<11)
73417 +#define MSR_IA32_APICBASE_BASE          (0xfffff<<12)
73418 +
73419 +/* P4/Xeon+ specific */
73420 +#define MSR_IA32_MCG_EAX               0x180
73421 +#define MSR_IA32_MCG_EBX               0x181
73422 +#define MSR_IA32_MCG_ECX               0x182
73423 +#define MSR_IA32_MCG_EDX               0x183
73424 +#define MSR_IA32_MCG_ESI               0x184
73425 +#define MSR_IA32_MCG_EDI               0x185
73426 +#define MSR_IA32_MCG_EBP               0x186
73427 +#define MSR_IA32_MCG_ESP               0x187
73428 +#define MSR_IA32_MCG_EFLAGS            0x188
73429 +#define MSR_IA32_MCG_EIP               0x189
73430 +#define MSR_IA32_MCG_RESERVED          0x18A
73431 +
73432 +#define MSR_P6_EVNTSEL0                        0x186
73433 +#define MSR_P6_EVNTSEL1                        0x187
73434 +
73435 +#define MSR_IA32_PERF_STATUS           0x198
73436 +#define MSR_IA32_PERF_CTL              0x199
73437 +
73438 +#define MSR_IA32_THERM_CONTROL         0x19a
73439 +#define MSR_IA32_THERM_INTERRUPT       0x19b
73440 +#define MSR_IA32_THERM_STATUS          0x19c
73441 +#define MSR_IA32_MISC_ENABLE           0x1a0
73442 +
73443 +#define MSR_IA32_DEBUGCTLMSR           0x1d9
73444 +#define MSR_IA32_LASTBRANCHFROMIP      0x1db
73445 +#define MSR_IA32_LASTBRANCHTOIP                0x1dc
73446 +#define MSR_IA32_LASTINTFROMIP         0x1dd
73447 +#define MSR_IA32_LASTINTTOIP           0x1de
73448 +
73449 +#define MSR_IA32_MC0_CTL               0x400
73450 +#define MSR_IA32_MC0_STATUS            0x401
73451 +#define MSR_IA32_MC0_ADDR              0x402
73452 +#define MSR_IA32_MC0_MISC              0x403
73453 +
73454 +/* Pentium IV performance counter MSRs */
73455 +#define MSR_P4_BPU_PERFCTR0            0x300
73456 +#define MSR_P4_BPU_PERFCTR1            0x301
73457 +#define MSR_P4_BPU_PERFCTR2            0x302
73458 +#define MSR_P4_BPU_PERFCTR3            0x303
73459 +#define MSR_P4_MS_PERFCTR0             0x304
73460 +#define MSR_P4_MS_PERFCTR1             0x305
73461 +#define MSR_P4_MS_PERFCTR2             0x306
73462 +#define MSR_P4_MS_PERFCTR3             0x307
73463 +#define MSR_P4_FLAME_PERFCTR0          0x308
73464 +#define MSR_P4_FLAME_PERFCTR1          0x309
73465 +#define MSR_P4_FLAME_PERFCTR2          0x30a
73466 +#define MSR_P4_FLAME_PERFCTR3          0x30b
73467 +#define MSR_P4_IQ_PERFCTR0             0x30c
73468 +#define MSR_P4_IQ_PERFCTR1             0x30d
73469 +#define MSR_P4_IQ_PERFCTR2             0x30e
73470 +#define MSR_P4_IQ_PERFCTR3             0x30f
73471 +#define MSR_P4_IQ_PERFCTR4             0x310
73472 +#define MSR_P4_IQ_PERFCTR5             0x311
73473 +#define MSR_P4_BPU_CCCR0               0x360
73474 +#define MSR_P4_BPU_CCCR1               0x361
73475 +#define MSR_P4_BPU_CCCR2               0x362
73476 +#define MSR_P4_BPU_CCCR3               0x363
73477 +#define MSR_P4_MS_CCCR0                0x364
73478 +#define MSR_P4_MS_CCCR1                0x365
73479 +#define MSR_P4_MS_CCCR2                0x366
73480 +#define MSR_P4_MS_CCCR3                0x367
73481 +#define MSR_P4_FLAME_CCCR0             0x368
73482 +#define MSR_P4_FLAME_CCCR1             0x369
73483 +#define MSR_P4_FLAME_CCCR2             0x36a
73484 +#define MSR_P4_FLAME_CCCR3             0x36b
73485 +#define MSR_P4_IQ_CCCR0                0x36c
73486 +#define MSR_P4_IQ_CCCR1                0x36d
73487 +#define MSR_P4_IQ_CCCR2                0x36e
73488 +#define MSR_P4_IQ_CCCR3                0x36f
73489 +#define MSR_P4_IQ_CCCR4                0x370
73490 +#define MSR_P4_IQ_CCCR5                0x371
73491 +#define MSR_P4_ALF_ESCR0               0x3ca
73492 +#define MSR_P4_ALF_ESCR1               0x3cb
73493 +#define MSR_P4_BPU_ESCR0               0x3b2
73494 +#define MSR_P4_BPU_ESCR1               0x3b3
73495 +#define MSR_P4_BSU_ESCR0               0x3a0
73496 +#define MSR_P4_BSU_ESCR1               0x3a1
73497 +#define MSR_P4_CRU_ESCR0               0x3b8
73498 +#define MSR_P4_CRU_ESCR1               0x3b9
73499 +#define MSR_P4_CRU_ESCR2               0x3cc
73500 +#define MSR_P4_CRU_ESCR3               0x3cd
73501 +#define MSR_P4_CRU_ESCR4               0x3e0
73502 +#define MSR_P4_CRU_ESCR5               0x3e1
73503 +#define MSR_P4_DAC_ESCR0               0x3a8
73504 +#define MSR_P4_DAC_ESCR1               0x3a9
73505 +#define MSR_P4_FIRM_ESCR0              0x3a4
73506 +#define MSR_P4_FIRM_ESCR1              0x3a5
73507 +#define MSR_P4_FLAME_ESCR0             0x3a6
73508 +#define MSR_P4_FLAME_ESCR1             0x3a7
73509 +#define MSR_P4_FSB_ESCR0               0x3a2
73510 +#define MSR_P4_FSB_ESCR1               0x3a3
73511 +#define MSR_P4_IQ_ESCR0                0x3ba
73512 +#define MSR_P4_IQ_ESCR1                0x3bb
73513 +#define MSR_P4_IS_ESCR0                0x3b4
73514 +#define MSR_P4_IS_ESCR1                0x3b5
73515 +#define MSR_P4_ITLB_ESCR0              0x3b6
73516 +#define MSR_P4_ITLB_ESCR1              0x3b7
73517 +#define MSR_P4_IX_ESCR0                0x3c8
73518 +#define MSR_P4_IX_ESCR1                0x3c9
73519 +#define MSR_P4_MOB_ESCR0               0x3aa
73520 +#define MSR_P4_MOB_ESCR1               0x3ab
73521 +#define MSR_P4_MS_ESCR0                0x3c0
73522 +#define MSR_P4_MS_ESCR1                0x3c1
73523 +#define MSR_P4_PMH_ESCR0               0x3ac
73524 +#define MSR_P4_PMH_ESCR1               0x3ad
73525 +#define MSR_P4_RAT_ESCR0               0x3bc
73526 +#define MSR_P4_RAT_ESCR1               0x3bd
73527 +#define MSR_P4_SAAT_ESCR0              0x3ae
73528 +#define MSR_P4_SAAT_ESCR1              0x3af
73529 +#define MSR_P4_SSU_ESCR0               0x3be
73530 +#define MSR_P4_SSU_ESCR1               0x3bf    /* guess: not defined in manual */
73531 +#define MSR_P4_TBPU_ESCR0              0x3c2
73532 +#define MSR_P4_TBPU_ESCR1              0x3c3
73533 +#define MSR_P4_TC_ESCR0                0x3c4
73534 +#define MSR_P4_TC_ESCR1                0x3c5
73535 +#define MSR_P4_U2L_ESCR0               0x3b0
73536 +#define MSR_P4_U2L_ESCR1               0x3b1
73537 +
73538 +#endif
73539 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/nmi.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/nmi.h
73540 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/nmi.h     1970-01-01 01:00:00.000000000 +0100
73541 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/nmi.h  2006-06-26 09:51:32.000000000 +0200
73542 @@ -0,0 +1,75 @@
73543 +/*
73544 + *  linux/include/asm-i386/nmi.h
73545 + */
73546 +#ifndef ASM_NMI_H
73547 +#define ASM_NMI_H
73548 +
73549 +#include <linux/pm.h>
73550 +
73551 +#include <xen/interface/nmi.h>
73552 +
73553 +struct pt_regs;
73554
73555 +typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu);
73556
73557 +/** 
73558 + * set_nmi_callback
73559 + *
73560 + * Set a handler for an NMI. Only one handler may be
73561 + * set. Return 1 if the NMI was handled.
73562 + */
73563 +void set_nmi_callback(nmi_callback_t callback);
73564
73565 +/** 
73566 + * unset_nmi_callback
73567 + *
73568 + * Remove the handler previously set.
73569 + */
73570 +void unset_nmi_callback(void);
73571
73572 +#ifdef CONFIG_PM
73573
73574 +/** Replace the PM callback routine for NMI. */
73575 +struct pm_dev * set_nmi_pm_callback(pm_callback callback);
73576 +
73577 +/** Unset the PM callback routine back to the default. */
73578 +void unset_nmi_pm_callback(struct pm_dev * dev);
73579 +
73580 +#else
73581 +
73582 +static inline struct pm_dev * set_nmi_pm_callback(pm_callback callback)
73583 +{
73584 +       return 0;
73585 +} 
73586
73587 +static inline void unset_nmi_pm_callback(struct pm_dev * dev)
73588 +{
73589 +}
73590 +
73591 +#endif /* CONFIG_PM */
73592
73593 +extern void default_do_nmi(struct pt_regs *);
73594 +extern void die_nmi(char *str, struct pt_regs *regs);
73595 +
73596 +static inline unsigned char get_nmi_reason(void)
73597 +{
73598 +        shared_info_t *s = HYPERVISOR_shared_info;
73599 +        unsigned char reason = 0;
73600 +
73601 +        /* construct a value which looks like it came from
73602 +         * port 0x61.
73603 +         */
73604 +        if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
73605 +                reason |= 0x40;
73606 +        if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
73607 +                reason |= 0x80;
73608 +
73609 +        return reason;
73610 +}
73611 +
73612 +extern int panic_on_timeout;
73613 +extern int unknown_nmi_panic;
73614 +
73615 +extern int check_nmi_watchdog(void);
73616
73617 +#endif /* ASM_NMI_H */
73618 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/page.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/page.h
73619 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/page.h    1970-01-01 01:00:00.000000000 +0100
73620 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/page.h 2006-06-26 09:51:32.000000000 +0200
73621 @@ -0,0 +1,318 @@
73622 +#ifndef _X86_64_PAGE_H
73623 +#define _X86_64_PAGE_H
73624 +
73625 +#include <linux/config.h>
73626 +/* #include <linux/string.h> */
73627 +#ifndef __ASSEMBLY__
73628 +#include <linux/kernel.h>
73629 +#include <linux/types.h>
73630 +#include <asm/bug.h>
73631 +#include <xen/features.h>
73632 +#endif
73633 +#include <xen/interface/xen.h> 
73634 +#include <xen/foreign_page.h>
73635 +
73636 +#define arch_free_page(_page,_order)                   \
73637 +({     int foreign = PageForeign(_page);               \
73638 +       if (foreign)                                    \
73639 +               (PageForeignDestructor(_page))(_page);  \
73640 +       foreign;                                        \
73641 +})
73642 +#define HAVE_ARCH_FREE_PAGE
73643 +
73644 +#ifdef CONFIG_XEN_SCRUB_PAGES
73645 +#define scrub_pages(_p,_n) memset((void *)(_p), 0, (_n) << PAGE_SHIFT)
73646 +#else
73647 +#define scrub_pages(_p,_n) ((void)0)
73648 +#endif
73649 +
73650 +/* PAGE_SHIFT determines the page size */
73651 +#define PAGE_SHIFT     12
73652 +#ifdef __ASSEMBLY__
73653 +#define PAGE_SIZE      (0x1 << PAGE_SHIFT)
73654 +#else
73655 +#define PAGE_SIZE      (1UL << PAGE_SHIFT)
73656 +#endif
73657 +#define PAGE_MASK      (~(PAGE_SIZE-1))
73658 +#define PHYSICAL_PAGE_MASK     (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
73659 +
73660 +#define THREAD_ORDER 1 
73661 +#define THREAD_SIZE  (PAGE_SIZE << THREAD_ORDER)
73662 +#define CURRENT_MASK (~(THREAD_SIZE-1))
73663 +
73664 +#define EXCEPTION_STACK_ORDER 0
73665 +#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
73666 +
73667 +#define DEBUG_STACK_ORDER EXCEPTION_STACK_ORDER
73668 +#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
73669 +
73670 +#define IRQSTACK_ORDER 2
73671 +#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER)
73672 +
73673 +#define STACKFAULT_STACK 1
73674 +#define DOUBLEFAULT_STACK 2
73675 +#define NMI_STACK 3
73676 +#define DEBUG_STACK 4
73677 +#define MCE_STACK 5
73678 +#define N_EXCEPTION_STACKS 5  /* hw limit: 7 */
73679 +
73680 +#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
73681 +#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
73682 +
73683 +#define HPAGE_SHIFT PMD_SHIFT
73684 +#define HPAGE_SIZE     ((1UL) << HPAGE_SHIFT)
73685 +#define HPAGE_MASK     (~(HPAGE_SIZE - 1))
73686 +#define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
73687 +
73688 +#ifdef __KERNEL__
73689 +#ifndef __ASSEMBLY__
73690 +
73691 +extern unsigned long end_pfn;
73692 +
73693 +void clear_page(void *);
73694 +void copy_page(void *, void *);
73695 +
73696 +#define clear_user_page(page, vaddr, pg)       clear_page(page)
73697 +#define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)
73698 +
73699 +#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
73700 +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
73701 +
73702 +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
73703 +#define INVALID_P2M_ENTRY      (~0UL)
73704 +#define FOREIGN_FRAME_BIT      (1UL<<63)
73705 +#define FOREIGN_FRAME(m)       ((m) | FOREIGN_FRAME_BIT)
73706 +
73707 +extern unsigned long *phys_to_machine_mapping;
73708 +
73709 +static inline unsigned long pfn_to_mfn(unsigned long pfn)
73710 +{
73711 +       if (xen_feature(XENFEAT_auto_translated_physmap))
73712 +               return pfn;
73713 +       return phys_to_machine_mapping[(unsigned int)(pfn)] &
73714 +               ~FOREIGN_FRAME_BIT;
73715 +}
73716 +
73717 +static inline int phys_to_machine_mapping_valid(unsigned long pfn)
73718 +{
73719 +       if (xen_feature(XENFEAT_auto_translated_physmap))
73720 +               return 1;
73721 +       return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
73722 +}
73723 +
73724 +static inline unsigned long mfn_to_pfn(unsigned long mfn)
73725 +{
73726 +       unsigned long pfn;
73727 +
73728 +       if (xen_feature(XENFEAT_auto_translated_physmap))
73729 +               return mfn;
73730 +
73731 +       /*
73732 +        * The array access can fail (e.g., device space beyond end of RAM).
73733 +        * In such cases it doesn't matter what we return (we return garbage),
73734 +        * but we must handle the fault without crashing!
73735 +        */
73736 +       asm (
73737 +               "1:     movq %1,%0\n"
73738 +               "2:\n"
73739 +               ".section __ex_table,\"a\"\n"
73740 +               "       .align 8\n"
73741 +               "       .quad 1b,2b\n"
73742 +               ".previous"
73743 +               : "=r" (pfn) : "m" (machine_to_phys_mapping[mfn]) );
73744 +
73745 +       return pfn;
73746 +}
73747 +
73748 +/*
73749 + * We detect special mappings in one of two ways:
73750 + *  1. If the MFN is an I/O page then Xen will set the m2p entry
73751 + *     to be outside our maximum possible pseudophys range.
73752 + *  2. If the MFN belongs to a different domain then we will certainly
73753 + *     not have MFN in our p2m table. Conversely, if the page is ours,
73754 + *     then we'll have p2m(m2p(MFN))==MFN.
73755 + * If we detect a special mapping then it doesn't have a 'struct page'.
73756 + * We force !pfn_valid() by returning an out-of-range pointer.
73757 + *
73758 + * NB. These checks require that, for any MFN that is not in our reservation,
73759 + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
73760 + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
73761 + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
73762 + *
73763 + * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
73764 + *      use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
73765 + *      require. In all the cases we care about, the FOREIGN_FRAME bit is
73766 + *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.
73767 + */
73768 +static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
73769 +{
73770 +       unsigned long pfn = mfn_to_pfn(mfn);
73771 +       if ((pfn < end_pfn)
73772 +           && !xen_feature(XENFEAT_auto_translated_physmap)
73773 +           && (phys_to_machine_mapping[pfn] != mfn))
73774 +               return end_pfn; /* force !pfn_valid() */
73775 +       return pfn;
73776 +}
73777 +
73778 +
73779 +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
73780 +{
73781 +       if (xen_feature(XENFEAT_auto_translated_physmap)) {
73782 +               BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
73783 +               return;
73784 +       }
73785 +       phys_to_machine_mapping[pfn] = mfn;
73786 +}
73787 +
73788 +/* Definitions for machine and pseudophysical addresses. */
73789 +typedef unsigned long paddr_t;
73790 +typedef unsigned long maddr_t;
73791 +
73792 +static inline maddr_t phys_to_machine(paddr_t phys)
73793 +{
73794 +       maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
73795 +       machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
73796 +       return machine;
73797 +}
73798 +
73799 +static inline paddr_t machine_to_phys(maddr_t machine)
73800 +{
73801 +       paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
73802 +       phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
73803 +       return phys;
73804 +}
73805 +
73806 +/*
73807 + * These are used to make use of C type-checking..
73808 + */
73809 +typedef struct { unsigned long pte; } pte_t;
73810 +typedef struct { unsigned long pmd; } pmd_t;
73811 +typedef struct { unsigned long pud; } pud_t;
73812 +typedef struct { unsigned long pgd; } pgd_t;
73813 +#define PTE_MASK       PHYSICAL_PAGE_MASK
73814 +
73815 +typedef struct { unsigned long pgprot; } pgprot_t;
73816 +
73817 +#define pte_val(x)     (((x).pte & 1) ? machine_to_phys((x).pte) : \
73818 +                        (x).pte)
73819 +#define pte_val_ma(x)  ((x).pte)
73820 +
73821 +static inline unsigned long pmd_val(pmd_t x)
73822 +{
73823 +       unsigned long ret = x.pmd;
73824 +       if (ret) ret = machine_to_phys(ret);
73825 +       return ret;
73826 +}
73827 +
73828 +static inline unsigned long pud_val(pud_t x)
73829 +{
73830 +       unsigned long ret = x.pud;
73831 +       if (ret) ret = machine_to_phys(ret);
73832 +       return ret;
73833 +}
73834 +
73835 +static inline unsigned long pgd_val(pgd_t x)
73836 +{
73837 +       unsigned long ret = x.pgd;
73838 +       if (ret) ret = machine_to_phys(ret);
73839 +       return ret;
73840 +}
73841 +
73842 +#define pgprot_val(x)  ((x).pgprot)
73843 +
73844 +#define __pte_ma(x)     ((pte_t) { (x) } )
73845 +
73846 +static inline pte_t __pte(unsigned long x)
73847 +{
73848 +       if (x & 1) x = phys_to_machine(x);
73849 +       return ((pte_t) { (x) });
73850 +}
73851 +
73852 +static inline pmd_t __pmd(unsigned long x)
73853 +{
73854 +       if ((x & 1)) x = phys_to_machine(x);
73855 +       return ((pmd_t) { (x) });
73856 +}
73857 +
73858 +static inline pud_t __pud(unsigned long x)
73859 +{
73860 +       if ((x & 1)) x = phys_to_machine(x);
73861 +       return ((pud_t) { (x) });
73862 +}
73863 +
73864 +static inline pgd_t __pgd(unsigned long x)
73865 +{
73866 +       if ((x & 1)) x = phys_to_machine(x);
73867 +       return ((pgd_t) { (x) });
73868 +}
73869 +
73870 +#define __pgprot(x)    ((pgprot_t) { (x) } )
73871 +
73872 +#define __PHYSICAL_START       ((unsigned long)CONFIG_PHYSICAL_START)
73873 +#define __START_KERNEL         (__START_KERNEL_map + __PHYSICAL_START)
73874 +#define __START_KERNEL_map     0xffffffff80000000UL
73875 +#define __PAGE_OFFSET           0xffff880000000000UL   
73876 +
73877 +#else
73878 +#define __PHYSICAL_START       CONFIG_PHYSICAL_START
73879 +#define __START_KERNEL         (__START_KERNEL_map + __PHYSICAL_START)
73880 +#define __START_KERNEL_map     0xffffffff80000000
73881 +#define __PAGE_OFFSET           0xffff880000000000
73882 +#endif /* !__ASSEMBLY__ */
73883 +
73884 +#undef LOAD_OFFSET
73885 +#define LOAD_OFFSET            0
73886 +
73887 +/* to align the pointer to the (next) page boundary */
73888 +#define PAGE_ALIGN(addr)       (((addr)+PAGE_SIZE-1)&PAGE_MASK)
73889 +
73890 +/* See Documentation/x86_64/mm.txt for a description of the memory map. */
73891 +#define __PHYSICAL_MASK_SHIFT  46
73892 +#define __PHYSICAL_MASK                ((1UL << __PHYSICAL_MASK_SHIFT) - 1)
73893 +#define __VIRTUAL_MASK_SHIFT   48
73894 +#define __VIRTUAL_MASK         ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
73895 +
73896 +#define KERNEL_TEXT_SIZE  (40UL*1024*1024)
73897 +#define KERNEL_TEXT_START 0xffffffff80000000UL 
73898 +
73899 +#define PAGE_OFFSET            ((unsigned long)__PAGE_OFFSET)
73900 +
73901 +/* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol.
73902 +   Otherwise you risk miscompilation. */ 
73903 +#define __pa(x)                        (((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET)
73904 +/* __pa_symbol should be used for C visible symbols.
73905 +   This seems to be the official gcc blessed way to do such arithmetic. */ 
73906 +#define __pa_symbol(x)         \
73907 +       ({unsigned long v;  \
73908 +         asm("" : "=r" (v) : "0" (x)); \
73909 +         __pa(v); })
73910 +
73911 +#define __va(x)                        ((void *)((unsigned long)(x)+PAGE_OFFSET))
73912 +#define __boot_va(x)           __va(x)
73913 +#define __boot_pa(x)           __pa(x)
73914 +#ifdef CONFIG_FLATMEM
73915 +#define pfn_to_page(pfn)       (mem_map + (pfn))
73916 +#define page_to_pfn(page)      ((unsigned long)((page) - mem_map))
73917 +#define pfn_valid(pfn)         ((pfn) < end_pfn)
73918 +#endif
73919 +
73920 +#define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
73921 +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
73922 +#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
73923 +
73924 +/* VIRT <-> MACHINE conversion */
73925 +#define virt_to_machine(v)     (phys_to_machine(__pa(v)))
73926 +#define virt_to_mfn(v)         (pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
73927 +#define mfn_to_virt(m)         (__va(mfn_to_pfn(m) << PAGE_SHIFT))
73928 +
73929 +#define VM_DATA_DEFAULT_FLAGS \
73930 +       (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
73931 +        VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
73932 +
73933 +#define __HAVE_ARCH_GATE_AREA 1        
73934 +
73935 +#endif /* __KERNEL__ */
73936 +
73937 +#include <asm-generic/page.h>
73938 +
73939 +#endif /* _X86_64_PAGE_H */
73940 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/pci.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/pci.h
73941 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/pci.h     1970-01-01 01:00:00.000000000 +0100
73942 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/pci.h  2006-06-26 09:51:32.000000000 +0200
73943 @@ -0,0 +1,174 @@
73944 +#ifndef __x8664_PCI_H
73945 +#define __x8664_PCI_H
73946 +
73947 +#include <linux/config.h>
73948 +#include <asm/io.h>
73949 +
73950 +#ifdef __KERNEL__
73951 +
73952 +#include <linux/mm.h> /* for struct page */
73953 +
73954 +/* Can be used to override the logic in pci_scan_bus for skipping
73955 +   already-configured bus numbers - to be used for buggy BIOSes
73956 +   or architectures with incomplete PCI setup by the loader */
73957 +
73958 +#ifdef CONFIG_PCI
73959 +extern unsigned int pcibios_assign_all_busses(void);
73960 +#else
73961 +#define pcibios_assign_all_busses()    0
73962 +#endif
73963 +#define pcibios_scan_all_fns(a, b)     0
73964 +
73965 +extern unsigned long pci_mem_start;
73966 +#define PCIBIOS_MIN_IO         0x1000
73967 +#define PCIBIOS_MIN_MEM                (pci_mem_start)
73968 +
73969 +#define PCIBIOS_MIN_CARDBUS_IO 0x4000
73970 +
73971 +void pcibios_config_init(void);
73972 +struct pci_bus * pcibios_scan_root(int bus);
73973 +extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value);
73974 +extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value);
73975 +
73976 +void pcibios_set_master(struct pci_dev *dev);
73977 +void pcibios_penalize_isa_irq(int irq, int active);
73978 +struct irq_routing_table *pcibios_get_irq_routing_table(void);
73979 +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
73980 +
73981 +#include <linux/types.h>
73982 +#include <linux/slab.h>
73983 +#include <asm/scatterlist.h>
73984 +#include <linux/string.h>
73985 +#include <asm/page.h>
73986 +#include <linux/dma-mapping.h> /* for have_iommu */
73987 +
73988 +extern int iommu_setup(char *opt);
73989 +
73990 +/* The PCI address space does equal the physical memory
73991 + * address space.  The networking and block device layers use
73992 + * this boolean for bounce buffer decisions
73993 + *
73994 + * On AMD64 it mostly equals, but we set it to zero if a hardware
73995 + * IOMMU (gart) of sotware IOMMU (swiotlb) is available.
73996 + */
73997 +#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
73998 +
73999 +#ifdef CONFIG_GART_IOMMU
74000 +
74001 +/*
74002 + * x86-64 always supports DAC, but sometimes it is useful to force
74003 + * devices through the IOMMU to get automatic sg list merging.
74004 + * Optional right now.
74005 + */
74006 +extern int iommu_sac_force;
74007 +#define pci_dac_dma_supported(pci_dev, mask)   (!iommu_sac_force)
74008 +
74009 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)      \
74010 +       dma_addr_t ADDR_NAME;
74011 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)                \
74012 +       __u32 LEN_NAME;
74013 +#define pci_unmap_addr(PTR, ADDR_NAME)                 \
74014 +       ((PTR)->ADDR_NAME)
74015 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)                \
74016 +       (((PTR)->ADDR_NAME) = (VAL))
74017 +#define pci_unmap_len(PTR, LEN_NAME)                   \
74018 +       ((PTR)->LEN_NAME)
74019 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)          \
74020 +       (((PTR)->LEN_NAME) = (VAL))
74021 +
74022 +#elif defined(CONFIG_SWIOTLB)
74023 +
74024 +#define pci_dac_dma_supported(pci_dev, mask)    1
74025 +
74026 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)      \
74027 +       dma_addr_t ADDR_NAME;
74028 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)                \
74029 +       __u32 LEN_NAME;
74030 +#define pci_unmap_addr(PTR, ADDR_NAME)                 \
74031 +       ((PTR)->ADDR_NAME)
74032 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)                \
74033 +       (((PTR)->ADDR_NAME) = (VAL))
74034 +#define pci_unmap_len(PTR, LEN_NAME)                   \
74035 +       ((PTR)->LEN_NAME)
74036 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)          \
74037 +       (((PTR)->LEN_NAME) = (VAL))
74038 +
74039 +#else
74040 +/* No IOMMU */
74041 +
74042 +#define pci_dac_dma_supported(pci_dev, mask)    1
74043 +
74044 +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
74045 +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
74046 +#define pci_unmap_addr(PTR, ADDR_NAME)         (0)
74047 +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)        do { } while (0)
74048 +#define pci_unmap_len(PTR, LEN_NAME)           (0)
74049 +#define pci_unmap_len_set(PTR, LEN_NAME, VAL)  do { } while (0)
74050 +
74051 +#endif
74052 +
74053 +#include <asm-generic/pci-dma-compat.h>
74054 +
74055 +static inline dma64_addr_t
74056 +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
74057 +{
74058 +       return ((dma64_addr_t) page_to_phys(page) +
74059 +               (dma64_addr_t) offset);
74060 +}
74061 +
74062 +static inline struct page *
74063 +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
74064 +{
74065 +       return virt_to_page(__va(dma_addr));    
74066 +}
74067 +
74068 +static inline unsigned long
74069 +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
74070 +{
74071 +       return (dma_addr & ~PAGE_MASK);
74072 +}
74073 +
74074 +static inline void
74075 +pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
74076 +{
74077 +}
74078 +
74079 +static inline void
74080 +pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
74081 +{
74082 +       flush_write_buffers();
74083 +}
74084 +
74085 +#ifdef CONFIG_PCI
74086 +static inline void pci_dma_burst_advice(struct pci_dev *pdev,
74087 +                                       enum pci_dma_burst_strategy *strat,
74088 +                                       unsigned long *strategy_parameter)
74089 +{
74090 +       *strat = PCI_DMA_BURST_INFINITY;
74091 +       *strategy_parameter = ~0UL;
74092 +}
74093 +#endif
74094 +
74095 +#define HAVE_PCI_MMAP
74096 +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
74097 +                              enum pci_mmap_state mmap_state, int write_combine);
74098 +
74099 +static inline void pcibios_add_platform_entries(struct pci_dev *dev)
74100 +{
74101 +}
74102 +
74103 +#endif /* __KERNEL__ */
74104 +
74105 +/* generic pci stuff */
74106 +#ifdef CONFIG_PCI
74107 +#include <asm-generic/pci.h>
74108 +#endif
74109 +
74110 +/* On Xen we have to scan all functions since Xen hides bridges from
74111 + * us.  If a bridge is at fn=0 and that slot has a multifunction
74112 + * device, we won't find the additional devices without scanning all
74113 + * functions. */
74114 +#undef pcibios_scan_all_fns
74115 +#define pcibios_scan_all_fns(a, b)     1
74116 +
74117 +#endif /* __x8664_PCI_H */
74118 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/pgalloc.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/pgalloc.h
74119 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/pgalloc.h 1970-01-01 01:00:00.000000000 +0100
74120 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/pgalloc.h      2006-06-26 09:51:32.000000000 +0200
74121 @@ -0,0 +1,198 @@
74122 +#ifndef _X86_64_PGALLOC_H
74123 +#define _X86_64_PGALLOC_H
74124 +
74125 +#include <asm/fixmap.h>
74126 +#include <asm/pda.h>
74127 +#include <linux/threads.h>
74128 +#include <linux/mm.h>
74129 +#include <asm/io.h>            /* for phys_to_virt and page_to_pseudophys */
74130 +
74131 +#include <xen/features.h>
74132 +void make_page_readonly(void *va, unsigned int feature);
74133 +void make_page_writable(void *va, unsigned int feature);
74134 +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
74135 +void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
74136 +
74137 +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
74138 +
74139 +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
74140 +{
74141 +       set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
74142 +}
74143 +
74144 +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
74145 +{
74146 +       if (unlikely((mm)->context.pinned)) {
74147 +               BUG_ON(HYPERVISOR_update_va_mapping(
74148 +                              (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT),
74149 +                              pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
74150 +               set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
74151 +       } else {
74152 +               *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
74153 +       }
74154 +}
74155 +
74156 +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
74157 +{
74158 +       if (unlikely((mm)->context.pinned)) {
74159 +               BUG_ON(HYPERVISOR_update_va_mapping(
74160 +                              (unsigned long)pmd,
74161 +                              pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, 
74162 +                                      PAGE_KERNEL_RO), 0));
74163 +               set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
74164 +       } else {
74165 +               *(pud) =  __pud(_PAGE_TABLE | __pa(pmd));
74166 +       }
74167 +}
74168 +
74169 +/*
74170 + * We need to use the batch mode here, but pgd_pupulate() won't be
74171 + * be called frequently.
74172 + */
74173 +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
74174 +{
74175 +       if (unlikely((mm)->context.pinned)) {
74176 +               BUG_ON(HYPERVISOR_update_va_mapping(
74177 +                              (unsigned long)pud,
74178 +                              pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, 
74179 +                                      PAGE_KERNEL_RO), 0));
74180 +               set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
74181 +               set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
74182 +       } else {
74183 +               *(pgd) =  __pgd(_PAGE_TABLE | __pa(pud));
74184 +               *(__user_pgd(pgd)) = *(pgd);
74185 +       }
74186 +}
74187 +
74188 +static inline void pmd_free(pmd_t *pmd)
74189 +{
74190 +       pte_t *ptep = virt_to_ptep(pmd);
74191 +
74192 +       if (!pte_write(*ptep)) {
74193 +               BUG_ON(HYPERVISOR_update_va_mapping(
74194 +                       (unsigned long)pmd,
74195 +                       pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, PAGE_KERNEL),
74196 +                       0));
74197 +       }
74198 +       free_page((unsigned long)pmd);
74199 +}
74200 +
74201 +static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
74202 +{
74203 +        pmd_t *pmd = (pmd_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
74204 +        return pmd;
74205 +}
74206 +
74207 +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
74208 +{
74209 +        pud_t *pud = (pud_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
74210 +        return pud;
74211 +}
74212 +
74213 +static inline void pud_free(pud_t *pud)
74214 +{
74215 +       pte_t *ptep = virt_to_ptep(pud);
74216 +
74217 +       if (!pte_write(*ptep)) {
74218 +               BUG_ON(HYPERVISOR_update_va_mapping(
74219 +                       (unsigned long)pud,
74220 +                       pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, PAGE_KERNEL),
74221 +                       0));
74222 +       }
74223 +       free_page((unsigned long)pud);
74224 +}
74225 +
74226 +static inline pgd_t *pgd_alloc(struct mm_struct *mm)
74227 +{
74228 +        /*
74229 +         * We allocate two contiguous pages for kernel and user.
74230 +         */
74231 +        unsigned boundary;
74232 +       pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1);
74233 +
74234 +       if (!pgd)
74235 +               return NULL;
74236 +       /*
74237 +        * Copy kernel pointers in from init.
74238 +        * Could keep a freelist or slab cache of those because the kernel
74239 +        * part never changes.
74240 +        */
74241 +       boundary = pgd_index(__PAGE_OFFSET);
74242 +       memset(pgd, 0, boundary * sizeof(pgd_t));
74243 +       memcpy(pgd + boundary,
74244 +              init_level4_pgt + boundary,
74245 +              (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
74246 +
74247 +       memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */
74248 +        /*
74249 +         * Set level3_user_pgt for vsyscall area
74250 +         */
74251 +       set_pgd(__user_pgd(pgd) + pgd_index(VSYSCALL_START), 
74252 +                mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
74253 +       return pgd;
74254 +}
74255 +
74256 +static inline void pgd_free(pgd_t *pgd)
74257 +{
74258 +       pte_t *ptep = virt_to_ptep(pgd);
74259 +
74260 +       if (!pte_write(*ptep)) {
74261 +               xen_pgd_unpin(__pa(pgd));
74262 +               BUG_ON(HYPERVISOR_update_va_mapping(
74263 +                              (unsigned long)pgd,
74264 +                              pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL),
74265 +                              0));
74266 +       }
74267 +
74268 +       ptep = virt_to_ptep(__user_pgd(pgd));
74269 +
74270 +       if (!pte_write(*ptep)) {
74271 +               xen_pgd_unpin(__pa(__user_pgd(pgd)));
74272 +               BUG_ON(HYPERVISOR_update_va_mapping(
74273 +                              (unsigned long)__user_pgd(pgd),
74274 +                              pfn_pte(virt_to_phys(__user_pgd(pgd))>>PAGE_SHIFT, 
74275 +                                      PAGE_KERNEL),
74276 +                              0));
74277 +       }
74278 +
74279 +       free_pages((unsigned long)pgd, 1);
74280 +}
74281 +
74282 +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
74283 +{
74284 +        pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
74285 +        if (pte)
74286 +               make_page_readonly(pte, XENFEAT_writable_page_tables);
74287 +
74288 +       return pte;
74289 +}
74290 +
74291 +static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
74292 +{
74293 +       struct page *pte;
74294 +
74295 +       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
74296 +       return pte;
74297 +}
74298 +
74299 +/* Should really implement gc for free page table pages. This could be
74300 +   done with a reference count in struct page. */
74301 +
74302 +static inline void pte_free_kernel(pte_t *pte)
74303 +{
74304 +       BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
74305 +        make_page_writable(pte, XENFEAT_writable_page_tables);
74306 +       free_page((unsigned long)pte); 
74307 +}
74308 +
74309 +extern void pte_free(struct page *pte);
74310 +
74311 +//#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) 
74312 +//#define __pmd_free_tlb(tlb,x)   tlb_remove_page((tlb),virt_to_page(x))
74313 +//#define __pud_free_tlb(tlb,x)   tlb_remove_page((tlb),virt_to_page(x))
74314 +
74315 +#define __pte_free_tlb(tlb,x)   pte_free((x))
74316 +#define __pmd_free_tlb(tlb,x)   pmd_free((x))
74317 +#define __pud_free_tlb(tlb,x)   pud_free((x))
74318 +
74319 +#endif /* _X86_64_PGALLOC_H */
74320 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/pgtable.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/pgtable.h
74321 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/pgtable.h 1970-01-01 01:00:00.000000000 +0100
74322 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/pgtable.h      2006-06-26 09:51:32.000000000 +0200
74323 @@ -0,0 +1,560 @@
74324 +#ifndef _X86_64_PGTABLE_H
74325 +#define _X86_64_PGTABLE_H
74326 +
74327 +/*
74328 + * This file contains the functions and defines necessary to modify and use
74329 + * the x86-64 page table tree.
74330 + */
74331 +#include <asm/processor.h>
74332 +#include <asm/fixmap.h>
74333 +#include <asm/bitops.h>
74334 +#include <linux/threads.h>
74335 +#include <linux/sched.h>
74336 +#include <asm/pda.h>
74337 +#ifdef CONFIG_XEN
74338 +#include <asm/hypervisor.h>
74339 +
74340 +extern pud_t level3_user_pgt[512];
74341 +extern pud_t init_level4_user_pgt[];
74342 +
74343 +extern void xen_init_pt(void);
74344 +
74345 +#define virt_to_ptep(__va)                                             \
74346 +({                                                                     \
74347 +       pgd_t *__pgd = pgd_offset_k((unsigned long)(__va));             \
74348 +       pud_t *__pud = pud_offset(__pgd, (unsigned long)(__va));        \
74349 +       pmd_t *__pmd = pmd_offset(__pud, (unsigned long)(__va));        \
74350 +       pte_offset_kernel(__pmd, (unsigned long)(__va));                \
74351 +})
74352 +
74353 +#define arbitrary_virt_to_machine(__va)                                        \
74354 +({                                                                     \
74355 +       maddr_t m = (maddr_t)pte_mfn(*virt_to_ptep(__va)) << PAGE_SHIFT;\
74356 +       m | ((unsigned long)(__va) & (PAGE_SIZE-1));                    \
74357 +})
74358 +#endif
74359 +
74360 +extern pud_t level3_kernel_pgt[512];
74361 +extern pud_t level3_physmem_pgt[512];
74362 +extern pud_t level3_ident_pgt[512];
74363 +extern pmd_t level2_kernel_pgt[512];
74364 +extern pgd_t init_level4_pgt[];
74365 +extern pgd_t boot_level4_pgt[];
74366 +extern unsigned long __supported_pte_mask;
74367 +
74368 +#define swapper_pg_dir init_level4_pgt
74369 +
74370 +extern int nonx_setup(char *str);
74371 +extern void paging_init(void);
74372 +extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
74373 +
74374 +extern unsigned long pgkern_mask;
74375 +
74376 +/*
74377 + * ZERO_PAGE is a global shared page that is always zero: used
74378 + * for zero-mapped memory areas etc..
74379 + */
74380 +extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
74381 +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
74382 +
74383 +/*
74384 + * PGDIR_SHIFT determines what a top-level page table entry can map
74385 + */
74386 +#define PGDIR_SHIFT    39
74387 +#define PTRS_PER_PGD   512
74388 +
74389 +/*
74390 + * 3rd level page
74391 + */
74392 +#define PUD_SHIFT      30
74393 +#define PTRS_PER_PUD   512
74394 +
74395 +/*
74396 + * PMD_SHIFT determines the size of the area a middle-level
74397 + * page table can map
74398 + */
74399 +#define PMD_SHIFT      21
74400 +#define PTRS_PER_PMD   512
74401 +
74402 +/*
74403 + * entries per page directory level
74404 + */
74405 +#define PTRS_PER_PTE   512
74406 +
74407 +#define pte_ERROR(e) \
74408 +       printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), pte_val(e))
74409 +#define pmd_ERROR(e) \
74410 +       printk("%s:%d: bad pmd %p(%016lx).\n", __FILE__, __LINE__, &(e), pmd_val(e))
74411 +#define pud_ERROR(e) \
74412 +       printk("%s:%d: bad pud %p(%016lx).\n", __FILE__, __LINE__, &(e), pud_val(e))
74413 +#define pgd_ERROR(e) \
74414 +       printk("%s:%d: bad pgd %p(%016lx).\n", __FILE__, __LINE__, &(e), pgd_val(e))
74415 +
74416 +#define pgd_none(x)    (!pgd_val(x))
74417 +#define pud_none(x)    (!pud_val(x))
74418 +
74419 +#define set_pte_batched(pteptr, pteval) \
74420 +       queue_l1_entry_update(pteptr, (pteval))
74421 +
74422 +extern inline int pud_present(pud_t pud)       { return !pud_none(pud); }
74423 +
74424 +static inline void set_pte(pte_t *dst, pte_t val)
74425 +{
74426 +       *dst = val;
74427 +}
74428 +
74429 +#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval))
74430 +#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval))
74431 +#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval))
74432 +
74433 +static inline void pud_clear (pud_t * pud)
74434 +{
74435 +       set_pud(pud, __pud(0));
74436 +}
74437 +
74438 +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD)
74439 +
74440 +static inline void pgd_clear (pgd_t * pgd)
74441 +{
74442 +        set_pgd(pgd, __pgd(0));
74443 +        set_pgd(__user_pgd(pgd), __pgd(0));
74444 +}
74445 +
74446 +#define pud_page(pud) \
74447 +    ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
74448 +
74449 +/*
74450 + * A note on implementation of this atomic 'get-and-clear' operation.
74451 + * This is actually very simple because Xen Linux can only run on a single
74452 + * processor. Therefore, we cannot race other processors setting the 'accessed'
74453 + * or 'dirty' bits on a page-table entry.
74454 + * Even if pages are shared between domains, that is not a problem because
74455 + * each domain will have separate page tables, with their own versions of
74456 + * accessed & dirty state.
74457 + */
74458 +#define ptep_get_and_clear(mm,addr,xp) __pte_ma(xchg(&(xp)->pte, 0))
74459 +
74460 +#if 0
74461 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp)
74462 +{
74463 +        pte_t pte = *xp;
74464 +        if (pte.pte)
74465 +                set_pte(xp, __pte_ma(0));
74466 +        return pte;
74467 +}
74468 +#endif
74469 +
74470 +struct mm_struct;
74471 +
74472 +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
74473 +{
74474 +       pte_t pte;
74475 +       if (full) {
74476 +               pte = *ptep;
74477 +               *ptep = __pte(0);
74478 +       } else {
74479 +               pte = ptep_get_and_clear(mm, addr, ptep);
74480 +       }
74481 +       return pte;
74482 +}
74483 +
74484 +#define pte_same(a, b)         ((a).pte == (b).pte)
74485 +
74486 +#define pte_pgprot(a)  (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
74487 +
74488 +#define PMD_SIZE       (1UL << PMD_SHIFT)
74489 +#define PMD_MASK       (~(PMD_SIZE-1))
74490 +#define PUD_SIZE       (1UL << PUD_SHIFT)
74491 +#define PUD_MASK       (~(PUD_SIZE-1))
74492 +#define PGDIR_SIZE     (1UL << PGDIR_SHIFT)
74493 +#define PGDIR_MASK     (~(PGDIR_SIZE-1))
74494 +
74495 +#define USER_PTRS_PER_PGD      ((TASK_SIZE-1)/PGDIR_SIZE+1)
74496 +#define FIRST_USER_ADDRESS     0
74497 +
74498 +#ifndef __ASSEMBLY__
74499 +#define MAXMEM          0x3fffffffffffUL
74500 +#define VMALLOC_START    0xffffc20000000000UL
74501 +#define VMALLOC_END      0xffffe1ffffffffffUL
74502 +#define MODULES_VADDR    0xffffffff88000000UL
74503 +#define MODULES_END      0xfffffffffff00000UL
74504 +#define MODULES_LEN   (MODULES_END - MODULES_VADDR)
74505 +
74506 +#define _PAGE_BIT_PRESENT      0
74507 +#define _PAGE_BIT_RW           1
74508 +#define _PAGE_BIT_USER         2
74509 +#define _PAGE_BIT_PWT          3
74510 +#define _PAGE_BIT_PCD          4
74511 +#define _PAGE_BIT_ACCESSED     5
74512 +#define _PAGE_BIT_DIRTY                6
74513 +#define _PAGE_BIT_PSE          7       /* 4 MB (or 2MB) page */
74514 +#define _PAGE_BIT_GLOBAL       8       /* Global TLB entry PPro+ */
74515 +#define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
74516 +
74517 +#define _PAGE_PRESENT  0x001
74518 +#define _PAGE_RW       0x002
74519 +#define _PAGE_USER     0x004
74520 +#define _PAGE_PWT      0x008
74521 +#define _PAGE_PCD      0x010
74522 +#define _PAGE_ACCESSED 0x020
74523 +#define _PAGE_DIRTY    0x040
74524 +#define _PAGE_PSE      0x080   /* 2MB page */
74525 +#define _PAGE_FILE     0x040   /* nonlinear file mapping, saved PTE; unset:swap */
74526 +#define _PAGE_GLOBAL   0x100   /* Global TLB entry */
74527 +
74528 +#define _PAGE_PROTNONE 0x080   /* If not present */
74529 +#define _PAGE_NX        (1UL<<_PAGE_BIT_NX)
74530 +
74531 +#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
74532 +#define _KERNPG_TABLE  _PAGE_TABLE
74533 +
74534 +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
74535 +
74536 +#define PAGE_NONE      __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
74537 +#define PAGE_SHARED    __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
74538 +#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
74539 +#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
74540 +#define PAGE_COPY PAGE_COPY_NOEXEC
74541 +#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
74542 +#define PAGE_READONLY  __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
74543 +#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
74544 +#define __PAGE_KERNEL \
74545 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | _PAGE_USER )
74546 +#define __PAGE_KERNEL_EXEC \
74547 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER )
74548 +#define __PAGE_KERNEL_NOCACHE \
74549 +       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | _PAGE_USER )
74550 +#define __PAGE_KERNEL_RO \
74551 +       (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | _PAGE_USER )
74552 +#define __PAGE_KERNEL_VSYSCALL \
74553 +       (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_USER )
74554 +#define __PAGE_KERNEL_VSYSCALL_NOCACHE \
74555 +       (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD | _PAGE_USER )
74556 +#define __PAGE_KERNEL_LARGE \
74557 +       (__PAGE_KERNEL | _PAGE_PSE | _PAGE_USER )
74558 +#define __PAGE_KERNEL_LARGE_EXEC \
74559 +       (__PAGE_KERNEL_EXEC | _PAGE_PSE | _PAGE_USER )
74560 +
74561 +
74562 +/*
74563 + * We don't support GLOBAL page in xenolinux64
74564 + */
74565 +#define MAKE_GLOBAL(x) __pgprot((x))
74566 +
74567 +#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL)
74568 +#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC)
74569 +#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO)
74570 +#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE)
74571 +#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL)
74572 +#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL)
74573 +#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE)
74574 +#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE)
74575 +
74576 +/*         xwr */
74577 +#define __P000 PAGE_NONE
74578 +#define __P001 PAGE_READONLY
74579 +#define __P010 PAGE_COPY
74580 +#define __P011 PAGE_COPY
74581 +#define __P100 PAGE_READONLY_EXEC
74582 +#define __P101 PAGE_READONLY_EXEC
74583 +#define __P110 PAGE_COPY_EXEC
74584 +#define __P111 PAGE_COPY_EXEC
74585 +
74586 +#define __S000 PAGE_NONE
74587 +#define __S001 PAGE_READONLY
74588 +#define __S010 PAGE_SHARED
74589 +#define __S011 PAGE_SHARED
74590 +#define __S100 PAGE_READONLY_EXEC
74591 +#define __S101 PAGE_READONLY_EXEC
74592 +#define __S110 PAGE_SHARED_EXEC
74593 +#define __S111 PAGE_SHARED_EXEC
74594 +
74595 +static inline unsigned long pgd_bad(pgd_t pgd)
74596 +{
74597 +       unsigned long val = pgd_val(pgd);
74598 +       val &= ~PTE_MASK;
74599 +       val &= ~(_PAGE_USER | _PAGE_DIRTY);
74600 +       return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
74601 +}
74602 +
74603 +static inline unsigned long pud_bad(pud_t pud) 
74604 +{ 
74605 +       unsigned long val = pud_val(pud);
74606 +       val &= ~PTE_MASK; 
74607 +       val &= ~(_PAGE_USER | _PAGE_DIRTY); 
74608 +       return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);      
74609 +} 
74610 +
74611 +#define set_pte_at(_mm,addr,ptep,pteval) do {                          \
74612 +       if (((_mm) != current->mm && (_mm) != &init_mm) ||              \
74613 +           HYPERVISOR_update_va_mapping((addr), (pteval), 0))          \
74614 +               set_pte((ptep), (pteval));                              \
74615 +} while (0)
74616 +
74617 +#define pte_none(x)    (!(x).pte)
74618 +#define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE))
74619 +#define pte_clear(mm,addr,xp)  do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
74620 +
74621 +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
74622 +
74623 +#define pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT)
74624 +#define pte_pfn(_pte) mfn_to_local_pfn(pte_mfn(_pte))
74625 +
74626 +#define pte_page(x)    pfn_to_page(pte_pfn(x))
74627 +
74628 +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
74629 +{
74630 +       pte_t pte;
74631 +        
74632 +       (pte).pte = (pfn_to_mfn(page_nr) << PAGE_SHIFT);
74633 +       (pte).pte |= pgprot_val(pgprot);
74634 +       (pte).pte &= __supported_pte_mask;
74635 +       return pte;
74636 +}
74637 +
74638 +#define pfn_pte_ma(pfn, prot)  __pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask)
74639 +/*
74640 + * The following only work if pte_present() is true.
74641 + * Undefined behaviour if not..
74642 + */
74643 +#define __pte_val(x)   ((x).pte)
74644 +
74645 +#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
74646 +static inline int pte_user(pte_t pte)          { return __pte_val(pte) & _PAGE_USER; }
74647 +static inline int pte_read(pte_t pte)          { return __pte_val(pte) & _PAGE_USER; }
74648 +static inline int pte_exec(pte_t pte)          { return __pte_val(pte) & _PAGE_USER; }
74649 +static inline int pte_dirty(pte_t pte)         { return __pte_val(pte) & _PAGE_DIRTY; }
74650 +static inline int pte_young(pte_t pte)         { return __pte_val(pte) & _PAGE_ACCESSED; }
74651 +static inline int pte_write(pte_t pte)         { return __pte_val(pte) & _PAGE_RW; }
74652 +static inline int pte_file(pte_t pte)          { return __pte_val(pte) & _PAGE_FILE; }
74653 +static inline int pte_huge(pte_t pte)          { return (__pte_val(pte) & __LARGE_PTE) == __LARGE_PTE; }
74654 +
74655 +static inline pte_t pte_rdprotect(pte_t pte)   { __pte_val(pte) &= ~_PAGE_USER; return pte; }
74656 +static inline pte_t pte_exprotect(pte_t pte)   { __pte_val(pte) &= ~_PAGE_USER; return pte; }
74657 +static inline pte_t pte_mkclean(pte_t pte)     { __pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
74658 +static inline pte_t pte_mkold(pte_t pte)       { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
74659 +static inline pte_t pte_wrprotect(pte_t pte)   { __pte_val(pte) &= ~_PAGE_RW; return pte; }
74660 +static inline pte_t pte_mkread(pte_t pte)      { __pte_val(pte) |= _PAGE_USER; return pte; }
74661 +static inline pte_t pte_mkexec(pte_t pte)      { __pte_val(pte) |= _PAGE_USER; return pte; }
74662 +static inline pte_t pte_mkdirty(pte_t pte)     { __pte_val(pte) |= _PAGE_DIRTY; return pte; }
74663 +static inline pte_t pte_mkyoung(pte_t pte)     { __pte_val(pte) |= _PAGE_ACCESSED; return pte; }
74664 +static inline pte_t pte_mkwrite(pte_t pte)     { __pte_val(pte) |= _PAGE_RW; return pte; }
74665 +static inline pte_t pte_mkhuge(pte_t pte)      { __pte_val(pte) |= __LARGE_PTE; return pte; }
74666 +
74667 +struct vm_area_struct;
74668 +
74669 +static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
74670 +{
74671 +       pte_t pte = *ptep;
74672 +       int ret = pte_dirty(pte);
74673 +       if (ret)
74674 +               set_pte(ptep, pte_mkclean(pte));
74675 +       return ret;
74676 +}
74677 +
74678 +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
74679 +{
74680 +       pte_t pte = *ptep;
74681 +       int ret = pte_young(pte);
74682 +       if (ret)
74683 +               set_pte(ptep, pte_mkold(pte));
74684 +       return ret;
74685 +}
74686 +
74687 +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
74688 +{
74689 +       pte_t pte = *ptep;
74690 +       if (pte_write(pte))
74691 +               set_pte(ptep, pte_wrprotect(pte));
74692 +}
74693 +
74694 +/*
74695 + * Macro to mark a page protection value as "uncacheable".
74696 + */
74697 +#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
74698 +
74699 +static inline int pmd_large(pmd_t pte) { 
74700 +       return (pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE; 
74701 +}      
74702 +
74703 +
74704 +/*
74705 + * Conversion functions: convert a page and protection to a page entry,
74706 + * and a page entry and page directory to the page they refer to.
74707 + */
74708 +
74709 +/*
74710 + * Level 4 access.
74711 + * Never use these in the common code.
74712 + */
74713 +#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
74714 +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
74715 +#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
74716 +#define pgd_offset_k(address) (pgd_t *)(init_level4_pgt + pgd_index(address))
74717 +#define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT)
74718 +#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
74719 +
74720 +/* PUD - Level3 access */
74721 +/* to find an entry in a page-table-directory. */
74722 +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
74723 +#define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address))
74724 +static inline pud_t *__pud_offset_k(pud_t *pud, unsigned long address)
74725 +{ 
74726 +       return pud + pud_index(address);
74727 +} 
74728 +
74729 +/* Find correct pud via the hidden fourth level page level: */
74730 +
74731 +/* This accesses the reference page table of the boot cpu. 
74732 +   Other CPUs get synced lazily via the page fault handler. */
74733 +static inline pud_t *pud_offset_k(pgd_t *pgd, unsigned long address)
74734 +{
74735 +       return pud_offset(pgd_offset_k(address), address);
74736 +}
74737 +
74738 +/* PMD  - Level 2 access */
74739 +#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
74740 +#define pmd_page(pmd)          (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
74741 +
74742 +#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
74743 +#define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \
74744 +                                  pmd_index(address))
74745 +#define pmd_none(x)    (!pmd_val(x))
74746 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
74747 +   can temporarily clear it. */
74748 +#define pmd_present(x) (pmd_val(x))
74749 +#define pmd_clear(xp)  do { set_pmd(xp, __pmd(0)); } while (0)
74750 +#define        pmd_bad(x)      ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
74751 +#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
74752 +#define pmd_pfn(x)  ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
74753 +
74754 +#define pte_to_pgoff(pte) ((pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
74755 +#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE })
74756 +#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
74757 +
74758 +/* PTE - Level 1 access. */
74759 +
74760 +/* page, protection -> pte */
74761 +#define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))
74762 +#define mk_pte_huge(entry) (pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE)
74763
74764 +/* physical address -> PTE */
74765 +static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot)
74766 +{ 
74767 +       pte_t pte;
74768 +       (pte).pte = physpage | pgprot_val(pgprot); 
74769 +       return pte; 
74770 +}
74771
74772 +/* Change flags of a PTE */
74773 +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
74774 +{ 
74775 +        (pte).pte &= _PAGE_CHG_MASK;
74776 +       (pte).pte |= pgprot_val(newprot);
74777 +       (pte).pte &= __supported_pte_mask;
74778 +       return pte; 
74779 +}
74780 +
74781 +#define pte_index(address) \
74782 +               (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
74783 +#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_kernel(*(dir)) + \
74784 +                       pte_index(address))
74785 +
74786 +/* x86-64 always has all page tables mapped. */
74787 +#define pte_offset_map(dir,address) pte_offset_kernel(dir,address)
74788 +#define pte_offset_map_nested(dir,address) pte_offset_kernel(dir,address)
74789 +#define pte_unmap(pte) /* NOP */
74790 +#define pte_unmap_nested(pte) /* NOP */ 
74791 +
74792 +#define update_mmu_cache(vma,address,pte) do { } while (0)
74793 +
74794 +/* We only update the dirty/accessed state if we set
74795 + * the dirty bit by hand in the kernel, since the hardware
74796 + * will do the accessed bit for us, and we don't want to
74797 + * race with other CPU's that might be updating the dirty
74798 + * bit at the same time. */
74799 +#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
74800 +#if 0
74801 +#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
74802 +       do {                                                              \
74803 +               if (__dirty) {                                            \
74804 +                       set_pte(__ptep, __entry);                         \
74805 +                       flush_tlb_page(__vma, __address);                 \
74806 +               }                                                         \
74807 +       } while (0)
74808 +#endif
74809 +#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
74810 +       do {                                                              \
74811 +               if (__dirty) {                                            \
74812 +                       if ( likely((__vma)->vm_mm == current->mm) ) {    \
74813 +                           BUG_ON(HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG|UVMF_MULTI|(unsigned long)((__vma)->vm_mm->cpu_vm_mask.bits))); \
74814 +                       } else {                                          \
74815 +                            xen_l1_entry_update((__ptep), (__entry)); \
74816 +                           flush_tlb_page((__vma), (__address));         \
74817 +                       }                                                 \
74818 +               }                                                         \
74819 +       } while (0)
74820 +
74821 +/* Encode and de-code a swap entry */
74822 +#define __swp_type(x)                  (((x).val >> 1) & 0x3f)
74823 +#define __swp_offset(x)                        ((x).val >> 8)
74824 +#define __swp_entry(type, offset)      ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
74825 +#define __pte_to_swp_entry(pte)                ((swp_entry_t) { pte_val(pte) })
74826 +#define __swp_entry_to_pte(x)          ((pte_t) { (x).val })
74827 +
74828 +#endif /* !__ASSEMBLY__ */
74829 +
74830 +extern int kern_addr_valid(unsigned long addr); 
74831 +
74832 +#define DOMID_LOCAL (0xFFFFU)
74833 +
74834 +int direct_remap_pfn_range(struct vm_area_struct *vma,
74835 +                            unsigned long address,
74836 +                            unsigned long mfn,
74837 +                            unsigned long size,
74838 +                            pgprot_t prot,
74839 +                            domid_t  domid);
74840 +
74841 +int direct_kernel_remap_pfn_range(unsigned long address, 
74842 +                                 unsigned long mfn,
74843 +                                 unsigned long size, 
74844 +                                 pgprot_t prot,
74845 +                                 domid_t  domid);
74846 +
74847 +int create_lookup_pte_addr(struct mm_struct *mm,
74848 +                           unsigned long address,
74849 +                           uint64_t *ptep);
74850 +
74851 +int touch_pte_range(struct mm_struct *mm,
74852 +                    unsigned long address,
74853 +                    unsigned long size);
74854 +
74855 +#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)                \
74856 +               direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
74857 +
74858 +#define MK_IOSPACE_PFN(space, pfn)     (pfn)
74859 +#define GET_IOSPACE(pfn)               0
74860 +#define GET_PFN(pfn)                   (pfn)
74861 +
74862 +#define HAVE_ARCH_UNMAPPED_AREA
74863 +
74864 +#define pgtable_cache_init()   do { } while (0)
74865 +#define check_pgt_cache()      do { } while (0)
74866 +
74867 +#define PAGE_AGP    PAGE_KERNEL_NOCACHE
74868 +#define HAVE_PAGE_AGP 1
74869 +
74870 +/* fs/proc/kcore.c */
74871 +#define        kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
74872 +#define        kc_offset_to_vaddr(o) \
74873 +   (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
74874 +
74875 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
74876 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
74877 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
74878 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
74879 +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
74880 +#define __HAVE_ARCH_PTE_SAME
74881 +#include <asm-generic/pgtable.h>
74882 +
74883 +#endif /* _X86_64_PGTABLE_H */
74884 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/processor.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/processor.h
74885 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/processor.h       1970-01-01 01:00:00.000000000 +0100
74886 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/processor.h    2006-06-26 09:51:32.000000000 +0200
74887 @@ -0,0 +1,493 @@
74888 +/*
74889 + * include/asm-x86_64/processor.h
74890 + *
74891 + * Copyright (C) 1994 Linus Torvalds
74892 + */
74893 +
74894 +#ifndef __ASM_X86_64_PROCESSOR_H
74895 +#define __ASM_X86_64_PROCESSOR_H
74896 +
74897 +#include <asm/segment.h>
74898 +#include <asm/page.h>
74899 +#include <asm/types.h>
74900 +#include <asm/sigcontext.h>
74901 +#include <asm/cpufeature.h>
74902 +#include <linux/config.h>
74903 +#include <linux/threads.h>
74904 +#include <asm/msr.h>
74905 +#include <asm/current.h>
74906 +#include <asm/system.h>
74907 +#include <asm/mmsegment.h>
74908 +#include <asm/percpu.h>
74909 +#include <linux/personality.h>
74910 +
74911 +#define TF_MASK                0x00000100
74912 +#define IF_MASK                0x00000200
74913 +#define IOPL_MASK      0x00003000
74914 +#define NT_MASK                0x00004000
74915 +#define VM_MASK                0x00020000
74916 +#define AC_MASK                0x00040000
74917 +#define VIF_MASK       0x00080000      /* virtual interrupt flag */
74918 +#define VIP_MASK       0x00100000      /* virtual interrupt pending */
74919 +#define ID_MASK                0x00200000
74920 +
74921 +#define desc_empty(desc) \
74922 +               (!((desc)->a | (desc)->b))
74923 +
74924 +#define desc_equal(desc1, desc2) \
74925 +               (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b))
74926 +
74927 +/*
74928 + * Default implementation of macro that returns current
74929 + * instruction pointer ("program counter").
74930 + */
74931 +#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; })
74932 +
74933 +/*
74934 + *  CPU type and hardware bug flags. Kept separately for each CPU.
74935 + */
74936 +
74937 +struct cpuinfo_x86 {
74938 +       __u8    x86;            /* CPU family */
74939 +       __u8    x86_vendor;     /* CPU vendor */
74940 +       __u8    x86_model;
74941 +       __u8    x86_mask;
74942 +       int     cpuid_level;    /* Maximum supported CPUID level, -1=no CPUID */
74943 +       __u32   x86_capability[NCAPINTS];
74944 +       char    x86_vendor_id[16];
74945 +       char    x86_model_id[64];
74946 +       int     x86_cache_size;  /* in KB */
74947 +       int     x86_clflush_size;
74948 +       int     x86_cache_alignment;
74949 +       int     x86_tlbsize;    /* number of 4K pages in DTLB/ITLB combined(in pages)*/
74950 +        __u8    x86_virt_bits, x86_phys_bits;
74951 +       __u8    x86_max_cores;  /* cpuid returned max cores value */
74952 +        __u32   x86_power;     
74953 +       __u32   extended_cpuid_level;   /* Max extended CPUID function supported */
74954 +       unsigned long loops_per_jiffy;
74955 +       __u8    apicid;
74956 +       __u8    booted_cores;   /* number of cores as seen by OS */
74957 +} ____cacheline_aligned;
74958 +
74959 +#define X86_VENDOR_INTEL 0
74960 +#define X86_VENDOR_CYRIX 1
74961 +#define X86_VENDOR_AMD 2
74962 +#define X86_VENDOR_UMC 3
74963 +#define X86_VENDOR_NEXGEN 4
74964 +#define X86_VENDOR_CENTAUR 5
74965 +#define X86_VENDOR_RISE 6
74966 +#define X86_VENDOR_TRANSMETA 7
74967 +#define X86_VENDOR_NUM 8
74968 +#define X86_VENDOR_UNKNOWN 0xff
74969 +
74970 +#ifdef CONFIG_SMP
74971 +extern struct cpuinfo_x86 cpu_data[];
74972 +#define current_cpu_data cpu_data[smp_processor_id()]
74973 +#else
74974 +#define cpu_data (&boot_cpu_data)
74975 +#define current_cpu_data boot_cpu_data
74976 +#endif
74977 +
74978 +extern char ignore_irq13;
74979 +
74980 +extern void identify_cpu(struct cpuinfo_x86 *);
74981 +extern void print_cpu_info(struct cpuinfo_x86 *);
74982 +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
74983 +
74984 +/*
74985 + * EFLAGS bits
74986 + */
74987 +#define X86_EFLAGS_CF  0x00000001 /* Carry Flag */
74988 +#define X86_EFLAGS_PF  0x00000004 /* Parity Flag */
74989 +#define X86_EFLAGS_AF  0x00000010 /* Auxillary carry Flag */
74990 +#define X86_EFLAGS_ZF  0x00000040 /* Zero Flag */
74991 +#define X86_EFLAGS_SF  0x00000080 /* Sign Flag */
74992 +#define X86_EFLAGS_TF  0x00000100 /* Trap Flag */
74993 +#define X86_EFLAGS_IF  0x00000200 /* Interrupt Flag */
74994 +#define X86_EFLAGS_DF  0x00000400 /* Direction Flag */
74995 +#define X86_EFLAGS_OF  0x00000800 /* Overflow Flag */
74996 +#define X86_EFLAGS_IOPL        0x00003000 /* IOPL mask */
74997 +#define X86_EFLAGS_NT  0x00004000 /* Nested Task */
74998 +#define X86_EFLAGS_RF  0x00010000 /* Resume Flag */
74999 +#define X86_EFLAGS_VM  0x00020000 /* Virtual Mode */
75000 +#define X86_EFLAGS_AC  0x00040000 /* Alignment Check */
75001 +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
75002 +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
75003 +#define X86_EFLAGS_ID  0x00200000 /* CPUID detection flag */
75004 +
75005 +/*
75006 + * Intel CPU features in CR4
75007 + */
75008 +#define X86_CR4_VME            0x0001  /* enable vm86 extensions */
75009 +#define X86_CR4_PVI            0x0002  /* virtual interrupts flag enable */
75010 +#define X86_CR4_TSD            0x0004  /* disable time stamp at ipl 3 */
75011 +#define X86_CR4_DE             0x0008  /* enable debugging extensions */
75012 +#define X86_CR4_PSE            0x0010  /* enable page size extensions */
75013 +#define X86_CR4_PAE            0x0020  /* enable physical address extensions */
75014 +#define X86_CR4_MCE            0x0040  /* Machine check enable */
75015 +#define X86_CR4_PGE            0x0080  /* enable global pages */
75016 +#define X86_CR4_PCE            0x0100  /* enable performance counters at ipl 3 */
75017 +#define X86_CR4_OSFXSR         0x0200  /* enable fast FPU save and restore */
75018 +#define X86_CR4_OSXMMEXCPT     0x0400  /* enable unmasked SSE exceptions */
75019 +
75020 +/*
75021 + * Save the cr4 feature set we're using (ie
75022 + * Pentium 4MB enable and PPro Global page
75023 + * enable), so that any CPU's that boot up
75024 + * after us can get the correct flags.
75025 + */
75026 +extern unsigned long mmu_cr4_features;
75027 +
75028 +static inline void set_in_cr4 (unsigned long mask)
75029 +{
75030 +       mmu_cr4_features |= mask;
75031 +       __asm__("movq %%cr4,%%rax\n\t"
75032 +               "orq %0,%%rax\n\t"
75033 +               "movq %%rax,%%cr4\n"
75034 +               : : "irg" (mask)
75035 +               :"ax");
75036 +}
75037 +
75038 +static inline void clear_in_cr4 (unsigned long mask)
75039 +{
75040 +       mmu_cr4_features &= ~mask;
75041 +       __asm__("movq %%cr4,%%rax\n\t"
75042 +               "andq %0,%%rax\n\t"
75043 +               "movq %%rax,%%cr4\n"
75044 +               : : "irg" (~mask)
75045 +               :"ax");
75046 +}
75047 +
75048 +
75049 +/*
75050 + * Bus types
75051 + */
75052 +#define MCA_bus 0
75053 +#define MCA_bus__is_a_macro
75054 +
75055 +/*
75056 + * User space process size. 47bits minus one guard page.
75057 + */
75058 +#define TASK_SIZE64    (0x800000000000UL - 4096)
75059 +
75060 +/* This decides where the kernel will search for a free chunk of vm
75061 + * space during mmap's.
75062 + */
75063 +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000)
75064 +
75065 +#define TASK_SIZE              (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
75066 +#define TASK_SIZE_OF(child)    ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
75067 +
75068 +#define TASK_UNMAPPED_BASE     PAGE_ALIGN(TASK_SIZE/3)
75069 +
75070 +/*
75071 + * Size of io_bitmap.
75072 + */
75073 +#define IO_BITMAP_BITS  65536
75074 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
75075 +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
75076 +#ifndef CONFIG_X86_NO_TSS
75077 +#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
75078 +#endif
75079 +#define INVALID_IO_BITMAP_OFFSET 0x8000
75080 +
75081 +struct i387_fxsave_struct {
75082 +       u16     cwd;
75083 +       u16     swd;
75084 +       u16     twd;
75085 +       u16     fop;
75086 +       u64     rip;
75087 +       u64     rdp; 
75088 +       u32     mxcsr;
75089 +       u32     mxcsr_mask;
75090 +       u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
75091 +       u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 128 bytes */
75092 +       u32     padding[24];
75093 +} __attribute__ ((aligned (16)));
75094 +
75095 +union i387_union {
75096 +       struct i387_fxsave_struct       fxsave;
75097 +};
75098 +
75099 +#ifndef CONFIG_X86_NO_TSS
75100 +struct tss_struct {
75101 +       u32 reserved1;
75102 +       u64 rsp0;       
75103 +       u64 rsp1;
75104 +       u64 rsp2;
75105 +       u64 reserved2;
75106 +       u64 ist[7];
75107 +       u32 reserved3;
75108 +       u32 reserved4;
75109 +       u16 reserved5;
75110 +       u16 io_bitmap_base;
75111 +       /*
75112 +        * The extra 1 is there because the CPU will access an
75113 +        * additional byte beyond the end of the IO permission
75114 +        * bitmap. The extra byte must be all 1 bits, and must
75115 +        * be within the limit. Thus we have:
75116 +        *
75117 +        * 128 bytes, the bitmap itself, for ports 0..0x3ff
75118 +        * 8 bytes, for an extra "long" of ~0UL
75119 +        */
75120 +       unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
75121 +} __attribute__((packed)) ____cacheline_aligned;
75122 +
75123 +DECLARE_PER_CPU(struct tss_struct,init_tss);
75124 +#endif
75125 +
75126 +extern struct cpuinfo_x86 boot_cpu_data;
75127 +
75128 +#ifdef CONFIG_X86_VSMP
75129 +#define ARCH_MIN_TASKALIGN     (1 << INTERNODE_CACHE_SHIFT)
75130 +#define ARCH_MIN_MMSTRUCT_ALIGN        (1 << INTERNODE_CACHE_SHIFT)
75131 +#else
75132 +#define ARCH_MIN_TASKALIGN     16
75133 +#define ARCH_MIN_MMSTRUCT_ALIGN        0
75134 +#endif
75135 +
75136 +struct thread_struct {
75137 +       unsigned long   rsp0;
75138 +       unsigned long   rsp;
75139 +       unsigned long   userrsp;        /* Copy from PDA */ 
75140 +       unsigned long   fs;
75141 +       unsigned long   gs;
75142 +       unsigned short  es, ds, fsindex, gsindex;       
75143 +/* Hardware debugging registers */
75144 +       unsigned long   debugreg0;  
75145 +       unsigned long   debugreg1;  
75146 +       unsigned long   debugreg2;  
75147 +       unsigned long   debugreg3;  
75148 +       unsigned long   debugreg6;  
75149 +       unsigned long   debugreg7;  
75150 +/* fault info */
75151 +       unsigned long   cr2, trap_no, error_code;
75152 +/* floating point info */
75153 +       union i387_union        i387  __attribute__((aligned(16)));
75154 +/* IO permissions. the bitmap could be moved into the GDT, that would make
75155 +   switch faster for a limited number of ioperm using tasks. -AK */
75156 +       int             ioperm;
75157 +       unsigned long   *io_bitmap_ptr;
75158 +       unsigned io_bitmap_max;
75159 +/* cached TLS descriptors. */
75160 +       u64 tls_array[GDT_ENTRY_TLS_ENTRIES];
75161 +       unsigned int    iopl;
75162 +} __attribute__((aligned(16)));
75163 +
75164 +#define INIT_THREAD  { \
75165 +       .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
75166 +}
75167 +
75168 +#ifndef CONFIG_X86_NO_TSS
75169 +#define INIT_TSS  { \
75170 +       .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
75171 +}
75172 +#endif
75173 +
75174 +#define INIT_MMAP \
75175 +{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }
75176 +
75177 +#define start_thread(regs,new_rip,new_rsp) do { \
75178 +       asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0));      \
75179 +       load_gs_index(0);                                                       \
75180 +       (regs)->rip = (new_rip);                                                 \
75181 +       (regs)->rsp = (new_rsp);                                                 \
75182 +       write_pda(oldrsp, (new_rsp));                                            \
75183 +       (regs)->cs = __USER_CS;                                                  \
75184 +       (regs)->ss = __USER_DS;                                                  \
75185 +       (regs)->eflags = 0x200;                                                  \
75186 +       set_fs(USER_DS);                                                         \
75187 +} while(0) 
75188 +
75189 +#define get_debugreg(var, register)                            \
75190 +       var = HYPERVISOR_get_debugreg(register)
75191 +#define set_debugreg(value, register)                  \
75192 +       HYPERVISOR_set_debugreg(register, value)
75193 +
75194 +struct task_struct;
75195 +struct mm_struct;
75196 +
75197 +/* Free all resources held by a thread. */
75198 +extern void release_thread(struct task_struct *);
75199 +
75200 +/* Prepare to copy thread state - unlazy all lazy status */
75201 +extern void prepare_to_copy(struct task_struct *tsk);
75202 +
75203 +/*
75204 + * create a kernel thread without removing it from tasklists
75205 + */
75206 +extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
75207 +
75208 +/*
75209 + * Return saved PC of a blocked thread.
75210 + * What is this good for? it will be always the scheduler or ret_from_fork.
75211 + */
75212 +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8))
75213 +
75214 +extern unsigned long get_wchan(struct task_struct *p);
75215 +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1)
75216 +#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip)
75217 +#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
75218 +
75219 +
75220 +struct microcode_header {
75221 +       unsigned int hdrver;
75222 +       unsigned int rev;
75223 +       unsigned int date;
75224 +       unsigned int sig;
75225 +       unsigned int cksum;
75226 +       unsigned int ldrver;
75227 +       unsigned int pf;
75228 +       unsigned int datasize;
75229 +       unsigned int totalsize;
75230 +       unsigned int reserved[3];
75231 +};
75232 +
75233 +struct microcode {
75234 +       struct microcode_header hdr;
75235 +       unsigned int bits[0];
75236 +};
75237 +
75238 +typedef struct microcode microcode_t;
75239 +typedef struct microcode_header microcode_header_t;
75240 +
75241 +/* microcode format is extended from prescott processors */
75242 +struct extended_signature {
75243 +       unsigned int sig;
75244 +       unsigned int pf;
75245 +       unsigned int cksum;
75246 +};
75247 +
75248 +struct extended_sigtable {
75249 +       unsigned int count;
75250 +       unsigned int cksum;
75251 +       unsigned int reserved[3];
75252 +       struct extended_signature sigs[0];
75253 +};
75254 +
75255 +/* '6' because it used to be for P6 only (but now covers Pentium 4 as well) */
75256 +#define MICROCODE_IOCFREE      _IO('6',0)
75257 +
75258 +
75259 +#define ASM_NOP1 K8_NOP1
75260 +#define ASM_NOP2 K8_NOP2
75261 +#define ASM_NOP3 K8_NOP3
75262 +#define ASM_NOP4 K8_NOP4
75263 +#define ASM_NOP5 K8_NOP5
75264 +#define ASM_NOP6 K8_NOP6
75265 +#define ASM_NOP7 K8_NOP7
75266 +#define ASM_NOP8 K8_NOP8
75267 +
75268 +/* Opteron nops */
75269 +#define K8_NOP1 ".byte 0x90\n"
75270 +#define K8_NOP2        ".byte 0x66,0x90\n" 
75271 +#define K8_NOP3        ".byte 0x66,0x66,0x90\n" 
75272 +#define K8_NOP4        ".byte 0x66,0x66,0x66,0x90\n" 
75273 +#define K8_NOP5        K8_NOP3 K8_NOP2 
75274 +#define K8_NOP6        K8_NOP3 K8_NOP3
75275 +#define K8_NOP7        K8_NOP4 K8_NOP3
75276 +#define K8_NOP8        K8_NOP4 K8_NOP4
75277 +
75278 +#define ASM_NOP_MAX 8
75279 +
75280 +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
75281 +static inline void rep_nop(void)
75282 +{
75283 +       __asm__ __volatile__("rep;nop": : :"memory");
75284 +}
75285 +
75286 +/* Stop speculative execution */
75287 +static inline void sync_core(void)
75288 +{ 
75289 +       int tmp;
75290 +       asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
75291 +} 
75292 +
75293 +#define cpu_has_fpu 1
75294 +
75295 +#define ARCH_HAS_PREFETCH
75296 +static inline void prefetch(void *x) 
75297 +{ 
75298 +       asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
75299 +} 
75300 +
75301 +#define ARCH_HAS_PREFETCHW 1
75302 +static inline void prefetchw(void *x) 
75303 +{ 
75304 +       alternative_input("prefetcht0 (%1)",
75305 +                         "prefetchw (%1)",
75306 +                         X86_FEATURE_3DNOW,
75307 +                         "r" (x));
75308 +} 
75309 +
75310 +#define ARCH_HAS_SPINLOCK_PREFETCH 1
75311 +
75312 +#define spin_lock_prefetch(x)  prefetchw(x)
75313 +
75314 +#define cpu_relax()   rep_nop()
75315 +
75316 +/*
75317 + *      NSC/Cyrix CPU configuration register indexes
75318 + */
75319 +#define CX86_CCR0 0xc0
75320 +#define CX86_CCR1 0xc1
75321 +#define CX86_CCR2 0xc2
75322 +#define CX86_CCR3 0xc3
75323 +#define CX86_CCR4 0xe8
75324 +#define CX86_CCR5 0xe9
75325 +#define CX86_CCR6 0xea
75326 +#define CX86_CCR7 0xeb
75327 +#define CX86_DIR0 0xfe
75328 +#define CX86_DIR1 0xff
75329 +#define CX86_ARR_BASE 0xc4
75330 +#define CX86_RCR_BASE 0xdc
75331 +
75332 +/*
75333 + *      NSC/Cyrix CPU indexed register access macros
75334 + */
75335 +
75336 +#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); })
75337 +
75338 +#define setCx86(reg, data) do { \
75339 +       outb((reg), 0x22); \
75340 +       outb((data), 0x23); \
75341 +} while (0)
75342 +
75343 +static inline void serialize_cpu(void)
75344 +{
75345 +       __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
75346 +}
75347 +
75348 +static inline void __monitor(const void *eax, unsigned long ecx,
75349 +               unsigned long edx)
75350 +{
75351 +       /* "monitor %eax,%ecx,%edx;" */
75352 +       asm volatile(
75353 +               ".byte 0x0f,0x01,0xc8;"
75354 +               : :"a" (eax), "c" (ecx), "d"(edx));
75355 +}
75356 +
75357 +static inline void __mwait(unsigned long eax, unsigned long ecx)
75358 +{
75359 +       /* "mwait %eax,%ecx;" */
75360 +       asm volatile(
75361 +               ".byte 0x0f,0x01,0xc9;"
75362 +               : :"a" (eax), "c" (ecx));
75363 +}
75364 +
75365 +#define stack_current() \
75366 +({                                                             \
75367 +       struct thread_info *ti;                                 \
75368 +       asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));  \
75369 +       ti->task;                                       \
75370 +})
75371 +
75372 +#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
75373 +
75374 +extern unsigned long boot_option_idle_override;
75375 +/* Boot loader type from the setup header */
75376 +extern int bootloader_type;
75377 +
75378 +#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
75379 +
75380 +#endif /* __ASM_X86_64_PROCESSOR_H */
75381 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/ptrace.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/ptrace.h
75382 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/ptrace.h  1970-01-01 01:00:00.000000000 +0100
75383 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/ptrace.h       2006-06-26 09:51:32.000000000 +0200
75384 @@ -0,0 +1,125 @@
75385 +#ifndef _X86_64_PTRACE_H
75386 +#define _X86_64_PTRACE_H
75387 +
75388 +#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS) 
75389 +#define R15 0
75390 +#define R14 8
75391 +#define R13 16
75392 +#define R12 24
75393 +#define RBP 32
75394 +#define RBX 40
75395 +/* arguments: interrupts/non tracing syscalls only save upto here*/
75396 +#define R11 48
75397 +#define R10 56 
75398 +#define R9 64
75399 +#define R8 72
75400 +#define RAX 80
75401 +#define RCX 88
75402 +#define RDX 96
75403 +#define RSI 104
75404 +#define RDI 112
75405 +#define ORIG_RAX 120       /* = ERROR */ 
75406 +/* end of arguments */         
75407 +/* cpu exception frame or undefined in case of fast syscall. */
75408 +#define RIP 128
75409 +#define CS 136
75410 +#define EFLAGS 144
75411 +#define RSP 152
75412 +#define SS 160
75413 +#define ARGOFFSET R11
75414 +#endif /* __ASSEMBLY__ */
75415 +
75416 +/* top of stack page */ 
75417 +#define FRAME_SIZE 168
75418 +
75419 +#define PTRACE_OLDSETOPTIONS         21
75420 +
75421 +#ifndef __ASSEMBLY__ 
75422 +
75423 +struct pt_regs {
75424 +       unsigned long r15;
75425 +       unsigned long r14;
75426 +       unsigned long r13;
75427 +       unsigned long r12;
75428 +       unsigned long rbp;
75429 +       unsigned long rbx;
75430 +/* arguments: non interrupts/non tracing syscalls only save upto here*/
75431 +       unsigned long r11;
75432 +       unsigned long r10;      
75433 +       unsigned long r9;
75434 +       unsigned long r8;
75435 +       unsigned long rax;
75436 +       unsigned long rcx;
75437 +       unsigned long rdx;
75438 +       unsigned long rsi;
75439 +       unsigned long rdi;
75440 +       unsigned long orig_rax;
75441 +/* end of arguments */         
75442 +/* cpu exception frame or undefined */
75443 +       unsigned long rip;
75444 +       unsigned long cs;
75445 +       unsigned long eflags; 
75446 +       unsigned long rsp; 
75447 +       unsigned long ss;
75448 +/* top of stack page */ 
75449 +};
75450 +
75451 +#endif
75452 +
75453 +/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
75454 +#define PTRACE_GETREGS            12
75455 +#define PTRACE_SETREGS            13
75456 +#define PTRACE_GETFPREGS          14
75457 +#define PTRACE_SETFPREGS          15
75458 +#define PTRACE_GETFPXREGS         18
75459 +#define PTRACE_SETFPXREGS         19
75460 +
75461 +/* only useful for access 32bit programs */
75462 +#define PTRACE_GET_THREAD_AREA    25
75463 +#define PTRACE_SET_THREAD_AREA    26
75464 +
75465 +#define PTRACE_ARCH_PRCTL        30    /* arch_prctl for child */
75466 +
75467 +#if defined(__KERNEL__) && !defined(__ASSEMBLY__) 
75468 +#define user_mode(regs) (!!((regs)->cs & 3))
75469 +#define user_mode_vm(regs) user_mode(regs)
75470 +#define instruction_pointer(regs) ((regs)->rip)
75471 +#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
75472 +extern unsigned long profile_pc(struct pt_regs *regs);
75473 +#else
75474 +#define profile_pc(regs) instruction_pointer(regs)
75475 +#endif
75476 +
75477 +void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
75478 +
75479 +struct task_struct;
75480 +
75481 +extern unsigned long
75482 +convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs);
75483 +
75484 +enum {
75485 +        EF_CF   = 0x00000001,
75486 +        EF_PF   = 0x00000004,
75487 +        EF_AF   = 0x00000010,
75488 +        EF_ZF   = 0x00000040,
75489 +        EF_SF   = 0x00000080,
75490 +        EF_TF   = 0x00000100,
75491 +        EF_IE   = 0x00000200,
75492 +        EF_DF   = 0x00000400,
75493 +        EF_OF   = 0x00000800,
75494 +        EF_IOPL = 0x00003000,
75495 +        EF_IOPL_RING0 = 0x00000000,
75496 +        EF_IOPL_RING1 = 0x00001000,
75497 +        EF_IOPL_RING2 = 0x00002000,
75498 +        EF_NT   = 0x00004000,   /* nested task */
75499 +        EF_RF   = 0x00010000,   /* resume */
75500 +        EF_VM   = 0x00020000,   /* virtual mode */
75501 +        EF_AC   = 0x00040000,   /* alignment */
75502 +        EF_VIF  = 0x00080000,   /* virtual interrupt */
75503 +        EF_VIP  = 0x00100000,   /* virtual interrupt pending */
75504 +        EF_ID   = 0x00200000,   /* id */
75505 +};
75506 +
75507 +#endif
75508 +
75509 +#endif
75510 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/smp.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/smp.h
75511 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/smp.h     1970-01-01 01:00:00.000000000 +0100
75512 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/smp.h  2006-06-26 09:51:32.000000000 +0200
75513 @@ -0,0 +1,152 @@
75514 +#ifndef __ASM_SMP_H
75515 +#define __ASM_SMP_H
75516 +
75517 +/*
75518 + * We need the APIC definitions automatically as part of 'smp.h'
75519 + */
75520 +#ifndef __ASSEMBLY__
75521 +#include <linux/config.h>
75522 +#include <linux/threads.h>
75523 +#include <linux/cpumask.h>
75524 +#include <linux/bitops.h>
75525 +extern int disable_apic;
75526 +#endif
75527 +
75528 +#ifdef CONFIG_X86_LOCAL_APIC
75529 +#ifndef __ASSEMBLY__
75530 +#include <asm/fixmap.h>
75531 +#include <asm/mpspec.h>
75532 +#ifdef CONFIG_X86_IO_APIC
75533 +#include <asm/io_apic.h>
75534 +#endif
75535 +#include <asm/apic.h>
75536 +#include <asm/thread_info.h>
75537 +#endif
75538 +#endif
75539 +
75540 +#ifdef CONFIG_SMP
75541 +#ifndef ASSEMBLY
75542 +
75543 +#include <asm/pda.h>
75544 +
75545 +struct pt_regs;
75546 +
75547 +extern cpumask_t cpu_present_mask;
75548 +extern cpumask_t cpu_possible_map;
75549 +extern cpumask_t cpu_online_map;
75550 +extern cpumask_t cpu_initialized;
75551 +
75552 +/*
75553 + * Private routines/data
75554 + */
75555
75556 +extern void smp_alloc_memory(void);
75557 +extern volatile unsigned long smp_invalidate_needed;
75558 +extern int pic_mode;
75559 +extern void lock_ipi_call_lock(void);
75560 +extern void unlock_ipi_call_lock(void);
75561 +extern int smp_num_siblings;
75562 +extern void smp_send_reschedule(int cpu);
75563 +void smp_stop_cpu(void);
75564 +extern int smp_call_function_single(int cpuid, void (*func) (void *info),
75565 +                               void *info, int retry, int wait);
75566 +
75567 +extern cpumask_t cpu_sibling_map[NR_CPUS];
75568 +extern cpumask_t cpu_core_map[NR_CPUS];
75569 +extern int phys_proc_id[NR_CPUS];
75570 +extern int cpu_core_id[NR_CPUS];
75571 +
75572 +#define SMP_TRAMPOLINE_BASE 0x6000
75573 +
75574 +/*
75575 + * On x86 all CPUs are mapped 1:1 to the APIC space.
75576 + * This simplifies scheduling and IPI sending and
75577 + * compresses data structures.
75578 + */
75579 +
75580 +static inline int num_booting_cpus(void)
75581 +{
75582 +       return cpus_weight(cpu_possible_map);
75583 +}
75584 +
75585 +#define raw_smp_processor_id() read_pda(cpunumber)
75586 +
75587 +#ifdef CONFIG_X86_LOCAL_APIC
75588 +static inline int hard_smp_processor_id(void)
75589 +{
75590 +       /* we don't want to mark this access volatile - bad code generation */
75591 +       return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
75592 +}
75593 +#endif
75594 +
75595 +extern int safe_smp_processor_id(void);
75596 +extern int __cpu_disable(void);
75597 +extern void __cpu_die(unsigned int cpu);
75598 +extern void prefill_possible_map(void);
75599 +extern unsigned num_processors;
75600 +extern unsigned disabled_cpus;
75601 +
75602 +#endif /* !ASSEMBLY */
75603 +
75604 +#define NO_PROC_ID             0xFF            /* No processor magic marker */
75605 +
75606 +#endif
75607 +
75608 +#ifndef ASSEMBLY
75609 +/*
75610 + * Some lowlevel functions might want to know about
75611 + * the real APIC ID <-> CPU # mapping.
75612 + */
75613 +extern u8 x86_cpu_to_apicid[NR_CPUS];  /* physical ID */
75614 +extern u8 x86_cpu_to_log_apicid[NR_CPUS];
75615 +extern u8 bios_cpu_apicid[];
75616 +
75617 +#ifdef CONFIG_X86_LOCAL_APIC
75618 +static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
75619 +{
75620 +       return cpus_addr(cpumask)[0];
75621 +}
75622 +
75623 +static inline int cpu_present_to_apicid(int mps_cpu)
75624 +{
75625 +       if (mps_cpu < NR_CPUS)
75626 +               return (int)bios_cpu_apicid[mps_cpu];
75627 +       else
75628 +               return BAD_APICID;
75629 +}
75630 +#endif
75631 +
75632 +#endif /* !ASSEMBLY */
75633 +
75634 +#ifndef CONFIG_SMP
75635 +#define stack_smp_processor_id() 0
75636 +#define safe_smp_processor_id() 0
75637 +#define cpu_logical_map(x) (x)
75638 +#else
75639 +#include <asm/thread_info.h>
75640 +#define stack_smp_processor_id() \
75641 +({                                                             \
75642 +       struct thread_info *ti;                                 \
75643 +       __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));      \
75644 +       ti->cpu;                                                \
75645 +})
75646 +#endif
75647 +
75648 +#ifndef __ASSEMBLY__
75649 +#ifdef CONFIG_X86_LOCAL_APIC
75650 +static __inline int logical_smp_processor_id(void)
75651 +{
75652 +       /* we don't want to mark this access volatile - bad code generation */
75653 +       return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
75654 +}
75655 +#endif
75656 +#endif
75657 +
75658 +#ifdef CONFIG_SMP
75659 +#define cpu_physical_id(cpu)           x86_cpu_to_apicid[cpu]
75660 +#else
75661 +#define cpu_physical_id(cpu)           boot_cpu_id
75662 +#endif
75663 +
75664 +#endif
75665 +
75666 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/synch_bitops.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/synch_bitops.h
75667 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/synch_bitops.h    1970-01-01 01:00:00.000000000 +0100
75668 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/synch_bitops.h 2006-06-26 09:51:32.000000000 +0200
75669 @@ -0,0 +1,2 @@
75670 +
75671 +#include <asm-i386/mach-xen/asm/synch_bitops.h>
75672 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/system.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/system.h
75673 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/system.h  1970-01-01 01:00:00.000000000 +0100
75674 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/system.h       2006-06-26 09:51:32.000000000 +0200
75675 @@ -0,0 +1,434 @@
75676 +#ifndef __ASM_SYSTEM_H
75677 +#define __ASM_SYSTEM_H
75678 +
75679 +#include <linux/config.h>
75680 +#include <linux/kernel.h>
75681 +#include <asm/segment.h>
75682 +#include <asm/synch_bitops.h>
75683 +#include <asm/hypervisor.h>
75684 +#include <xen/interface/arch-x86_64.h>
75685 +
75686 +#ifdef __KERNEL__
75687 +
75688 +#ifdef CONFIG_SMP
75689 +#define __vcpu_id smp_processor_id()
75690 +#else
75691 +#define __vcpu_id 0
75692 +#endif
75693 +
75694 +#ifdef CONFIG_SMP
75695 +#define LOCK_PREFIX "lock ; "
75696 +#else
75697 +#define LOCK_PREFIX ""
75698 +#endif
75699 +
75700 +#define __STR(x) #x
75701 +#define STR(x) __STR(x)
75702 +
75703 +#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
75704 +#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
75705 +
75706 +/* frame pointer must be last for get_wchan */
75707 +#define SAVE_CONTEXT    "pushq %%rbp ; movq %%rsi,%%rbp\n\t"
75708 +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\n\t"
75709 +
75710 +#define __EXTRA_CLOBBER  \
75711 +       ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
75712 +
75713 +#define switch_to(prev,next,last) \
75714 +       asm volatile(SAVE_CONTEXT                                                   \
75715 +                    "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
75716 +                    "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
75717 +                    "call __switch_to\n\t"                                       \
75718 +                    ".globl thread_return\n"                                   \
75719 +                    "thread_return:\n\t"                                           \
75720 +                    "movq %%gs:%P[pda_pcurrent],%%rsi\n\t"                       \
75721 +                    "movq %P[thread_info](%%rsi),%%r8\n\t"                       \
75722 +                    LOCK "btr  %[tif_fork],%P[ti_flags](%%r8)\n\t"               \
75723 +                    "movq %%rax,%%rdi\n\t"                                       \
75724 +                    "jc   ret_from_fork\n\t"                                     \
75725 +                    RESTORE_CONTEXT                                                \
75726 +                    : "=a" (last)                                                \
75727 +                    : [next] "S" (next), [prev] "D" (prev),                      \
75728 +                      [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
75729 +                      [ti_flags] "i" (offsetof(struct thread_info, flags)),\
75730 +                      [tif_fork] "i" (TIF_FORK),                         \
75731 +                      [thread_info] "i" (offsetof(struct task_struct, thread_info)), \
75732 +                      [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent))   \
75733 +                    : "memory", "cc" __EXTRA_CLOBBER)
75734 +    
75735 +
75736 +extern void load_gs_index(unsigned);
75737 +
75738 +/*
75739 + * Load a segment. Fall back on loading the zero
75740 + * segment if something goes wrong..
75741 + */
75742 +#define loadsegment(seg,value) \
75743 +       asm volatile("\n"                       \
75744 +               "1:\t"                          \
75745 +               "movl %k0,%%" #seg "\n"         \
75746 +               "2:\n"                          \
75747 +               ".section .fixup,\"ax\"\n"      \
75748 +               "3:\t"                          \
75749 +               "movl %1,%%" #seg "\n\t"        \
75750 +               "jmp 2b\n"                      \
75751 +               ".previous\n"                   \
75752 +               ".section __ex_table,\"a\"\n\t" \
75753 +               ".align 8\n\t"                  \
75754 +               ".quad 1b,3b\n"                 \
75755 +               ".previous"                     \
75756 +               : :"r" (value), "r" (0))
75757 +
75758 +#define set_debug(value,register) \
75759 +                __asm__("movq %0,%%db" #register  \
75760 +               : /* no output */ \
75761 +               :"r" ((unsigned long) value))
75762 +
75763 +
75764 +#ifdef __KERNEL__
75765 +struct alt_instr { 
75766 +       __u8 *instr;            /* original instruction */
75767 +       __u8 *replacement;
75768 +       __u8  cpuid;            /* cpuid bit set for replacement */
75769 +       __u8  instrlen;         /* length of original instruction */
75770 +       __u8  replacementlen;   /* length of new instruction, <= instrlen */ 
75771 +       __u8  pad[5];
75772 +}; 
75773 +#endif
75774 +
75775 +/*
75776 + * Alternative instructions for different CPU types or capabilities.
75777 + * 
75778 + * This allows to use optimized instructions even on generic binary
75779 + * kernels.
75780 + * 
75781 + * length of oldinstr must be longer or equal the length of newinstr
75782 + * It can be padded with nops as needed.
75783 + * 
75784 + * For non barrier like inlines please define new variants
75785 + * without volatile and memory clobber.
75786 + */
75787 +#define alternative(oldinstr, newinstr, feature)       \
75788 +       asm volatile ("661:\n\t" oldinstr "\n662:\n"                 \
75789 +                     ".section .altinstructions,\"a\"\n"            \
75790 +                     "  .align 8\n"                                   \
75791 +                     "  .quad 661b\n"            /* label */          \
75792 +                     "  .quad 663f\n"            /* new instruction */ \
75793 +                     "  .byte %c0\n"             /* feature bit */    \
75794 +                     "  .byte 662b-661b\n"       /* sourcelen */      \
75795 +                     "  .byte 664f-663f\n"       /* replacementlen */ \
75796 +                     ".previous\n"                                     \
75797 +                     ".section .altinstr_replacement,\"ax\"\n"         \
75798 +                     "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
75799 +                     ".previous" :: "i" (feature) : "memory")  
75800 +
75801 +/*
75802 + * Alternative inline assembly with input.
75803 + * 
75804 + * Peculiarities:
75805 + * No memory clobber here. 
75806 + * Argument numbers start with 1.
75807 + * Best is to use constraints that are fixed size (like (%1) ... "r")
75808 + * If you use variable sized constraints like "m" or "g" in the 
75809 + * replacement make sure to pad to the worst case length.
75810 + */
75811 +#define alternative_input(oldinstr, newinstr, feature, input...)       \
75812 +       asm volatile ("661:\n\t" oldinstr "\n662:\n"                    \
75813 +                     ".section .altinstructions,\"a\"\n"               \
75814 +                     "  .align 8\n"                                    \
75815 +                     "  .quad 661b\n"            /* label */           \
75816 +                     "  .quad 663f\n"            /* new instruction */ \
75817 +                     "  .byte %c0\n"             /* feature bit */     \
75818 +                     "  .byte 662b-661b\n"       /* sourcelen */       \
75819 +                     "  .byte 664f-663f\n"       /* replacementlen */  \
75820 +                     ".previous\n"                                     \
75821 +                     ".section .altinstr_replacement,\"ax\"\n"         \
75822 +                     "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
75823 +                     ".previous" :: "i" (feature), ##input)
75824 +
75825 +/* Like alternative_input, but with a single output argument */
75826 +#define alternative_io(oldinstr, newinstr, feature, output, input...) \
75827 +       asm volatile ("661:\n\t" oldinstr "\n662:\n"                    \
75828 +                     ".section .altinstructions,\"a\"\n"               \
75829 +                     "  .align 8\n"                                    \
75830 +                     "  .quad 661b\n"            /* label */           \
75831 +                     "  .quad 663f\n"            /* new instruction */ \
75832 +                     "  .byte %c[feat]\n"        /* feature bit */     \
75833 +                     "  .byte 662b-661b\n"       /* sourcelen */       \
75834 +                     "  .byte 664f-663f\n"       /* replacementlen */  \
75835 +                     ".previous\n"                                     \
75836 +                     ".section .altinstr_replacement,\"ax\"\n"         \
75837 +                     "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
75838 +                     ".previous" : output : [feat] "i" (feature), ##input)
75839 +
75840 +/*
75841 + * Clear and set 'TS' bit respectively
75842 + */
75843 +#define clts() (HYPERVISOR_fpu_taskswitch(0))
75844 +
75845 +static inline unsigned long read_cr0(void)
75846 +{ 
75847 +       unsigned long cr0;
75848 +       asm volatile("movq %%cr0,%0" : "=r" (cr0));
75849 +       return cr0;
75850 +} 
75851 +
75852 +static inline void write_cr0(unsigned long val) 
75853 +{ 
75854 +       asm volatile("movq %0,%%cr0" :: "r" (val));
75855 +} 
75856 +
75857 +#define read_cr3() ({ \
75858 +       unsigned long __dummy; \
75859 +       asm("movq %%cr3,%0" : "=r" (__dummy)); \
75860 +       machine_to_phys(__dummy); \
75861 +})
75862 +
75863 +static inline unsigned long read_cr4(void)
75864 +{ 
75865 +       unsigned long cr4;
75866 +       asm("movq %%cr4,%0" : "=r" (cr4));
75867 +       return cr4;
75868 +} 
75869 +
75870 +static inline void write_cr4(unsigned long val)
75871 +{ 
75872 +       asm volatile("movq %0,%%cr4" :: "r" (val));
75873 +} 
75874 +
75875 +#define stts() (HYPERVISOR_fpu_taskswitch(1))
75876 +
75877 +#define wbinvd() \
75878 +       __asm__ __volatile__ ("wbinvd": : :"memory");
75879 +
75880 +/*
75881 + * On SMP systems, when the scheduler does migration-cost autodetection,
75882 + * it needs a way to flush as much of the CPU's caches as possible.
75883 + */
75884 +static inline void sched_cacheflush(void)
75885 +{
75886 +       wbinvd();
75887 +}
75888 +
75889 +#endif /* __KERNEL__ */
75890 +
75891 +#define nop() __asm__ __volatile__ ("nop")
75892 +
75893 +#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
75894 +
75895 +#define tas(ptr) (xchg((ptr),1))
75896 +
75897 +#define __xg(x) ((volatile long *)(x))
75898 +
75899 +static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
75900 +{
75901 +       *ptr = val;
75902 +}
75903 +
75904 +#define _set_64bit set_64bit
75905 +
75906 +/*
75907 + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
75908 + * Note 2: xchg has side effect, so that attribute volatile is necessary,
75909 + *       but generally the primitive is invalid, *ptr is output argument. --ANK
75910 + */
75911 +static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
75912 +{
75913 +       switch (size) {
75914 +               case 1:
75915 +                       __asm__ __volatile__("xchgb %b0,%1"
75916 +                               :"=q" (x)
75917 +                               :"m" (*__xg(ptr)), "0" (x)
75918 +                               :"memory");
75919 +                       break;
75920 +               case 2:
75921 +                       __asm__ __volatile__("xchgw %w0,%1"
75922 +                               :"=r" (x)
75923 +                               :"m" (*__xg(ptr)), "0" (x)
75924 +                               :"memory");
75925 +                       break;
75926 +               case 4:
75927 +                       __asm__ __volatile__("xchgl %k0,%1"
75928 +                               :"=r" (x)
75929 +                               :"m" (*__xg(ptr)), "0" (x)
75930 +                               :"memory");
75931 +                       break;
75932 +               case 8:
75933 +                       __asm__ __volatile__("xchgq %0,%1"
75934 +                               :"=r" (x)
75935 +                               :"m" (*__xg(ptr)), "0" (x)
75936 +                               :"memory");
75937 +                       break;
75938 +       }
75939 +       return x;
75940 +}
75941 +
75942 +/*
75943 + * Atomic compare and exchange.  Compare OLD with MEM, if identical,
75944 + * store NEW in MEM.  Return the initial value in MEM.  Success is
75945 + * indicated by comparing RETURN with OLD.
75946 + */
75947 +
75948 +#define __HAVE_ARCH_CMPXCHG 1
75949 +
75950 +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
75951 +                                     unsigned long new, int size)
75952 +{
75953 +       unsigned long prev;
75954 +       switch (size) {
75955 +       case 1:
75956 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
75957 +                                    : "=a"(prev)
75958 +                                    : "q"(new), "m"(*__xg(ptr)), "0"(old)
75959 +                                    : "memory");
75960 +               return prev;
75961 +       case 2:
75962 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
75963 +                                    : "=a"(prev)
75964 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
75965 +                                    : "memory");
75966 +               return prev;
75967 +       case 4:
75968 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %k1,%2"
75969 +                                    : "=a"(prev)
75970 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
75971 +                                    : "memory");
75972 +               return prev;
75973 +       case 8:
75974 +               __asm__ __volatile__(LOCK_PREFIX "cmpxchgq %1,%2"
75975 +                                    : "=a"(prev)
75976 +                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
75977 +                                    : "memory");
75978 +               return prev;
75979 +       }
75980 +       return old;
75981 +}
75982 +
75983 +#define cmpxchg(ptr,o,n)\
75984 +       ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
75985 +                                       (unsigned long)(n),sizeof(*(ptr))))
75986 +
75987 +#ifdef CONFIG_SMP
75988 +#define smp_mb()       mb()
75989 +#define smp_rmb()      rmb()
75990 +#define smp_wmb()      wmb()
75991 +#define smp_read_barrier_depends()     do {} while(0)
75992 +#else
75993 +#define smp_mb()       barrier()
75994 +#define smp_rmb()      barrier()
75995 +#define smp_wmb()      barrier()
75996 +#define smp_read_barrier_depends()     do {} while(0)
75997 +#endif
75998 +
75999 +    
76000 +/*
76001 + * Force strict CPU ordering.
76002 + * And yes, this is required on UP too when we're talking
76003 + * to devices.
76004 + */
76005 +#define mb()   asm volatile("mfence":::"memory")
76006 +#define rmb()  asm volatile("lfence":::"memory")
76007 +
76008 +#ifdef CONFIG_UNORDERED_IO
76009 +#define wmb()  asm volatile("sfence" ::: "memory")
76010 +#else
76011 +#define wmb()  asm volatile("" ::: "memory")
76012 +#endif
76013 +#define read_barrier_depends() do {} while(0)
76014 +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
76015 +#define set_wmb(var, value) do { var = value; wmb(); } while (0)
76016 +
76017 +#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
76018 +
76019 +
76020 +/* 
76021 + * The use of 'barrier' in the following reflects their use as local-lock
76022 + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
76023 + * critical operations are executed. All critical operations must complete
76024 + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
76025 + * includes these barriers, for example.
76026 + */
76027 +
76028 +#define __cli()                                                                \
76029 +do {                                                                   \
76030 +       vcpu_info_t *_vcpu;                                             \
76031 +       preempt_disable();                                              \
76032 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
76033 +       _vcpu->evtchn_upcall_mask = 1;                                  \
76034 +       preempt_enable_no_resched();                                    \
76035 +       barrier();                                                      \
76036 +} while (0)
76037 +
76038 +#define __sti()                                                                \
76039 +do {                                                                   \
76040 +       vcpu_info_t *_vcpu;                                             \
76041 +       barrier();                                                      \
76042 +       preempt_disable();                                              \
76043 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
76044 +       _vcpu->evtchn_upcall_mask = 0;                                  \
76045 +       barrier(); /* unmask then check (avoid races) */                \
76046 +       if ( unlikely(_vcpu->evtchn_upcall_pending) )                   \
76047 +               force_evtchn_callback();                                \
76048 +       preempt_enable();                                               \
76049 +} while (0)
76050 +
76051 +#define __save_flags(x)                                                        \
76052 +do {                                                                   \
76053 +       vcpu_info_t *_vcpu;                                             \
76054 +       preempt_disable();                                              \
76055 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
76056 +       (x) = _vcpu->evtchn_upcall_mask;                                \
76057 +       preempt_enable();                                               \
76058 +} while (0)
76059 +
76060 +#define __restore_flags(x)                                             \
76061 +do {                                                                   \
76062 +       vcpu_info_t *_vcpu;                                             \
76063 +       barrier();                                                      \
76064 +       preempt_disable();                                              \
76065 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
76066 +       if ((_vcpu->evtchn_upcall_mask = (x)) == 0) {                   \
76067 +               barrier(); /* unmask then check (avoid races) */        \
76068 +               if ( unlikely(_vcpu->evtchn_upcall_pending) )           \
76069 +                       force_evtchn_callback();                        \
76070 +               preempt_enable();                                       \
76071 +       } else                                                          \
76072 +               preempt_enable_no_resched();                            \
76073 +} while (0)
76074 +
76075 +#define __save_and_cli(x)                                              \
76076 +do {                                                                   \
76077 +       vcpu_info_t *_vcpu;                                             \
76078 +       preempt_disable();                                              \
76079 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
76080 +       (x) = _vcpu->evtchn_upcall_mask;                                \
76081 +       _vcpu->evtchn_upcall_mask = 1;                                  \
76082 +       preempt_enable_no_resched();                                    \
76083 +       barrier();                                                      \
76084 +} while (0)
76085 +
76086 +#define local_irq_save(x)      __save_and_cli(x)
76087 +#define local_irq_restore(x)   __restore_flags(x)
76088 +#define local_save_flags(x)    __save_flags(x)
76089 +#define local_irq_disable()    __cli()
76090 +#define local_irq_enable()     __sti()
76091 +
76092 +/* Cannot use preempt_enable() here as we would recurse in preempt_sched(). */
76093 +#define irqs_disabled()                                                        \
76094 +({     int ___x;                                                       \
76095 +       vcpu_info_t *_vcpu;                                             \
76096 +       preempt_disable();                                              \
76097 +       _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];          \
76098 +       ___x = (_vcpu->evtchn_upcall_mask != 0);                        \
76099 +       preempt_enable_no_resched();                                    \
76100 +       ___x; })
76101 +
76102 +#define safe_halt()            ((void)0)
76103 +#define halt()                 ((void)0)
76104 +
76105 +void cpu_idle_wait(void);
76106 +
76107 +extern unsigned long arch_align_stack(unsigned long sp);
76108 +
76109 +#endif
76110 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/timer.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/timer.h
76111 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/timer.h   1970-01-01 01:00:00.000000000 +0100
76112 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/timer.h        2006-06-26 09:51:32.000000000 +0200
76113 @@ -0,0 +1,67 @@
76114 +#ifndef _ASMi386_TIMER_H
76115 +#define _ASMi386_TIMER_H
76116 +#include <linux/init.h>
76117 +
76118 +/**
76119 + * struct timer_ops - used to define a timer source
76120 + *
76121 + * @name: name of the timer.
76122 + * @init: Probes and initializes the timer. Takes clock= override 
76123 + *        string as an argument. Returns 0 on success, anything else
76124 + *        on failure.
76125 + * @mark_offset: called by the timer interrupt.
76126 + * @get_offset:  called by gettimeofday(). Returns the number of microseconds
76127 + *               since the last timer interupt.
76128 + * @monotonic_clock: returns the number of nanoseconds since the init of the
76129 + *                   timer.
76130 + * @delay: delays this many clock cycles.
76131 + */
76132 +struct timer_opts {
76133 +       char* name;
76134 +       void (*mark_offset)(void);
76135 +       unsigned long (*get_offset)(void);
76136 +       unsigned long long (*monotonic_clock)(void);
76137 +       void (*delay)(unsigned long);
76138 +       unsigned long (*read_timer)(void);
76139 +       int (*suspend)(pm_message_t state);
76140 +       int (*resume)(void);
76141 +};
76142 +
76143 +struct init_timer_opts {
76144 +       int (*init)(char *override);
76145 +       struct timer_opts *opts;
76146 +};
76147 +
76148 +#define TICK_SIZE (tick_nsec / 1000)
76149 +
76150 +extern struct timer_opts* __init select_timer(void);
76151 +extern void clock_fallback(void);
76152 +void setup_pit_timer(void);
76153 +
76154 +/* Modifiers for buggy PIT handling */
76155 +
76156 +extern int pit_latch_buggy;
76157 +
76158 +extern struct timer_opts *cur_timer;
76159 +extern int timer_ack;
76160 +
76161 +/* list of externed timers */
76162 +extern struct timer_opts timer_none;
76163 +extern struct timer_opts timer_pit;
76164 +extern struct init_timer_opts timer_pit_init;
76165 +extern struct init_timer_opts timer_tsc_init;
76166 +#ifdef CONFIG_X86_CYCLONE_TIMER
76167 +extern struct init_timer_opts timer_cyclone_init;
76168 +#endif
76169 +
76170 +extern unsigned long calibrate_tsc(void);
76171 +extern void init_cpu_khz(void);
76172 +#ifdef CONFIG_HPET_TIMER
76173 +extern struct init_timer_opts timer_hpet_init;
76174 +extern unsigned long calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr);
76175 +#endif
76176 +
76177 +#ifdef CONFIG_X86_PM_TIMER
76178 +extern struct init_timer_opts timer_pmtmr_init;
76179 +#endif
76180 +#endif
76181 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/tlbflush.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/tlbflush.h
76182 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/tlbflush.h        1970-01-01 01:00:00.000000000 +0100
76183 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/tlbflush.h     2006-06-26 09:51:32.000000000 +0200
76184 @@ -0,0 +1,104 @@
76185 +#ifndef _X8664_TLBFLUSH_H
76186 +#define _X8664_TLBFLUSH_H
76187 +
76188 +#include <linux/config.h>
76189 +#include <linux/mm.h>
76190 +#include <asm/processor.h>
76191 +
76192 +#define __flush_tlb()  xen_tlb_flush()
76193 +
76194 +/*
76195 + * Global pages have to be flushed a bit differently. Not a real
76196 + * performance problem because this does not happen often.
76197 + */
76198 +#define __flush_tlb_global()   xen_tlb_flush()
76199 +
76200 +
76201 +extern unsigned long pgkern_mask;
76202 +
76203 +#define __flush_tlb_all() __flush_tlb_global()
76204 +
76205 +#define __flush_tlb_one(addr)  xen_invlpg((unsigned long)addr)
76206 +
76207 +
76208 +/*
76209 + * TLB flushing:
76210 + *
76211 + *  - flush_tlb() flushes the current mm struct TLBs
76212 + *  - flush_tlb_all() flushes all processes TLBs
76213 + *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
76214 + *  - flush_tlb_page(vma, vmaddr) flushes one page
76215 + *  - flush_tlb_range(vma, start, end) flushes a range of pages
76216 + *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
76217 + *  - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
76218 + *
76219 + * x86-64 can only flush individual pages or full VMs. For a range flush
76220 + * we always do the full VM. Might be worth trying if for a small
76221 + * range a few INVLPGs in a row are a win.
76222 + */
76223 +
76224 +#ifndef CONFIG_SMP
76225 +
76226 +#define flush_tlb() __flush_tlb()
76227 +#define flush_tlb_all() __flush_tlb_all()
76228 +#define local_flush_tlb() __flush_tlb()
76229 +
76230 +static inline void flush_tlb_mm(struct mm_struct *mm)
76231 +{
76232 +       if (mm == current->active_mm)
76233 +               __flush_tlb();
76234 +}
76235 +
76236 +static inline void flush_tlb_page(struct vm_area_struct *vma,
76237 +       unsigned long addr)
76238 +{
76239 +       if (vma->vm_mm == current->active_mm)
76240 +               __flush_tlb_one(addr);
76241 +}
76242 +
76243 +static inline void flush_tlb_range(struct vm_area_struct *vma,
76244 +       unsigned long start, unsigned long end)
76245 +{
76246 +       if (vma->vm_mm == current->active_mm)
76247 +               __flush_tlb();
76248 +}
76249 +
76250 +#else
76251 +
76252 +#include <asm/smp.h>
76253 +
76254 +#define local_flush_tlb() \
76255 +       __flush_tlb()
76256 +
76257 +extern void flush_tlb_all(void);
76258 +extern void flush_tlb_current_task(void);
76259 +extern void flush_tlb_mm(struct mm_struct *);
76260 +extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
76261 +
76262 +#define flush_tlb()    flush_tlb_current_task()
76263 +
76264 +static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end)
76265 +{
76266 +       flush_tlb_mm(vma->vm_mm);
76267 +}
76268 +
76269 +#define TLBSTATE_OK    1
76270 +#define TLBSTATE_LAZY  2
76271 +
76272 +/* Roughly an IPI every 20MB with 4k pages for freeing page table
76273 +   ranges. Cost is about 42k of memory for each CPU. */
76274 +#define ARCH_FREE_PTE_NR 5350  
76275 +
76276 +#endif
76277 +
76278 +#define flush_tlb_kernel_range(start, end) flush_tlb_all()
76279 +
76280 +static inline void flush_tlb_pgtables(struct mm_struct *mm,
76281 +                                     unsigned long start, unsigned long end)
76282 +{
76283 +       /* x86_64 does not keep any page table caches in a software TLB.
76284 +          The CPUs do in their hardware TLBs, but they are handled
76285 +          by the normal TLB flushing algorithms. */
76286 +}
76287 +
76288 +#endif /* _X8664_TLBFLUSH_H */
76289 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/vga.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/vga.h
76290 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/vga.h     1970-01-01 01:00:00.000000000 +0100
76291 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/vga.h  2006-06-26 09:51:32.000000000 +0200
76292 @@ -0,0 +1,20 @@
76293 +/*
76294 + *     Access to VGA videoram
76295 + *
76296 + *     (c) 1998 Martin Mares <mj@ucw.cz>
76297 + */
76298 +
76299 +#ifndef _LINUX_ASM_VGA_H_
76300 +#define _LINUX_ASM_VGA_H_
76301 +
76302 +/*
76303 + *     On the PC, we can just recalculate addresses and then
76304 + *     access the videoram directly without any black magic.
76305 + */
76306 +
76307 +#define VGA_MAP_MEM(x) (unsigned long)isa_bus_to_virt(x)
76308 +
76309 +#define vga_readb(x) (*(x))
76310 +#define vga_writeb(x,y) (*(y) = (x))
76311 +
76312 +#endif
76313 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/xor.h linux-2.6.16/include/asm-x86_64/mach-xen/asm/xor.h
76314 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/asm/xor.h     1970-01-01 01:00:00.000000000 +0100
76315 +++ linux-2.6.16/include/asm-x86_64/mach-xen/asm/xor.h  2006-06-26 09:51:32.000000000 +0200
76316 @@ -0,0 +1,328 @@
76317 +/*
76318 + * x86-64 changes / gcc fixes from Andi Kleen. 
76319 + * Copyright 2002 Andi Kleen, SuSE Labs.
76320 + *
76321 + * This hasn't been optimized for the hammer yet, but there are likely
76322 + * no advantages to be gotten from x86-64 here anyways.
76323 + */
76324 +
76325 +typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
76326 +
76327 +/* Doesn't use gcc to save the XMM registers, because there is no easy way to 
76328 +   tell it to do a clts before the register saving. */
76329 +#define XMMS_SAVE do {                         \
76330 +       preempt_disable();                      \
76331 +       if (!(current_thread_info()->status & TS_USEDFPU))      \
76332 +               clts();                         \
76333 +       __asm__ __volatile__ (                  \
76334 +               "movups %%xmm0,(%1)     ;\n\t"  \
76335 +               "movups %%xmm1,0x10(%1) ;\n\t"  \
76336 +               "movups %%xmm2,0x20(%1) ;\n\t"  \
76337 +               "movups %%xmm3,0x30(%1) ;\n\t"  \
76338 +               : "=&r" (cr0)                   \
76339 +               : "r" (xmm_save)                \
76340 +               : "memory");                    \
76341 +} while(0)
76342 +
76343 +#define XMMS_RESTORE do {                      \
76344 +       asm volatile (                          \
76345 +               "sfence                 ;\n\t"  \
76346 +               "movups (%1),%%xmm0     ;\n\t"  \
76347 +               "movups 0x10(%1),%%xmm1 ;\n\t"  \
76348 +               "movups 0x20(%1),%%xmm2 ;\n\t"  \
76349 +               "movups 0x30(%1),%%xmm3 ;\n\t"  \
76350 +               :                               \
76351 +               : "r" (cr0), "r" (xmm_save)     \
76352 +               : "memory");                    \
76353 +       if (!(current_thread_info()->status & TS_USEDFPU))      \
76354 +               stts();                         \
76355 +       preempt_enable();                       \
76356 +} while(0)
76357 +
76358 +#define OFFS(x)                "16*("#x")"
76359 +#define PF_OFFS(x)     "256+16*("#x")"
76360 +#define        PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
76361 +#define LD(x,y)                "       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"    ;\n"
76362 +#define ST(x,y)                "       movaps %%xmm"#y",   "OFFS(x)"(%[p1])    ;\n"
76363 +#define PF1(x)         "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
76364 +#define PF2(x)         "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
76365 +#define PF3(x)         "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
76366 +#define PF4(x)         "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
76367 +#define PF5(x)         "       prefetchnta "PF_OFFS(x)"(%[p6])         ;\n"
76368 +#define XO1(x,y)       "       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"     ;\n"
76369 +#define XO2(x,y)       "       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"     ;\n"
76370 +#define XO3(x,y)       "       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"     ;\n"
76371 +#define XO4(x,y)       "       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"     ;\n"
76372 +#define XO5(x,y)       "       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"     ;\n"
76373 +
76374 +
76375 +static void
76376 +xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
76377 +{
76378 +        unsigned int lines = bytes >> 8;
76379 +       unsigned long cr0;
76380 +       xmm_store_t xmm_save[4];
76381 +
76382 +       XMMS_SAVE;
76383 +
76384 +        asm volatile (
76385 +#undef BLOCK
76386 +#define BLOCK(i) \
76387 +               LD(i,0)                                 \
76388 +                       LD(i+1,1)                       \
76389 +               PF1(i)                                  \
76390 +                               PF1(i+2)                \
76391 +                               LD(i+2,2)               \
76392 +                                       LD(i+3,3)       \
76393 +               PF0(i+4)                                \
76394 +                               PF0(i+6)                \
76395 +               XO1(i,0)                                \
76396 +                       XO1(i+1,1)                      \
76397 +                               XO1(i+2,2)              \
76398 +                                       XO1(i+3,3)      \
76399 +               ST(i,0)                                 \
76400 +                       ST(i+1,1)                       \
76401 +                               ST(i+2,2)               \
76402 +                                       ST(i+3,3)       \
76403 +
76404 +
76405 +               PF0(0)
76406 +                               PF0(2)
76407 +
76408 +       " .align 32                     ;\n"
76409 +        " 1:                            ;\n"
76410 +
76411 +               BLOCK(0)
76412 +               BLOCK(4)
76413 +               BLOCK(8)
76414 +               BLOCK(12)
76415 +
76416 +        "       addq %[inc], %[p1]           ;\n"
76417 +        "       addq %[inc], %[p2]           ;\n"
76418 +               "               decl %[cnt] ; jnz 1b"
76419 +       : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
76420 +       : [inc] "r" (256UL) 
76421 +        : "memory");
76422 +
76423 +       XMMS_RESTORE;
76424 +}
76425 +
76426 +static void
76427 +xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
76428 +         unsigned long *p3)
76429 +{
76430 +       unsigned int lines = bytes >> 8;
76431 +       xmm_store_t xmm_save[4];
76432 +       unsigned long cr0;
76433 +
76434 +       XMMS_SAVE;
76435 +
76436 +        __asm__ __volatile__ (
76437 +#undef BLOCK
76438 +#define BLOCK(i) \
76439 +               PF1(i)                                  \
76440 +                               PF1(i+2)                \
76441 +               LD(i,0)                                 \
76442 +                       LD(i+1,1)                       \
76443 +                               LD(i+2,2)               \
76444 +                                       LD(i+3,3)       \
76445 +               PF2(i)                                  \
76446 +                               PF2(i+2)                \
76447 +               PF0(i+4)                                \
76448 +                               PF0(i+6)                \
76449 +               XO1(i,0)                                \
76450 +                       XO1(i+1,1)                      \
76451 +                               XO1(i+2,2)              \
76452 +                                       XO1(i+3,3)      \
76453 +               XO2(i,0)                                \
76454 +                       XO2(i+1,1)                      \
76455 +                               XO2(i+2,2)              \
76456 +                                       XO2(i+3,3)      \
76457 +               ST(i,0)                                 \
76458 +                       ST(i+1,1)                       \
76459 +                               ST(i+2,2)               \
76460 +                                       ST(i+3,3)       \
76461 +
76462 +
76463 +               PF0(0)
76464 +                               PF0(2)
76465 +
76466 +       " .align 32                     ;\n"
76467 +        " 1:                            ;\n"
76468 +
76469 +               BLOCK(0)
76470 +               BLOCK(4)
76471 +               BLOCK(8)
76472 +               BLOCK(12)
76473 +
76474 +        "       addq %[inc], %[p1]           ;\n"
76475 +        "       addq %[inc], %[p2]          ;\n"
76476 +        "       addq %[inc], %[p3]           ;\n"
76477 +               "               decl %[cnt] ; jnz 1b"
76478 +       : [cnt] "+r" (lines),
76479 +         [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
76480 +       : [inc] "r" (256UL)
76481 +       : "memory"); 
76482 +       XMMS_RESTORE;
76483 +}
76484 +
76485 +static void
76486 +xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
76487 +         unsigned long *p3, unsigned long *p4)
76488 +{
76489 +       unsigned int lines = bytes >> 8;
76490 +       xmm_store_t xmm_save[4]; 
76491 +       unsigned long cr0;
76492 +
76493 +       XMMS_SAVE;
76494 +
76495 +        __asm__ __volatile__ (
76496 +#undef BLOCK
76497 +#define BLOCK(i) \
76498 +               PF1(i)                                  \
76499 +                               PF1(i+2)                \
76500 +               LD(i,0)                                 \
76501 +                       LD(i+1,1)                       \
76502 +                               LD(i+2,2)               \
76503 +                                       LD(i+3,3)       \
76504 +               PF2(i)                                  \
76505 +                               PF2(i+2)                \
76506 +               XO1(i,0)                                \
76507 +                       XO1(i+1,1)                      \
76508 +                               XO1(i+2,2)              \
76509 +                                       XO1(i+3,3)      \
76510 +               PF3(i)                                  \
76511 +                               PF3(i+2)                \
76512 +               PF0(i+4)                                \
76513 +                               PF0(i+6)                \
76514 +               XO2(i,0)                                \
76515 +                       XO2(i+1,1)                      \
76516 +                               XO2(i+2,2)              \
76517 +                                       XO2(i+3,3)      \
76518 +               XO3(i,0)                                \
76519 +                       XO3(i+1,1)                      \
76520 +                               XO3(i+2,2)              \
76521 +                                       XO3(i+3,3)      \
76522 +               ST(i,0)                                 \
76523 +                       ST(i+1,1)                       \
76524 +                               ST(i+2,2)               \
76525 +                                       ST(i+3,3)       \
76526 +
76527 +
76528 +               PF0(0)
76529 +                               PF0(2)
76530 +
76531 +       " .align 32                     ;\n"
76532 +        " 1:                            ;\n"
76533 +
76534 +               BLOCK(0)
76535 +               BLOCK(4)
76536 +               BLOCK(8)
76537 +               BLOCK(12)
76538 +
76539 +        "       addq %[inc], %[p1]           ;\n"
76540 +        "       addq %[inc], %[p2]           ;\n"
76541 +        "       addq %[inc], %[p3]           ;\n"
76542 +        "       addq %[inc], %[p4]           ;\n"
76543 +       "       decl %[cnt] ; jnz 1b"
76544 +       : [cnt] "+c" (lines),
76545 +         [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
76546 +       : [inc] "r" (256UL)
76547 +        : "memory" );
76548 +
76549 +       XMMS_RESTORE;
76550 +}
76551 +
76552 +static void
76553 +xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
76554 +         unsigned long *p3, unsigned long *p4, unsigned long *p5)
76555 +{
76556 +        unsigned int lines = bytes >> 8;
76557 +       xmm_store_t xmm_save[4];
76558 +       unsigned long cr0;
76559 +
76560 +       XMMS_SAVE;
76561 +
76562 +        __asm__ __volatile__ (
76563 +#undef BLOCK
76564 +#define BLOCK(i) \
76565 +               PF1(i)                                  \
76566 +                               PF1(i+2)                \
76567 +               LD(i,0)                                 \
76568 +                       LD(i+1,1)                       \
76569 +                               LD(i+2,2)               \
76570 +                                       LD(i+3,3)       \
76571 +               PF2(i)                                  \
76572 +                               PF2(i+2)                \
76573 +               XO1(i,0)                                \
76574 +                       XO1(i+1,1)                      \
76575 +                               XO1(i+2,2)              \
76576 +                                       XO1(i+3,3)      \
76577 +               PF3(i)                                  \
76578 +                               PF3(i+2)                \
76579 +               XO2(i,0)                                \
76580 +                       XO2(i+1,1)                      \
76581 +                               XO2(i+2,2)              \
76582 +                                       XO2(i+3,3)      \
76583 +               PF4(i)                                  \
76584 +                               PF4(i+2)                \
76585 +               PF0(i+4)                                \
76586 +                               PF0(i+6)                \
76587 +               XO3(i,0)                                \
76588 +                       XO3(i+1,1)                      \
76589 +                               XO3(i+2,2)              \
76590 +                                       XO3(i+3,3)      \
76591 +               XO4(i,0)                                \
76592 +                       XO4(i+1,1)                      \
76593 +                               XO4(i+2,2)              \
76594 +                                       XO4(i+3,3)      \
76595 +               ST(i,0)                                 \
76596 +                       ST(i+1,1)                       \
76597 +                               ST(i+2,2)               \
76598 +                                       ST(i+3,3)       \
76599 +
76600 +
76601 +               PF0(0)
76602 +                               PF0(2)
76603 +
76604 +       " .align 32                     ;\n"
76605 +        " 1:                            ;\n"
76606 +
76607 +               BLOCK(0)
76608 +               BLOCK(4)
76609 +               BLOCK(8)
76610 +               BLOCK(12)
76611 +
76612 +        "       addq %[inc], %[p1]           ;\n"
76613 +        "       addq %[inc], %[p2]           ;\n"
76614 +        "       addq %[inc], %[p3]           ;\n"
76615 +        "       addq %[inc], %[p4]           ;\n"
76616 +        "       addq %[inc], %[p5]           ;\n"
76617 +       "       decl %[cnt] ; jnz 1b"
76618 +       : [cnt] "+c" (lines),
76619 +         [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), 
76620 +         [p5] "+r" (p5)
76621 +       : [inc] "r" (256UL)
76622 +       : "memory");
76623 +
76624 +       XMMS_RESTORE;
76625 +}
76626 +
76627 +static struct xor_block_template xor_block_sse = {
76628 +        .name = "generic_sse",
76629 +        .do_2 = xor_sse_2,
76630 +        .do_3 = xor_sse_3,
76631 +        .do_4 = xor_sse_4,
76632 +        .do_5 = xor_sse_5,
76633 +};
76634 +
76635 +#undef XOR_TRY_TEMPLATES
76636 +#define XOR_TRY_TEMPLATES                              \
76637 +       do {                                            \
76638 +               xor_speed(&xor_block_sse);      \
76639 +       } while (0)
76640 +
76641 +/* We force the use of the SSE xor block because it can write around L2.
76642 +   We may also be able to load into the L1 only depending on how the cpu
76643 +   deals with a load to a line that is being prefetched.  */
76644 +#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
76645 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/irq_vectors.h linux-2.6.16/include/asm-x86_64/mach-xen/irq_vectors.h
76646 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/irq_vectors.h 1970-01-01 01:00:00.000000000 +0100
76647 +++ linux-2.6.16/include/asm-x86_64/mach-xen/irq_vectors.h      2006-06-26 09:51:32.000000000 +0200
76648 @@ -0,0 +1,123 @@
76649 +/*
76650 + * This file should contain #defines for all of the interrupt vector
76651 + * numbers used by this architecture.
76652 + *
76653 + * In addition, there are some standard defines:
76654 + *
76655 + *     FIRST_EXTERNAL_VECTOR:
76656 + *             The first free place for external interrupts
76657 + *
76658 + *     SYSCALL_VECTOR:
76659 + *             The IRQ vector a syscall makes the user to kernel transition
76660 + *             under.
76661 + *
76662 + *     TIMER_IRQ:
76663 + *             The IRQ number the timer interrupt comes in at.
76664 + *
76665 + *     NR_IRQS:
76666 + *             The total number of interrupt vectors (including all the
76667 + *             architecture specific interrupts) needed.
76668 + *
76669 + */                    
76670 +#ifndef _ASM_IRQ_VECTORS_H
76671 +#define _ASM_IRQ_VECTORS_H
76672 +
76673 +/*
76674 + * IDT vectors usable for external interrupt sources start
76675 + * at 0x20:
76676 + */
76677 +#define FIRST_EXTERNAL_VECTOR  0x20
76678 +
76679 +#define SYSCALL_VECTOR         0x80
76680 +
76681 +/*
76682 + * Vectors 0x20-0x2f are used for ISA interrupts.
76683 + */
76684 +
76685 +#if 0
76686 +/*
76687 + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
76688 + *
76689 + *  some of the following vectors are 'rare', they are merged
76690 + *  into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
76691 + *  TLB, reschedule and local APIC vectors are performance-critical.
76692 + *
76693 + *  Vectors 0xf0-0xfa are free (reserved for future Linux use).
76694 + */
76695 +#define INVALIDATE_TLB_VECTOR  0xfd
76696 +#define RESCHEDULE_VECTOR      0xfc
76697 +#define CALL_FUNCTION_VECTOR   0xfb
76698 +
76699 +#define THERMAL_APIC_VECTOR    0xf0
76700 +/*
76701 + * Local APIC timer IRQ vector is on a different priority level,
76702 + * to work around the 'lost local interrupt if more than 2 IRQ
76703 + * sources per level' errata.
76704 + */
76705 +#define LOCAL_TIMER_VECTOR     0xef
76706 +#endif
76707 +
76708 +#define SPURIOUS_APIC_VECTOR   0xff
76709 +#define ERROR_APIC_VECTOR      0xfe
76710 +
76711 +/*
76712 + * First APIC vector available to drivers: (vectors 0x30-0xee)
76713 + * we start at 0x31 to spread out vectors evenly between priority
76714 + * levels. (0x80 is the syscall vector)
76715 + */
76716 +#define FIRST_DEVICE_VECTOR    0x31
76717 +#define FIRST_SYSTEM_VECTOR    0xef
76718 +
76719 +/*
76720 + * 16 8259A IRQ's, 208 potential APIC interrupt sources.
76721 + * Right now the APIC is mostly only used for SMP.
76722 + * 256 vectors is an architectural limit. (we can have
76723 + * more than 256 devices theoretically, but they will
76724 + * have to use shared interrupts)
76725 + * Since vectors 0x00-0x1f are used/reserved for the CPU,
76726 + * the usable vector space is 0x20-0xff (224 vectors)
76727 + */
76728 +
76729 +#define RESCHEDULE_VECTOR      0
76730 +#define CALL_FUNCTION_VECTOR   1
76731 +#define NR_IPIS                        2
76732 +
76733 +/*
76734 + * The maximum number of vectors supported by i386 processors
76735 + * is limited to 256. For processors other than i386, NR_VECTORS
76736 + * should be changed accordingly.
76737 + */
76738 +#define NR_VECTORS 256
76739 +
76740 +#define FPU_IRQ                        13
76741 +
76742 +#define        FIRST_VM86_IRQ          3
76743 +#define LAST_VM86_IRQ          15
76744 +#define invalid_vm86_irq(irq)  ((irq) < 3 || (irq) > 15)
76745 +
76746 +/*
76747 + * The flat IRQ space is divided into two regions:
76748 + *  1. A one-to-one mapping of real physical IRQs. This space is only used
76749 + *     if we have physical device-access privilege. This region is at the 
76750 + *     start of the IRQ space so that existing device drivers do not need
76751 + *     to be modified to translate physical IRQ numbers into our IRQ space.
76752 + *  3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
76753 + *     are bound using the provided bind/unbind functions.
76754 + */
76755 +
76756 +#define PIRQ_BASE              0
76757 +#define NR_PIRQS               256
76758 +
76759 +#define DYNIRQ_BASE            (PIRQ_BASE + NR_PIRQS)
76760 +#define NR_DYNIRQS             256
76761 +
76762 +#define NR_IRQS                        (NR_PIRQS + NR_DYNIRQS)
76763 +#define NR_IRQ_VECTORS         NR_IRQS
76764 +
76765 +#define pirq_to_irq(_x)                ((_x) + PIRQ_BASE)
76766 +#define irq_to_pirq(_x)                ((_x) - PIRQ_BASE)
76767 +
76768 +#define dynirq_to_irq(_x)      ((_x) + DYNIRQ_BASE)
76769 +#define irq_to_dynirq(_x)      ((_x) - DYNIRQ_BASE)
76770 +
76771 +#endif /* _ASM_IRQ_VECTORS_H */
76772 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/mach_time.h linux-2.6.16/include/asm-x86_64/mach-xen/mach_time.h
76773 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/mach_time.h   1970-01-01 01:00:00.000000000 +0100
76774 +++ linux-2.6.16/include/asm-x86_64/mach-xen/mach_time.h        2006-06-26 09:51:32.000000000 +0200
76775 @@ -0,0 +1,122 @@
76776 +/*
76777 + *  include/asm-i386/mach-default/mach_time.h
76778 + *
76779 + *  Machine specific set RTC function for generic.
76780 + *  Split out from time.c by Osamu Tomita <tomita@cinet.co.jp>
76781 + */
76782 +#ifndef _MACH_TIME_H
76783 +#define _MACH_TIME_H
76784 +
76785 +#include <asm-i386/mc146818rtc.h>
76786 +
76787 +/* for check timing call set_rtc_mmss() 500ms     */
76788 +/* used in arch/i386/time.c::do_timer_interrupt() */
76789 +#define USEC_AFTER     500000
76790 +#define USEC_BEFORE    500000
76791 +
76792 +/*
76793 + * In order to set the CMOS clock precisely, set_rtc_mmss has to be
76794 + * called 500 ms after the second nowtime has started, because when
76795 + * nowtime is written into the registers of the CMOS clock, it will
76796 + * jump to the next second precisely 500 ms later. Check the Motorola
76797 + * MC146818A or Dallas DS12887 data sheet for details.
76798 + *
76799 + * BUG: This routine does not handle hour overflow properly; it just
76800 + *      sets the minutes. Usually you'll only notice that after reboot!
76801 + */
76802 +static inline int mach_set_rtc_mmss(unsigned long nowtime)
76803 +{
76804 +       int retval = 0;
76805 +       int real_seconds, real_minutes, cmos_minutes;
76806 +       unsigned char save_control, save_freq_select;
76807 +
76808 +       save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being set */
76809 +       CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
76810 +
76811 +       save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset prescaler */
76812 +       CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
76813 +
76814 +       cmos_minutes = CMOS_READ(RTC_MINUTES);
76815 +       if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
76816 +               BCD_TO_BIN(cmos_minutes);
76817 +
76818 +       /*
76819 +        * since we're only adjusting minutes and seconds,
76820 +        * don't interfere with hour overflow. This avoids
76821 +        * messing with unknown time zones but requires your
76822 +        * RTC not to be off by more than 15 minutes
76823 +        */
76824 +       real_seconds = nowtime % 60;
76825 +       real_minutes = nowtime / 60;
76826 +       if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
76827 +               real_minutes += 30;             /* correct for half hour time zone */
76828 +       real_minutes %= 60;
76829 +
76830 +       if (abs(real_minutes - cmos_minutes) < 30) {
76831 +               if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
76832 +                       BIN_TO_BCD(real_seconds);
76833 +                       BIN_TO_BCD(real_minutes);
76834 +               }
76835 +               CMOS_WRITE(real_seconds,RTC_SECONDS);
76836 +               CMOS_WRITE(real_minutes,RTC_MINUTES);
76837 +       } else {
76838 +               printk(KERN_WARNING
76839 +                      "set_rtc_mmss: can't update from %d to %d\n",
76840 +                      cmos_minutes, real_minutes);
76841 +               retval = -1;
76842 +       }
76843 +
76844 +       /* The following flags have to be released exactly in this order,
76845 +        * otherwise the DS12887 (popular MC146818A clone with integrated
76846 +        * battery and quartz) will not reset the oscillator and will not
76847 +        * update precisely 500 ms later. You won't find this mentioned in
76848 +        * the Dallas Semiconductor data sheets, but who believes data
76849 +        * sheets anyway ...                           -- Markus Kuhn
76850 +        */
76851 +       CMOS_WRITE(save_control, RTC_CONTROL);
76852 +       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
76853 +
76854 +       return retval;
76855 +}
76856 +
76857 +static inline unsigned long mach_get_cmos_time(void)
76858 +{
76859 +       unsigned int year, mon, day, hour, min, sec;
76860 +       int i;
76861 +
76862 +       /* The Linux interpretation of the CMOS clock register contents:
76863 +        * When the Update-In-Progress (UIP) flag goes from 1 to 0, the
76864 +        * RTC registers show the second which has precisely just started.
76865 +        * Let's hope other operating systems interpret the RTC the same way.
76866 +        */
76867 +       /* read RTC exactly on falling edge of update flag */
76868 +       for (i = 0 ; i < 1000000 ; i++) /* may take up to 1 second... */
76869 +               if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)
76870 +                       break;
76871 +       for (i = 0 ; i < 1000000 ; i++) /* must try at least 2.228 ms */
76872 +               if (!(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
76873 +                       break;
76874 +       do { /* Isn't this overkill ? UIP above should guarantee consistency */
76875 +               sec = CMOS_READ(RTC_SECONDS);
76876 +               min = CMOS_READ(RTC_MINUTES);
76877 +               hour = CMOS_READ(RTC_HOURS);
76878 +               day = CMOS_READ(RTC_DAY_OF_MONTH);
76879 +               mon = CMOS_READ(RTC_MONTH);
76880 +               year = CMOS_READ(RTC_YEAR);
76881 +       } while (sec != CMOS_READ(RTC_SECONDS));
76882 +       if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
76883 +         {
76884 +           BCD_TO_BIN(sec);
76885 +           BCD_TO_BIN(min);
76886 +           BCD_TO_BIN(hour);
76887 +           BCD_TO_BIN(day);
76888 +           BCD_TO_BIN(mon);
76889 +           BCD_TO_BIN(year);
76890 +         }
76891 +       if ((year += 1900) < 1970)
76892 +               year += 100;
76893 +
76894 +       return mktime(year, mon, day, hour, min, sec);
76895 +}
76896 +
76897 +#endif /* !_MACH_TIME_H */
76898 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/mach_timer.h linux-2.6.16/include/asm-x86_64/mach-xen/mach_timer.h
76899 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/mach_timer.h  1970-01-01 01:00:00.000000000 +0100
76900 +++ linux-2.6.16/include/asm-x86_64/mach-xen/mach_timer.h       2006-06-26 09:51:32.000000000 +0200
76901 @@ -0,0 +1,48 @@
76902 +/*
76903 + *  include/asm-i386/mach-default/mach_timer.h
76904 + *
76905 + *  Machine specific calibrate_tsc() for generic.
76906 + *  Split out from timer_tsc.c by Osamu Tomita <tomita@cinet.co.jp>
76907 + */
76908 +/* ------ Calibrate the TSC ------- 
76909 + * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset().
76910 + * Too much 64-bit arithmetic here to do this cleanly in C, and for
76911 + * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2)
76912 + * output busy loop as low as possible. We avoid reading the CTC registers
76913 + * directly because of the awkward 8-bit access mechanism of the 82C54
76914 + * device.
76915 + */
76916 +#ifndef _MACH_TIMER_H
76917 +#define _MACH_TIMER_H
76918 +
76919 +#define CALIBRATE_LATCH        (5 * LATCH)
76920 +
76921 +static inline void mach_prepare_counter(void)
76922 +{
76923 +       /* Set the Gate high, disable speaker */
76924 +       outb((inb(0x61) & ~0x02) | 0x01, 0x61);
76925 +
76926 +       /*
76927 +        * Now let's take care of CTC channel 2
76928 +        *
76929 +        * Set the Gate high, program CTC channel 2 for mode 0,
76930 +        * (interrupt on terminal count mode), binary count,
76931 +        * load 5 * LATCH count, (LSB and MSB) to begin countdown.
76932 +        *
76933 +        * Some devices need a delay here.
76934 +        */
76935 +       outb(0xb0, 0x43);                       /* binary, mode 0, LSB/MSB, Ch 2 */
76936 +       outb_p(CALIBRATE_LATCH & 0xff, 0x42);   /* LSB of count */
76937 +       outb_p(CALIBRATE_LATCH >> 8, 0x42);       /* MSB of count */
76938 +}
76939 +
76940 +static inline void mach_countup(unsigned long *count_p)
76941 +{
76942 +       unsigned long count = 0;
76943 +       do {
76944 +               count++;
76945 +       } while ((inb_p(0x61) & 0x20) == 0);
76946 +       *count_p = count;
76947 +}
76948 +
76949 +#endif /* !_MACH_TIMER_H */
76950 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/setup_arch_post.h linux-2.6.16/include/asm-x86_64/mach-xen/setup_arch_post.h
76951 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/setup_arch_post.h     1970-01-01 01:00:00.000000000 +0100
76952 +++ linux-2.6.16/include/asm-x86_64/mach-xen/setup_arch_post.h  2006-06-26 09:51:32.000000000 +0200
76953 @@ -0,0 +1,28 @@
76954 +/**
76955 + * machine_specific_* - Hooks for machine specific setup.
76956 + *
76957 + * Description:
76958 + *     This is included late in kernel/setup.c so that it can make
76959 + *     use of all of the static functions.
76960 + **/
76961 +
76962 +extern void hypervisor_callback(void);
76963 +extern void failsafe_callback(void);
76964 +extern void nmi(void);
76965 +
76966 +static void __init machine_specific_arch_setup(void)
76967 +{
76968 +#ifdef CONFIG_X86_LOCAL_APIC
76969 +       struct xennmi_callback cb;
76970 +#endif
76971 +
76972 +       HYPERVISOR_set_callbacks(
76973 +                (unsigned long) hypervisor_callback,
76974 +                (unsigned long) failsafe_callback,
76975 +                (unsigned long) system_call);
76976 +
76977 +#ifdef CONFIG_X86_LOCAL_APIC
76978 +       cb.handler_address = (unsigned long)&nmi;
76979 +       HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
76980 +#endif
76981 +}
76982 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/asm-x86_64/mach-xen/setup_arch_pre.h linux-2.6.16/include/asm-x86_64/mach-xen/setup_arch_pre.h
76983 --- linux-2.6.16.orig/include/asm-x86_64/mach-xen/setup_arch_pre.h      1970-01-01 01:00:00.000000000 +0100
76984 +++ linux-2.6.16/include/asm-x86_64/mach-xen/setup_arch_pre.h   2006-06-26 09:51:32.000000000 +0200
76985 @@ -0,0 +1,5 @@
76986 +/* Hook to call BIOS initialisation function */
76987 +
76988 +#define ARCH_SETUP machine_specific_arch_setup();
76989 +
76990 +static void __init machine_specific_arch_setup(void);
76991 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/linux/gfp.h linux-2.6.16/include/linux/gfp.h
76992 --- linux-2.6.16.orig/include/linux/gfp.h       2006-03-20 06:53:29.000000000 +0100
76993 +++ linux-2.6.16/include/linux/gfp.h    2006-06-26 09:51:32.000000000 +0200
76994 @@ -98,7 +98,11 @@
76995   */
76996  
76997  #ifndef HAVE_ARCH_FREE_PAGE
76998 -static inline void arch_free_page(struct page *page, int order) { }
76999 +/*
77000 + * If arch_free_page returns non-zero then the generic free_page code can
77001 + * immediately bail: the arch-specific function has done all the work.
77002 + */
77003 +static inline int arch_free_page(struct page *page, int order) { return 0; }
77004  #endif
77005  
77006  extern struct page *
77007 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/linux/highmem.h linux-2.6.16/include/linux/highmem.h
77008 --- linux-2.6.16.orig/include/linux/highmem.h   2006-03-20 06:53:29.000000000 +0100
77009 +++ linux-2.6.16/include/linux/highmem.h        2006-06-26 09:51:32.000000000 +0200
77010 @@ -13,10 +13,16 @@
77011  
77012  /* declarations for linux/mm/highmem.c */
77013  unsigned int nr_free_highpages(void);
77014 +#ifdef CONFIG_XEN
77015 +void kmap_flush_unused(void);
77016 +#endif
77017  
77018  #else /* CONFIG_HIGHMEM */
77019  
77020  static inline unsigned int nr_free_highpages(void) { return 0; }
77021 +#ifdef CONFIG_XEN
77022 +static inline void kmap_flush_unused(void) { }
77023 +#endif
77024  
77025  static inline void *kmap(struct page *page)
77026  {
77027 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/linux/mm.h linux-2.6.16/include/linux/mm.h
77028 --- linux-2.6.16.orig/include/linux/mm.h        2006-06-26 09:49:45.000000000 +0200
77029 +++ linux-2.6.16/include/linux/mm.h     2006-06-26 09:51:32.000000000 +0200
77030 @@ -166,6 +166,9 @@
77031  #define VM_NONLINEAR   0x00800000      /* Is non-linear (remap_file_pages) */
77032  #define VM_MAPPED_COPY 0x01000000      /* T if mapped copy of data (nommu mmap) */
77033  #define VM_INSERTPAGE  0x02000000      /* The vma has had "vm_insert_page()" done on it */
77034 +#ifdef CONFIG_XEN
77035 +#define VM_FOREIGN     0x04000000      /* Has pages belonging to another VM */
77036 +#endif
77037  
77038  #ifndef VM_STACK_DEFAULT_FLAGS         /* arch can override this */
77039  #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
77040 @@ -229,9 +232,10 @@
77041                 unsigned long private;          /* Mapping-private opaque data:
77042                                                  * usually used for buffer_heads
77043                                                  * if PagePrivate set; used for
77044 -                                                * swp_entry_t if PageSwapCache;
77045 +                                                * swp_entry_t if PageSwapCache.
77046 +                                                * When page is free, this
77047                                                  * indicates order in the buddy
77048 -                                                * system if PG_buddy is set.
77049 +                                                * system.
77050                                                  */
77051                 struct address_space *mapping;  /* If low bit clear, points to
77052                                                  * inode address_space, or NULL.
77053 @@ -244,6 +248,9 @@
77054  #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
77055             spinlock_t ptl;
77056  #endif
77057 +#ifdef CONFIG_XEN
77058 +           struct list_head ballooned;
77059 +#endif
77060         };
77061         pgoff_t index;                  /* Our offset within mapping. */
77062         struct list_head lru;           /* Pageout list, eg. active_list
77063 @@ -1012,6 +1019,13 @@
77064  #define FOLL_GET       0x04    /* do get_page on page */
77065  #define FOLL_ANON      0x08    /* give ZERO_PAGE if no pgtable */
77066  
77067 +#ifdef CONFIG_XEN
77068 +typedef int (*pte_fn_t)(pte_t *pte, struct page *pmd_page, unsigned long addr,
77069 +                       void *data);
77070 +extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
77071 +                              unsigned long size, pte_fn_t fn, void *data);
77072 +#endif
77073 +
77074  #ifdef CONFIG_PROC_FS
77075  void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
77076  #else
77077 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/linux/skbuff.h linux-2.6.16/include/linux/skbuff.h
77078 --- linux-2.6.16.orig/include/linux/skbuff.h    2006-06-26 09:49:46.000000000 +0200
77079 +++ linux-2.6.16/include/linux/skbuff.h 2006-06-26 09:51:32.000000000 +0200
77080 @@ -189,6 +189,8 @@
77081   *     @local_df: allow local fragmentation
77082   *     @cloned: Head may be cloned (check refcnt to be sure)
77083   *     @nohdr: Payload reference only, must not modify header
77084 + *     @proto_data_valid: Protocol data validated since arriving at localhost
77085 + *     @proto_csum_blank: Protocol csum must be added before leaving localhost
77086   *     @pkt_type: Packet class
77087   *     @fclone: skbuff clone status
77088   *     @ip_summed: Driver fed us an IP checksum
77089 @@ -265,7 +267,13 @@
77090                                 nfctinfo:3;
77091         __u8                    pkt_type:3,
77092                                 fclone:2,
77093 +#ifndef CONFIG_XEN
77094                                 ipvs_property:1;
77095 +#else
77096 +                               ipvs_property:1,
77097 +                               proto_data_valid:1,
77098 +                               proto_csum_blank:1;
77099 +#endif
77100         __be16                  protocol;
77101  
77102         void                    (*destructor)(struct sk_buff *skb);
77103 @@ -325,7 +333,8 @@
77104  
77105  extern struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
77106                                             unsigned int size,
77107 -                                           gfp_t priority);
77108 +                                           gfp_t priority,
77109 +                                           int fclone);
77110  extern void           kfree_skbmem(struct sk_buff *skb);
77111  extern struct sk_buff *skb_clone(struct sk_buff *skb,
77112                                  gfp_t priority);
77113 @@ -1055,7 +1064,7 @@
77114         return skb;
77115  }
77116  #else
77117 -extern struct sk_buff *__dev_alloc_skb(unsigned int length, int gfp_mask);
77118 +extern struct sk_buff *__dev_alloc_skb(unsigned int length, gfp_t gfp_mask);
77119  #endif
77120  
77121  /**
77122 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/balloon.h linux-2.6.16/include/xen/balloon.h
77123 --- linux-2.6.16.orig/include/xen/balloon.h     1970-01-01 01:00:00.000000000 +0100
77124 +++ linux-2.6.16/include/xen/balloon.h  2006-06-26 09:51:32.000000000 +0200
77125 @@ -0,0 +1,73 @@
77126 +/******************************************************************************
77127 + * balloon.h
77128 + *
77129 + * Xen balloon driver - enables returning/claiming memory to/from Xen.
77130 + *
77131 + * Copyright (c) 2003, B Dragovic
77132 + * Copyright (c) 2003-2004, M Williamson, K Fraser
77133 + * 
77134 + * This program is free software; you can redistribute it and/or
77135 + * modify it under the terms of the GNU General Public License version 2
77136 + * as published by the Free Software Foundation; or, when distributed
77137 + * separately from the Linux kernel or incorporated into other
77138 + * software packages, subject to the following license:
77139 + * 
77140 + * Permission is hereby granted, free of charge, to any person obtaining a copy
77141 + * of this source file (the "Software"), to deal in the Software without
77142 + * restriction, including without limitation the rights to use, copy, modify,
77143 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
77144 + * and to permit persons to whom the Software is furnished to do so, subject to
77145 + * the following conditions:
77146 + * 
77147 + * The above copyright notice and this permission notice shall be included in
77148 + * all copies or substantial portions of the Software.
77149 + * 
77150 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
77151 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
77152 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
77153 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
77154 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
77155 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
77156 + * IN THE SOFTWARE.
77157 + */
77158 +
77159 +#ifndef __ASM_BALLOON_H__
77160 +#define __ASM_BALLOON_H__
77161 +
77162 +/*
77163 + * Inform the balloon driver that it should allow some slop for device-driver
77164 + * memory activities.
77165 + */
77166 +extern void
77167 +balloon_update_driver_allowance(
77168 +       long delta);
77169 +
77170 +/* Allocate an empty low-memory page range. */
77171 +extern struct page *
77172 +balloon_alloc_empty_page_range(
77173 +       unsigned long nr_pages);
77174 +
77175 +/* Deallocate an empty page range, adding to the balloon. */
77176 +extern void
77177 +balloon_dealloc_empty_page_range(
77178 +       struct page *page, unsigned long nr_pages);
77179 +
77180 +/*
77181 + * Prevent the balloon driver from changing the memory reservation during
77182 + * a driver critical region.
77183 + */
77184 +extern spinlock_t balloon_lock;
77185 +#define balloon_lock(__flags)   spin_lock_irqsave(&balloon_lock, __flags)
77186 +#define balloon_unlock(__flags) spin_unlock_irqrestore(&balloon_lock, __flags)
77187 +
77188 +#endif /* __ASM_BALLOON_H__ */
77189 +
77190 +/*
77191 + * Local variables:
77192 + *  c-file-style: "linux"
77193 + *  indent-tabs-mode: t
77194 + *  c-indent-level: 8
77195 + *  c-basic-offset: 8
77196 + *  tab-width: 8
77197 + * End:
77198 + */
77199 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/driver_util.h linux-2.6.16/include/xen/driver_util.h
77200 --- linux-2.6.16.orig/include/xen/driver_util.h 1970-01-01 01:00:00.000000000 +0100
77201 +++ linux-2.6.16/include/xen/driver_util.h      2006-06-26 09:51:32.000000000 +0200
77202 @@ -0,0 +1,26 @@
77203 +
77204 +#ifndef __ASM_XEN_DRIVER_UTIL_H__
77205 +#define __ASM_XEN_DRIVER_UTIL_H__
77206 +
77207 +#include <linux/config.h>
77208 +#include <linux/vmalloc.h>
77209 +
77210 +/* Allocate/destroy a 'vmalloc' VM area. */
77211 +extern struct vm_struct *alloc_vm_area(unsigned long size);
77212 +extern void free_vm_area(struct vm_struct *area);
77213 +
77214 +/* Lock an area so that PTEs are accessible in the current address space. */
77215 +extern void lock_vm_area(struct vm_struct *area);
77216 +extern void unlock_vm_area(struct vm_struct *area);
77217 +
77218 +#endif /* __ASM_XEN_DRIVER_UTIL_H__ */
77219 +
77220 +/*
77221 + * Local variables:
77222 + *  c-file-style: "linux"
77223 + *  indent-tabs-mode: t
77224 + *  c-indent-level: 8
77225 + *  c-basic-offset: 8
77226 + *  tab-width: 8
77227 + * End:
77228 + */
77229 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/evtchn.h linux-2.6.16/include/xen/evtchn.h
77230 --- linux-2.6.16.orig/include/xen/evtchn.h      1970-01-01 01:00:00.000000000 +0100
77231 +++ linux-2.6.16/include/xen/evtchn.h   2006-06-26 09:51:32.000000000 +0200
77232 @@ -0,0 +1,126 @@
77233 +/******************************************************************************
77234 + * evtchn.h
77235 + * 
77236 + * Communication via Xen event channels.
77237 + * Also definitions for the device that demuxes notifications to userspace.
77238 + * 
77239 + * Copyright (c) 2004-2005, K A Fraser
77240 + * 
77241 + * This program is free software; you can redistribute it and/or
77242 + * modify it under the terms of the GNU General Public License version 2
77243 + * as published by the Free Software Foundation; or, when distributed
77244 + * separately from the Linux kernel or incorporated into other
77245 + * software packages, subject to the following license:
77246 + * 
77247 + * Permission is hereby granted, free of charge, to any person obtaining a copy
77248 + * of this source file (the "Software"), to deal in the Software without
77249 + * restriction, including without limitation the rights to use, copy, modify,
77250 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
77251 + * and to permit persons to whom the Software is furnished to do so, subject to
77252 + * the following conditions:
77253 + * 
77254 + * The above copyright notice and this permission notice shall be included in
77255 + * all copies or substantial portions of the Software.
77256 + * 
77257 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
77258 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
77259 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
77260 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
77261 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
77262 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
77263 + * IN THE SOFTWARE.
77264 + */
77265 +
77266 +#ifndef __ASM_EVTCHN_H__
77267 +#define __ASM_EVTCHN_H__
77268 +
77269 +#include <linux/config.h>
77270 +#include <linux/interrupt.h>
77271 +#include <asm/hypervisor.h>
77272 +#include <asm/ptrace.h>
77273 +#include <asm/synch_bitops.h>
77274 +#include <xen/interface/event_channel.h>
77275 +#include <linux/smp.h>
77276 +
77277 +/*
77278 + * LOW-LEVEL DEFINITIONS
77279 + */
77280 +
77281 +/*
77282 + * Dynamically bind an event source to an IRQ-like callback handler.
77283 + * On some platforms this may not be implemented via the Linux IRQ subsystem.
77284 + * The IRQ argument passed to the callback handler is the same as returned
77285 + * from the bind call. It may not correspond to a Linux IRQ number.
77286 + * Returns IRQ or negative errno.
77287 + * UNBIND: Takes IRQ to unbind from; automatically closes the event channel.
77288 + */
77289 +extern int bind_evtchn_to_irqhandler(
77290 +       unsigned int evtchn,
77291 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
77292 +       unsigned long irqflags,
77293 +       const char *devname,
77294 +       void *dev_id);
77295 +extern int bind_virq_to_irqhandler(
77296 +       unsigned int virq,
77297 +       unsigned int cpu,
77298 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
77299 +       unsigned long irqflags,
77300 +       const char *devname,
77301 +       void *dev_id);
77302 +extern int bind_ipi_to_irqhandler(
77303 +       unsigned int ipi,
77304 +       unsigned int cpu,
77305 +       irqreturn_t (*handler)(int, void *, struct pt_regs *),
77306 +       unsigned long irqflags,
77307 +       const char *devname,
77308 +       void *dev_id);
77309 +
77310 +/*
77311 + * Common unbind function for all event sources. Takes IRQ to unbind from.
77312 + * Automatically closes the underlying event channel (even for bindings
77313 + * made with bind_evtchn_to_irqhandler()).
77314 + */
77315 +extern void unbind_from_irqhandler(unsigned int irq, void *dev_id);
77316 +
77317 +extern void irq_resume(void);
77318 +
77319 +/* Entry point for notifications into Linux subsystems. */
77320 +asmlinkage void evtchn_do_upcall(struct pt_regs *regs);
77321 +
77322 +/* Entry point for notifications into the userland character device. */
77323 +extern void evtchn_device_upcall(int port);
77324 +
77325 +extern void mask_evtchn(int port);
77326 +extern void unmask_evtchn(int port);
77327 +
77328 +static inline void clear_evtchn(int port)
77329 +{
77330 +       shared_info_t *s = HYPERVISOR_shared_info;
77331 +       synch_clear_bit(port, &s->evtchn_pending[0]);
77332 +}
77333 +
77334 +static inline void notify_remote_via_evtchn(int port)
77335 +{
77336 +       evtchn_op_t op;
77337 +       op.cmd         = EVTCHNOP_send,
77338 +       op.u.send.port = port;
77339 +       (void)HYPERVISOR_event_channel_op(&op);
77340 +}
77341 +
77342 +/*
77343 + * Unlike notify_remote_via_evtchn(), this is safe to use across
77344 + * save/restore. Notifications on a broken connection are silently dropped.
77345 + */
77346 +extern void notify_remote_via_irq(int irq);
77347 +
77348 +#endif /* __ASM_EVTCHN_H__ */
77349 +
77350 +/*
77351 + * Local variables:
77352 + *  c-file-style: "linux"
77353 + *  indent-tabs-mode: t
77354 + *  c-indent-level: 8
77355 + *  c-basic-offset: 8
77356 + *  tab-width: 8
77357 + * End:
77358 + */
77359 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/features.h linux-2.6.16/include/xen/features.h
77360 --- linux-2.6.16.orig/include/xen/features.h    1970-01-01 01:00:00.000000000 +0100
77361 +++ linux-2.6.16/include/xen/features.h 2006-06-26 09:51:32.000000000 +0200
77362 @@ -0,0 +1,20 @@
77363 +/******************************************************************************
77364 + * features.h
77365 + *
77366 + * Query the features reported by Xen.
77367 + *
77368 + * Copyright (c) 2006, Ian Campbell
77369 + */
77370 +
77371 +#ifndef __ASM_XEN_FEATURES_H__
77372 +#define __ASM_XEN_FEATURES_H__
77373 +
77374 +#include <xen/interface/version.h>
77375 +
77376 +extern void setup_xen_features(void);
77377 +
77378 +extern u8 xen_features[XENFEAT_NR_SUBMAPS * 32];
77379 +
77380 +#define xen_feature(flag)      (xen_features[flag])
77381 +
77382 +#endif /* __ASM_XEN_FEATURES_H__ */
77383 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/foreign_page.h linux-2.6.16/include/xen/foreign_page.h
77384 --- linux-2.6.16.orig/include/xen/foreign_page.h        1970-01-01 01:00:00.000000000 +0100
77385 +++ linux-2.6.16/include/xen/foreign_page.h     2006-06-26 09:51:32.000000000 +0200
77386 @@ -0,0 +1,40 @@
77387 +/******************************************************************************
77388 + * foreign_page.h
77389 + * 
77390 + * Provide a "foreign" page type, that is owned by a foreign allocator and 
77391 + * not the normal buddy allocator in page_alloc.c
77392 + * 
77393 + * Copyright (c) 2004, K A Fraser
77394 + */
77395 +
77396 +#ifndef __ASM_XEN_FOREIGN_PAGE_H__
77397 +#define __ASM_XEN_FOREIGN_PAGE_H__
77398 +
77399 +#define PG_foreign             PG_arch_1
77400 +
77401 +#define PageForeign(page)      test_bit(PG_foreign, &(page)->flags)
77402 +
77403 +#define SetPageForeign(page, dtor) do {                \
77404 +       set_bit(PG_foreign, &(page)->flags);    \
77405 +       (page)->mapping = (void *)dtor;         \
77406 +} while (0)
77407 +
77408 +#define ClearPageForeign(page) do {            \
77409 +       clear_bit(PG_foreign, &(page)->flags);  \
77410 +       (page)->mapping = NULL;                 \
77411 +} while (0)
77412 +
77413 +#define PageForeignDestructor(page)    \
77414 +       ( (void (*) (struct page *)) (page)->mapping )
77415 +
77416 +#endif /* __ASM_XEN_FOREIGN_PAGE_H__ */
77417 +
77418 +/*
77419 + * Local variables:
77420 + *  c-file-style: "linux"
77421 + *  indent-tabs-mode: t
77422 + *  c-indent-level: 8
77423 + *  c-basic-offset: 8
77424 + *  tab-width: 8
77425 + * End:
77426 + */
77427 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/gnttab.h linux-2.6.16/include/xen/gnttab.h
77428 --- linux-2.6.16.orig/include/xen/gnttab.h      1970-01-01 01:00:00.000000000 +0100
77429 +++ linux-2.6.16/include/xen/gnttab.h   2006-06-26 09:51:32.000000000 +0200
77430 @@ -0,0 +1,126 @@
77431 +/******************************************************************************
77432 + * gnttab.h
77433 + * 
77434 + * Two sets of functionality:
77435 + * 1. Granting foreign access to our memory reservation.
77436 + * 2. Accessing others' memory reservations via grant references.
77437 + * (i.e., mechanisms for both sender and recipient of grant references)
77438 + * 
77439 + * Copyright (c) 2004-2005, K A Fraser
77440 + * Copyright (c) 2005, Christopher Clark
77441 + * 
77442 + * This program is free software; you can redistribute it and/or
77443 + * modify it under the terms of the GNU General Public License version 2
77444 + * as published by the Free Software Foundation; or, when distributed
77445 + * separately from the Linux kernel or incorporated into other
77446 + * software packages, subject to the following license:
77447 + * 
77448 + * Permission is hereby granted, free of charge, to any person obtaining a copy
77449 + * of this source file (the "Software"), to deal in the Software without
77450 + * restriction, including without limitation the rights to use, copy, modify,
77451 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
77452 + * and to permit persons to whom the Software is furnished to do so, subject to
77453 + * the following conditions:
77454 + * 
77455 + * The above copyright notice and this permission notice shall be included in
77456 + * all copies or substantial portions of the Software.
77457 + * 
77458 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
77459 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
77460 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
77461 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
77462 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
77463 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
77464 + * IN THE SOFTWARE.
77465 + */
77466 +
77467 +#ifndef __ASM_GNTTAB_H__
77468 +#define __ASM_GNTTAB_H__
77469 +
77470 +#include <linux/config.h>
77471 +#include <asm/hypervisor.h>
77472 +#include <xen/interface/grant_table.h>
77473 +
77474 +/* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */
77475 +#ifdef __ia64__
77476 +#define NR_GRANT_FRAMES 1
77477 +#else
77478 +#define NR_GRANT_FRAMES 4
77479 +#endif
77480 +
77481 +struct gnttab_free_callback {
77482 +       struct gnttab_free_callback *next;
77483 +       void (*fn)(void *);
77484 +       void *arg;
77485 +       u16 count;
77486 +};
77487 +
77488 +int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
77489 +                               int readonly);
77490 +
77491 +/*
77492 + * End access through the given grant reference, iff the grant entry is no
77493 + * longer in use.  Return 1 if the grant entry was freed, 0 if it is still in
77494 + * use.
77495 + */
77496 +int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly);
77497 +
77498 +/*
77499 + * Eventually end access through the given grant reference, and once that
77500 + * access has been ended, free the given page too.  Access will be ended
77501 + * immediately iff the grant entry is not in use, otherwise it will happen
77502 + * some time later.  page may be 0, in which case no freeing will occur.
77503 + */
77504 +void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
77505 +                              unsigned long page);
77506 +
77507 +int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn);
77508 +
77509 +unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
77510 +unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
77511 +
77512 +int gnttab_query_foreign_access(grant_ref_t ref);
77513 +
77514 +/*
77515 + * operations on reserved batches of grant references
77516 + */
77517 +int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head);
77518 +
77519 +void gnttab_free_grant_reference(grant_ref_t ref);
77520 +
77521 +void gnttab_free_grant_references(grant_ref_t head);
77522 +
77523 +int gnttab_claim_grant_reference(grant_ref_t *pprivate_head);
77524 +
77525 +void gnttab_release_grant_reference(grant_ref_t *private_head,
77526 +                                   grant_ref_t release);
77527 +
77528 +void gnttab_request_free_callback(struct gnttab_free_callback *callback,
77529 +                                 void (*fn)(void *), void *arg, u16 count);
77530 +
77531 +void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
77532 +                                    unsigned long frame, int readonly);
77533 +
77534 +void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
77535 +                                      unsigned long pfn);
77536 +
77537 +#ifdef __ia64__
77538 +#define gnttab_map_vaddr(map) __va(map.dev_bus_addr)
77539 +#else
77540 +#define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
77541 +#endif
77542 +
77543 +int gnttab_suspend(void);
77544 +int gnttab_resume(void);
77545 +
77546 +#endif /* __ASM_GNTTAB_H__ */
77547 +
77548 +/*
77549 + * Local variables:
77550 + *  c-file-style: "linux"
77551 + *  indent-tabs-mode: t
77552 + *  c-indent-level: 8
77553 + *  c-basic-offset: 8
77554 + *  tab-width: 8
77555 + * End:
77556 + */
77557 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/hypervisor_sysfs.h linux-2.6.16/include/xen/hypervisor_sysfs.h
77558 --- linux-2.6.16.orig/include/xen/hypervisor_sysfs.h    1970-01-01 01:00:00.000000000 +0100
77559 +++ linux-2.6.16/include/xen/hypervisor_sysfs.h 2006-06-26 09:51:32.000000000 +0200
77560 @@ -0,0 +1,32 @@
77561 +/*
77562 + *  copyright (c) 2006 IBM Corporation
77563 + *  Authored by: Mike D. Day <ncmike@us.ibm.com>
77564 + *
77565 + *  This program is free software; you can redistribute it and/or modify
77566 + *  it under the terms of the GNU General Public License version 2 as
77567 + *  published by the Free Software Foundation.
77568 + */
77569 +
77570 +#ifndef _HYP_SYSFS_H_
77571 +#define _HYP_SYSFS_H_
77572 +
77573 +#include <linux/kobject.h>
77574 +#include <linux/sysfs.h>
77575 +
77576 +#define HYPERVISOR_ATTR_RO(_name) \
77577 +static struct hyp_sysfs_attr  _name##_attr = __ATTR_RO(_name)
77578 +
77579 +#define HYPERVISOR_ATTR_RW(_name) \
77580 +static struct hyp_sysfs_attr _name##_attr = \
77581 +       __ATTR(_name, 0644, _name##_show, _name##_store)
77582 +
77583 +extern struct subsystem hypervisor_subsys;
77584 +
77585 +struct hyp_sysfs_attr {
77586 +       struct attribute attr;
77587 +       ssize_t (*show)(struct hyp_sysfs_attr *, char *);
77588 +       ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t);
77589 +       void *hyp_attr_data;
77590 +};
77591 +
77592 +#endif /* _HYP_SYSFS_H_ */
77593 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/COPYING linux-2.6.16/include/xen/interface/COPYING
77594 --- linux-2.6.16.orig/include/xen/interface/COPYING     1970-01-01 01:00:00.000000000 +0100
77595 +++ linux-2.6.16/include/xen/interface/COPYING  2006-06-26 09:51:32.000000000 +0200
77596 @@ -0,0 +1,28 @@
77597 +XEN NOTICE
77598 +==========
77599 +
77600 +This copyright applies to all files within this subdirectory. All
77601 +other files in the Xen source distribution are covered by version 2 of
77602 +the GNU General Public License.
77603 +
77604 + -- Keir Fraser (on behalf of the Xen team)
77605 +
77606 +=====================================================================
77607 +
77608 +Permission is hereby granted, free of charge, to any person obtaining a copy
77609 +of this software and associated documentation files (the "Software"), to
77610 +deal in the Software without restriction, including without limitation the
77611 +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
77612 +sell copies of the Software, and to permit persons to whom the Software is
77613 +furnished to do so, subject to the following conditions:
77614 +
77615 +The above copyright notice and this permission notice shall be included in
77616 +all copies or substantial portions of the Software.
77617 +
77618 +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
77619 +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
77620 +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
77621 +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
77622 +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
77623 +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
77624 +DEALINGS IN THE SOFTWARE.
77625 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/acm.h linux-2.6.16/include/xen/interface/acm.h
77626 --- linux-2.6.16.orig/include/xen/interface/acm.h       1970-01-01 01:00:00.000000000 +0100
77627 +++ linux-2.6.16/include/xen/interface/acm.h    2006-06-26 09:51:32.000000000 +0200
77628 @@ -0,0 +1,181 @@
77629 +/*
77630 + * acm.h: Xen access control module interface defintions
77631 + *
77632 + * Reiner Sailer <sailer@watson.ibm.com>
77633 + * Copyright (c) 2005, International Business Machines Corporation.
77634 + */
77635 +
77636 +#ifndef _XEN_PUBLIC_ACM_H
77637 +#define _XEN_PUBLIC_ACM_H
77638 +
77639 +#include "xen.h"
77640 +#include "sched_ctl.h"
77641 +
77642 +/* if ACM_DEBUG defined, all hooks should
77643 + * print a short trace message (comment it out
77644 + * when not in testing mode )
77645 + */
77646 +/* #define ACM_DEBUG */
77647 +
77648 +#ifdef ACM_DEBUG
77649 +#  define printkd(fmt, args...) printk(fmt,## args)
77650 +#else
77651 +#  define printkd(fmt, args...)
77652 +#endif
77653 +
77654 +/* default ssid reference value if not supplied */
77655 +#define ACM_DEFAULT_SSID  0x0
77656 +#define ACM_DEFAULT_LOCAL_SSID  0x0
77657 +
77658 +/* Internal ACM ERROR types */
77659 +#define ACM_OK     0
77660 +#define ACM_UNDEF   -1
77661 +#define ACM_INIT_SSID_ERROR  -2
77662 +#define ACM_INIT_SOID_ERROR  -3
77663 +#define ACM_ERROR          -4
77664 +
77665 +/* External ACCESS DECISIONS */
77666 +#define ACM_ACCESS_PERMITTED        0
77667 +#define ACM_ACCESS_DENIED           -111
77668 +#define ACM_NULL_POINTER_ERROR      -200
77669 +
77670 +/* primary policy in lower 4 bits */
77671 +#define ACM_NULL_POLICY 0
77672 +#define ACM_CHINESE_WALL_POLICY 1
77673 +#define ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY 2
77674 +#define ACM_POLICY_UNDEFINED 15
77675 +
77676 +/* combinations have secondary policy component in higher 4bit */
77677 +#define ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY \
77678 +    ((ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY << 4) | ACM_CHINESE_WALL_POLICY)
77679 +
77680 +/* policy: */
77681 +#define ACM_POLICY_NAME(X) \
77682 + ((X) == (ACM_NULL_POLICY)) ? "NULL policy" :                        \
77683 +    ((X) == (ACM_CHINESE_WALL_POLICY)) ? "CHINESE WALL policy" :        \
77684 +    ((X) == (ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "SIMPLE TYPE ENFORCEMENT policy" : \
77685 +    ((X) == (ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "CHINESE WALL AND SIMPLE TYPE ENFORCEMENT policy" : \
77686 +     "UNDEFINED policy"
77687 +
77688 +/* the following policy versions must be increased
77689 + * whenever the interpretation of the related
77690 + * policy's data structure changes
77691 + */
77692 +#define ACM_POLICY_VERSION 1
77693 +#define ACM_CHWALL_VERSION 1
77694 +#define ACM_STE_VERSION  1
77695 +
77696 +/* defines a ssid reference used by xen */
77697 +typedef uint32_t ssidref_t;
77698 +
77699 +/* hooks that are known to domains */
77700 +enum acm_hook_type {NONE=0, SHARING};
77701 +
77702 +/* -------security policy relevant type definitions-------- */
77703 +
77704 +/* type identifier; compares to "equal" or "not equal" */
77705 +typedef uint16_t domaintype_t;
77706 +
77707 +/* CHINESE WALL POLICY DATA STRUCTURES
77708 + *
77709 + * current accumulated conflict type set:
77710 + * When a domain is started and has a type that is in
77711 + * a conflict set, the conflicting types are incremented in
77712 + * the aggregate set. When a domain is destroyed, the 
77713 + * conflicting types to its type are decremented.
77714 + * If a domain has multiple types, this procedure works over
77715 + * all those types.
77716 + *
77717 + * conflict_aggregate_set[i] holds the number of
77718 + *   running domains that have a conflict with type i.
77719 + *
77720 + * running_types[i] holds the number of running domains
77721 + *        that include type i in their ssidref-referenced type set
77722 + *
77723 + * conflict_sets[i][j] is "0" if type j has no conflict
77724 + *    with type i and is "1" otherwise.
77725 + */
77726 +/* high-16 = version, low-16 = check magic */
77727 +#define ACM_MAGIC  0x0001debc
77728 +
77729 +/* each offset in bytes from start of the struct they
77730 + * are part of */
77731 +
77732 +/* each buffer consists of all policy information for
77733 + * the respective policy given in the policy code
77734 + *
77735 + * acm_policy_buffer, acm_chwall_policy_buffer,
77736 + * and acm_ste_policy_buffer need to stay 32-bit aligned
77737 + * because we create binary policies also with external
77738 + * tools that assume packed representations (e.g. the java tool)
77739 + */
77740 +struct acm_policy_buffer {
77741 +    uint32_t policy_version; /* ACM_POLICY_VERSION */
77742 +    uint32_t magic;
77743 +    uint32_t len;
77744 +    uint32_t primary_policy_code;
77745 +    uint32_t primary_buffer_offset;
77746 +    uint32_t secondary_policy_code;
77747 +    uint32_t secondary_buffer_offset;
77748 +};
77749 +
77750 +struct acm_chwall_policy_buffer {
77751 +    uint32_t policy_version; /* ACM_CHWALL_VERSION */
77752 +    uint32_t policy_code;
77753 +    uint32_t chwall_max_types;
77754 +    uint32_t chwall_max_ssidrefs;
77755 +    uint32_t chwall_max_conflictsets;
77756 +    uint32_t chwall_ssid_offset;
77757 +    uint32_t chwall_conflict_sets_offset;
77758 +    uint32_t chwall_running_types_offset;
77759 +    uint32_t chwall_conflict_aggregate_offset;
77760 +};
77761 +
77762 +struct acm_ste_policy_buffer {
77763 +    uint32_t policy_version; /* ACM_STE_VERSION */
77764 +    uint32_t policy_code;
77765 +    uint32_t ste_max_types;
77766 +    uint32_t ste_max_ssidrefs;
77767 +    uint32_t ste_ssid_offset;
77768 +};
77769 +
77770 +struct acm_stats_buffer {
77771 +    uint32_t magic;
77772 +    uint32_t len;
77773 +    uint32_t primary_policy_code;
77774 +    uint32_t primary_stats_offset;
77775 +    uint32_t secondary_policy_code;
77776 +    uint32_t secondary_stats_offset;
77777 +};
77778 +
77779 +struct acm_ste_stats_buffer {
77780 +    uint32_t ec_eval_count;
77781 +    uint32_t gt_eval_count;
77782 +    uint32_t ec_denied_count;
77783 +    uint32_t gt_denied_count;
77784 +    uint32_t ec_cachehit_count;
77785 +    uint32_t gt_cachehit_count;
77786 +};
77787 +
77788 +struct acm_ssid_buffer {
77789 +    uint32_t len;
77790 +    ssidref_t ssidref;
77791 +    uint32_t primary_policy_code;
77792 +    uint32_t primary_max_types;
77793 +    uint32_t primary_types_offset;
77794 +    uint32_t secondary_policy_code;
77795 +    uint32_t secondary_max_types;
77796 +    uint32_t secondary_types_offset;
77797 +};
77798 +
77799 +#endif
77800 +
77801 +/*
77802 + * Local variables:
77803 + * mode: C
77804 + * c-set-style: "BSD"
77805 + * c-basic-offset: 4
77806 + * tab-width: 4
77807 + * indent-tabs-mode: nil
77808 + * End:
77809 + */
77810 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/acm_ops.h linux-2.6.16/include/xen/interface/acm_ops.h
77811 --- linux-2.6.16.orig/include/xen/interface/acm_ops.h   1970-01-01 01:00:00.000000000 +0100
77812 +++ linux-2.6.16/include/xen/interface/acm_ops.h        2006-06-26 09:51:32.000000000 +0200
77813 @@ -0,0 +1,98 @@
77814 +/*
77815 + * acm_ops.h: Xen access control module hypervisor commands
77816 + *
77817 + * Reiner Sailer <sailer@watson.ibm.com>
77818 + * Copyright (c) 2005, International Business Machines Corporation.
77819 + */
77820 +
77821 +#ifndef __XEN_PUBLIC_ACM_OPS_H__
77822 +#define __XEN_PUBLIC_ACM_OPS_H__
77823 +
77824 +#include "xen.h"
77825 +#include "sched_ctl.h"
77826 +#include "acm.h"
77827 +
77828 +/*
77829 + * Make sure you increment the interface version whenever you modify this file!
77830 + * This makes sure that old versions of acm tools will stop working in a
77831 + * well-defined way (rather than crashing the machine, for instance).
77832 + */
77833 +#define ACM_INTERFACE_VERSION   0xAAAA0005
77834 +
77835 +/************************************************************************/
77836 +
77837 +#define ACM_SETPOLICY         4
77838 +struct acm_setpolicy {
77839 +    /* OUT variables */
77840 +    void *pushcache;
77841 +    uint32_t pushcache_size;
77842 +};
77843 +
77844 +
77845 +#define ACM_GETPOLICY         5
77846 +struct acm_getpolicy {
77847 +    /* OUT variables */
77848 +    void *pullcache;
77849 +    uint32_t pullcache_size;
77850 +};
77851 +
77852 +
77853 +#define ACM_DUMPSTATS         6
77854 +struct acm_dumpstats {
77855 +    void *pullcache;
77856 +    uint32_t pullcache_size;
77857 +};
77858 +
77859 +
77860 +#define ACM_GETSSID           7
77861 +enum get_type {UNSET=0, SSIDREF, DOMAINID};
77862 +struct acm_getssid {
77863 +    enum get_type get_ssid_by;
77864 +    union {
77865 +        domaintype_t domainid;
77866 +        ssidref_t    ssidref;
77867 +    } id;
77868 +    void *ssidbuf;
77869 +    uint32_t ssidbuf_size;
77870 +};
77871 +
77872 +#define ACM_GETDECISION        8
77873 +struct acm_getdecision {
77874 +    enum get_type get_decision_by1; /* in */
77875 +    enum get_type get_decision_by2;
77876 +    union {
77877 +        domaintype_t domainid;
77878 +        ssidref_t    ssidref;
77879 +    } id1;
77880 +    union {
77881 +        domaintype_t domainid;
77882 +        ssidref_t    ssidref;
77883 +    } id2;
77884 +    enum acm_hook_type hook;
77885 +    int acm_decision;           /* out */
77886 +};
77887 +
77888 +typedef struct acm_op {
77889 +    uint32_t cmd;
77890 +    uint32_t interface_version;      /* ACM_INTERFACE_VERSION */
77891 +    union {
77892 +        struct acm_setpolicy setpolicy;
77893 +        struct acm_getpolicy getpolicy;
77894 +        struct acm_dumpstats dumpstats;
77895 +        struct acm_getssid getssid;
77896 +        struct acm_getdecision getdecision;
77897 +    } u;
77898 +} acm_op_t;
77899 +DEFINE_GUEST_HANDLE(acm_op_t);
77900 +
77901 +#endif                          /* __XEN_PUBLIC_ACM_OPS_H__ */
77902 +
77903 +/*
77904 + * Local variables:
77905 + * mode: C
77906 + * c-set-style: "BSD"
77907 + * c-basic-offset: 4
77908 + * tab-width: 4
77909 + * indent-tabs-mode: nil
77910 + * End:
77911 + */
77912 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/arch-ia64.h linux-2.6.16/include/xen/interface/arch-ia64.h
77913 --- linux-2.6.16.orig/include/xen/interface/arch-ia64.h 1970-01-01 01:00:00.000000000 +0100
77914 +++ linux-2.6.16/include/xen/interface/arch-ia64.h      2006-06-26 09:51:32.000000000 +0200
77915 @@ -0,0 +1,337 @@
77916 +/******************************************************************************
77917 + * arch-ia64/hypervisor-if.h
77918 + * 
77919 + * Guest OS interface to IA64 Xen.
77920 + */
77921 +
77922 +#ifndef __HYPERVISOR_IF_IA64_H__
77923 +#define __HYPERVISOR_IF_IA64_H__
77924 +
77925 +#ifdef __XEN__
77926 +#define __DEFINE_GUEST_HANDLE(name, type) \
77927 +    typedef struct { type *p; } __guest_handle_ ## name
77928 +#else
77929 +#define __DEFINE_GUEST_HANDLE(name, type) \
77930 +    typedef type * __guest_handle_ ## name
77931 +#endif
77932 +
77933 +#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
77934 +#define GUEST_HANDLE(name)        __guest_handle_ ## name
77935 +
77936 +#ifndef __ASSEMBLY__
77937 +/* Guest handles for primitive C types. */
77938 +__DEFINE_GUEST_HANDLE(uchar, unsigned char);
77939 +__DEFINE_GUEST_HANDLE(uint,  unsigned int);
77940 +__DEFINE_GUEST_HANDLE(ulong, unsigned long);
77941 +DEFINE_GUEST_HANDLE(char);
77942 +DEFINE_GUEST_HANDLE(int);
77943 +DEFINE_GUEST_HANDLE(long);
77944 +DEFINE_GUEST_HANDLE(void);
77945 +#endif
77946 +
77947 +/* Maximum number of virtual CPUs in multi-processor guests. */
77948 +/* WARNING: before changing this, check that shared_info fits on a page */
77949 +#define MAX_VIRT_CPUS 4
77950 +
77951 +#ifndef __ASSEMBLY__
77952 +
77953 +#define MAX_NR_SECTION  32  /* at most 32 memory holes */
77954 +typedef struct {
77955 +    unsigned long start;  /* start of memory hole */
77956 +    unsigned long end;    /* end of memory hole */
77957 +} mm_section_t;
77958 +
77959 +typedef struct {
77960 +    unsigned long mfn : 56;
77961 +    unsigned long type: 8;
77962 +} pmt_entry_t;
77963 +
77964 +#define GPFN_MEM          (0UL << 56) /* Guest pfn is normal mem */
77965 +#define GPFN_FRAME_BUFFER (1UL << 56) /* VGA framebuffer */
77966 +#define GPFN_LOW_MMIO     (2UL << 56) /* Low MMIO range */
77967 +#define GPFN_PIB          (3UL << 56) /* PIB base */
77968 +#define GPFN_IOSAPIC      (4UL << 56) /* IOSAPIC base */
77969 +#define GPFN_LEGACY_IO    (5UL << 56) /* Legacy I/O base */
77970 +#define GPFN_GFW          (6UL << 56) /* Guest Firmware */
77971 +#define GPFN_HIGH_MMIO    (7UL << 56) /* High MMIO range */
77972 +
77973 +#define GPFN_IO_MASK     (7UL << 56)  /* Guest pfn is I/O type */
77974 +#define GPFN_INV_MASK    (31UL << 59) /* Guest pfn is invalid */
77975 +
77976 +#define INVALID_MFN       (~0UL)
77977 +
77978 +#define MEM_G   (1UL << 30)
77979 +#define MEM_M   (1UL << 20)
77980 +
77981 +#define MMIO_START       (3 * MEM_G)
77982 +#define MMIO_SIZE        (512 * MEM_M)
77983 +
77984 +#define VGA_IO_START     0xA0000UL
77985 +#define VGA_IO_SIZE      0x20000
77986 +
77987 +#define LEGACY_IO_START  (MMIO_START + MMIO_SIZE)
77988 +#define LEGACY_IO_SIZE   (64*MEM_M)
77989 +
77990 +#define IO_PAGE_START (LEGACY_IO_START + LEGACY_IO_SIZE)
77991 +#define IO_PAGE_SIZE  PAGE_SIZE
77992 +
77993 +#define STORE_PAGE_START (IO_PAGE_START + IO_PAGE_SIZE)
77994 +#define STORE_PAGE_SIZE         PAGE_SIZE
77995 +
77996 +#define IO_SAPIC_START   0xfec00000UL
77997 +#define IO_SAPIC_SIZE    0x100000
77998 +
77999 +#define PIB_START 0xfee00000UL
78000 +#define PIB_SIZE 0x100000
78001 +
78002 +#define GFW_START        (4*MEM_G -16*MEM_M)
78003 +#define GFW_SIZE         (16*MEM_M)
78004 +
78005 +/*
78006 + * NB. This may become a 64-bit count with no shift. If this happens then the 
78007 + * structure size will still be 8 bytes, so no other alignments will change.
78008 + */
78009 +typedef struct {
78010 +    unsigned int  tsc_bits;      /* 0: 32 bits read from the CPU's TSC. */
78011 +    unsigned int  tsc_bitshift;  /* 4: 'tsc_bits' uses N:N+31 of TSC.   */
78012 +} tsc_timestamp_t; /* 8 bytes */
78013 +
78014 +struct pt_fpreg {
78015 +    union {
78016 +        unsigned long bits[2];
78017 +        long double __dummy;    /* force 16-byte alignment */
78018 +    } u;
78019 +};
78020 +
78021 +typedef struct cpu_user_regs{
78022 +    /* The following registers are saved by SAVE_MIN: */
78023 +    unsigned long b6;  /* scratch */
78024 +    unsigned long b7;  /* scratch */
78025 +
78026 +    unsigned long ar_csd; /* used by cmp8xchg16 (scratch) */
78027 +    unsigned long ar_ssd; /* reserved for future use (scratch) */
78028 +
78029 +    unsigned long r8;  /* scratch (return value register 0) */
78030 +    unsigned long r9;  /* scratch (return value register 1) */
78031 +    unsigned long r10; /* scratch (return value register 2) */
78032 +    unsigned long r11; /* scratch (return value register 3) */
78033 +
78034 +    unsigned long cr_ipsr; /* interrupted task's psr */
78035 +    unsigned long cr_iip;  /* interrupted task's instruction pointer */
78036 +    unsigned long cr_ifs;  /* interrupted task's function state */
78037 +
78038 +    unsigned long ar_unat; /* interrupted task's NaT register (preserved) */
78039 +    unsigned long ar_pfs;  /* prev function state  */
78040 +    unsigned long ar_rsc;  /* RSE configuration */
78041 +    /* The following two are valid only if cr_ipsr.cpl > 0: */
78042 +    unsigned long ar_rnat;  /* RSE NaT */
78043 +    unsigned long ar_bspstore; /* RSE bspstore */
78044 +
78045 +    unsigned long pr;  /* 64 predicate registers (1 bit each) */
78046 +    unsigned long b0;  /* return pointer (bp) */
78047 +    unsigned long loadrs;  /* size of dirty partition << 16 */
78048 +
78049 +    unsigned long r1;  /* the gp pointer */
78050 +    unsigned long r12; /* interrupted task's memory stack pointer */
78051 +    unsigned long r13; /* thread pointer */
78052 +
78053 +    unsigned long ar_fpsr;  /* floating point status (preserved) */
78054 +    unsigned long r15;  /* scratch */
78055 +
78056 + /* The remaining registers are NOT saved for system calls.  */
78057 +
78058 +    unsigned long r14;  /* scratch */
78059 +    unsigned long r2;  /* scratch */
78060 +    unsigned long r3;  /* scratch */
78061 +    unsigned long r16;  /* scratch */
78062 +    unsigned long r17;  /* scratch */
78063 +    unsigned long r18;  /* scratch */
78064 +    unsigned long r19;  /* scratch */
78065 +    unsigned long r20;  /* scratch */
78066 +    unsigned long r21;  /* scratch */
78067 +    unsigned long r22;  /* scratch */
78068 +    unsigned long r23;  /* scratch */
78069 +    unsigned long r24;  /* scratch */
78070 +    unsigned long r25;  /* scratch */
78071 +    unsigned long r26;  /* scratch */
78072 +    unsigned long r27;  /* scratch */
78073 +    unsigned long r28;  /* scratch */
78074 +    unsigned long r29;  /* scratch */
78075 +    unsigned long r30;  /* scratch */
78076 +    unsigned long r31;  /* scratch */
78077 +    unsigned long ar_ccv;  /* compare/exchange value (scratch) */
78078 +
78079 +    /*
78080 +     * Floating point registers that the kernel considers scratch:
78081 +     */
78082 +    struct pt_fpreg f6;  /* scratch */
78083 +    struct pt_fpreg f7;  /* scratch */
78084 +    struct pt_fpreg f8;  /* scratch */
78085 +    struct pt_fpreg f9;  /* scratch */
78086 +    struct pt_fpreg f10;  /* scratch */
78087 +    struct pt_fpreg f11;  /* scratch */
78088 +    unsigned long r4;  /* preserved */
78089 +    unsigned long r5;  /* preserved */
78090 +    unsigned long r6;  /* preserved */
78091 +    unsigned long r7;  /* preserved */
78092 +    unsigned long eml_unat;    /* used for emulating instruction */
78093 +    unsigned long rfi_pfs;     /* used for elulating rfi */
78094 +
78095 +}cpu_user_regs_t;
78096 +
78097 +typedef union {
78098 +    unsigned long value;
78099 +    struct {
78100 +        int a_int:1;
78101 +        int a_from_int_cr:1;
78102 +        int a_to_int_cr:1;
78103 +        int a_from_psr:1;
78104 +        int a_from_cpuid:1;
78105 +        int a_cover:1;
78106 +        int a_bsw:1;
78107 +        long reserved:57;
78108 +    };
78109 +} vac_t;
78110 +
78111 +typedef union {
78112 +    unsigned long value;
78113 +    struct {
78114 +        int d_vmsw:1;
78115 +        int d_extint:1;
78116 +        int d_ibr_dbr:1;
78117 +        int d_pmc:1;
78118 +        int d_to_pmd:1;
78119 +        int d_itm:1;
78120 +        long reserved:58;
78121 +    };
78122 +} vdc_t;
78123 +
78124 +typedef struct {
78125 +    vac_t   vac;
78126 +    vdc_t   vdc;
78127 +    unsigned long  virt_env_vaddr;
78128 +    unsigned long  reserved1[29];
78129 +    unsigned long  vhpi;
78130 +    unsigned long  reserved2[95];
78131 +    union {
78132 +        unsigned long  vgr[16];
78133 +        unsigned long bank1_regs[16]; // bank1 regs (r16-r31) when bank0 active
78134 +    };
78135 +    union {
78136 +        unsigned long  vbgr[16];
78137 +        unsigned long bank0_regs[16]; // bank0 regs (r16-r31) when bank1 active
78138 +    };
78139 +    unsigned long  vnat;
78140 +    unsigned long  vbnat;
78141 +    unsigned long  vcpuid[5];
78142 +    unsigned long  reserved3[11];
78143 +    unsigned long  vpsr;
78144 +    unsigned long  vpr;
78145 +    unsigned long  reserved4[76];
78146 +    union {
78147 +        unsigned long  vcr[128];
78148 +        struct {
78149 +            unsigned long dcr;  // CR0
78150 +            unsigned long itm;
78151 +            unsigned long iva;
78152 +            unsigned long rsv1[5];
78153 +            unsigned long pta;  // CR8
78154 +            unsigned long rsv2[7];
78155 +            unsigned long ipsr;  // CR16
78156 +            unsigned long isr;
78157 +            unsigned long rsv3;
78158 +            unsigned long iip;
78159 +            unsigned long ifa;
78160 +            unsigned long itir;
78161 +            unsigned long iipa;
78162 +            unsigned long ifs;
78163 +            unsigned long iim;  // CR24
78164 +            unsigned long iha;
78165 +            unsigned long rsv4[38];
78166 +            unsigned long lid;  // CR64
78167 +            unsigned long ivr;
78168 +            unsigned long tpr;
78169 +            unsigned long eoi;
78170 +            unsigned long irr[4];
78171 +            unsigned long itv;  // CR72
78172 +            unsigned long pmv;
78173 +            unsigned long cmcv;
78174 +            unsigned long rsv5[5];
78175 +            unsigned long lrr0;  // CR80
78176 +            unsigned long lrr1;
78177 +            unsigned long rsv6[46];
78178 +        };
78179 +    };
78180 +    union {
78181 +        unsigned long  reserved5[128];
78182 +        struct {
78183 +            unsigned long precover_ifs;
78184 +            unsigned long unat;  // not sure if this is needed until NaT arch is done
78185 +            int interrupt_collection_enabled; // virtual psr.ic
78186 +            int interrupt_delivery_enabled; // virtual psr.i
78187 +            int pending_interruption;
78188 +            int incomplete_regframe; // see SDM vol2 6.8
78189 +            unsigned long reserved5_1[4];
78190 +            int metaphysical_mode; // 1 = use metaphys mapping, 0 = use virtual
78191 +            int banknum; // 0 or 1, which virtual register bank is active
78192 +            unsigned long rrs[8]; // region registers
78193 +            unsigned long krs[8]; // kernel registers
78194 +            unsigned long pkrs[8]; // protection key registers
78195 +            unsigned long tmp[8]; // temp registers (e.g. for hyperprivops)
78196 +            // FIXME: tmp[8] temp'ly being used for virtual psr.pp
78197 +        };
78198 +    };
78199 +    unsigned long  reserved6[3456];
78200 +    unsigned long  vmm_avail[128];
78201 +    unsigned long  reserved7[4096];
78202 +} mapped_regs_t;
78203 +
78204 +typedef struct {
78205 +    mapped_regs_t *privregs;
78206 +    int evtchn_vector;
78207 +} arch_vcpu_info_t;
78208 +
78209 +typedef mapped_regs_t vpd_t;
78210 +
78211 +typedef struct {
78212 +    unsigned int flags;
78213 +    unsigned long start_info_pfn;
78214 +} arch_shared_info_t;
78215 +
78216 +typedef struct {
78217 +    unsigned long start;
78218 +    unsigned long size;
78219 +} arch_initrd_info_t;
78220 +
78221 +#define IA64_COMMAND_LINE_SIZE 512
78222 +typedef struct vcpu_guest_context {
78223 +#define VGCF_FPU_VALID (1<<0)
78224 +#define VGCF_VMX_GUEST (1<<1)
78225 +#define VGCF_IN_KERNEL (1<<2)
78226 +    unsigned long flags;       /* VGCF_* flags */
78227 +    unsigned long pt_base;     /* PMT table base */
78228 +    unsigned long share_io_pg; /* Shared page for I/O emulation */
78229 +    unsigned long sys_pgnr;    /* System pages out of domain memory */
78230 +    unsigned long vm_assist;   /* VMASST_TYPE_* bitmap, now none on IPF */
78231 +
78232 +    cpu_user_regs_t regs;
78233 +    arch_vcpu_info_t vcpu;
78234 +    arch_shared_info_t shared;
78235 +    arch_initrd_info_t initrd;
78236 +    char cmdline[IA64_COMMAND_LINE_SIZE];
78237 +} vcpu_guest_context_t;
78238 +DEFINE_GUEST_HANDLE(vcpu_guest_context_t);
78239 +
78240 +#endif /* !__ASSEMBLY__ */
78241 +
78242 +#endif /* __HYPERVISOR_IF_IA64_H__ */
78243 +
78244 +/*
78245 + * Local variables:
78246 + * mode: C
78247 + * c-set-style: "BSD"
78248 + * c-basic-offset: 4
78249 + * tab-width: 4
78250 + * indent-tabs-mode: nil
78251 + * End:
78252 + */
78253 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/arch-x86_32.h linux-2.6.16/include/xen/interface/arch-x86_32.h
78254 --- linux-2.6.16.orig/include/xen/interface/arch-x86_32.h       1970-01-01 01:00:00.000000000 +0100
78255 +++ linux-2.6.16/include/xen/interface/arch-x86_32.h    2006-06-26 09:51:32.000000000 +0200
78256 @@ -0,0 +1,195 @@
78257 +/******************************************************************************
78258 + * arch-x86_32.h
78259 + * 
78260 + * Guest OS interface to x86 32-bit Xen.
78261 + * 
78262 + * Copyright (c) 2004, K A Fraser
78263 + */
78264 +
78265 +#ifndef __XEN_PUBLIC_ARCH_X86_32_H__
78266 +#define __XEN_PUBLIC_ARCH_X86_32_H__
78267 +
78268 +#ifdef __XEN__
78269 +#define __DEFINE_GUEST_HANDLE(name, type) \
78270 +    typedef struct { type *p; } __guest_handle_ ## name
78271 +#else
78272 +#define __DEFINE_GUEST_HANDLE(name, type) \
78273 +    typedef type * __guest_handle_ ## name
78274 +#endif
78275 +
78276 +#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
78277 +#define GUEST_HANDLE(name)        __guest_handle_ ## name
78278 +
78279 +#ifndef __ASSEMBLY__
78280 +/* Guest handles for primitive C types. */
78281 +__DEFINE_GUEST_HANDLE(uchar, unsigned char);
78282 +__DEFINE_GUEST_HANDLE(uint,  unsigned int);
78283 +__DEFINE_GUEST_HANDLE(ulong, unsigned long);
78284 +DEFINE_GUEST_HANDLE(char);
78285 +DEFINE_GUEST_HANDLE(int);
78286 +DEFINE_GUEST_HANDLE(long);
78287 +DEFINE_GUEST_HANDLE(void);
78288 +#endif
78289 +
78290 +/*
78291 + * SEGMENT DESCRIPTOR TABLES
78292 + */
78293 +/*
78294 + * A number of GDT entries are reserved by Xen. These are not situated at the
78295 + * start of the GDT because some stupid OSes export hard-coded selector values
78296 + * in their ABI. These hard-coded values are always near the start of the GDT,
78297 + * so Xen places itself out of the way, at the far end of the GDT.
78298 + */
78299 +#define FIRST_RESERVED_GDT_PAGE  14
78300 +#define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
78301 +#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
78302 +
78303 +/*
78304 + * These flat segments are in the Xen-private section of every GDT. Since these
78305 + * are also present in the initial GDT, many OSes will be able to avoid
78306 + * installing their own GDT.
78307 + */
78308 +#define FLAT_RING1_CS 0xe019    /* GDT index 259 */
78309 +#define FLAT_RING1_DS 0xe021    /* GDT index 260 */
78310 +#define FLAT_RING1_SS 0xe021    /* GDT index 260 */
78311 +#define FLAT_RING3_CS 0xe02b    /* GDT index 261 */
78312 +#define FLAT_RING3_DS 0xe033    /* GDT index 262 */
78313 +#define FLAT_RING3_SS 0xe033    /* GDT index 262 */
78314 +
78315 +#define FLAT_KERNEL_CS FLAT_RING1_CS
78316 +#define FLAT_KERNEL_DS FLAT_RING1_DS
78317 +#define FLAT_KERNEL_SS FLAT_RING1_SS
78318 +#define FLAT_USER_CS    FLAT_RING3_CS
78319 +#define FLAT_USER_DS    FLAT_RING3_DS
78320 +#define FLAT_USER_SS    FLAT_RING3_SS
78321 +
78322 +/* And the trap vector is... */
78323 +#define TRAP_INSTR "int $0x82"
78324 +
78325 +/*
78326 + * Virtual addresses beyond this are not modifiable by guest OSes. The 
78327 + * machine->physical mapping table starts at this address, read-only.
78328 + */
78329 +#ifdef CONFIG_X86_PAE
78330 +#define __HYPERVISOR_VIRT_START 0xF5800000
78331 +#else
78332 +#define __HYPERVISOR_VIRT_START 0xFC000000
78333 +#endif
78334 +
78335 +#ifndef HYPERVISOR_VIRT_START
78336 +#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
78337 +#endif
78338 +
78339 +#ifndef machine_to_phys_mapping
78340 +#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
78341 +#endif
78342 +
78343 +/* Maximum number of virtual CPUs in multi-processor guests. */
78344 +#define MAX_VIRT_CPUS 32
78345 +
78346 +#ifndef __ASSEMBLY__
78347 +
78348 +/*
78349 + * Send an array of these to HYPERVISOR_set_trap_table()
78350 + */
78351 +#define TI_GET_DPL(_ti)      ((_ti)->flags & 3)
78352 +#define TI_GET_IF(_ti)       ((_ti)->flags & 4)
78353 +#define TI_SET_DPL(_ti,_dpl) ((_ti)->flags |= (_dpl))
78354 +#define TI_SET_IF(_ti,_if)   ((_ti)->flags |= ((!!(_if))<<2))
78355 +typedef struct trap_info {
78356 +    uint8_t       vector;  /* exception vector                              */
78357 +    uint8_t       flags;   /* 0-3: privilege level; 4: clear event enable?  */
78358 +    uint16_t      cs;      /* code selector                                 */
78359 +    unsigned long address; /* code offset                                   */
78360 +} trap_info_t;
78361 +DEFINE_GUEST_HANDLE(trap_info_t);
78362 +
78363 +typedef struct cpu_user_regs {
78364 +    uint32_t ebx;
78365 +    uint32_t ecx;
78366 +    uint32_t edx;
78367 +    uint32_t esi;
78368 +    uint32_t edi;
78369 +    uint32_t ebp;
78370 +    uint32_t eax;
78371 +    uint16_t error_code;    /* private */
78372 +    uint16_t entry_vector;  /* private */
78373 +    uint32_t eip;
78374 +    uint16_t cs;
78375 +    uint8_t  saved_upcall_mask;
78376 +    uint8_t  _pad0;
78377 +    uint32_t eflags;        /* eflags.IF == !saved_upcall_mask */
78378 +    uint32_t esp;
78379 +    uint16_t ss, _pad1;
78380 +    uint16_t es, _pad2;
78381 +    uint16_t ds, _pad3;
78382 +    uint16_t fs, _pad4;
78383 +    uint16_t gs, _pad5;
78384 +} cpu_user_regs_t;
78385 +DEFINE_GUEST_HANDLE(cpu_user_regs_t);
78386 +
78387 +typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
78388 +
78389 +/*
78390 + * The following is all CPU context. Note that the fpu_ctxt block is filled 
78391 + * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
78392 + */
78393 +typedef struct vcpu_guest_context {
78394 +    /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
78395 +    struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
78396 +#define VGCF_I387_VALID (1<<0)
78397 +#define VGCF_HVM_GUEST  (1<<1)
78398 +#define VGCF_IN_KERNEL  (1<<2)
78399 +    unsigned long flags;                    /* VGCF_* flags                 */
78400 +    cpu_user_regs_t user_regs;              /* User-level CPU registers     */
78401 +    struct trap_info trap_ctxt[256];        /* Virtual IDT                  */
78402 +    unsigned long ldt_base, ldt_ents;       /* LDT (linear address, # ents) */
78403 +    unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
78404 +    unsigned long kernel_ss, kernel_sp;     /* Virtual TSS (only SS1/SP1)   */
78405 +    unsigned long ctrlreg[8];               /* CR0-CR7 (control registers)  */
78406 +    unsigned long debugreg[8];              /* DB0-DB7 (debug registers)    */
78407 +    unsigned long event_callback_cs;        /* CS:EIP of event callback     */
78408 +    unsigned long event_callback_eip;
78409 +    unsigned long failsafe_callback_cs;     /* CS:EIP of failsafe callback  */
78410 +    unsigned long failsafe_callback_eip;
78411 +    unsigned long vm_assist;                /* VMASST_TYPE_* bitmap */
78412 +} vcpu_guest_context_t;
78413 +DEFINE_GUEST_HANDLE(vcpu_guest_context_t);
78414 +
78415 +typedef struct arch_shared_info {
78416 +    unsigned long max_pfn;                  /* max pfn that appears in table */
78417 +    /* Frame containing list of mfns containing list of mfns containing p2m. */
78418 +    unsigned long pfn_to_mfn_frame_list_list;
78419 +    unsigned long nmi_reason;
78420 +} arch_shared_info_t;
78421 +
78422 +typedef struct {
78423 +    unsigned long cr2;
78424 +    unsigned long pad[5]; /* sizeof(vcpu_info_t) == 64 */
78425 +} arch_vcpu_info_t;
78426 +
78427 +#endif /* !__ASSEMBLY__ */
78428 +
78429 +/*
78430 + * Prefix forces emulation of some non-trapping instructions.
78431 + * Currently only CPUID.
78432 + */
78433 +#ifdef __ASSEMBLY__
78434 +#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
78435 +#define XEN_CPUID          XEN_EMULATE_PREFIX cpuid
78436 +#else
78437 +#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
78438 +#define XEN_CPUID          XEN_EMULATE_PREFIX "cpuid"
78439 +#endif
78440 +
78441 +#endif
78442 +
78443 +/*
78444 + * Local variables:
78445 + * mode: C
78446 + * c-set-style: "BSD"
78447 + * c-basic-offset: 4
78448 + * tab-width: 4
78449 + * indent-tabs-mode: nil
78450 + * End:
78451 + */
78452 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/arch-x86_64.h linux-2.6.16/include/xen/interface/arch-x86_64.h
78453 --- linux-2.6.16.orig/include/xen/interface/arch-x86_64.h       1970-01-01 01:00:00.000000000 +0100
78454 +++ linux-2.6.16/include/xen/interface/arch-x86_64.h    2006-06-26 09:51:32.000000000 +0200
78455 @@ -0,0 +1,271 @@
78456 +/******************************************************************************
78457 + * arch-x86_64.h
78458 + * 
78459 + * Guest OS interface to x86 64-bit Xen.
78460 + * 
78461 + * Copyright (c) 2004, K A Fraser
78462 + */
78463 +
78464 +#ifndef __XEN_PUBLIC_ARCH_X86_64_H__
78465 +#define __XEN_PUBLIC_ARCH_X86_64_H__
78466 +
78467 +#ifdef __XEN__
78468 +#define __DEFINE_GUEST_HANDLE(name, type) \
78469 +    typedef struct { type *p; } __guest_handle_ ## name
78470 +#else
78471 +#define __DEFINE_GUEST_HANDLE(name, type) \
78472 +    typedef type * __guest_handle_ ## name
78473 +#endif
78474 +
78475 +#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
78476 +#define GUEST_HANDLE(name)        __guest_handle_ ## name
78477 +
78478 +#ifndef __ASSEMBLY__
78479 +/* Guest handles for primitive C types. */
78480 +__DEFINE_GUEST_HANDLE(uchar, unsigned char);
78481 +__DEFINE_GUEST_HANDLE(uint,  unsigned int);
78482 +__DEFINE_GUEST_HANDLE(ulong, unsigned long);
78483 +DEFINE_GUEST_HANDLE(char);
78484 +DEFINE_GUEST_HANDLE(int);
78485 +DEFINE_GUEST_HANDLE(long);
78486 +DEFINE_GUEST_HANDLE(void);
78487 +#endif
78488 +
78489 +/*
78490 + * SEGMENT DESCRIPTOR TABLES
78491 + */
78492 +/*
78493 + * A number of GDT entries are reserved by Xen. These are not situated at the
78494 + * start of the GDT because some stupid OSes export hard-coded selector values
78495 + * in their ABI. These hard-coded values are always near the start of the GDT,
78496 + * so Xen places itself out of the way, at the far end of the GDT.
78497 + */
78498 +#define FIRST_RESERVED_GDT_PAGE  14
78499 +#define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
78500 +#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
78501 +
78502 +/*
78503 + * 64-bit segment selectors
78504 + * These flat segments are in the Xen-private section of every GDT. Since these
78505 + * are also present in the initial GDT, many OSes will be able to avoid
78506 + * installing their own GDT.
78507 + */
78508 +
78509 +#define FLAT_RING3_CS32 0xe023  /* GDT index 260 */
78510 +#define FLAT_RING3_CS64 0xe033  /* GDT index 261 */
78511 +#define FLAT_RING3_DS32 0xe02b  /* GDT index 262 */
78512 +#define FLAT_RING3_DS64 0x0000  /* NULL selector */
78513 +#define FLAT_RING3_SS32 0xe02b  /* GDT index 262 */
78514 +#define FLAT_RING3_SS64 0xe02b  /* GDT index 262 */
78515 +
78516 +#define FLAT_KERNEL_DS64 FLAT_RING3_DS64
78517 +#define FLAT_KERNEL_DS32 FLAT_RING3_DS32
78518 +#define FLAT_KERNEL_DS   FLAT_KERNEL_DS64
78519 +#define FLAT_KERNEL_CS64 FLAT_RING3_CS64
78520 +#define FLAT_KERNEL_CS32 FLAT_RING3_CS32
78521 +#define FLAT_KERNEL_CS   FLAT_KERNEL_CS64
78522 +#define FLAT_KERNEL_SS64 FLAT_RING3_SS64
78523 +#define FLAT_KERNEL_SS32 FLAT_RING3_SS32
78524 +#define FLAT_KERNEL_SS   FLAT_KERNEL_SS64
78525 +
78526 +#define FLAT_USER_DS64 FLAT_RING3_DS64
78527 +#define FLAT_USER_DS32 FLAT_RING3_DS32
78528 +#define FLAT_USER_DS   FLAT_USER_DS64
78529 +#define FLAT_USER_CS64 FLAT_RING3_CS64
78530 +#define FLAT_USER_CS32 FLAT_RING3_CS32
78531 +#define FLAT_USER_CS   FLAT_USER_CS64
78532 +#define FLAT_USER_SS64 FLAT_RING3_SS64
78533 +#define FLAT_USER_SS32 FLAT_RING3_SS32
78534 +#define FLAT_USER_SS   FLAT_USER_SS64
78535 +
78536 +/* And the trap vector is... */
78537 +#define TRAP_INSTR "syscall"
78538 +
78539 +#define __HYPERVISOR_VIRT_START 0xFFFF800000000000
78540 +#define __HYPERVISOR_VIRT_END   0xFFFF880000000000
78541 +
78542 +#ifndef HYPERVISOR_VIRT_START
78543 +#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
78544 +#define HYPERVISOR_VIRT_END   mk_unsigned_long(__HYPERVISOR_VIRT_END)
78545 +#endif
78546 +
78547 +/* Maximum number of virtual CPUs in multi-processor guests. */
78548 +#define MAX_VIRT_CPUS 32
78549 +
78550 +#ifndef __ASSEMBLY__
78551 +
78552 +/* The machine->physical mapping table starts at this address, read-only. */
78553 +#ifndef machine_to_phys_mapping
78554 +#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
78555 +#endif
78556 +
78557 +/*
78558 + * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
78559 + *  @which == SEGBASE_*  ;  @base == 64-bit base address
78560 + * Returns 0 on success.
78561 + */
78562 +#define SEGBASE_FS          0
78563 +#define SEGBASE_GS_USER     1
78564 +#define SEGBASE_GS_KERNEL   2
78565 +#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */
78566 +
78567 +/*
78568 + * int HYPERVISOR_iret(void)
78569 + * All arguments are on the kernel stack, in the following format.
78570 + * Never returns if successful. Current kernel context is lost.
78571 + * The saved CS is mapped as follows:
78572 + *   RING0 -> RING3 kernel mode.
78573 + *   RING1 -> RING3 kernel mode.
78574 + *   RING2 -> RING3 kernel mode.
78575 + *   RING3 -> RING3 user mode.
78576 + * However RING0 indicates that the guest kernel should return to iteself
78577 + * directly with
78578 + *      orb   $3,1*8(%rsp)
78579 + *      iretq
78580 + * If flags contains VGCF_IN_SYSCALL:
78581 + *   Restore RAX, RIP, RFLAGS, RSP.
78582 + *   Discard R11, RCX, CS, SS.
78583 + * Otherwise:
78584 + *   Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP.
78585 + * All other registers are saved on hypercall entry and restored to user.
78586 + */
78587 +/* Guest exited in SYSCALL context? Return to guest with SYSRET? */
78588 +#define VGCF_IN_SYSCALL (1<<8)
78589 +struct iret_context {
78590 +    /* Top of stack (%rsp at point of hypercall). */
78591 +    uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
78592 +    /* Bottom of iret stack frame. */
78593 +};
78594 +
78595 +/*
78596 + * Send an array of these to HYPERVISOR_set_trap_table().
78597 + * N.B. As in x86/32 mode, the privilege level specifies which modes may enter
78598 + * a trap via a software interrupt. Since rings 1 and 2 are unavailable, we
78599 + * allocate privilege levels as follows:
78600 + *  Level == 0: Noone may enter
78601 + *  Level == 1: Kernel may enter
78602 + *  Level == 2: Kernel may enter
78603 + *  Level == 3: Everyone may enter
78604 + */
78605 +#define TI_GET_DPL(_ti)      ((_ti)->flags & 3)
78606 +#define TI_GET_IF(_ti)       ((_ti)->flags & 4)
78607 +#define TI_SET_DPL(_ti,_dpl) ((_ti)->flags |= (_dpl))
78608 +#define TI_SET_IF(_ti,_if)   ((_ti)->flags |= ((!!(_if))<<2))
78609 +typedef struct trap_info {
78610 +    uint8_t       vector;  /* exception vector                              */
78611 +    uint8_t       flags;   /* 0-3: privilege level; 4: clear event enable?  */
78612 +    uint16_t      cs;      /* code selector                                 */
78613 +    unsigned long address; /* code offset                                   */
78614 +} trap_info_t;
78615 +DEFINE_GUEST_HANDLE(trap_info_t);
78616 +
78617 +#ifdef __GNUC__
78618 +/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */
78619 +#define __DECL_REG(name) union { uint64_t r ## name, e ## name; }
78620 +#else
78621 +/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */
78622 +#define __DECL_REG(name) uint64_t r ## name
78623 +#endif
78624 +
78625 +typedef struct cpu_user_regs {
78626 +    uint64_t r15;
78627 +    uint64_t r14;
78628 +    uint64_t r13;
78629 +    uint64_t r12;
78630 +    __DECL_REG(bp);
78631 +    __DECL_REG(bx);
78632 +    uint64_t r11;
78633 +    uint64_t r10;
78634 +    uint64_t r9;
78635 +    uint64_t r8;
78636 +    __DECL_REG(ax);
78637 +    __DECL_REG(cx);
78638 +    __DECL_REG(dx);
78639 +    __DECL_REG(si);
78640 +    __DECL_REG(di);
78641 +    uint32_t error_code;    /* private */
78642 +    uint32_t entry_vector;  /* private */
78643 +    __DECL_REG(ip);
78644 +    uint16_t cs, _pad0[1];
78645 +    uint8_t  saved_upcall_mask;
78646 +    uint8_t  _pad1[3];
78647 +    __DECL_REG(flags);      /* rflags.IF == !saved_upcall_mask */
78648 +    __DECL_REG(sp);
78649 +    uint16_t ss, _pad2[3];
78650 +    uint16_t es, _pad3[3];
78651 +    uint16_t ds, _pad4[3];
78652 +    uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base.     */
78653 +    uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
78654 +} cpu_user_regs_t;
78655 +DEFINE_GUEST_HANDLE(cpu_user_regs_t);
78656 +
78657 +#undef __DECL_REG
78658 +
78659 +typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
78660 +
78661 +/*
78662 + * The following is all CPU context. Note that the fpu_ctxt block is filled 
78663 + * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
78664 + */
78665 +typedef struct vcpu_guest_context {
78666 +    /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
78667 +    struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
78668 +#define VGCF_I387_VALID (1<<0)
78669 +#define VGCF_HVM_GUEST  (1<<1)
78670 +#define VGCF_IN_KERNEL  (1<<2)
78671 +    unsigned long flags;                    /* VGCF_* flags                 */
78672 +    cpu_user_regs_t user_regs;              /* User-level CPU registers     */
78673 +    struct trap_info trap_ctxt[256];        /* Virtual IDT                  */
78674 +    unsigned long ldt_base, ldt_ents;       /* LDT (linear address, # ents) */
78675 +    unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
78676 +    unsigned long kernel_ss, kernel_sp;     /* Virtual TSS (only SS1/SP1)   */
78677 +    unsigned long ctrlreg[8];               /* CR0-CR7 (control registers)  */
78678 +    unsigned long debugreg[8];              /* DB0-DB7 (debug registers)    */
78679 +    unsigned long event_callback_eip;
78680 +    unsigned long failsafe_callback_eip;
78681 +    unsigned long syscall_callback_eip;
78682 +    unsigned long vm_assist;                /* VMASST_TYPE_* bitmap */
78683 +    /* Segment base addresses. */
78684 +    uint64_t      fs_base;
78685 +    uint64_t      gs_base_kernel;
78686 +    uint64_t      gs_base_user;
78687 +} vcpu_guest_context_t;
78688 +DEFINE_GUEST_HANDLE(vcpu_guest_context_t);
78689 +
78690 +typedef struct arch_shared_info {
78691 +    unsigned long max_pfn;                  /* max pfn that appears in table */
78692 +    /* Frame containing list of mfns containing list of mfns containing p2m. */
78693 +    unsigned long pfn_to_mfn_frame_list_list;
78694 +    unsigned long nmi_reason;
78695 +} arch_shared_info_t;
78696 +
78697 +typedef struct {
78698 +    unsigned long cr2;
78699 +    unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
78700 +} arch_vcpu_info_t;
78701 +
78702 +#endif /* !__ASSEMBLY__ */
78703 +
78704 +/*
78705 + * Prefix forces emulation of some non-trapping instructions.
78706 + * Currently only CPUID.
78707 + */
78708 +#ifdef __ASSEMBLY__
78709 +#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
78710 +#define XEN_CPUID          XEN_EMULATE_PREFIX cpuid
78711 +#else
78712 +#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
78713 +#define XEN_CPUID          XEN_EMULATE_PREFIX "cpuid"
78714 +#endif
78715 +
78716 +#endif
78717 +
78718 +/*
78719 + * Local variables:
78720 + * mode: C
78721 + * c-set-style: "BSD"
78722 + * c-basic-offset: 4
78723 + * tab-width: 4
78724 + * indent-tabs-mode: nil
78725 + * End:
78726 + */
78727 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/dom0_ops.h linux-2.6.16/include/xen/interface/dom0_ops.h
78728 --- linux-2.6.16.orig/include/xen/interface/dom0_ops.h  1970-01-01 01:00:00.000000000 +0100
78729 +++ linux-2.6.16/include/xen/interface/dom0_ops.h       2006-06-26 09:51:32.000000000 +0200
78730 @@ -0,0 +1,531 @@
78731 +/******************************************************************************
78732 + * dom0_ops.h
78733 + * 
78734 + * Process command requests from domain-0 guest OS.
78735 + * 
78736 + * Copyright (c) 2002-2003, B Dragovic
78737 + * Copyright (c) 2002-2004, K Fraser
78738 + */
78739 +
78740 +
78741 +#ifndef __XEN_PUBLIC_DOM0_OPS_H__
78742 +#define __XEN_PUBLIC_DOM0_OPS_H__
78743 +
78744 +#include "xen.h"
78745 +#include "sched_ctl.h"
78746 +
78747 +/*
78748 + * Make sure you increment the interface version whenever you modify this file!
78749 + * This makes sure that old versions of dom0 tools will stop working in a
78750 + * well-defined way (rather than crashing the machine, for instance).
78751 + */
78752 +#define DOM0_INTERFACE_VERSION   0x03000000
78753 +
78754 +/************************************************************************/
78755 +
78756 +#define DOM0_GETMEMLIST        2
78757 +typedef struct dom0_getmemlist {
78758 +    /* IN variables. */
78759 +    domid_t       domain;
78760 +    unsigned long max_pfns;
78761 +    GUEST_HANDLE(ulong) buffer;
78762 +    /* OUT variables. */
78763 +    unsigned long num_pfns;
78764 +} dom0_getmemlist_t;
78765 +DEFINE_GUEST_HANDLE(dom0_getmemlist_t);
78766 +
78767 +#define DOM0_SCHEDCTL          6
78768 + /* struct sched_ctl_cmd is from sched-ctl.h   */
78769 +typedef struct sched_ctl_cmd dom0_schedctl_t;
78770 +DEFINE_GUEST_HANDLE(dom0_schedctl_t);
78771 +
78772 +#define DOM0_ADJUSTDOM         7
78773 +/* struct sched_adjdom_cmd is from sched-ctl.h */
78774 +typedef struct sched_adjdom_cmd dom0_adjustdom_t;
78775 +DEFINE_GUEST_HANDLE(dom0_adjustdom_t);
78776 +
78777 +#define DOM0_CREATEDOMAIN      8
78778 +typedef struct dom0_createdomain {
78779 +    /* IN parameters */
78780 +    uint32_t ssidref;
78781 +    xen_domain_handle_t handle;
78782 +    /* IN/OUT parameters. */
78783 +    /* Identifier for new domain (auto-allocate if zero is specified). */
78784 +    domid_t domain;
78785 +} dom0_createdomain_t;
78786 +DEFINE_GUEST_HANDLE(dom0_createdomain_t);
78787 +
78788 +#define DOM0_DESTROYDOMAIN     9
78789 +typedef struct dom0_destroydomain {
78790 +    /* IN variables. */
78791 +    domid_t domain;
78792 +} dom0_destroydomain_t;
78793 +DEFINE_GUEST_HANDLE(dom0_destroydomain_t);
78794 +
78795 +#define DOM0_PAUSEDOMAIN      10
78796 +typedef struct dom0_pausedomain {
78797 +    /* IN parameters. */
78798 +    domid_t domain;
78799 +} dom0_pausedomain_t;
78800 +DEFINE_GUEST_HANDLE(dom0_pausedomain_t);
78801 +
78802 +#define DOM0_UNPAUSEDOMAIN    11
78803 +typedef struct dom0_unpausedomain {
78804 +    /* IN parameters. */
78805 +    domid_t domain;
78806 +} dom0_unpausedomain_t;
78807 +DEFINE_GUEST_HANDLE(dom0_unpausedomain_t);
78808 +
78809 +#define DOM0_GETDOMAININFO    12
78810 +typedef struct dom0_getdomaininfo {
78811 +    /* IN variables. */
78812 +    domid_t  domain;                  /* NB. IN/OUT variable. */
78813 +    /* OUT variables. */
78814 +#define DOMFLAGS_DYING     (1<<0) /* Domain is scheduled to die.             */
78815 +#define DOMFLAGS_SHUTDOWN  (1<<2) /* The guest OS has shut down.             */
78816 +#define DOMFLAGS_PAUSED    (1<<3) /* Currently paused by control software.   */
78817 +#define DOMFLAGS_BLOCKED   (1<<4) /* Currently blocked pending an event.     */
78818 +#define DOMFLAGS_RUNNING   (1<<5) /* Domain is currently running.            */
78819 +#define DOMFLAGS_CPUMASK      255 /* CPU to which this domain is bound.      */
78820 +#define DOMFLAGS_CPUSHIFT       8
78821 +#define DOMFLAGS_SHUTDOWNMASK 255 /* DOMFLAGS_SHUTDOWN guest-supplied code.  */
78822 +#define DOMFLAGS_SHUTDOWNSHIFT 16
78823 +    uint32_t flags;
78824 +    unsigned long tot_pages;
78825 +    unsigned long max_pages;
78826 +    unsigned long shared_info_frame;       /* MFN of shared_info struct */
78827 +    uint64_t cpu_time;
78828 +    uint32_t nr_online_vcpus;     /* Number of VCPUs currently online. */
78829 +    uint32_t max_vcpu_id;         /* Maximum VCPUID in use by this domain. */
78830 +    uint32_t ssidref;
78831 +    xen_domain_handle_t handle;
78832 +} dom0_getdomaininfo_t;
78833 +DEFINE_GUEST_HANDLE(dom0_getdomaininfo_t);
78834 +
78835 +#define DOM0_SETVCPUCONTEXT   13
78836 +typedef struct dom0_setvcpucontext {
78837 +    /* IN variables. */
78838 +    domid_t               domain;
78839 +    uint32_t              vcpu;
78840 +    /* IN/OUT parameters */
78841 +    GUEST_HANDLE(vcpu_guest_context_t) ctxt;
78842 +} dom0_setvcpucontext_t;
78843 +DEFINE_GUEST_HANDLE(dom0_setvcpucontext_t);
78844 +
78845 +#define DOM0_MSR              15
78846 +typedef struct dom0_msr {
78847 +    /* IN variables. */
78848 +    uint32_t write;
78849 +    cpumap_t cpu_mask;
78850 +    uint32_t msr;
78851 +    uint32_t in1;
78852 +    uint32_t in2;
78853 +    /* OUT variables. */
78854 +    uint32_t out1;
78855 +    uint32_t out2;
78856 +} dom0_msr_t;
78857 +DEFINE_GUEST_HANDLE(dom0_msr_t);
78858 +
78859 +/*
78860 + * Set clock such that it would read <secs,nsecs> after 00:00:00 UTC,
78861 + * 1 January, 1970 if the current system time was <system_time>.
78862 + */
78863 +#define DOM0_SETTIME          17
78864 +typedef struct dom0_settime {
78865 +    /* IN variables. */
78866 +    uint32_t secs;
78867 +    uint32_t nsecs;
78868 +    uint64_t system_time;
78869 +} dom0_settime_t;
78870 +DEFINE_GUEST_HANDLE(dom0_settime_t);
78871 +
78872 +#define DOM0_GETPAGEFRAMEINFO 18
78873 +#define LTAB_SHIFT 28
78874 +#define NOTAB 0         /* normal page */
78875 +#define L1TAB (1<<LTAB_SHIFT)
78876 +#define L2TAB (2<<LTAB_SHIFT)
78877 +#define L3TAB (3<<LTAB_SHIFT)
78878 +#define L4TAB (4<<LTAB_SHIFT)
78879 +#define LPINTAB  (1<<31)
78880 +#define XTAB  (0xf<<LTAB_SHIFT) /* invalid page */
78881 +#define LTAB_MASK XTAB
78882 +#define LTABTYPE_MASK (0x7<<LTAB_SHIFT)
78883 +
78884 +typedef struct dom0_getpageframeinfo {
78885 +    /* IN variables. */
78886 +    unsigned long mfn;     /* Machine page frame number to query.       */
78887 +    domid_t domain;        /* To which domain does the frame belong?    */
78888 +    /* OUT variables. */
78889 +    /* Is the page PINNED to a type? */
78890 +    uint32_t type;         /* see above type defs */
78891 +} dom0_getpageframeinfo_t;
78892 +DEFINE_GUEST_HANDLE(dom0_getpageframeinfo_t);
78893 +
78894 +/*
78895 + * Read console content from Xen buffer ring.
78896 + */
78897 +#define DOM0_READCONSOLE      19
78898 +typedef struct dom0_readconsole {
78899 +    /* IN variables. */
78900 +    uint32_t clear;            /* Non-zero -> clear after reading. */
78901 +    /* IN/OUT variables. */
78902 +    GUEST_HANDLE(char) buffer; /* In: Buffer start; Out: Used buffer start */
78903 +    uint32_t count;            /* In: Buffer size;  Out: Used buffer size  */
78904 +} dom0_readconsole_t;
78905 +DEFINE_GUEST_HANDLE(dom0_readconsole_t);
78906 +
78907 +/*
78908 + * Set which physical cpus a vcpu can execute on.
78909 + */
78910 +#define DOM0_SETVCPUAFFINITY  20
78911 +typedef struct dom0_setvcpuaffinity {
78912 +    /* IN variables. */
78913 +    domid_t   domain;
78914 +    uint32_t  vcpu;
78915 +    cpumap_t  cpumap;
78916 +} dom0_setvcpuaffinity_t;
78917 +DEFINE_GUEST_HANDLE(dom0_setvcpuaffinity_t);
78918 +
78919 +/* Get trace buffers machine base address */
78920 +#define DOM0_TBUFCONTROL       21
78921 +typedef struct dom0_tbufcontrol {
78922 +    /* IN variables */
78923 +#define DOM0_TBUF_GET_INFO     0
78924 +#define DOM0_TBUF_SET_CPU_MASK 1
78925 +#define DOM0_TBUF_SET_EVT_MASK 2
78926 +#define DOM0_TBUF_SET_SIZE     3
78927 +#define DOM0_TBUF_ENABLE       4
78928 +#define DOM0_TBUF_DISABLE      5
78929 +    uint32_t      op;
78930 +    /* IN/OUT variables */
78931 +    cpumap_t      cpu_mask;
78932 +    uint32_t      evt_mask;
78933 +    /* OUT variables */
78934 +    unsigned long buffer_mfn;
78935 +    uint32_t size;
78936 +} dom0_tbufcontrol_t;
78937 +DEFINE_GUEST_HANDLE(dom0_tbufcontrol_t);
78938 +
78939 +/*
78940 + * Get physical information about the host machine
78941 + */
78942 +#define DOM0_PHYSINFO         22
78943 +typedef struct dom0_physinfo {
78944 +    uint32_t threads_per_core;
78945 +    uint32_t cores_per_socket;
78946 +    uint32_t sockets_per_node;
78947 +    uint32_t nr_nodes;
78948 +    uint32_t cpu_khz;
78949 +    unsigned long total_pages;
78950 +    unsigned long free_pages;
78951 +    uint32_t hw_cap[8];
78952 +} dom0_physinfo_t;
78953 +DEFINE_GUEST_HANDLE(dom0_physinfo_t);
78954 +
78955 +/*
78956 + * Get the ID of the current scheduler.
78957 + */
78958 +#define DOM0_SCHED_ID        24
78959 +typedef struct dom0_sched_id {
78960 +    /* OUT variable */
78961 +    uint32_t sched_id;
78962 +} dom0_sched_id_t;
78963 +DEFINE_GUEST_HANDLE(dom0_sched_id_t);
78964 +
78965 +/*
78966 + * Control shadow pagetables operation
78967 + */
78968 +#define DOM0_SHADOW_CONTROL  25
78969 +
78970 +#define DOM0_SHADOW_CONTROL_OP_OFF         0
78971 +#define DOM0_SHADOW_CONTROL_OP_ENABLE_TEST 1
78972 +#define DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY 2
78973 +#define DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE 3
78974 +
78975 +#define DOM0_SHADOW_CONTROL_OP_FLUSH       10     /* table ops */
78976 +#define DOM0_SHADOW_CONTROL_OP_CLEAN       11
78977 +#define DOM0_SHADOW_CONTROL_OP_PEEK        12
78978 +
78979 +typedef struct dom0_shadow_control_stats {
78980 +    uint32_t fault_count;
78981 +    uint32_t dirty_count;
78982 +    uint32_t dirty_net_count;
78983 +    uint32_t dirty_block_count;
78984 +} dom0_shadow_control_stats_t;
78985 +DEFINE_GUEST_HANDLE(dom0_shadow_control_stats_t);
78986 +
78987 +typedef struct dom0_shadow_control {
78988 +    /* IN variables. */
78989 +    domid_t        domain;
78990 +    uint32_t       op;
78991 +    GUEST_HANDLE(ulong) dirty_bitmap;
78992 +    /* IN/OUT variables. */
78993 +    unsigned long  pages;        /* size of buffer, updated with actual size */
78994 +    /* OUT variables. */
78995 +    dom0_shadow_control_stats_t stats;
78996 +} dom0_shadow_control_t;
78997 +DEFINE_GUEST_HANDLE(dom0_shadow_control_t);
78998 +
78999 +#define DOM0_SETDOMAINMAXMEM   28
79000 +typedef struct dom0_setdomainmaxmem {
79001 +    /* IN variables. */
79002 +    domid_t       domain;
79003 +    unsigned long max_memkb;
79004 +} dom0_setdomainmaxmem_t;
79005 +DEFINE_GUEST_HANDLE(dom0_setdomainmaxmem_t);
79006 +
79007 +#define DOM0_GETPAGEFRAMEINFO2 29   /* batched interface */
79008 +typedef struct dom0_getpageframeinfo2 {
79009 +    /* IN variables. */
79010 +    domid_t        domain;
79011 +    unsigned long  num;
79012 +    /* IN/OUT variables. */
79013 +    GUEST_HANDLE(ulong) array;
79014 +} dom0_getpageframeinfo2_t;
79015 +DEFINE_GUEST_HANDLE(dom0_getpageframeinfo2_t);
79016 +
79017 +/*
79018 + * Request memory range (@mfn, @mfn+@nr_mfns-1) to have type @type.
79019 + * On x86, @type is an architecture-defined MTRR memory type.
79020 + * On success, returns the MTRR that was used (@reg) and a handle that can
79021 + * be passed to DOM0_DEL_MEMTYPE to accurately tear down the new setting.
79022 + * (x86-specific).
79023 + */
79024 +#define DOM0_ADD_MEMTYPE         31
79025 +typedef struct dom0_add_memtype {
79026 +    /* IN variables. */
79027 +    unsigned long mfn;
79028 +    unsigned long nr_mfns;
79029 +    uint32_t      type;
79030 +    /* OUT variables. */
79031 +    uint32_t      handle;
79032 +    uint32_t      reg;
79033 +} dom0_add_memtype_t;
79034 +DEFINE_GUEST_HANDLE(dom0_add_memtype_t);
79035 +
79036 +/*
79037 + * Tear down an existing memory-range type. If @handle is remembered then it
79038 + * should be passed in to accurately tear down the correct setting (in case
79039 + * of overlapping memory regions with differing types). If it is not known
79040 + * then @handle should be set to zero. In all cases @reg must be set.
79041 + * (x86-specific).
79042 + */
79043 +#define DOM0_DEL_MEMTYPE         32
79044 +typedef struct dom0_del_memtype {
79045 +    /* IN variables. */
79046 +    uint32_t handle;
79047 +    uint32_t reg;
79048 +} dom0_del_memtype_t;
79049 +DEFINE_GUEST_HANDLE(dom0_del_memtype_t);
79050 +
79051 +/* Read current type of an MTRR (x86-specific). */
79052 +#define DOM0_READ_MEMTYPE        33
79053 +typedef struct dom0_read_memtype {
79054 +    /* IN variables. */
79055 +    uint32_t reg;
79056 +    /* OUT variables. */
79057 +    unsigned long mfn;
79058 +    unsigned long nr_mfns;
79059 +    uint32_t type;
79060 +} dom0_read_memtype_t;
79061 +DEFINE_GUEST_HANDLE(dom0_read_memtype_t);
79062 +
79063 +/* Interface for controlling Xen software performance counters. */
79064 +#define DOM0_PERFCCONTROL        34
79065 +/* Sub-operations: */
79066 +#define DOM0_PERFCCONTROL_OP_RESET 1   /* Reset all counters to zero. */
79067 +#define DOM0_PERFCCONTROL_OP_QUERY 2   /* Get perfctr information. */
79068 +typedef struct dom0_perfc_desc {
79069 +    char         name[80];             /* name of perf counter */
79070 +    uint32_t     nr_vals;              /* number of values for this counter */
79071 +    uint32_t     vals[64];             /* array of values */
79072 +} dom0_perfc_desc_t;
79073 +DEFINE_GUEST_HANDLE(dom0_perfc_desc_t);
79074 +typedef struct dom0_perfccontrol {
79075 +    /* IN variables. */
79076 +    uint32_t       op;                /*  DOM0_PERFCCONTROL_OP_??? */
79077 +    /* OUT variables. */
79078 +    uint32_t       nr_counters;       /*  number of counters */
79079 +    GUEST_HANDLE(dom0_perfc_desc_t) desc; /*  counter information (or NULL) */
79080 +} dom0_perfccontrol_t;
79081 +DEFINE_GUEST_HANDLE(dom0_perfccontrol_t);
79082 +
79083 +#define DOM0_MICROCODE           35
79084 +typedef struct dom0_microcode {
79085 +    /* IN variables. */
79086 +    GUEST_HANDLE(void) data;          /* Pointer to microcode data */
79087 +    uint32_t length;                  /* Length of microcode data. */
79088 +} dom0_microcode_t;
79089 +DEFINE_GUEST_HANDLE(dom0_microcode_t);
79090 +
79091 +#define DOM0_IOPORT_PERMISSION   36
79092 +typedef struct dom0_ioport_permission {
79093 +    domid_t  domain;                  /* domain to be affected */
79094 +    uint32_t first_port;              /* first port int range */
79095 +    uint32_t nr_ports;                /* size of port range */
79096 +    uint8_t  allow_access;            /* allow or deny access to range? */
79097 +} dom0_ioport_permission_t;
79098 +DEFINE_GUEST_HANDLE(dom0_ioport_permission_t);
79099 +
79100 +#define DOM0_GETVCPUCONTEXT      37
79101 +typedef struct dom0_getvcpucontext {
79102 +    /* IN variables. */
79103 +    domid_t  domain;                  /* domain to be affected */
79104 +    uint32_t vcpu;                    /* vcpu # */
79105 +    /* OUT variables. */
79106 +    GUEST_HANDLE(vcpu_guest_context_t) ctxt;
79107 +} dom0_getvcpucontext_t;
79108 +DEFINE_GUEST_HANDLE(dom0_getvcpucontext_t);
79109 +
79110 +#define DOM0_GETVCPUINFO         43
79111 +typedef struct dom0_getvcpuinfo {
79112 +    /* IN variables. */
79113 +    domid_t  domain;                  /* domain to be affected */
79114 +    uint32_t vcpu;                    /* vcpu # */
79115 +    /* OUT variables. */
79116 +    uint8_t  online;                  /* currently online (not hotplugged)? */
79117 +    uint8_t  blocked;                 /* blocked waiting for an event? */
79118 +    uint8_t  running;                 /* currently scheduled on its CPU? */
79119 +    uint64_t cpu_time;                /* total cpu time consumed (ns) */
79120 +    uint32_t cpu;                     /* current mapping   */
79121 +    cpumap_t cpumap;                  /* allowable mapping */
79122 +} dom0_getvcpuinfo_t;
79123 +DEFINE_GUEST_HANDLE(dom0_getvcpuinfo_t);
79124 +
79125 +#define DOM0_GETDOMAININFOLIST   38
79126 +typedef struct dom0_getdomaininfolist {
79127 +    /* IN variables. */
79128 +    domid_t               first_domain;
79129 +    uint32_t              max_domains;
79130 +    GUEST_HANDLE(dom0_getdomaininfo_t) buffer;
79131 +    /* OUT variables. */
79132 +    uint32_t              num_domains;
79133 +} dom0_getdomaininfolist_t;
79134 +DEFINE_GUEST_HANDLE(dom0_getdomaininfolist_t);
79135 +
79136 +#define DOM0_PLATFORM_QUIRK      39
79137 +#define QUIRK_NOIRQBALANCING  1
79138 +typedef struct dom0_platform_quirk {
79139 +    /* IN variables. */
79140 +    uint32_t quirk_id;
79141 +} dom0_platform_quirk_t;
79142 +DEFINE_GUEST_HANDLE(dom0_platform_quirk_t);
79143 +
79144 +#define DOM0_PHYSICAL_MEMORY_MAP 40
79145 +typedef struct dom0_memory_map_entry {
79146 +    uint64_t start, end;
79147 +    uint32_t flags; /* reserved */
79148 +    uint8_t  is_ram;
79149 +} dom0_memory_map_entry_t;
79150 +DEFINE_GUEST_HANDLE(dom0_memory_map_entry_t);
79151 +typedef struct dom0_physical_memory_map {
79152 +    /* IN variables. */
79153 +    uint32_t max_map_entries;
79154 +    /* OUT variables. */
79155 +    uint32_t nr_map_entries;
79156 +    GUEST_HANDLE(dom0_memory_map_entry_t) memory_map;
79157 +} dom0_physical_memory_map_t;
79158 +DEFINE_GUEST_HANDLE(dom0_physical_memory_map_t);
79159 +
79160 +#define DOM0_MAX_VCPUS 41
79161 +typedef struct dom0_max_vcpus {
79162 +    domid_t  domain;        /* domain to be affected */
79163 +    uint32_t max;           /* maximum number of vcpus */
79164 +} dom0_max_vcpus_t;
79165 +DEFINE_GUEST_HANDLE(dom0_max_vcpus_t);
79166 +
79167 +#define DOM0_SETDOMAINHANDLE 44
79168 +typedef struct dom0_setdomainhandle {
79169 +    domid_t domain;
79170 +    xen_domain_handle_t handle;
79171 +} dom0_setdomainhandle_t;
79172 +DEFINE_GUEST_HANDLE(dom0_setdomainhandle_t);
79173 +
79174 +#define DOM0_SETDEBUGGING 45
79175 +typedef struct dom0_setdebugging {
79176 +    domid_t domain;
79177 +    uint8_t enable;
79178 +} dom0_setdebugging_t;
79179 +DEFINE_GUEST_HANDLE(dom0_setdebugging_t);
79180 +
79181 +#define DOM0_IRQ_PERMISSION 46
79182 +typedef struct dom0_irq_permission {
79183 +    domid_t domain;          /* domain to be affected */
79184 +    uint8_t pirq;
79185 +    uint8_t allow_access;    /* flag to specify enable/disable of IRQ access */
79186 +} dom0_irq_permission_t;
79187 +DEFINE_GUEST_HANDLE(dom0_irq_permission_t);
79188 +
79189 +#define DOM0_IOMEM_PERMISSION 47
79190 +typedef struct dom0_iomem_permission {
79191 +    domid_t  domain;          /* domain to be affected */
79192 +    unsigned long first_mfn;  /* first page (physical page number) in range */
79193 +    unsigned long nr_mfns;    /* number of pages in range (>0) */
79194 +    uint8_t allow_access;     /* allow (!0) or deny (0) access to range? */
79195 +} dom0_iomem_permission_t;
79196 +DEFINE_GUEST_HANDLE(dom0_iomem_permission_t);
79197 +
79198 +#define DOM0_HYPERCALL_INIT   48
79199 +typedef struct dom0_hypercall_init {
79200 +    domid_t  domain;          /* domain to be affected */
79201 +    unsigned long mfn;        /* machine frame to be initialised */
79202 +} dom0_hypercall_init_t;
79203 +DEFINE_GUEST_HANDLE(dom0_hypercall_init_t);
79204 +
79205 +typedef struct dom0_op {
79206 +    uint32_t cmd;
79207 +    uint32_t interface_version; /* DOM0_INTERFACE_VERSION */
79208 +    union {
79209 +        struct dom0_createdomain      createdomain;
79210 +        struct dom0_pausedomain       pausedomain;
79211 +        struct dom0_unpausedomain     unpausedomain;
79212 +        struct dom0_destroydomain     destroydomain;
79213 +        struct dom0_getmemlist        getmemlist;
79214 +        struct sched_ctl_cmd          schedctl;
79215 +        struct sched_adjdom_cmd       adjustdom;
79216 +        struct dom0_setvcpucontext    setvcpucontext;
79217 +        struct dom0_getdomaininfo     getdomaininfo;
79218 +        struct dom0_getpageframeinfo  getpageframeinfo;
79219 +        struct dom0_msr               msr;
79220 +        struct dom0_settime           settime;
79221 +        struct dom0_readconsole       readconsole;
79222 +        struct dom0_setvcpuaffinity   setvcpuaffinity;
79223 +        struct dom0_tbufcontrol       tbufcontrol;
79224 +        struct dom0_physinfo          physinfo;
79225 +        struct dom0_sched_id          sched_id;
79226 +        struct dom0_shadow_control    shadow_control;
79227 +        struct dom0_setdomainmaxmem   setdomainmaxmem;
79228 +        struct dom0_getpageframeinfo2 getpageframeinfo2;
79229 +        struct dom0_add_memtype       add_memtype;
79230 +        struct dom0_del_memtype       del_memtype;
79231 +        struct dom0_read_memtype      read_memtype;
79232 +        struct dom0_perfccontrol      perfccontrol;
79233 +        struct dom0_microcode         microcode;
79234 +        struct dom0_ioport_permission ioport_permission;
79235 +        struct dom0_getvcpucontext    getvcpucontext;
79236 +        struct dom0_getvcpuinfo       getvcpuinfo;
79237 +        struct dom0_getdomaininfolist getdomaininfolist;
79238 +        struct dom0_platform_quirk    platform_quirk;
79239 +        struct dom0_physical_memory_map physical_memory_map;
79240 +        struct dom0_max_vcpus         max_vcpus;
79241 +        struct dom0_setdomainhandle   setdomainhandle;
79242 +        struct dom0_setdebugging      setdebugging;
79243 +        struct dom0_irq_permission    irq_permission;
79244 +        struct dom0_iomem_permission  iomem_permission;
79245 +        struct dom0_hypercall_init    hypercall_init;
79246 +        uint8_t                       pad[128];
79247 +    } u;
79248 +} dom0_op_t;
79249 +DEFINE_GUEST_HANDLE(dom0_op_t);
79250 +
79251 +#endif /* __XEN_PUBLIC_DOM0_OPS_H__ */
79252 +
79253 +/*
79254 + * Local variables:
79255 + * mode: C
79256 + * c-set-style: "BSD"
79257 + * c-basic-offset: 4
79258 + * tab-width: 4
79259 + * indent-tabs-mode: nil
79260 + * End:
79261 + */
79262 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/event_channel.h linux-2.6.16/include/xen/interface/event_channel.h
79263 --- linux-2.6.16.orig/include/xen/interface/event_channel.h     1970-01-01 01:00:00.000000000 +0100
79264 +++ linux-2.6.16/include/xen/interface/event_channel.h  2006-06-26 09:51:32.000000000 +0200
79265 @@ -0,0 +1,205 @@
79266 +/******************************************************************************
79267 + * event_channel.h
79268 + * 
79269 + * Event channels between domains.
79270 + * 
79271 + * Copyright (c) 2003-2004, K A Fraser.
79272 + */
79273 +
79274 +#ifndef __XEN_PUBLIC_EVENT_CHANNEL_H__
79275 +#define __XEN_PUBLIC_EVENT_CHANNEL_H__
79276 +
79277 +typedef uint32_t evtchn_port_t;
79278 +DEFINE_GUEST_HANDLE(evtchn_port_t);
79279 +
79280 +/*
79281 + * EVTCHNOP_alloc_unbound: Allocate a port in domain <dom> and mark as
79282 + * accepting interdomain bindings from domain <remote_dom>. A fresh port
79283 + * is allocated in <dom> and returned as <port>.
79284 + * NOTES:
79285 + *  1. If the caller is unprivileged then <dom> must be DOMID_SELF.
79286 + *  2. <rdom> may be DOMID_SELF, allowing loopback connections.
79287 + */
79288 +#define EVTCHNOP_alloc_unbound    6
79289 +typedef struct evtchn_alloc_unbound {
79290 +    /* IN parameters */
79291 +    domid_t dom, remote_dom;
79292 +    /* OUT parameters */
79293 +    evtchn_port_t port;
79294 +} evtchn_alloc_unbound_t;
79295 +
79296 +/*
79297 + * EVTCHNOP_bind_interdomain: Construct an interdomain event channel between
79298 + * the calling domain and <remote_dom>. <remote_dom,remote_port> must identify
79299 + * a port that is unbound and marked as accepting bindings from the calling
79300 + * domain. A fresh port is allocated in the calling domain and returned as
79301 + * <local_port>.
79302 + * NOTES:
79303 + *  2. <remote_dom> may be DOMID_SELF, allowing loopback connections.
79304 + */
79305 +#define EVTCHNOP_bind_interdomain 0
79306 +typedef struct evtchn_bind_interdomain {
79307 +    /* IN parameters. */
79308 +    domid_t remote_dom;
79309 +    evtchn_port_t remote_port;
79310 +    /* OUT parameters. */
79311 +    evtchn_port_t local_port;
79312 +} evtchn_bind_interdomain_t;
79313 +
79314 +/*
79315 + * EVTCHNOP_bind_virq: Bind a local event channel to VIRQ <irq> on specified
79316 + * vcpu.
79317 + * NOTES:
79318 + *  1. A virtual IRQ may be bound to at most one event channel per vcpu.
79319 + *  2. The allocated event channel is bound to the specified vcpu. The binding
79320 + *     may not be changed.
79321 + */
79322 +#define EVTCHNOP_bind_virq        1
79323 +typedef struct evtchn_bind_virq {
79324 +    /* IN parameters. */
79325 +    uint32_t virq;
79326 +    uint32_t vcpu;
79327 +    /* OUT parameters. */
79328 +    evtchn_port_t port;
79329 +} evtchn_bind_virq_t;
79330 +
79331 +/*
79332 + * EVTCHNOP_bind_pirq: Bind a local event channel to PIRQ <irq>.
79333 + * NOTES:
79334 + *  1. A physical IRQ may be bound to at most one event channel per domain.
79335 + *  2. Only a sufficiently-privileged domain may bind to a physical IRQ.
79336 + */
79337 +#define EVTCHNOP_bind_pirq        2
79338 +typedef struct evtchn_bind_pirq {
79339 +    /* IN parameters. */
79340 +    uint32_t pirq;
79341 +#define BIND_PIRQ__WILL_SHARE 1
79342 +    uint32_t flags; /* BIND_PIRQ__* */
79343 +    /* OUT parameters. */
79344 +    evtchn_port_t port;
79345 +} evtchn_bind_pirq_t;
79346 +
79347 +/*
79348 + * EVTCHNOP_bind_ipi: Bind a local event channel to receive events.
79349 + * NOTES:
79350 + *  1. The allocated event channel is bound to the specified vcpu. The binding
79351 + *     may not be changed.
79352 + */
79353 +#define EVTCHNOP_bind_ipi         7
79354 +typedef struct evtchn_bind_ipi {
79355 +    uint32_t vcpu;
79356 +    /* OUT parameters. */
79357 +    evtchn_port_t port;
79358 +} evtchn_bind_ipi_t;
79359 +
79360 +/*
79361 + * EVTCHNOP_close: Close a local event channel <port>. If the channel is
79362 + * interdomain then the remote end is placed in the unbound state
79363 + * (EVTCHNSTAT_unbound), awaiting a new connection.
79364 + */
79365 +#define EVTCHNOP_close            3
79366 +typedef struct evtchn_close {
79367 +    /* IN parameters. */
79368 +    evtchn_port_t port;
79369 +} evtchn_close_t;
79370 +
79371 +/*
79372 + * EVTCHNOP_send: Send an event to the remote end of the channel whose local
79373 + * endpoint is <port>.
79374 + */
79375 +#define EVTCHNOP_send             4
79376 +typedef struct evtchn_send {
79377 +    /* IN parameters. */
79378 +    evtchn_port_t port;
79379 +} evtchn_send_t;
79380 +
79381 +/*
79382 + * EVTCHNOP_status: Get the current status of the communication channel which
79383 + * has an endpoint at <dom, port>.
79384 + * NOTES:
79385 + *  1. <dom> may be specified as DOMID_SELF.
79386 + *  2. Only a sufficiently-privileged domain may obtain the status of an event
79387 + *     channel for which <dom> is not DOMID_SELF.
79388 + */
79389 +#define EVTCHNOP_status           5
79390 +typedef struct evtchn_status {
79391 +    /* IN parameters */
79392 +    domid_t  dom;
79393 +    evtchn_port_t port;
79394 +    /* OUT parameters */
79395 +#define EVTCHNSTAT_closed       0  /* Channel is not in use.                 */
79396 +#define EVTCHNSTAT_unbound      1  /* Channel is waiting interdom connection.*/
79397 +#define EVTCHNSTAT_interdomain  2  /* Channel is connected to remote domain. */
79398 +#define EVTCHNSTAT_pirq         3  /* Channel is bound to a phys IRQ line.   */
79399 +#define EVTCHNSTAT_virq         4  /* Channel is bound to a virtual IRQ line */
79400 +#define EVTCHNSTAT_ipi          5  /* Channel is bound to a virtual IPI line */
79401 +    uint32_t status;
79402 +    uint32_t vcpu;                 /* VCPU to which this channel is bound.   */
79403 +    union {
79404 +        struct {
79405 +            domid_t dom;
79406 +        } unbound; /* EVTCHNSTAT_unbound */
79407 +        struct {
79408 +            domid_t dom;
79409 +            evtchn_port_t port;
79410 +        } interdomain; /* EVTCHNSTAT_interdomain */
79411 +        uint32_t pirq;      /* EVTCHNSTAT_pirq        */
79412 +        uint32_t virq;      /* EVTCHNSTAT_virq        */
79413 +    } u;
79414 +} evtchn_status_t;
79415 +
79416 +/*
79417 + * EVTCHNOP_bind_vcpu: Specify which vcpu a channel should notify when an
79418 + * event is pending.
79419 + * NOTES:
79420 + *  1. IPI- and VIRQ-bound channels always notify the vcpu that initialised
79421 + *     the binding. This binding cannot be changed.
79422 + *  2. All other channels notify vcpu0 by default. This default is set when
79423 + *     the channel is allocated (a port that is freed and subsequently reused
79424 + *     has its binding reset to vcpu0).
79425 + */
79426 +#define EVTCHNOP_bind_vcpu        8
79427 +typedef struct evtchn_bind_vcpu {
79428 +    /* IN parameters. */
79429 +    evtchn_port_t port;
79430 +    uint32_t vcpu;
79431 +} evtchn_bind_vcpu_t;
79432 +
79433 +/*
79434 + * EVTCHNOP_unmask: Unmask the specified local event-channel port and deliver
79435 + * a notification to the appropriate VCPU if an event is pending.
79436 + */
79437 +#define EVTCHNOP_unmask           9
79438 +typedef struct evtchn_unmask {
79439 +    /* IN parameters. */
79440 +    evtchn_port_t port;
79441 +} evtchn_unmask_t;
79442 +
79443 +typedef struct evtchn_op {
79444 +    uint32_t cmd; /* EVTCHNOP_* */
79445 +    union {
79446 +        evtchn_alloc_unbound_t    alloc_unbound;
79447 +        evtchn_bind_interdomain_t bind_interdomain;
79448 +        evtchn_bind_virq_t        bind_virq;
79449 +        evtchn_bind_pirq_t        bind_pirq;
79450 +        evtchn_bind_ipi_t         bind_ipi;
79451 +        evtchn_close_t            close;
79452 +        evtchn_send_t             send;
79453 +        evtchn_status_t           status;
79454 +        evtchn_bind_vcpu_t        bind_vcpu;
79455 +        evtchn_unmask_t           unmask;
79456 +    } u;
79457 +} evtchn_op_t;
79458 +DEFINE_GUEST_HANDLE(evtchn_op_t);
79459 +
79460 +#endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */
79461 +
79462 +/*
79463 + * Local variables:
79464 + * mode: C
79465 + * c-set-style: "BSD"
79466 + * c-basic-offset: 4
79467 + * tab-width: 4
79468 + * indent-tabs-mode: nil
79469 + * End:
79470 + */
79471 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/features.h linux-2.6.16/include/xen/interface/features.h
79472 --- linux-2.6.16.orig/include/xen/interface/features.h  1970-01-01 01:00:00.000000000 +0100
79473 +++ linux-2.6.16/include/xen/interface/features.h       2006-06-26 09:51:32.000000000 +0200
79474 @@ -0,0 +1,53 @@
79475 +/******************************************************************************
79476 + * features.h
79477 + * 
79478 + * Feature flags, reported by XENVER_get_features.
79479 + * 
79480 + * Copyright (c) 2006, Keir Fraser <keir@xensource.com>
79481 + */
79482 +
79483 +#ifndef __XEN_PUBLIC_FEATURES_H__
79484 +#define __XEN_PUBLIC_FEATURES_H__
79485 +
79486 +/*
79487 + * If set, the guest does not need to write-protect its pagetables, and can
79488 + * update them via direct writes.
79489 + */
79490 +#define XENFEAT_writable_page_tables       0
79491 +
79492 +/*
79493 + * If set, the guest does not need to write-protect its segment descriptor
79494 + * tables, and can update them via direct writes.
79495 + */
79496 +#define XENFEAT_writable_descriptor_tables 1
79497 +
79498 +/*
79499 + * If set, translation between the guest's 'pseudo-physical' address space
79500 + * and the host's machine address space are handled by the hypervisor. In this
79501 + * mode the guest does not need to perform phys-to/from-machine translations
79502 + * when performing page table operations.
79503 + */
79504 +#define XENFEAT_auto_translated_physmap    2
79505 +
79506 +/* If set, the guest is running in supervisor mode (e.g., x86 ring 0). */
79507 +#define XENFEAT_supervisor_mode_kernel     3
79508 +
79509 +/*
79510 + * If set, the guest does not need to allocate x86 PAE page directories
79511 + * below 4GB. This flag is usually implied by auto_translated_physmap.
79512 + */
79513 +#define XENFEAT_pae_pgdir_above_4gb        4
79514 +
79515 +#define XENFEAT_NR_SUBMAPS 1
79516 +
79517 +#endif /* __XEN_PUBLIC_FEATURES_H__ */
79518 +
79519 +/*
79520 + * Local variables:
79521 + * mode: C
79522 + * c-set-style: "BSD"
79523 + * c-basic-offset: 4
79524 + * tab-width: 4
79525 + * indent-tabs-mode: nil
79526 + * End:
79527 + */
79528 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/grant_table.h linux-2.6.16/include/xen/interface/grant_table.h
79529 --- linux-2.6.16.orig/include/xen/interface/grant_table.h       1970-01-01 01:00:00.000000000 +0100
79530 +++ linux-2.6.16/include/xen/interface/grant_table.h    2006-06-26 09:51:32.000000000 +0200
79531 @@ -0,0 +1,311 @@
79532 +/******************************************************************************
79533 + * grant_table.h
79534 + * 
79535 + * Interface for granting foreign access to page frames, and receiving
79536 + * page-ownership transfers.
79537 + * 
79538 + * Copyright (c) 2004, K A Fraser
79539 + */
79540 +
79541 +#ifndef __XEN_PUBLIC_GRANT_TABLE_H__
79542 +#define __XEN_PUBLIC_GRANT_TABLE_H__
79543 +
79544 +
79545 +/***********************************
79546 + * GRANT TABLE REPRESENTATION
79547 + */
79548 +
79549 +/* Some rough guidelines on accessing and updating grant-table entries
79550 + * in a concurrency-safe manner. For more information, Linux contains a
79551 + * reference implementation for guest OSes (arch/xen/kernel/grant_table.c).
79552 + * 
79553 + * NB. WMB is a no-op on current-generation x86 processors. However, a
79554 + *     compiler barrier will still be required.
79555 + * 
79556 + * Introducing a valid entry into the grant table:
79557 + *  1. Write ent->domid.
79558 + *  2. Write ent->frame:
79559 + *      GTF_permit_access:   Frame to which access is permitted.
79560 + *      GTF_accept_transfer: Pseudo-phys frame slot being filled by new
79561 + *                           frame, or zero if none.
79562 + *  3. Write memory barrier (WMB).
79563 + *  4. Write ent->flags, inc. valid type.
79564 + * 
79565 + * Invalidating an unused GTF_permit_access entry:
79566 + *  1. flags = ent->flags.
79567 + *  2. Observe that !(flags & (GTF_reading|GTF_writing)).
79568 + *  3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
79569 + *  NB. No need for WMB as reuse of entry is control-dependent on success of
79570 + *      step 3, and all architectures guarantee ordering of ctrl-dep writes.
79571 + *
79572 + * Invalidating an in-use GTF_permit_access entry:
79573 + *  This cannot be done directly. Request assistance from the domain controller
79574 + *  which can set a timeout on the use of a grant entry and take necessary
79575 + *  action. (NB. This is not yet implemented!).
79576 + * 
79577 + * Invalidating an unused GTF_accept_transfer entry:
79578 + *  1. flags = ent->flags.
79579 + *  2. Observe that !(flags & GTF_transfer_committed). [*]
79580 + *  3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
79581 + *  NB. No need for WMB as reuse of entry is control-dependent on success of
79582 + *      step 3, and all architectures guarantee ordering of ctrl-dep writes.
79583 + *  [*] If GTF_transfer_committed is set then the grant entry is 'committed'.
79584 + *      The guest must /not/ modify the grant entry until the address of the
79585 + *      transferred frame is written. It is safe for the guest to spin waiting
79586 + *      for this to occur (detect by observing GTF_transfer_completed in
79587 + *      ent->flags).
79588 + *
79589 + * Invalidating a committed GTF_accept_transfer entry:
79590 + *  1. Wait for (ent->flags & GTF_transfer_completed).
79591 + *
79592 + * Changing a GTF_permit_access from writable to read-only:
79593 + *  Use SMP-safe CMPXCHG to set GTF_readonly, while checking !GTF_writing.
79594 + * 
79595 + * Changing a GTF_permit_access from read-only to writable:
79596 + *  Use SMP-safe bit-setting instruction.
79597 + */
79598 +
79599 +/*
79600 + * A grant table comprises a packed array of grant entries in one or more
79601 + * page frames shared between Xen and a guest.
79602 + * [XEN]: This field is written by Xen and read by the sharing guest.
79603 + * [GST]: This field is written by the guest and read by Xen.
79604 + */
79605 +typedef struct grant_entry {
79606 +    /* GTF_xxx: various type and flag information.  [XEN,GST] */
79607 +    uint16_t flags;
79608 +    /* The domain being granted foreign privileges. [GST] */
79609 +    domid_t  domid;
79610 +    /*
79611 +     * GTF_permit_access: Frame that @domid is allowed to map and access. [GST]
79612 +     * GTF_accept_transfer: Frame whose ownership transferred by @domid. [XEN]
79613 +     */
79614 +    uint32_t frame;
79615 +} grant_entry_t;
79616 +
79617 +/*
79618 + * Type of grant entry.
79619 + *  GTF_invalid: This grant entry grants no privileges.
79620 + *  GTF_permit_access: Allow @domid to map/access @frame.
79621 + *  GTF_accept_transfer: Allow @domid to transfer ownership of one page frame
79622 + *                       to this guest. Xen writes the page number to @frame.
79623 + */
79624 +#define GTF_invalid         (0U<<0)
79625 +#define GTF_permit_access   (1U<<0)
79626 +#define GTF_accept_transfer (2U<<0)
79627 +#define GTF_type_mask       (3U<<0)
79628 +
79629 +/*
79630 + * Subflags for GTF_permit_access.
79631 + *  GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST]
79632 + *  GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN]
79633 + *  GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN]
79634 + */
79635 +#define _GTF_readonly       (2)
79636 +#define GTF_readonly        (1U<<_GTF_readonly)
79637 +#define _GTF_reading        (3)
79638 +#define GTF_reading         (1U<<_GTF_reading)
79639 +#define _GTF_writing        (4)
79640 +#define GTF_writing         (1U<<_GTF_writing)
79641 +
79642 +/*
79643 + * Subflags for GTF_accept_transfer:
79644 + *  GTF_transfer_committed: Xen sets this flag to indicate that it is committed
79645 + *      to transferring ownership of a page frame. When a guest sees this flag
79646 + *      it must /not/ modify the grant entry until GTF_transfer_completed is
79647 + *      set by Xen.
79648 + *  GTF_transfer_completed: It is safe for the guest to spin-wait on this flag
79649 + *      after reading GTF_transfer_committed. Xen will always write the frame
79650 + *      address, followed by ORing this flag, in a timely manner.
79651 + */
79652 +#define _GTF_transfer_committed (2)
79653 +#define GTF_transfer_committed  (1U<<_GTF_transfer_committed)
79654 +#define _GTF_transfer_completed (3)
79655 +#define GTF_transfer_completed  (1U<<_GTF_transfer_completed)
79656 +
79657 +
79658 +/***********************************
79659 + * GRANT TABLE QUERIES AND USES
79660 + */
79661 +
79662 +/*
79663 + * Reference to a grant entry in a specified domain's grant table.
79664 + */
79665 +typedef uint32_t grant_ref_t;
79666 +
79667 +/*
79668 + * Handle to track a mapping created via a grant reference.
79669 + */
79670 +typedef uint32_t grant_handle_t;
79671 +
79672 +/*
79673 + * GNTTABOP_map_grant_ref: Map the grant entry (<dom>,<ref>) for access
79674 + * by devices and/or host CPUs. If successful, <handle> is a tracking number
79675 + * that must be presented later to destroy the mapping(s). On error, <handle>
79676 + * is a negative status code.
79677 + * NOTES:
79678 + *  1. If GNTPIN_map_for_dev is specified then <dev_bus_addr> is the address
79679 + *     via which I/O devices may access the granted frame.
79680 + *  2. If GNTPIN_map_for_host is specified then a mapping will be added at
79681 + *     either a host virtual address in the current address space, or at
79682 + *     a PTE at the specified machine address.  The type of mapping to
79683 + *     perform is selected through the GNTMAP_contains_pte flag, and the 
79684 + *     address is specified in <host_addr>.
79685 + *  3. Mappings should only be destroyed via GNTTABOP_unmap_grant_ref. If a
79686 + *     host mapping is destroyed by other means then it is *NOT* guaranteed
79687 + *     to be accounted to the correct grant reference!
79688 + */
79689 +#define GNTTABOP_map_grant_ref        0
79690 +typedef struct gnttab_map_grant_ref {
79691 +    /* IN parameters. */
79692 +    uint64_t host_addr;
79693 +    uint32_t flags;               /* GNTMAP_* */
79694 +    grant_ref_t ref;
79695 +    domid_t  dom;
79696 +    /* OUT parameters. */
79697 +    int16_t  status;              /* GNTST_* */
79698 +    grant_handle_t handle;
79699 +    uint64_t dev_bus_addr;
79700 +} gnttab_map_grant_ref_t;
79701 +DEFINE_GUEST_HANDLE(gnttab_map_grant_ref_t);
79702 +
79703 +/*
79704 + * GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference mappings
79705 + * tracked by <handle>. If <host_addr> or <dev_bus_addr> is zero, that
79706 + * field is ignored. If non-zero, they must refer to a device/host mapping
79707 + * that is tracked by <handle>
79708 + * NOTES:
79709 + *  1. The call may fail in an undefined manner if either mapping is not
79710 + *     tracked by <handle>.
79711 + *  3. After executing a batch of unmaps, it is guaranteed that no stale
79712 + *     mappings will remain in the device or host TLBs.
79713 + */
79714 +#define GNTTABOP_unmap_grant_ref      1
79715 +typedef struct gnttab_unmap_grant_ref {
79716 +    /* IN parameters. */
79717 +    uint64_t host_addr;
79718 +    uint64_t dev_bus_addr;
79719 +    grant_handle_t handle;
79720 +    /* OUT parameters. */
79721 +    int16_t  status;              /* GNTST_* */
79722 +} gnttab_unmap_grant_ref_t;
79723 +DEFINE_GUEST_HANDLE(gnttab_unmap_grant_ref_t);
79724 +
79725 +/*
79726 + * GNTTABOP_setup_table: Set up a grant table for <dom> comprising at least
79727 + * <nr_frames> pages. The frame addresses are written to the <frame_list>.
79728 + * Only <nr_frames> addresses are written, even if the table is larger.
79729 + * NOTES:
79730 + *  1. <dom> may be specified as DOMID_SELF.
79731 + *  2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF.
79732 + *  3. Xen may not support more than a single grant-table page per domain.
79733 + */
79734 +#define GNTTABOP_setup_table          2
79735 +typedef struct gnttab_setup_table {
79736 +    /* IN parameters. */
79737 +    domid_t  dom;
79738 +    uint32_t nr_frames;
79739 +    /* OUT parameters. */
79740 +    int16_t  status;              /* GNTST_* */
79741 +    GUEST_HANDLE(ulong) frame_list;
79742 +} gnttab_setup_table_t;
79743 +DEFINE_GUEST_HANDLE(gnttab_setup_table_t);
79744 +
79745 +/*
79746 + * GNTTABOP_dump_table: Dump the contents of the grant table to the
79747 + * xen console. Debugging use only.
79748 + */
79749 +#define GNTTABOP_dump_table           3
79750 +typedef struct gnttab_dump_table {
79751 +    /* IN parameters. */
79752 +    domid_t dom;
79753 +    /* OUT parameters. */
79754 +    int16_t status;               /* GNTST_* */
79755 +} gnttab_dump_table_t;
79756 +DEFINE_GUEST_HANDLE(gnttab_dump_table_t);
79757 +
79758 +/*
79759 + * GNTTABOP_transfer_grant_ref: Transfer <frame> to a foreign domain. The
79760 + * foreign domain has previously registered its interest in the transfer via
79761 + * <domid, ref>.
79762 + * 
79763 + * Note that, even if the transfer fails, the specified page no longer belongs
79764 + * to the calling domain *unless* the error is GNTST_bad_page.
79765 + */
79766 +#define GNTTABOP_transfer                4
79767 +typedef struct gnttab_transfer {
79768 +    /* IN parameters. */
79769 +    unsigned long mfn;
79770 +    domid_t       domid;
79771 +    grant_ref_t   ref;
79772 +    /* OUT parameters. */
79773 +    int16_t       status;
79774 +} gnttab_transfer_t;
79775 +DEFINE_GUEST_HANDLE(gnttab_transfer_t);
79776 +
79777 +/*
79778 + * Bitfield values for update_pin_status.flags.
79779 + */
79780 + /* Map the grant entry for access by I/O devices. */
79781 +#define _GNTMAP_device_map      (0)
79782 +#define GNTMAP_device_map       (1<<_GNTMAP_device_map)
79783 + /* Map the grant entry for access by host CPUs. */
79784 +#define _GNTMAP_host_map        (1)
79785 +#define GNTMAP_host_map         (1<<_GNTMAP_host_map)
79786 + /* Accesses to the granted frame will be restricted to read-only access. */
79787 +#define _GNTMAP_readonly        (2)
79788 +#define GNTMAP_readonly         (1<<_GNTMAP_readonly)
79789 + /*
79790 +  * GNTMAP_host_map subflag:
79791 +  *  0 => The host mapping is usable only by the guest OS.
79792 +  *  1 => The host mapping is usable by guest OS + current application.
79793 +  */
79794 +#define _GNTMAP_application_map (3)
79795 +#define GNTMAP_application_map  (1<<_GNTMAP_application_map)
79796 +
79797 + /*
79798 +  * GNTMAP_contains_pte subflag:
79799 +  *  0 => This map request contains a host virtual address.
79800 +  *  1 => This map request contains the machine addess of the PTE to update.
79801 +  */
79802 +#define _GNTMAP_contains_pte    (4)
79803 +#define GNTMAP_contains_pte     (1<<_GNTMAP_contains_pte)
79804 +
79805 +/*
79806 + * Values for error status returns. All errors are -ve.
79807 + */
79808 +#define GNTST_okay             (0)  /* Normal return.                        */
79809 +#define GNTST_general_error    (-1) /* General undefined error.              */
79810 +#define GNTST_bad_domain       (-2) /* Unrecognsed domain id.                */
79811 +#define GNTST_bad_gntref       (-3) /* Unrecognised or inappropriate gntref. */
79812 +#define GNTST_bad_handle       (-4) /* Unrecognised or inappropriate handle. */
79813 +#define GNTST_bad_virt_addr    (-5) /* Inappropriate virtual address to map. */
79814 +#define GNTST_bad_dev_addr     (-6) /* Inappropriate device address to unmap.*/
79815 +#define GNTST_no_device_space  (-7) /* Out of space in I/O MMU.              */
79816 +#define GNTST_permission_denied (-8) /* Not enough privilege for operation.  */
79817 +#define GNTST_bad_page         (-9) /* Specified page was invalid for op.    */
79818 +
79819 +#define GNTTABOP_error_msgs {                   \
79820 +    "okay",                                     \
79821 +    "undefined error",                          \
79822 +    "unrecognised domain id",                   \
79823 +    "invalid grant reference",                  \
79824 +    "invalid mapping handle",                   \
79825 +    "invalid virtual address",                  \
79826 +    "invalid device address",                   \
79827 +    "no spare translation slot in the I/O MMU", \
79828 +    "permission denied",                        \
79829 +    "bad page"                                  \
79830 +}
79831 +
79832 +#endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */
79833 +
79834 +/*
79835 + * Local variables:
79836 + * mode: C
79837 + * c-set-style: "BSD"
79838 + * c-basic-offset: 4
79839 + * tab-width: 4
79840 + * indent-tabs-mode: nil
79841 + * End:
79842 + */
79843 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/hvm/hvm_info_table.h linux-2.6.16/include/xen/interface/hvm/hvm_info_table.h
79844 --- linux-2.6.16.orig/include/xen/interface/hvm/hvm_info_table.h        1970-01-01 01:00:00.000000000 +0100
79845 +++ linux-2.6.16/include/xen/interface/hvm/hvm_info_table.h     2006-06-26 09:51:32.000000000 +0200
79846 @@ -0,0 +1,24 @@
79847 +/******************************************************************************
79848 + * hvm/hvm_info_table.h
79849 + * 
79850 + * HVM parameter and information table, written into guest memory map.
79851 + */
79852 +
79853 +#ifndef __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
79854 +#define __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
79855 +
79856 +#define HVM_INFO_PFN         0x09F
79857 +#define HVM_INFO_OFFSET      0x800
79858 +#define HVM_INFO_PADDR       ((HVM_INFO_PFN << 12) + HVM_INFO_OFFSET)
79859 +
79860 +struct hvm_info_table {
79861 +    char        signature[8]; /* "HVM INFO" */
79862 +    uint32_t    length;
79863 +    uint8_t     checksum;
79864 +    uint8_t     acpi_enabled;
79865 +    uint8_t     apic_enabled;
79866 +    uint8_t     pae_enabled;
79867 +    uint32_t    nr_vcpus;
79868 +};
79869 +
79870 +#endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */
79871 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/hvm/ioreq.h linux-2.6.16/include/xen/interface/hvm/ioreq.h
79872 --- linux-2.6.16.orig/include/xen/interface/hvm/ioreq.h 1970-01-01 01:00:00.000000000 +0100
79873 +++ linux-2.6.16/include/xen/interface/hvm/ioreq.h      2006-06-26 09:51:32.000000000 +0200
79874 @@ -0,0 +1,94 @@
79875 +/*
79876 + * ioreq.h: I/O request definitions for device models
79877 + * Copyright (c) 2004, Intel Corporation.
79878 + *
79879 + * This program is free software; you can redistribute it and/or modify it
79880 + * under the terms and conditions of the GNU General Public License,
79881 + * version 2, as published by the Free Software Foundation.
79882 + *
79883 + * This program is distributed in the hope it will be useful, but WITHOUT
79884 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
79885 + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
79886 + * more details.
79887 + *
79888 + * You should have received a copy of the GNU General Public License along with
79889 + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
79890 + * Place - Suite 330, Boston, MA 02111-1307 USA.
79891 + *
79892 + */
79893 +
79894 +#ifndef _IOREQ_H_
79895 +#define _IOREQ_H_
79896 +
79897 +#define IOREQ_READ      1
79898 +#define IOREQ_WRITE     0
79899 +
79900 +#define STATE_INVALID           0
79901 +#define STATE_IOREQ_READY       1
79902 +#define STATE_IOREQ_INPROCESS   2
79903 +#define STATE_IORESP_READY      3
79904 +#define STATE_IORESP_HOOK       4
79905 +
79906 +#define IOREQ_TYPE_PIO          0 /* pio */
79907 +#define IOREQ_TYPE_COPY         1 /* mmio ops */
79908 +#define IOREQ_TYPE_AND          2
79909 +#define IOREQ_TYPE_OR           3
79910 +#define IOREQ_TYPE_XOR          4
79911 +#define IOREQ_TYPE_XCHG         5
79912 +
79913 +/*
79914 + * VMExit dispatcher should cooperate with instruction decoder to
79915 + * prepare this structure and notify service OS and DM by sending
79916 + * virq
79917 + */
79918 +typedef struct {
79919 +    uint64_t addr;          /*  physical address            */
79920 +    uint64_t size;          /*  size in bytes               */
79921 +    uint64_t count;         /*  for rep prefixes            */
79922 +    union {
79923 +        uint64_t data;      /*  data                        */
79924 +        void    *pdata;     /*  pointer to data             */
79925 +    } u;
79926 +    uint8_t state:4;
79927 +    uint8_t pdata_valid:1;  /* if 1, use pdata above        */
79928 +    uint8_t dir:1;          /*  1=read, 0=write             */
79929 +    uint8_t df:1;
79930 +    uint8_t type;           /* I/O type                     */
79931 +    uint64_t io_count;      /* How many IO done on a vcpu   */
79932 +} ioreq_t;
79933 +
79934 +#define MAX_VECTOR      256
79935 +#define BITS_PER_BYTE   8
79936 +#define INTR_LEN        (MAX_VECTOR/(BITS_PER_BYTE * sizeof(uint64_t)))
79937 +#define INTR_LEN_32     (MAX_VECTOR/(BITS_PER_BYTE * sizeof(uint32_t)))
79938 +
79939 +typedef struct {
79940 +    uint16_t    pic_elcr;
79941 +    uint16_t    pic_irr;
79942 +    uint16_t    pic_last_irr;
79943 +    uint16_t    pic_clear_irr;
79944 +} global_iodata_t;
79945 +
79946 +typedef struct {
79947 +    ioreq_t         vp_ioreq;
79948 +    /* Event channel port */
79949 +    unsigned int    vp_eport;   /* VMX vcpu uses this to notify DM */
79950 +    unsigned int    dm_eport;   /* DM uses this to notify VMX vcpu */
79951 +} vcpu_iodata_t;
79952 +
79953 +typedef struct {
79954 +    global_iodata_t sp_global;
79955 +    vcpu_iodata_t   vcpu_iodata[1];
79956 +} shared_iopage_t;
79957 +
79958 +#endif /* _IOREQ_H_ */
79959 +
79960 +/*
79961 + * Local variables:
79962 + * mode: C
79963 + * c-set-style: "BSD"
79964 + * c-basic-offset: 4
79965 + * tab-width: 4
79966 + * indent-tabs-mode: nil
79967 + * End:
79968 + */
79969 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/hvm/vmx_assist.h linux-2.6.16/include/xen/interface/hvm/vmx_assist.h
79970 --- linux-2.6.16.orig/include/xen/interface/hvm/vmx_assist.h    1970-01-01 01:00:00.000000000 +0100
79971 +++ linux-2.6.16/include/xen/interface/hvm/vmx_assist.h 2006-06-26 09:51:32.000000000 +0200
79972 @@ -0,0 +1,97 @@
79973 +/*
79974 + * vmx_assist.h: Context definitions for the VMXASSIST world switch.
79975 + *
79976 + * Leendert van Doorn, leendert@watson.ibm.com
79977 + * Copyright (c) 2005, International Business Machines Corporation.
79978 + */
79979 +
79980 +#ifndef _VMX_ASSIST_H_
79981 +#define _VMX_ASSIST_H_
79982 +
79983 +#define VMXASSIST_BASE         0xD0000
79984 +#define VMXASSIST_MAGIC        0x17101966
79985 +#define VMXASSIST_MAGIC_OFFSET (VMXASSIST_BASE+8)
79986 +
79987 +#define VMXASSIST_NEW_CONTEXT (VMXASSIST_BASE + 12)
79988 +#define VMXASSIST_OLD_CONTEXT (VMXASSIST_NEW_CONTEXT + 4)
79989 +
79990 +#ifndef __ASSEMBLY__
79991 +
79992 +union vmcs_arbytes {
79993 +    struct arbyte_fields {
79994 +        unsigned int seg_type : 4,
79995 +            s         : 1,
79996 +            dpl       : 2,
79997 +            p         : 1,
79998 +            reserved0 : 4,
79999 +            avl       : 1,
80000 +            reserved1 : 1,
80001 +            default_ops_size: 1,
80002 +            g         : 1,
80003 +            null_bit  : 1,
80004 +            reserved2 : 15;
80005 +    } fields;
80006 +    unsigned int bytes;
80007 +};
80008 +
80009 +/*
80010 + * World switch state
80011 + */
80012 +typedef struct vmx_assist_context {
80013 +    uint32_t  eip;        /* execution pointer */
80014 +    uint32_t  esp;        /* stack pointer */
80015 +    uint32_t  eflags;     /* flags register */
80016 +    uint32_t  cr0;
80017 +    uint32_t  cr3;        /* page table directory */
80018 +    uint32_t  cr4;
80019 +    uint32_t  idtr_limit; /* idt */
80020 +    uint32_t  idtr_base;
80021 +    uint32_t  gdtr_limit; /* gdt */
80022 +    uint32_t  gdtr_base;
80023 +    uint32_t  cs_sel;     /* cs selector */
80024 +    uint32_t  cs_limit;
80025 +    uint32_t  cs_base;
80026 +    union vmcs_arbytes cs_arbytes;
80027 +    uint32_t  ds_sel;     /* ds selector */
80028 +    uint32_t  ds_limit;
80029 +    uint32_t  ds_base;
80030 +    union vmcs_arbytes ds_arbytes;
80031 +    uint32_t  es_sel;     /* es selector */
80032 +    uint32_t  es_limit;
80033 +    uint32_t  es_base;
80034 +    union vmcs_arbytes es_arbytes;
80035 +    uint32_t  ss_sel;     /* ss selector */
80036 +    uint32_t  ss_limit;
80037 +    uint32_t  ss_base;
80038 +    union vmcs_arbytes ss_arbytes;
80039 +    uint32_t  fs_sel;     /* fs selector */
80040 +    uint32_t  fs_limit;
80041 +    uint32_t  fs_base;
80042 +    union vmcs_arbytes fs_arbytes;
80043 +    uint32_t  gs_sel;     /* gs selector */
80044 +    uint32_t  gs_limit;
80045 +    uint32_t  gs_base;
80046 +    union vmcs_arbytes gs_arbytes;
80047 +    uint32_t  tr_sel;     /* task selector */
80048 +    uint32_t  tr_limit;
80049 +    uint32_t  tr_base;
80050 +    union vmcs_arbytes tr_arbytes;
80051 +    uint32_t  ldtr_sel;   /* ldtr selector */
80052 +    uint32_t  ldtr_limit;
80053 +    uint32_t  ldtr_base;
80054 +    union vmcs_arbytes ldtr_arbytes;
80055 +} vmx_assist_context_t;
80056 +
80057 +#endif /* __ASSEMBLY__ */
80058 +
80059 +#endif /* _VMX_ASSIST_H_ */
80060 +
80061 +/*
80062 + * Local variables:
80063 + * mode: C
80064 + * c-set-style: "BSD"
80065 + * c-basic-offset: 4
80066 + * tab-width: 4
80067 + * indent-tabs-mode: nil
80068 + * End:
80069 + */
80070 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/io/blkif.h linux-2.6.16/include/xen/interface/io/blkif.h
80071 --- linux-2.6.16.orig/include/xen/interface/io/blkif.h  1970-01-01 01:00:00.000000000 +0100
80072 +++ linux-2.6.16/include/xen/interface/io/blkif.h       2006-06-26 09:51:32.000000000 +0200
80073 @@ -0,0 +1,85 @@
80074 +/******************************************************************************
80075 + * blkif.h
80076 + * 
80077 + * Unified block-device I/O interface for Xen guest OSes.
80078 + * 
80079 + * Copyright (c) 2003-2004, Keir Fraser
80080 + */
80081 +
80082 +#ifndef __XEN_PUBLIC_IO_BLKIF_H__
80083 +#define __XEN_PUBLIC_IO_BLKIF_H__
80084 +
80085 +#include "ring.h"
80086 +#include "../grant_table.h"
80087 +
80088 +/*
80089 + * Front->back notifications: When enqueuing a new request, sending a
80090 + * notification can be made conditional on req_event (i.e., the generic
80091 + * hold-off mechanism provided by the ring macros). Backends must set
80092 + * req_event appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()).
80093 + * 
80094 + * Back->front notifications: When enqueuing a new response, sending a
80095 + * notification can be made conditional on rsp_event (i.e., the generic
80096 + * hold-off mechanism provided by the ring macros). Frontends must set
80097 + * rsp_event appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()).
80098 + */
80099 +
80100 +#ifndef blkif_vdev_t
80101 +#define blkif_vdev_t   uint16_t
80102 +#endif
80103 +#define blkif_sector_t uint64_t
80104 +
80105 +#define BLKIF_OP_READ      0
80106 +#define BLKIF_OP_WRITE     1
80107 +
80108 +/*
80109 + * Maximum scatter/gather segments per request.
80110 + * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE.
80111 + * NB. This could be 12 if the ring indexes weren't stored in the same page.
80112 + */
80113 +#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
80114 +
80115 +typedef struct blkif_request {
80116 +    uint8_t        operation;    /* BLKIF_OP_???                         */
80117 +    uint8_t        nr_segments;  /* number of segments                   */
80118 +    blkif_vdev_t   handle;       /* only for read/write requests         */
80119 +    uint64_t       id;           /* private guest value, echoed in resp  */
80120 +    blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
80121 +    struct blkif_request_segment {
80122 +        grant_ref_t gref;        /* reference to I/O buffer frame        */
80123 +        /* @first_sect: first sector in frame to transfer (inclusive).   */
80124 +        /* @last_sect: last sector in frame to transfer (inclusive).     */
80125 +        uint8_t     first_sect, last_sect;
80126 +    } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
80127 +} blkif_request_t;
80128 +
80129 +typedef struct blkif_response {
80130 +    uint64_t        id;              /* copied from request */
80131 +    uint8_t         operation;       /* copied from request */
80132 +    int16_t         status;          /* BLKIF_RSP_???       */
80133 +} blkif_response_t;
80134 +
80135 +#define BLKIF_RSP_ERROR  -1 /* non-specific 'error' */
80136 +#define BLKIF_RSP_OKAY    0 /* non-specific 'okay'  */
80137 +
80138 +/*
80139 + * Generate blkif ring structures and types.
80140 + */
80141 +
80142 +DEFINE_RING_TYPES(blkif, blkif_request_t, blkif_response_t);
80143 +
80144 +#define VDISK_CDROM        0x1
80145 +#define VDISK_REMOVABLE    0x2
80146 +#define VDISK_READONLY     0x4
80147 +
80148 +#endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
80149 +
80150 +/*
80151 + * Local variables:
80152 + * mode: C
80153 + * c-set-style: "BSD"
80154 + * c-basic-offset: 4
80155 + * tab-width: 4
80156 + * indent-tabs-mode: nil
80157 + * End:
80158 + */
80159 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/io/console.h linux-2.6.16/include/xen/interface/io/console.h
80160 --- linux-2.6.16.orig/include/xen/interface/io/console.h        1970-01-01 01:00:00.000000000 +0100
80161 +++ linux-2.6.16/include/xen/interface/io/console.h     2006-06-26 09:51:32.000000000 +0200
80162 @@ -0,0 +1,33 @@
80163 +/******************************************************************************
80164 + * console.h
80165 + * 
80166 + * Console I/O interface for Xen guest OSes.
80167 + * 
80168 + * Copyright (c) 2005, Keir Fraser
80169 + */
80170 +
80171 +#ifndef __XEN_PUBLIC_IO_CONSOLE_H__
80172 +#define __XEN_PUBLIC_IO_CONSOLE_H__
80173 +
80174 +typedef uint32_t XENCONS_RING_IDX;
80175 +
80176 +#define MASK_XENCONS_IDX(idx, ring) ((idx) & (sizeof(ring)-1))
80177 +
80178 +struct xencons_interface {
80179 +    char in[1024];
80180 +    char out[2048];
80181 +    XENCONS_RING_IDX in_cons, in_prod;
80182 +    XENCONS_RING_IDX out_cons, out_prod;
80183 +};
80184 +
80185 +#endif /* __XEN_PUBLIC_IO_CONSOLE_H__ */
80186 +
80187 +/*
80188 + * Local variables:
80189 + * mode: C
80190 + * c-set-style: "BSD"
80191 + * c-basic-offset: 4
80192 + * tab-width: 4
80193 + * indent-tabs-mode: nil
80194 + * End:
80195 + */
80196 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/io/netif.h linux-2.6.16/include/xen/interface/io/netif.h
80197 --- linux-2.6.16.orig/include/xen/interface/io/netif.h  1970-01-01 01:00:00.000000000 +0100
80198 +++ linux-2.6.16/include/xen/interface/io/netif.h       2006-06-26 09:51:32.000000000 +0200
80199 @@ -0,0 +1,84 @@
80200 +/******************************************************************************
80201 + * netif.h
80202 + * 
80203 + * Unified network-device I/O interface for Xen guest OSes.
80204 + * 
80205 + * Copyright (c) 2003-2004, Keir Fraser
80206 + */
80207 +
80208 +#ifndef __XEN_PUBLIC_IO_NETIF_H__
80209 +#define __XEN_PUBLIC_IO_NETIF_H__
80210 +
80211 +#include "ring.h"
80212 +#include "../grant_table.h"
80213 +
80214 +/*
80215 + * Note that there is *never* any need to notify the backend when enqueuing
80216 + * receive requests (netif_rx_request_t). Notifications after enqueuing any
80217 + * other type of message should be conditional on the appropriate req_event
80218 + * or rsp_event field in the shared ring.
80219 + */
80220 +
80221 +/* Protocol checksum field is blank in the packet (hardware offload)? */
80222 +#define _NETTXF_csum_blank     (0)
80223 +#define  NETTXF_csum_blank     (1U<<_NETTXF_csum_blank)
80224 +
80225 +/* Packet data has been validated against protocol checksum. */
80226 +#define _NETTXF_data_validated (1)
80227 +#define  NETTXF_data_validated (1U<<_NETTXF_data_validated)
80228 +
80229 +typedef struct netif_tx_request {
80230 +    grant_ref_t gref;      /* Reference to buffer page */
80231 +    uint16_t offset;       /* Offset within buffer page */
80232 +    uint16_t flags;        /* NETTXF_* */
80233 +    uint16_t id;           /* Echoed in response message. */
80234 +    uint16_t size;         /* Packet size in bytes.       */
80235 +} netif_tx_request_t;
80236 +
80237 +typedef struct netif_tx_response {
80238 +    uint16_t id;
80239 +    int16_t  status;       /* NETIF_RSP_* */
80240 +} netif_tx_response_t;
80241 +
80242 +typedef struct {
80243 +    uint16_t    id;        /* Echoed in response message.        */
80244 +    grant_ref_t gref;      /* Reference to incoming granted frame */
80245 +} netif_rx_request_t;
80246 +
80247 +/* Packet data has been validated against protocol checksum. */
80248 +#define _NETRXF_data_validated (0)
80249 +#define  NETRXF_data_validated (1U<<_NETRXF_data_validated)
80250 +
80251 +/* Protocol checksum field is blank in the packet (hardware offload)? */
80252 +#define _NETRXF_csum_blank     (1)
80253 +#define  NETRXF_csum_blank     (1U<<_NETRXF_csum_blank)
80254 +
80255 +typedef struct {
80256 +    uint16_t id;
80257 +    uint16_t offset;       /* Offset in page of start of received packet  */
80258 +    uint16_t flags;        /* NETRXF_* */
80259 +    int16_t  status;       /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */
80260 +} netif_rx_response_t;
80261 +
80262 +/*
80263 + * Generate netif ring structures and types.
80264 + */
80265 +
80266 +DEFINE_RING_TYPES(netif_tx, netif_tx_request_t, netif_tx_response_t);
80267 +DEFINE_RING_TYPES(netif_rx, netif_rx_request_t, netif_rx_response_t);
80268 +
80269 +#define NETIF_RSP_DROPPED         -2
80270 +#define NETIF_RSP_ERROR           -1
80271 +#define NETIF_RSP_OKAY             0
80272 +
80273 +#endif
80274 +
80275 +/*
80276 + * Local variables:
80277 + * mode: C
80278 + * c-set-style: "BSD"
80279 + * c-basic-offset: 4
80280 + * tab-width: 4
80281 + * indent-tabs-mode: nil
80282 + * End:
80283 + */
80284 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/io/pciif.h linux-2.6.16/include/xen/interface/io/pciif.h
80285 --- linux-2.6.16.orig/include/xen/interface/io/pciif.h  1970-01-01 01:00:00.000000000 +0100
80286 +++ linux-2.6.16/include/xen/interface/io/pciif.h       2006-06-26 09:51:32.000000000 +0200
80287 @@ -0,0 +1,55 @@
80288 +/*
80289 + * PCI Backend/Frontend Common Data Structures & Macros
80290 + *
80291 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
80292 + */
80293 +#ifndef __XEN_PCI_COMMON_H__
80294 +#define __XEN_PCI_COMMON_H__
80295 +
80296 +/* Be sure to bump this number if you change this file */
80297 +#define XEN_PCI_MAGIC          "7"
80298 +
80299 +/* xen_pci_sharedinfo flags */
80300 +#define _XEN_PCIF_active     (0)
80301 +#define XEN_PCIF_active      (1<<_XEN_PCI_active)
80302 +
80303 +/* xen_pci_op commands */
80304 +#define XEN_PCI_OP_conf_read    (0)
80305 +#define XEN_PCI_OP_conf_write   (1)
80306 +
80307 +/* xen_pci_op error numbers */
80308 +#define XEN_PCI_ERR_success          (0)
80309 +#define XEN_PCI_ERR_dev_not_found   (-1)
80310 +#define XEN_PCI_ERR_invalid_offset  (-2)
80311 +#define XEN_PCI_ERR_access_denied   (-3)
80312 +#define XEN_PCI_ERR_not_implemented (-4)
80313 +/* XEN_PCI_ERR_op_failed - backend failed to complete the operation */
80314 +#define XEN_PCI_ERR_op_failed       (-5)
80315 +
80316 +struct xen_pci_op {
80317 +       /* IN: what action to perform: XEN_PCI_OP_* */
80318 +       uint32_t cmd;
80319 +
80320 +       /* OUT: will contain an error number (if any) from errno.h */
80321 +       int32_t err;
80322 +
80323 +       /* IN: which device to touch */
80324 +       uint32_t domain; /* PCI Domain/Segment */
80325 +       uint32_t bus;
80326 +       uint32_t devfn;
80327 +
80328 +       /* IN: which configuration registers to touch */
80329 +       int32_t offset;
80330 +       int32_t size;
80331 +
80332 +       /* IN/OUT: Contains the result after a READ or the value to WRITE */
80333 +       uint32_t value;
80334 +};
80335 +
80336 +struct xen_pci_sharedinfo {
80337 +       /* flags - XEN_PCIF_* */
80338 +       uint32_t flags;
80339 +       struct xen_pci_op op;
80340 +};
80341 +
80342 +#endif /* __XEN_PCI_COMMON_H__ */
80343 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/io/ring.h linux-2.6.16/include/xen/interface/io/ring.h
80344 --- linux-2.6.16.orig/include/xen/interface/io/ring.h   1970-01-01 01:00:00.000000000 +0100
80345 +++ linux-2.6.16/include/xen/interface/io/ring.h        2006-06-26 09:51:32.000000000 +0200
80346 @@ -0,0 +1,265 @@
80347 +/******************************************************************************
80348 + * ring.h
80349 + * 
80350 + * Shared producer-consumer ring macros.
80351 + *
80352 + * Tim Deegan and Andrew Warfield November 2004.
80353 + */
80354 +
80355 +#ifndef __XEN_PUBLIC_IO_RING_H__
80356 +#define __XEN_PUBLIC_IO_RING_H__
80357 +
80358 +typedef unsigned int RING_IDX;
80359 +
80360 +/* Round a 32-bit unsigned constant down to the nearest power of two. */
80361 +#define __RD2(_x)  (((_x) & 0x00000002) ? 0x2                  : ((_x) & 0x1))
80362 +#define __RD4(_x)  (((_x) & 0x0000000c) ? __RD2((_x)>>2)<<2    : __RD2(_x))
80363 +#define __RD8(_x)  (((_x) & 0x000000f0) ? __RD4((_x)>>4)<<4    : __RD4(_x))
80364 +#define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8    : __RD8(_x))
80365 +#define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 : __RD16(_x))
80366 +
80367 +/*
80368 + * Calculate size of a shared ring, given the total available space for the
80369 + * ring and indexes (_sz), and the name tag of the request/response structure.
80370 + * A ring contains as many entries as will fit, rounded down to the nearest 
80371 + * power of two (so we can mask with (size-1) to loop around).
80372 + */
80373 +#define __RING_SIZE(_s, _sz) \
80374 +    (__RD32(((_sz) - (long)&(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
80375 +
80376 +/*
80377 + * Macros to make the correct C datatypes for a new kind of ring.
80378 + * 
80379 + * To make a new ring datatype, you need to have two message structures,
80380 + * let's say request_t, and response_t already defined.
80381 + *
80382 + * In a header where you want the ring datatype declared, you then do:
80383 + *
80384 + *     DEFINE_RING_TYPES(mytag, request_t, response_t);
80385 + *
80386 + * These expand out to give you a set of types, as you can see below.
80387 + * The most important of these are:
80388 + * 
80389 + *     mytag_sring_t      - The shared ring.
80390 + *     mytag_front_ring_t - The 'front' half of the ring.
80391 + *     mytag_back_ring_t  - The 'back' half of the ring.
80392 + *
80393 + * To initialize a ring in your code you need to know the location and size
80394 + * of the shared memory area (PAGE_SIZE, for instance). To initialise
80395 + * the front half:
80396 + *
80397 + *     mytag_front_ring_t front_ring;
80398 + *     SHARED_RING_INIT((mytag_sring_t *)shared_page);
80399 + *     FRONT_RING_INIT(&front_ring, (mytag_sring_t *)shared_page, PAGE_SIZE);
80400 + *
80401 + * Initializing the back follows similarly (note that only the front
80402 + * initializes the shared ring):
80403 + *
80404 + *     mytag_back_ring_t back_ring;
80405 + *     BACK_RING_INIT(&back_ring, (mytag_sring_t *)shared_page, PAGE_SIZE);
80406 + */
80407 +
80408 +#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t)                     \
80409 +                                                                        \
80410 +/* Shared ring entry */                                                 \
80411 +union __name##_sring_entry {                                            \
80412 +    __req_t req;                                                        \
80413 +    __rsp_t rsp;                                                        \
80414 +};                                                                      \
80415 +                                                                        \
80416 +/* Shared ring page */                                                  \
80417 +struct __name##_sring {                                                 \
80418 +    RING_IDX req_prod, req_event;                                       \
80419 +    RING_IDX rsp_prod, rsp_event;                                       \
80420 +    uint8_t  pad[48];                                                   \
80421 +    union __name##_sring_entry ring[1]; /* variable-length */           \
80422 +};                                                                      \
80423 +                                                                        \
80424 +/* "Front" end's private variables */                                   \
80425 +struct __name##_front_ring {                                            \
80426 +    RING_IDX req_prod_pvt;                                              \
80427 +    RING_IDX rsp_cons;                                                  \
80428 +    unsigned int nr_ents;                                               \
80429 +    struct __name##_sring *sring;                                       \
80430 +};                                                                      \
80431 +                                                                        \
80432 +/* "Back" end's private variables */                                    \
80433 +struct __name##_back_ring {                                             \
80434 +    RING_IDX rsp_prod_pvt;                                              \
80435 +    RING_IDX req_cons;                                                  \
80436 +    unsigned int nr_ents;                                               \
80437 +    struct __name##_sring *sring;                                       \
80438 +};                                                                      \
80439 +                                                                        \
80440 +/* Syntactic sugar */                                                   \
80441 +typedef struct __name##_sring __name##_sring_t;                         \
80442 +typedef struct __name##_front_ring __name##_front_ring_t;               \
80443 +typedef struct __name##_back_ring __name##_back_ring_t
80444 +
80445 +/*
80446 + * Macros for manipulating rings.
80447 + * 
80448 + * FRONT_RING_whatever works on the "front end" of a ring: here 
80449 + * requests are pushed on to the ring and responses taken off it.
80450 + * 
80451 + * BACK_RING_whatever works on the "back end" of a ring: here 
80452 + * requests are taken off the ring and responses put on.
80453 + * 
80454 + * N.B. these macros do NO INTERLOCKS OR FLOW CONTROL. 
80455 + * This is OK in 1-for-1 request-response situations where the 
80456 + * requestor (front end) never has more than RING_SIZE()-1
80457 + * outstanding requests.
80458 + */
80459 +
80460 +/* Initialising empty rings */
80461 +#define SHARED_RING_INIT(_s) do {                                       \
80462 +    (_s)->req_prod  = (_s)->rsp_prod  = 0;                              \
80463 +    (_s)->req_event = (_s)->rsp_event = 1;                              \
80464 +    memset((_s)->pad, 0, sizeof((_s)->pad));                            \
80465 +} while(0)
80466 +
80467 +#define FRONT_RING_INIT(_r, _s, __size) do {                            \
80468 +    (_r)->req_prod_pvt = 0;                                             \
80469 +    (_r)->rsp_cons = 0;                                                 \
80470 +    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
80471 +    (_r)->sring = (_s);                                                 \
80472 +} while (0)
80473 +
80474 +#define BACK_RING_INIT(_r, _s, __size) do {                             \
80475 +    (_r)->rsp_prod_pvt = 0;                                             \
80476 +    (_r)->req_cons = 0;                                                 \
80477 +    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
80478 +    (_r)->sring = (_s);                                                 \
80479 +} while (0)
80480 +
80481 +/* Initialize to existing shared indexes -- for recovery */
80482 +#define FRONT_RING_ATTACH(_r, _s, __size) do {                          \
80483 +    (_r)->sring = (_s);                                                 \
80484 +    (_r)->req_prod_pvt = (_s)->req_prod;                                \
80485 +    (_r)->rsp_cons = (_s)->rsp_prod;                                    \
80486 +    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
80487 +} while (0)
80488 +
80489 +#define BACK_RING_ATTACH(_r, _s, __size) do {                           \
80490 +    (_r)->sring = (_s);                                                 \
80491 +    (_r)->rsp_prod_pvt = (_s)->rsp_prod;                                \
80492 +    (_r)->req_cons = (_s)->req_prod;                                    \
80493 +    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
80494 +} while (0)
80495 +
80496 +/* How big is this ring? */
80497 +#define RING_SIZE(_r)                                                   \
80498 +    ((_r)->nr_ents)
80499 +
80500 +/* Test if there is an empty slot available on the front ring.
80501 + * (This is only meaningful from the front. )
80502 + */
80503 +#define RING_FULL(_r)                                                   \
80504 +    (((_r)->req_prod_pvt - (_r)->rsp_cons) == RING_SIZE(_r))
80505 +
80506 +/* Test if there are outstanding messages to be processed on a ring. */
80507 +#define RING_HAS_UNCONSUMED_RESPONSES(_r)                               \
80508 +    ((_r)->rsp_cons != (_r)->sring->rsp_prod)
80509 +
80510 +#define RING_HAS_UNCONSUMED_REQUESTS(_r)                                \
80511 +    (((_r)->req_cons != (_r)->sring->req_prod) &&                       \
80512 +     (((_r)->req_cons - (_r)->rsp_prod_pvt) != RING_SIZE(_r)))
80513 +
80514 +/* Direct access to individual ring elements, by index. */
80515 +#define RING_GET_REQUEST(_r, _idx)                                      \
80516 +    (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req))
80517 +
80518 +#define RING_GET_RESPONSE(_r, _idx)                                     \
80519 +    (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp))
80520 +
80521 +/* Loop termination condition: Would the specified index overflow the ring? */
80522 +#define RING_REQUEST_CONS_OVERFLOW(_r, _cons)                           \
80523 +    (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
80524 +
80525 +#define RING_PUSH_REQUESTS(_r) do {                                     \
80526 +    wmb(); /* back sees requests /before/ updated producer index */     \
80527 +    (_r)->sring->req_prod = (_r)->req_prod_pvt;                         \
80528 +} while (0)
80529 +
80530 +#define RING_PUSH_RESPONSES(_r) do {                                    \
80531 +    wmb(); /* front sees responses /before/ updated producer index */   \
80532 +    (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt;                         \
80533 +} while (0)
80534 +
80535 +/*
80536 + * Notification hold-off (req_event and rsp_event):
80537 + * 
80538 + * When queueing requests or responses on a shared ring, it may not always be
80539 + * necessary to notify the remote end. For example, if requests are in flight
80540 + * in a backend, the front may be able to queue further requests without
80541 + * notifying the back (if the back checks for new requests when it queues
80542 + * responses).
80543 + * 
80544 + * When enqueuing requests or responses:
80545 + * 
80546 + *  Use RING_PUSH_{REQUESTS,RESPONSES}_AND_CHECK_NOTIFY(). The second argument
80547 + *  is a boolean return value. True indicates that the receiver requires an
80548 + *  asynchronous notification.
80549 + * 
80550 + * After dequeuing requests or responses (before sleeping the connection):
80551 + * 
80552 + *  Use RING_FINAL_CHECK_FOR_REQUESTS() or RING_FINAL_CHECK_FOR_RESPONSES().
80553 + *  The second argument is a boolean return value. True indicates that there
80554 + *  are pending messages on the ring (i.e., the connection should not be put
80555 + *  to sleep).
80556 + * 
80557 + *  These macros will set the req_event/rsp_event field to trigger a
80558 + *  notification on the very next message that is enqueued. If you want to
80559 + *  create batches of work (i.e., only receive a notification after several
80560 + *  messages have been enqueued) then you will need to create a customised
80561 + *  version of the FINAL_CHECK macro in your own code, which sets the event
80562 + *  field appropriately.
80563 + */
80564 +
80565 +#define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do {           \
80566 +    RING_IDX __old = (_r)->sring->req_prod;                             \
80567 +    RING_IDX __new = (_r)->req_prod_pvt;                                \
80568 +    wmb(); /* back sees requests /before/ updated producer index */     \
80569 +    (_r)->sring->req_prod = __new;                                      \
80570 +    mb(); /* back sees new requests /before/ we check req_event */      \
80571 +    (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) <           \
80572 +                 (RING_IDX)(__new - __old));                            \
80573 +} while (0)
80574 +
80575 +#define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do {          \
80576 +    RING_IDX __old = (_r)->sring->rsp_prod;                             \
80577 +    RING_IDX __new = (_r)->rsp_prod_pvt;                                \
80578 +    wmb(); /* front sees responses /before/ updated producer index */   \
80579 +    (_r)->sring->rsp_prod = __new;                                      \
80580 +    mb(); /* front sees new responses /before/ we check rsp_event */    \
80581 +    (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) <           \
80582 +                 (RING_IDX)(__new - __old));                            \
80583 +} while (0)
80584 +
80585 +#define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do {             \
80586 +    (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);                   \
80587 +    if (_work_to_do) break;                                             \
80588 +    (_r)->sring->req_event = (_r)->req_cons + 1;                        \
80589 +    mb();                                                               \
80590 +    (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);                   \
80591 +} while (0)
80592 +
80593 +#define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do {            \
80594 +    (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);                  \
80595 +    if (_work_to_do) break;                                             \
80596 +    (_r)->sring->rsp_event = (_r)->rsp_cons + 1;                        \
80597 +    mb();                                                               \
80598 +    (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);                  \
80599 +} while (0)
80600 +
80601 +#endif /* __XEN_PUBLIC_IO_RING_H__ */
80602 +
80603 +/*
80604 + * Local variables:
80605 + * mode: C
80606 + * c-set-style: "BSD"
80607 + * c-basic-offset: 4
80608 + * tab-width: 4
80609 + * indent-tabs-mode: nil
80610 + * End:
80611 + */
80612 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/io/tpmif.h linux-2.6.16/include/xen/interface/io/tpmif.h
80613 --- linux-2.6.16.orig/include/xen/interface/io/tpmif.h  1970-01-01 01:00:00.000000000 +0100
80614 +++ linux-2.6.16/include/xen/interface/io/tpmif.h       2006-06-26 09:51:32.000000000 +0200
80615 @@ -0,0 +1,56 @@
80616 +/******************************************************************************
80617 + * tpmif.h
80618 + *
80619 + * TPM I/O interface for Xen guest OSes.
80620 + *
80621 + * Copyright (c) 2005, IBM Corporation
80622 + *
80623 + * Author: Stefan Berger, stefanb@us.ibm.com
80624 + * Grant table support: Mahadevan Gomathisankaran
80625 + *
80626 + * This code has been derived from tools/libxc/xen/io/netif.h
80627 + *
80628 + * Copyright (c) 2003-2004, Keir Fraser
80629 + */
80630 +
80631 +#ifndef __XEN_PUBLIC_IO_TPMIF_H__
80632 +#define __XEN_PUBLIC_IO_TPMIF_H__
80633 +
80634 +#include "../grant_table.h"
80635 +
80636 +typedef struct {
80637 +    unsigned long addr;   /* Machine address of packet.   */
80638 +    grant_ref_t ref;      /* grant table access reference */
80639 +    uint16_t unused;
80640 +    uint16_t size;        /* Packet size in bytes.        */
80641 +} tpmif_tx_request_t;
80642 +
80643 +/*
80644 + * The TPMIF_TX_RING_SIZE defines the number of pages the
80645 + * front-end and backend can exchange (= size of array).
80646 + */
80647 +typedef uint32_t TPMIF_RING_IDX;
80648 +
80649 +#define TPMIF_TX_RING_SIZE 10
80650 +
80651 +/* This structure must fit in a memory page. */
80652 +
80653 +typedef struct {
80654 +    tpmif_tx_request_t req;
80655 +} tpmif_ring_t;
80656 +
80657 +typedef struct {
80658 +    tpmif_ring_t ring[TPMIF_TX_RING_SIZE];
80659 +} tpmif_tx_interface_t;
80660 +
80661 +#endif
80662 +
80663 +/*
80664 + * Local variables:
80665 + * mode: C
80666 + * c-set-style: "BSD"
80667 + * c-basic-offset: 4
80668 + * tab-width: 4
80669 + * indent-tabs-mode: nil
80670 + * End:
80671 + */
80672 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/io/xenbus.h linux-2.6.16/include/xen/interface/io/xenbus.h
80673 --- linux-2.6.16.orig/include/xen/interface/io/xenbus.h 1970-01-01 01:00:00.000000000 +0100
80674 +++ linux-2.6.16/include/xen/interface/io/xenbus.h      2006-06-26 09:51:32.000000000 +0200
80675 @@ -0,0 +1,42 @@
80676 +/*****************************************************************************
80677 + * xenbus.h
80678 + *
80679 + * Xenbus protocol details.
80680 + *
80681 + * Copyright (C) 2005 XenSource Ltd.
80682 + */
80683 +
80684 +#ifndef _XEN_PUBLIC_IO_XENBUS_H
80685 +#define _XEN_PUBLIC_IO_XENBUS_H
80686 +
80687 +/* The state of either end of the Xenbus, i.e. the current communication
80688 +   status of initialisation across the bus.  States here imply nothing about
80689 +   the state of the connection between the driver and the kernel's device
80690 +   layers.  */
80691 +typedef enum
80692 +{
80693 +  XenbusStateUnknown      = 0,
80694 +  XenbusStateInitialising = 1,
80695 +  XenbusStateInitWait     = 2,  /* Finished early initialisation, but waiting
80696 +                                   for information from the peer or hotplug
80697 +                                  scripts. */
80698 +  XenbusStateInitialised  = 3,  /* Initialised and waiting for a connection
80699 +                                  from the peer. */
80700 +  XenbusStateConnected    = 4,
80701 +  XenbusStateClosing      = 5,  /* The device is being closed due to an error
80702 +                                  or an unplug event. */
80703 +  XenbusStateClosed       = 6
80704 +
80705 +} XenbusState;
80706 +
80707 +#endif /* _XEN_PUBLIC_IO_XENBUS_H */
80708 +
80709 +/*
80710 + * Local variables:
80711 + *  c-file-style: "linux"
80712 + *  indent-tabs-mode: t
80713 + *  c-indent-level: 8
80714 + *  c-basic-offset: 8
80715 + *  tab-width: 8
80716 + * End:
80717 + */
80718 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/io/xs_wire.h linux-2.6.16/include/xen/interface/io/xs_wire.h
80719 --- linux-2.6.16.orig/include/xen/interface/io/xs_wire.h        1970-01-01 01:00:00.000000000 +0100
80720 +++ linux-2.6.16/include/xen/interface/io/xs_wire.h     2006-06-26 09:51:32.000000000 +0200
80721 @@ -0,0 +1,97 @@
80722 +/*
80723 + * Details of the "wire" protocol between Xen Store Daemon and client
80724 + * library or guest kernel.
80725 + * Copyright (C) 2005 Rusty Russell IBM Corporation
80726 + */
80727 +
80728 +#ifndef _XS_WIRE_H
80729 +#define _XS_WIRE_H
80730 +
80731 +enum xsd_sockmsg_type
80732 +{
80733 +    XS_DEBUG,
80734 +    XS_DIRECTORY,
80735 +    XS_READ,
80736 +    XS_GET_PERMS,
80737 +    XS_WATCH,
80738 +    XS_UNWATCH,
80739 +    XS_TRANSACTION_START,
80740 +    XS_TRANSACTION_END,
80741 +    XS_INTRODUCE,
80742 +    XS_RELEASE,
80743 +    XS_GET_DOMAIN_PATH,
80744 +    XS_WRITE,
80745 +    XS_MKDIR,
80746 +    XS_RM,
80747 +    XS_SET_PERMS,
80748 +    XS_WATCH_EVENT,
80749 +    XS_ERROR,
80750 +    XS_IS_DOMAIN_INTRODUCED
80751 +};
80752 +
80753 +#define XS_WRITE_NONE "NONE"
80754 +#define XS_WRITE_CREATE "CREATE"
80755 +#define XS_WRITE_CREATE_EXCL "CREATE|EXCL"
80756 +
80757 +/* We hand errors as strings, for portability. */
80758 +struct xsd_errors
80759 +{
80760 +    int errnum;
80761 +    const char *errstring;
80762 +};
80763 +#define XSD_ERROR(x) { x, #x }
80764 +static struct xsd_errors xsd_errors[] __attribute__((unused)) = {
80765 +    XSD_ERROR(EINVAL),
80766 +    XSD_ERROR(EACCES),
80767 +    XSD_ERROR(EEXIST),
80768 +    XSD_ERROR(EISDIR),
80769 +    XSD_ERROR(ENOENT),
80770 +    XSD_ERROR(ENOMEM),
80771 +    XSD_ERROR(ENOSPC),
80772 +    XSD_ERROR(EIO),
80773 +    XSD_ERROR(ENOTEMPTY),
80774 +    XSD_ERROR(ENOSYS),
80775 +    XSD_ERROR(EROFS),
80776 +    XSD_ERROR(EBUSY),
80777 +    XSD_ERROR(EAGAIN),
80778 +    XSD_ERROR(EISCONN)
80779 +};
80780 +
80781 +struct xsd_sockmsg
80782 +{
80783 +    uint32_t type;  /* XS_??? */
80784 +    uint32_t req_id;/* Request identifier, echoed in daemon's response.  */
80785 +    uint32_t tx_id; /* Transaction id (0 if not related to a transaction). */
80786 +    uint32_t len;   /* Length of data following this. */
80787 +
80788 +    /* Generally followed by nul-terminated string(s). */
80789 +};
80790 +
80791 +enum xs_watch_type
80792 +{
80793 +    XS_WATCH_PATH = 0,
80794 +    XS_WATCH_TOKEN
80795 +};
80796 +
80797 +/* Inter-domain shared memory communications. */
80798 +#define XENSTORE_RING_SIZE 1024
80799 +typedef uint32_t XENSTORE_RING_IDX;
80800 +#define MASK_XENSTORE_IDX(idx) ((idx) & (XENSTORE_RING_SIZE-1))
80801 +struct xenstore_domain_interface {
80802 +    char req[XENSTORE_RING_SIZE]; /* Requests to xenstore daemon. */
80803 +    char rsp[XENSTORE_RING_SIZE]; /* Replies and async watch events. */
80804 +    XENSTORE_RING_IDX req_cons, req_prod;
80805 +    XENSTORE_RING_IDX rsp_cons, rsp_prod;
80806 +};
80807 +
80808 +#endif /* _XS_WIRE_H */
80809 +
80810 +/*
80811 + * Local variables:
80812 + * mode: C
80813 + * c-set-style: "BSD"
80814 + * c-basic-offset: 4
80815 + * tab-width: 4
80816 + * indent-tabs-mode: nil
80817 + * End:
80818 + */
80819 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/memory.h linux-2.6.16/include/xen/interface/memory.h
80820 --- linux-2.6.16.orig/include/xen/interface/memory.h    1970-01-01 01:00:00.000000000 +0100
80821 +++ linux-2.6.16/include/xen/interface/memory.h 2006-06-26 09:51:32.000000000 +0200
80822 @@ -0,0 +1,155 @@
80823 +/******************************************************************************
80824 + * memory.h
80825 + * 
80826 + * Memory reservation and information.
80827 + * 
80828 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
80829 + */
80830 +
80831 +#ifndef __XEN_PUBLIC_MEMORY_H__
80832 +#define __XEN_PUBLIC_MEMORY_H__
80833 +
80834 +/*
80835 + * Increase or decrease the specified domain's memory reservation. Returns a
80836 + * -ve errcode on failure, or the # extents successfully allocated or freed.
80837 + * arg == addr of struct xen_memory_reservation.
80838 + */
80839 +#define XENMEM_increase_reservation 0
80840 +#define XENMEM_decrease_reservation 1
80841 +#define XENMEM_populate_physmap     6
80842 +typedef struct xen_memory_reservation {
80843 +
80844 +    /*
80845 +     * XENMEM_increase_reservation:
80846 +     *   OUT: MFN (*not* GMFN) bases of extents that were allocated
80847 +     * XENMEM_decrease_reservation:
80848 +     *   IN:  GMFN bases of extents to free
80849 +     * XENMEM_populate_physmap:
80850 +     *   IN:  GPFN bases of extents to populate with memory
80851 +     *   OUT: GMFN bases of extents that were allocated
80852 +     *   (NB. This command also updates the mach_to_phys translation table)
80853 +     */
80854 +    GUEST_HANDLE(ulong) extent_start;
80855 +
80856 +    /* Number of extents, and size/alignment of each (2^extent_order pages). */
80857 +    unsigned long  nr_extents;
80858 +    unsigned int   extent_order;
80859 +
80860 +    /*
80861 +     * Maximum # bits addressable by the user of the allocated region (e.g., 
80862 +     * I/O devices often have a 32-bit limitation even in 64-bit systems). If 
80863 +     * zero then the user has no addressing restriction.
80864 +     * This field is not used by XENMEM_decrease_reservation.
80865 +     */
80866 +    unsigned int   address_bits;
80867 +
80868 +    /*
80869 +     * Domain whose reservation is being changed.
80870 +     * Unprivileged domains can specify only DOMID_SELF.
80871 +     */
80872 +    domid_t        domid;
80873 +
80874 +} xen_memory_reservation_t;
80875 +DEFINE_GUEST_HANDLE(xen_memory_reservation_t);
80876 +
80877 +/*
80878 + * Returns the maximum machine frame number of mapped RAM in this system.
80879 + * This command always succeeds (it never returns an error code).
80880 + * arg == NULL.
80881 + */
80882 +#define XENMEM_maximum_ram_page     2
80883 +
80884 +/*
80885 + * Returns the current or maximum memory reservation, in pages, of the
80886 + * specified domain (may be DOMID_SELF). Returns -ve errcode on failure.
80887 + * arg == addr of domid_t.
80888 + */
80889 +#define XENMEM_current_reservation  3
80890 +#define XENMEM_maximum_reservation  4
80891 +
80892 +/*
80893 + * Returns a list of MFN bases of 2MB extents comprising the machine_to_phys
80894 + * mapping table. Architectures which do not have a m2p table do not implement
80895 + * this command.
80896 + * arg == addr of xen_machphys_mfn_list_t.
80897 + */
80898 +#define XENMEM_machphys_mfn_list    5
80899 +typedef struct xen_machphys_mfn_list {
80900 +    /*
80901 +     * Size of the 'extent_start' array. Fewer entries will be filled if the
80902 +     * machphys table is smaller than max_extents * 2MB.
80903 +     */
80904 +    unsigned int max_extents;
80905 +
80906 +    /*
80907 +     * Pointer to buffer to fill with list of extent starts. If there are
80908 +     * any large discontiguities in the machine address space, 2MB gaps in
80909 +     * the machphys table will be represented by an MFN base of zero.
80910 +     */
80911 +    GUEST_HANDLE(ulong) extent_start;
80912 +
80913 +    /*
80914 +     * Number of extents written to the above array. This will be smaller
80915 +     * than 'max_extents' if the machphys table is smaller than max_e * 2MB.
80916 +     */
80917 +    unsigned int nr_extents;
80918 +} xen_machphys_mfn_list_t;
80919 +DEFINE_GUEST_HANDLE(xen_machphys_mfn_list_t);
80920 +
80921 +/*
80922 + * Sets the GPFN at which a particular page appears in the specified guest's
80923 + * pseudophysical address space.
80924 + * arg == addr of xen_add_to_physmap_t.
80925 + */
80926 +#define XENMEM_add_to_physmap      7
80927 +typedef struct xen_add_to_physmap {
80928 +    /* Which domain to change the mapping for. */
80929 +    domid_t domid;
80930 +
80931 +    /* Source mapping space. */
80932 +#define XENMAPSPACE_shared_info 0 /* shared info page */
80933 +#define XENMAPSPACE_grant_table 1 /* grant table page */
80934 +    unsigned int space;
80935 +
80936 +    /* Index into source mapping space. */
80937 +    unsigned long idx;
80938 +
80939 +    /* GPFN where the source mapping page should appear. */
80940 +    unsigned long gpfn;
80941 +} xen_add_to_physmap_t;
80942 +DEFINE_GUEST_HANDLE(xen_add_to_physmap_t);
80943 +
80944 +/*
80945 + * Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error
80946 + * code on failure. This call only works for auto-translated guests.
80947 + */
80948 +#define XENMEM_translate_gpfn_list  8
80949 +typedef struct xen_translate_gpfn_list {
80950 +    /* Which domain to translate for? */
80951 +    domid_t domid;
80952 +
80953 +    /* Length of list. */
80954 +    unsigned long nr_gpfns;
80955 +
80956 +    /* List of GPFNs to translate. */
80957 +    GUEST_HANDLE(ulong) gpfn_list;
80958 +
80959 +    /*
80960 +     * Output list to contain MFN translations. May be the same as the input
80961 +     * list (in which case each input GPFN is overwritten with the output MFN).
80962 +     */
80963 +    GUEST_HANDLE(ulong) mfn_list;
80964 +} xen_translate_gpfn_list_t;
80965 +DEFINE_GUEST_HANDLE(xen_translate_gpfn_list_t);
80966 +
80967 +#endif /* __XEN_PUBLIC_MEMORY_H__ */
80968 +
80969 +/*
80970 + * Local variables:
80971 + * mode: C
80972 + * c-set-style: "BSD"
80973 + * c-basic-offset: 4
80974 + * tab-width: 4
80975 + * indent-tabs-mode: nil
80976 + * End:
80977 + */
80978 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/nmi.h linux-2.6.16/include/xen/interface/nmi.h
80979 --- linux-2.6.16.orig/include/xen/interface/nmi.h       1970-01-01 01:00:00.000000000 +0100
80980 +++ linux-2.6.16/include/xen/interface/nmi.h    2006-06-26 09:51:32.000000000 +0200
80981 @@ -0,0 +1,59 @@
80982 +/******************************************************************************
80983 + * nmi.h
80984 + * 
80985 + * NMI callback registration and reason codes.
80986 + * 
80987 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
80988 + */
80989 +
80990 +#ifndef __XEN_PUBLIC_NMI_H__
80991 +#define __XEN_PUBLIC_NMI_H__
80992 +
80993 +/*
80994 + * NMI reason codes:
80995 + * Currently these are x86-specific, stored in arch_shared_info.nmi_reason.
80996 + */
80997 + /* I/O-check error reported via ISA port 0x61, bit 6. */
80998 +#define _XEN_NMIREASON_io_error     0
80999 +#define XEN_NMIREASON_io_error      (1UL << _XEN_NMIREASON_io_error)
81000 + /* Parity error reported via ISA port 0x61, bit 7. */
81001 +#define _XEN_NMIREASON_parity_error 1
81002 +#define XEN_NMIREASON_parity_error  (1UL << _XEN_NMIREASON_parity_error)
81003 + /* Unknown hardware-generated NMI. */
81004 +#define _XEN_NMIREASON_unknown      2
81005 +#define XEN_NMIREASON_unknown       (1UL << _XEN_NMIREASON_unknown)
81006 +
81007 +/*
81008 + * long nmi_op(unsigned int cmd, void *arg)
81009 + * NB. All ops return zero on success, else a negative error code.
81010 + */
81011 +
81012 +/*
81013 + * Register NMI callback for this (calling) VCPU. Currently this only makes
81014 + * sense for domain 0, vcpu 0. All other callers will be returned EINVAL.
81015 + * arg == pointer to xennmi_callback structure.
81016 + */
81017 +#define XENNMI_register_callback   0
81018 +typedef struct xennmi_callback {
81019 +    unsigned long handler_address;
81020 +    unsigned long pad;
81021 +} xennmi_callback_t;
81022 +DEFINE_GUEST_HANDLE(xennmi_callback_t);
81023 +
81024 +/*
81025 + * Deregister NMI callback for this (calling) VCPU.
81026 + * arg == NULL.
81027 + */
81028 +#define XENNMI_unregister_callback 1
81029 +
81030 +#endif /* __XEN_PUBLIC_NMI_H__ */
81031 +
81032 +/*
81033 + * Local variables:
81034 + * mode: C
81035 + * c-set-style: "BSD"
81036 + * c-basic-offset: 4
81037 + * tab-width: 4
81038 + * indent-tabs-mode: nil
81039 + * End:
81040 + */
81041 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/physdev.h linux-2.6.16/include/xen/interface/physdev.h
81042 --- linux-2.6.16.orig/include/xen/interface/physdev.h   1970-01-01 01:00:00.000000000 +0100
81043 +++ linux-2.6.16/include/xen/interface/physdev.h        2006-06-26 09:51:32.000000000 +0200
81044 @@ -0,0 +1,71 @@
81045 +
81046 +#ifndef __XEN_PUBLIC_PHYSDEV_H__
81047 +#define __XEN_PUBLIC_PHYSDEV_H__
81048 +
81049 +/* Commands to HYPERVISOR_physdev_op() */
81050 +#define PHYSDEVOP_IRQ_UNMASK_NOTIFY     4
81051 +#define PHYSDEVOP_IRQ_STATUS_QUERY      5
81052 +#define PHYSDEVOP_SET_IOPL              6
81053 +#define PHYSDEVOP_SET_IOBITMAP          7
81054 +#define PHYSDEVOP_APIC_READ             8
81055 +#define PHYSDEVOP_APIC_WRITE            9
81056 +#define PHYSDEVOP_ASSIGN_VECTOR         10
81057 +
81058 +typedef struct physdevop_irq_status_query {
81059 +    /* IN */
81060 +    uint32_t irq;
81061 +    /* OUT */
81062 +/* Need to call PHYSDEVOP_IRQ_UNMASK_NOTIFY when the IRQ has been serviced? */
81063 +#define PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY (1<<0)
81064 +    uint32_t flags;
81065 +} physdevop_irq_status_query_t;
81066 +
81067 +typedef struct physdevop_set_iopl {
81068 +    /* IN */
81069 +    uint32_t iopl;
81070 +} physdevop_set_iopl_t;
81071 +
81072 +typedef struct physdevop_set_iobitmap {
81073 +    /* IN */
81074 +    uint8_t *bitmap;
81075 +    uint32_t nr_ports;
81076 +} physdevop_set_iobitmap_t;
81077 +
81078 +typedef struct physdevop_apic {
81079 +    /* IN */
81080 +    unsigned long apic_physbase;
81081 +    uint32_t reg;
81082 +    /* IN or OUT */
81083 +    uint32_t value;
81084 +} physdevop_apic_t;
81085 +
81086 +typedef struct physdevop_irq {
81087 +    /* IN */
81088 +    uint32_t irq;
81089 +    /* OUT */
81090 +    uint32_t vector;
81091 +} physdevop_irq_t;
81092 +
81093 +typedef struct physdev_op {
81094 +    uint32_t cmd;
81095 +    union {
81096 +        physdevop_irq_status_query_t      irq_status_query;
81097 +        physdevop_set_iopl_t              set_iopl;
81098 +        physdevop_set_iobitmap_t          set_iobitmap;
81099 +        physdevop_apic_t                  apic_op;
81100 +        physdevop_irq_t                   irq_op;
81101 +    } u;
81102 +} physdev_op_t;
81103 +DEFINE_GUEST_HANDLE(physdev_op_t);
81104 +
81105 +#endif /* __XEN_PUBLIC_PHYSDEV_H__ */
81106 +
81107 +/*
81108 + * Local variables:
81109 + * mode: C
81110 + * c-set-style: "BSD"
81111 + * c-basic-offset: 4
81112 + * tab-width: 4
81113 + * indent-tabs-mode: nil
81114 + * End:
81115 + */
81116 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/sched.h linux-2.6.16/include/xen/interface/sched.h
81117 --- linux-2.6.16.orig/include/xen/interface/sched.h     1970-01-01 01:00:00.000000000 +0100
81118 +++ linux-2.6.16/include/xen/interface/sched.h  2006-06-26 09:51:32.000000000 +0200
81119 @@ -0,0 +1,87 @@
81120 +/******************************************************************************
81121 + * sched.h
81122 + * 
81123 + * Scheduler state interactions
81124 + * 
81125 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
81126 + */
81127 +
81128 +#ifndef __XEN_PUBLIC_SCHED_H__
81129 +#define __XEN_PUBLIC_SCHED_H__
81130 +
81131 +#include "event_channel.h"
81132 +
81133 +/*
81134 + * The prototype for this hypercall is:
81135 + *  long sched_op(int cmd, void *arg)
81136 + * @cmd == SCHEDOP_??? (scheduler operation).
81137 + * @arg == Operation-specific extra argument(s), as described below.
81138 + * 
81139 + * Versions of Xen prior to 3.0.2 provided only the following legacy version
81140 + * of this hypercall, supporting only the commands yield, block and shutdown:
81141 + *  long sched_op(int cmd, unsigned long arg)
81142 + * @cmd == SCHEDOP_??? (scheduler operation).
81143 + * @arg == 0               (SCHEDOP_yield and SCHEDOP_block)
81144 + *      == SHUTDOWN_* code (SCHEDOP_shutdown)
81145 + * This legacy version is available to new guests as sched_op_compat().
81146 + */
81147 +
81148 +/*
81149 + * Voluntarily yield the CPU.
81150 + * @arg == NULL.
81151 + */
81152 +#define SCHEDOP_yield       0
81153 +
81154 +/*
81155 + * Block execution of this VCPU until an event is received for processing.
81156 + * If called with event upcalls masked, this operation will atomically
81157 + * reenable event delivery and check for pending events before blocking the
81158 + * VCPU. This avoids a "wakeup waiting" race.
81159 + * @arg == NULL.
81160 + */
81161 +#define SCHEDOP_block       1
81162 +
81163 +/*
81164 + * Halt execution of this domain (all VCPUs) and notify the system controller.
81165 + * @arg == pointer to sched_shutdown structure.
81166 + */
81167 +#define SCHEDOP_shutdown    2
81168 +typedef struct sched_shutdown {
81169 +    unsigned int reason; /* SHUTDOWN_* */
81170 +} sched_shutdown_t;
81171 +DEFINE_GUEST_HANDLE(sched_shutdown_t);
81172 +
81173 +/*
81174 + * Poll a set of event-channel ports. Return when one or more are pending. An
81175 + * optional timeout may be specified.
81176 + * @arg == pointer to sched_poll structure.
81177 + */
81178 +#define SCHEDOP_poll        3
81179 +typedef struct sched_poll {
81180 +    GUEST_HANDLE(evtchn_port_t) ports;
81181 +    unsigned int nr_ports;
81182 +    uint64_t timeout;
81183 +} sched_poll_t;
81184 +DEFINE_GUEST_HANDLE(sched_poll_t);
81185 +
81186 +/*
81187 + * Reason codes for SCHEDOP_shutdown. These may be interpreted by control
81188 + * software to determine the appropriate action. For the most part, Xen does
81189 + * not care about the shutdown code.
81190 + */
81191 +#define SHUTDOWN_poweroff   0  /* Domain exited normally. Clean up and kill. */
81192 +#define SHUTDOWN_reboot     1  /* Clean up, kill, and then restart.          */
81193 +#define SHUTDOWN_suspend    2  /* Clean up, save suspend info, kill.         */
81194 +#define SHUTDOWN_crash      3  /* Tell controller we've crashed.             */
81195 +
81196 +#endif /* __XEN_PUBLIC_SCHED_H__ */
81197 +
81198 +/*
81199 + * Local variables:
81200 + * mode: C
81201 + * c-set-style: "BSD"
81202 + * c-basic-offset: 4
81203 + * tab-width: 4
81204 + * indent-tabs-mode: nil
81205 + * End:
81206 + */
81207 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/sched_ctl.h linux-2.6.16/include/xen/interface/sched_ctl.h
81208 --- linux-2.6.16.orig/include/xen/interface/sched_ctl.h 1970-01-01 01:00:00.000000000 +0100
81209 +++ linux-2.6.16/include/xen/interface/sched_ctl.h      2006-06-26 09:51:32.000000000 +0200
81210 @@ -0,0 +1,64 @@
81211 +/******************************************************************************
81212 + * Generic scheduler control interface.
81213 + *
81214 + * Mark Williamson, (C) 2004 Intel Research Cambridge
81215 + */
81216 +
81217 +#ifndef __XEN_PUBLIC_SCHED_CTL_H__
81218 +#define __XEN_PUBLIC_SCHED_CTL_H__
81219 +
81220 +/* Scheduler types. */
81221 +#define SCHED_BVT      0
81222 +#define SCHED_SEDF     4
81223 +
81224 +/* Set or get info? */
81225 +#define SCHED_INFO_PUT 0
81226 +#define SCHED_INFO_GET 1
81227 +
81228 +/*
81229 + * Generic scheduler control command - used to adjust system-wide scheduler
81230 + * parameters
81231 + */
81232 +struct sched_ctl_cmd {
81233 +    uint32_t sched_id;
81234 +    uint32_t direction;
81235 +    union {
81236 +        struct bvt_ctl {
81237 +            uint32_t ctx_allow;
81238 +        } bvt;
81239 +    } u;
81240 +};
81241 +
81242 +struct sched_adjdom_cmd {
81243 +    uint32_t sched_id;
81244 +    uint32_t direction;
81245 +    domid_t  domain;
81246 +    union {
81247 +        struct bvt_adjdom {
81248 +            uint32_t mcu_adv;      /* mcu advance: inverse of weight */
81249 +            uint32_t warpback;     /* warp? */
81250 +            int32_t  warpvalue;    /* warp value */
81251 +            int64_t  warpl;        /* warp limit */
81252 +            int64_t  warpu;        /* unwarp time requirement */
81253 +        } bvt;
81254 +        struct sedf_adjdom {
81255 +            uint64_t period;
81256 +            uint64_t slice;
81257 +            uint64_t latency;
81258 +            uint32_t extratime;
81259 +            uint32_t weight;
81260 +        } sedf;
81261 +    } u;
81262 +};
81263 +
81264 +#endif /* __XEN_PUBLIC_SCHED_CTL_H__ */
81265 +
81266 +/*
81267 + * Local variables:
81268 + * mode: C
81269 + * c-set-style: "BSD"
81270 + * c-basic-offset: 4
81271 + * tab-width: 4
81272 + * indent-tabs-mode: nil
81273 + * End:
81274 + */
81275 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/trace.h linux-2.6.16/include/xen/interface/trace.h
81276 --- linux-2.6.16.orig/include/xen/interface/trace.h     1970-01-01 01:00:00.000000000 +0100
81277 +++ linux-2.6.16/include/xen/interface/trace.h  2006-06-26 09:51:32.000000000 +0200
81278 @@ -0,0 +1,86 @@
81279 +/******************************************************************************
81280 + * include/public/trace.h
81281 + * 
81282 + * Mark Williamson, (C) 2004 Intel Research Cambridge
81283 + * Copyright (C) 2005 Bin Ren
81284 + */
81285 +
81286 +#ifndef __XEN_PUBLIC_TRACE_H__
81287 +#define __XEN_PUBLIC_TRACE_H__
81288 +
81289 +/* Trace classes */
81290 +#define TRC_CLS_SHIFT 16
81291 +#define TRC_GEN     0x0001f000    /* General trace            */
81292 +#define TRC_SCHED   0x0002f000    /* Xen Scheduler trace      */
81293 +#define TRC_DOM0OP  0x0004f000    /* Xen DOM0 operation trace */
81294 +#define TRC_VMX     0x0008f000    /* Xen VMX trace            */
81295 +#define TRC_MEM     0x000af000    /* Xen memory trace         */
81296 +#define TRC_ALL     0xfffff000
81297 +
81298 +/* Trace subclasses */
81299 +#define TRC_SUBCLS_SHIFT 12
81300 +/* trace subclasses for VMX */
81301 +#define TRC_VMXEXIT  0x00081000   /* VMX exit trace            */
81302 +#define TRC_VMXTIMER 0x00082000   /* VMX timer trace           */
81303 +#define TRC_VMXINT   0x00084000   /* VMX interrupt trace       */
81304 +#define TRC_VMXIO    0x00088000   /* VMX io emulation trace  */
81305 +
81306 +/* Trace events per class */
81307 +
81308 +#define TRC_SCHED_DOM_ADD       (TRC_SCHED +  1)
81309 +#define TRC_SCHED_DOM_REM       (TRC_SCHED +  2)
81310 +#define TRC_SCHED_SLEEP         (TRC_SCHED +  3)
81311 +#define TRC_SCHED_WAKE          (TRC_SCHED +  4)
81312 +#define TRC_SCHED_YIELD         (TRC_SCHED +  5)
81313 +#define TRC_SCHED_BLOCK         (TRC_SCHED +  6)
81314 +#define TRC_SCHED_SHUTDOWN      (TRC_SCHED +  7)
81315 +#define TRC_SCHED_CTL           (TRC_SCHED +  8)
81316 +#define TRC_SCHED_ADJDOM        (TRC_SCHED +  9)
81317 +#define TRC_SCHED_SWITCH        (TRC_SCHED + 10)
81318 +#define TRC_SCHED_S_TIMER_FN    (TRC_SCHED + 11)
81319 +#define TRC_SCHED_T_TIMER_FN    (TRC_SCHED + 12)
81320 +#define TRC_SCHED_DOM_TIMER_FN  (TRC_SCHED + 13)
81321 +#define TRC_SCHED_SWITCH_INFPREV (TRC_SCHED + 14)
81322 +#define TRC_SCHED_SWITCH_INFNEXT (TRC_SCHED + 15)
81323 +
81324 +#define TRC_MEM_PAGE_GRANT_MAP      (TRC_MEM + 1)
81325 +#define TRC_MEM_PAGE_GRANT_UNMAP    (TRC_MEM + 2)
81326 +#define TRC_MEM_PAGE_GRANT_TRANSFER (TRC_MEM + 3)
81327 +
81328 +/* trace events per subclass */
81329 +#define TRC_VMX_VMEXIT          (TRC_VMXEXIT + 1)
81330 +#define TRC_VMX_VMENTRY         (TRC_VMXEXIT + 2)
81331 +
81332 +#define TRC_VMX_TIMER_INTR      (TRC_VMXTIMER + 1)
81333 +
81334 +#define TRC_VMX_INT             (TRC_VMXINT + 1)
81335 +
81336 +
81337 +/* This structure represents a single trace buffer record. */
81338 +struct t_rec {
81339 +    uint64_t cycles;          /* cycle counter timestamp */
81340 +    uint32_t event;           /* event ID                */
81341 +    unsigned long data[5];    /* event data items        */
81342 +};
81343 +
81344 +/*
81345 + * This structure contains the metadata for a single trace buffer.  The head
81346 + * field, indexes into an array of struct t_rec's.
81347 + */
81348 +struct t_buf {
81349 +    uint32_t cons;      /* Next item to be consumed by control tools. */
81350 +    uint32_t prod;      /* Next item to be produced by Xen.           */
81351 +    /* 'nr_recs' records follow immediately after the meta-data header.    */
81352 +};
81353 +
81354 +#endif /* __XEN_PUBLIC_TRACE_H__ */
81355 +
81356 +/*
81357 + * Local variables:
81358 + * mode: C
81359 + * c-set-style: "BSD"
81360 + * c-basic-offset: 4
81361 + * tab-width: 4
81362 + * indent-tabs-mode: nil
81363 + * End:
81364 + */
81365 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/vcpu.h linux-2.6.16/include/xen/interface/vcpu.h
81366 --- linux-2.6.16.orig/include/xen/interface/vcpu.h      1970-01-01 01:00:00.000000000 +0100
81367 +++ linux-2.6.16/include/xen/interface/vcpu.h   2006-06-26 09:51:32.000000000 +0200
81368 @@ -0,0 +1,119 @@
81369 +/******************************************************************************
81370 + * vcpu.h
81371 + * 
81372 + * VCPU initialisation, query, and hotplug.
81373 + * 
81374 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
81375 + */
81376 +
81377 +#ifndef __XEN_PUBLIC_VCPU_H__
81378 +#define __XEN_PUBLIC_VCPU_H__
81379 +
81380 +/*
81381 + * Prototype for this hypercall is:
81382 + *  int vcpu_op(int cmd, int vcpuid, void *extra_args)
81383 + * @cmd        == VCPUOP_??? (VCPU operation).
81384 + * @vcpuid     == VCPU to operate on.
81385 + * @extra_args == Operation-specific extra arguments (NULL if none).
81386 + */
81387 +
81388 +/*
81389 + * Initialise a VCPU. Each VCPU can be initialised only once. A 
81390 + * newly-initialised VCPU will not run until it is brought up by VCPUOP_up.
81391 + * 
81392 + * @extra_arg == pointer to vcpu_guest_context structure containing initial
81393 + *               state for the VCPU.
81394 + */
81395 +#define VCPUOP_initialise           0
81396 +
81397 +/*
81398 + * Bring up a VCPU. This makes the VCPU runnable. This operation will fail
81399 + * if the VCPU has not been initialised (VCPUOP_initialise).
81400 + */
81401 +#define VCPUOP_up                   1
81402 +
81403 +/*
81404 + * Bring down a VCPU (i.e., make it non-runnable).
81405 + * There are a few caveats that callers should observe:
81406 + *  1. This operation may return, and VCPU_is_up may return false, before the
81407 + *     VCPU stops running (i.e., the command is asynchronous). It is a good
81408 + *     idea to ensure that the VCPU has entered a non-critical loop before
81409 + *     bringing it down. Alternatively, this operation is guaranteed
81410 + *     synchronous if invoked by the VCPU itself.
81411 + *  2. After a VCPU is initialised, there is currently no way to drop all its
81412 + *     references to domain memory. Even a VCPU that is down still holds
81413 + *     memory references via its pagetable base pointer and GDT. It is good
81414 + *     practise to move a VCPU onto an 'idle' or default page table, LDT and
81415 + *     GDT before bringing it down.
81416 + */
81417 +#define VCPUOP_down                 2
81418 +
81419 +/* Returns 1 if the given VCPU is up. */
81420 +#define VCPUOP_is_up                3
81421 +
81422 +/*
81423 + * Return information about the state and running time of a VCPU.
81424 + * @extra_arg == pointer to vcpu_runstate_info structure.
81425 + */
81426 +#define VCPUOP_get_runstate_info    4
81427 +typedef struct vcpu_runstate_info {
81428 +    /* VCPU's current state (RUNSTATE_*). */
81429 +    int      state;
81430 +    /* When was current state entered (system time, ns)? */
81431 +    uint64_t state_entry_time;
81432 +    /*
81433 +     * Time spent in each RUNSTATE_* (ns). The sum of these times is
81434 +     * guaranteed not to drift from system time.
81435 +     */
81436 +    uint64_t time[4];
81437 +} vcpu_runstate_info_t;
81438 +
81439 +/* VCPU is currently running on a physical CPU. */
81440 +#define RUNSTATE_running  0
81441 +
81442 +/* VCPU is runnable, but not currently scheduled on any physical CPU. */
81443 +#define RUNSTATE_runnable 1
81444 +
81445 +/* VCPU is blocked (a.k.a. idle). It is therefore not runnable. */
81446 +#define RUNSTATE_blocked  2
81447 +
81448 +/*
81449 + * VCPU is not runnable, but it is not blocked.
81450 + * This is a 'catch all' state for things like hotplug and pauses by the
81451 + * system administrator (or for critical sections in the hypervisor).
81452 + * RUNSTATE_blocked dominates this state (it is the preferred state).
81453 + */
81454 +#define RUNSTATE_offline  3
81455 +
81456 +/*
81457 + * Register a shared memory area from which the guest may obtain its own
81458 + * runstate information without needing to execute a hypercall.
81459 + * Notes:
81460 + *  1. The registered address may be virtual or physical, depending on the
81461 + *     platform. The virtual address should be registered on x86 systems.
81462 + *  2. Only one shared area may be registered per VCPU. The shared area is
81463 + *     updated by the hypervisor each time the VCPU is scheduled. Thus
81464 + *     runstate.state will always be RUNSTATE_running and
81465 + *     runstate.state_entry_time will indicate the system time at which the
81466 + *     VCPU was last scheduled to run.
81467 + * @extra_arg == pointer to vcpu_register_runstate_memory_area structure.
81468 + */
81469 +#define VCPUOP_register_runstate_memory_area 5
81470 +typedef struct vcpu_register_runstate_memory_area {
81471 +    union {
81472 +        struct vcpu_runstate_info *v;
81473 +        uint64_t p;
81474 +    } addr;
81475 +} vcpu_register_runstate_memory_area_t;
81476 +
81477 +#endif /* __XEN_PUBLIC_VCPU_H__ */
81478 +
81479 +/*
81480 + * Local variables:
81481 + * mode: C
81482 + * c-set-style: "BSD"
81483 + * c-basic-offset: 4
81484 + * tab-width: 4
81485 + * indent-tabs-mode: nil
81486 + * End:
81487 + */
81488 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/version.h linux-2.6.16/include/xen/interface/version.h
81489 --- linux-2.6.16.orig/include/xen/interface/version.h   1970-01-01 01:00:00.000000000 +0100
81490 +++ linux-2.6.16/include/xen/interface/version.h        2006-06-26 09:51:32.000000000 +0200
81491 @@ -0,0 +1,64 @@
81492 +/******************************************************************************
81493 + * version.h
81494 + * 
81495 + * Xen version, type, and compile information.
81496 + * 
81497 + * Copyright (c) 2005, Nguyen Anh Quynh <aquynh@gmail.com>
81498 + * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
81499 + */
81500 +
81501 +#ifndef __XEN_PUBLIC_VERSION_H__
81502 +#define __XEN_PUBLIC_VERSION_H__
81503 +
81504 +/* NB. All ops return zero on success, except XENVER_version. */
81505 +
81506 +/* arg == NULL; returns major:minor (16:16). */
81507 +#define XENVER_version      0
81508 +
81509 +/* arg == xen_extraversion_t. */
81510 +#define XENVER_extraversion 1
81511 +typedef char xen_extraversion_t[16];
81512 +#define XEN_EXTRAVERSION_LEN (sizeof(xen_extraversion_t))
81513 +
81514 +/* arg == xen_compile_info_t. */
81515 +#define XENVER_compile_info 2
81516 +typedef struct xen_compile_info {
81517 +    char compiler[64];
81518 +    char compile_by[16];
81519 +    char compile_domain[32];
81520 +    char compile_date[32];
81521 +} xen_compile_info_t;
81522 +
81523 +#define XENVER_capabilities 3
81524 +typedef char xen_capabilities_info_t[1024];
81525 +#define XEN_CAPABILITIES_INFO_LEN (sizeof(xen_capabilities_info_t))
81526 +
81527 +#define XENVER_changeset 4
81528 +typedef char xen_changeset_info_t[64];
81529 +#define XEN_CHANGESET_INFO_LEN (sizeof(xen_changeset_info_t))
81530 +
81531 +#define XENVER_platform_parameters 5
81532 +typedef struct xen_platform_parameters {
81533 +    unsigned long virt_start;
81534 +} xen_platform_parameters_t;
81535 +
81536 +#define XENVER_get_features 6
81537 +typedef struct xen_feature_info {
81538 +    unsigned int submap_idx;    /* IN: which 32-bit submap to return */
81539 +    uint32_t     submap;        /* OUT: 32-bit submap */
81540 +} xen_feature_info_t;
81541 +
81542 +/* Declares the features reported by XENVER_get_features. */
81543 +#include "features.h"
81544 +
81545 +#endif /* __XEN_PUBLIC_VERSION_H__ */
81546 +
81547 +/*
81548 + * Local variables:
81549 + * mode: C
81550 + * c-set-style: "BSD"
81551 + * c-basic-offset: 4
81552 + * tab-width: 4
81553 + * indent-tabs-mode: nil
81554 + * End:
81555 + */
81556 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/xen-compat.h linux-2.6.16/include/xen/interface/xen-compat.h
81557 --- linux-2.6.16.orig/include/xen/interface/xen-compat.h        1970-01-01 01:00:00.000000000 +0100
81558 +++ linux-2.6.16/include/xen/interface/xen-compat.h     2006-06-26 09:51:32.000000000 +0200
81559 @@ -0,0 +1,31 @@
81560 +/******************************************************************************
81561 + * xen-compat.h
81562 + * 
81563 + * Guest OS interface to Xen.  Compatibility layer.
81564 + * 
81565 + * Copyright (c) 2006, Christian Limpach
81566 + */
81567 +
81568 +#ifndef __XEN_PUBLIC_XEN_COMPAT_H__
81569 +#define __XEN_PUBLIC_XEN_COMPAT_H__
81570 +
81571 +#define __XEN_LATEST_INTERFACE_VERSION__ 0x00030101
81572 +
81573 +#if defined(__XEN__)
81574 +/* Xen is built with matching headers and implements the latest interface. */
81575 +#define __XEN_INTERFACE_VERSION__ __XEN_LATEST_INTERFACE_VERSION__
81576 +#elif !defined(__XEN_INTERFACE_VERSION__)
81577 +/* Guests which do not specify a version get the legacy interface. */
81578 +#define __XEN_INTERFACE_VERSION__ 0x00000000
81579 +#endif
81580 +
81581 +#if __XEN_INTERFACE_VERSION__ > __XEN_LATEST_INTERFACE_VERSION__
81582 +#error "These header files do not support the requested interface version."
81583 +#endif
81584 +
81585 +#if __XEN_INTERFACE_VERSION__ < 0x00030101
81586 +#undef __HYPERVISOR_sched_op
81587 +#define __HYPERVISOR_sched_op __HYPERVISOR_sched_op_compat
81588 +#endif
81589 +
81590 +#endif /* __XEN_PUBLIC_XEN_COMPAT_H__ */
81591 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/interface/xen.h linux-2.6.16/include/xen/interface/xen.h
81592 --- linux-2.6.16.orig/include/xen/interface/xen.h       1970-01-01 01:00:00.000000000 +0100
81593 +++ linux-2.6.16/include/xen/interface/xen.h    2006-06-26 09:51:32.000000000 +0200
81594 @@ -0,0 +1,451 @@
81595 +/******************************************************************************
81596 + * xen.h
81597 + * 
81598 + * Guest OS interface to Xen.
81599 + * 
81600 + * Copyright (c) 2004, K A Fraser
81601 + */
81602 +
81603 +#ifndef __XEN_PUBLIC_XEN_H__
81604 +#define __XEN_PUBLIC_XEN_H__
81605 +
81606 +#if defined(__i386__)
81607 +#include "arch-x86_32.h"
81608 +#elif defined(__x86_64__)
81609 +#include "arch-x86_64.h"
81610 +#elif defined(__ia64__)
81611 +#include "arch-ia64.h"
81612 +#else
81613 +#error "Unsupported architecture"
81614 +#endif
81615 +
81616 +/*
81617 + * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS).
81618 + */
81619 +
81620 +/*
81621 + * x86_32: EAX = vector; EBX, ECX, EDX, ESI, EDI = args 1, 2, 3, 4, 5.
81622 + *         EAX = return value
81623 + *         (argument registers may be clobbered on return)
81624 + * x86_64: RAX = vector; RDI, RSI, RDX, R10, R8, R9 = args 1, 2, 3, 4, 5, 6. 
81625 + *         RAX = return value
81626 + *         (argument registers not clobbered on return; RCX, R11 are)
81627 + */
81628 +#define __HYPERVISOR_set_trap_table        0
81629 +#define __HYPERVISOR_mmu_update            1
81630 +#define __HYPERVISOR_set_gdt               2
81631 +#define __HYPERVISOR_stack_switch          3
81632 +#define __HYPERVISOR_set_callbacks         4
81633 +#define __HYPERVISOR_fpu_taskswitch        5
81634 +#define __HYPERVISOR_sched_op_compat       6 /* compat as of 0x00030101 */
81635 +#define __HYPERVISOR_dom0_op               7
81636 +#define __HYPERVISOR_set_debugreg          8
81637 +#define __HYPERVISOR_get_debugreg          9
81638 +#define __HYPERVISOR_update_descriptor    10
81639 +#define __HYPERVISOR_memory_op            12
81640 +#define __HYPERVISOR_multicall            13
81641 +#define __HYPERVISOR_update_va_mapping    14
81642 +#define __HYPERVISOR_set_timer_op         15
81643 +#define __HYPERVISOR_event_channel_op     16
81644 +#define __HYPERVISOR_xen_version          17
81645 +#define __HYPERVISOR_console_io           18
81646 +#define __HYPERVISOR_physdev_op           19
81647 +#define __HYPERVISOR_grant_table_op       20
81648 +#define __HYPERVISOR_vm_assist            21
81649 +#define __HYPERVISOR_update_va_mapping_otherdomain 22
81650 +#define __HYPERVISOR_iret                 23 /* x86 only */
81651 +#define __HYPERVISOR_vcpu_op              24
81652 +#define __HYPERVISOR_set_segment_base     25 /* x86/64 only */
81653 +#define __HYPERVISOR_mmuext_op            26
81654 +#define __HYPERVISOR_acm_op               27
81655 +#define __HYPERVISOR_nmi_op               28
81656 +#define __HYPERVISOR_sched_op             29
81657 +
81658 +/* 
81659 + * VIRTUAL INTERRUPTS
81660 + * 
81661 + * Virtual interrupts that a guest OS may receive from Xen.
81662 + */
81663 +#define VIRQ_TIMER      0  /* Timebase update, and/or requested timeout.  */
81664 +#define VIRQ_DEBUG      1  /* Request guest to dump debug info.           */
81665 +#define VIRQ_CONSOLE    2  /* (DOM0) Bytes received on emergency console. */
81666 +#define VIRQ_DOM_EXC    3  /* (DOM0) Exceptional event for some domain.   */
81667 +#define VIRQ_DEBUGGER   6  /* (DOM0) A domain has paused for debugging.   */
81668 +#define NR_VIRQS        8
81669 +
81670 +/*
81671 + * MMU-UPDATE REQUESTS
81672 + * 
81673 + * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs.
81674 + * A foreigndom (FD) can be specified (or DOMID_SELF for none).
81675 + * Where the FD has some effect, it is described below.
81676 + * ptr[1:0] specifies the appropriate MMU_* command.
81677 + * 
81678 + * ptr[1:0] == MMU_NORMAL_PT_UPDATE:
81679 + * Updates an entry in a page table. If updating an L1 table, and the new
81680 + * table entry is valid/present, the mapped frame must belong to the FD, if
81681 + * an FD has been specified. If attempting to map an I/O page then the
81682 + * caller assumes the privilege of the FD.
81683 + * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller.
81684 + * FD == DOMID_XEN: Map restricted areas of Xen's heap space.
81685 + * ptr[:2]  -- Machine address of the page-table entry to modify.
81686 + * val      -- Value to write.
81687 + * 
81688 + * ptr[1:0] == MMU_MACHPHYS_UPDATE:
81689 + * Updates an entry in the machine->pseudo-physical mapping table.
81690 + * ptr[:2]  -- Machine address within the frame whose mapping to modify.
81691 + *             The frame must belong to the FD, if one is specified.
81692 + * val      -- Value to write into the mapping entry.
81693 + */
81694 +#define MMU_NORMAL_PT_UPDATE     0 /* checked '*ptr = val'. ptr is MA.       */
81695 +#define MMU_MACHPHYS_UPDATE      1 /* ptr = MA of frame to modify entry for  */
81696 +
81697 +/*
81698 + * MMU EXTENDED OPERATIONS
81699 + * 
81700 + * HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures.
81701 + * A foreigndom (FD) can be specified (or DOMID_SELF for none).
81702 + * Where the FD has some effect, it is described below.
81703 + * 
81704 + * cmd: MMUEXT_(UN)PIN_*_TABLE
81705 + * mfn: Machine frame number to be (un)pinned as a p.t. page.
81706 + *      The frame must belong to the FD, if one is specified.
81707 + * 
81708 + * cmd: MMUEXT_NEW_BASEPTR
81709 + * mfn: Machine frame number of new page-table base to install in MMU.
81710 + * 
81711 + * cmd: MMUEXT_NEW_USER_BASEPTR [x86/64 only]
81712 + * mfn: Machine frame number of new page-table base to install in MMU
81713 + *      when in user space.
81714 + * 
81715 + * cmd: MMUEXT_TLB_FLUSH_LOCAL
81716 + * No additional arguments. Flushes local TLB.
81717 + * 
81718 + * cmd: MMUEXT_INVLPG_LOCAL
81719 + * linear_addr: Linear address to be flushed from the local TLB.
81720 + * 
81721 + * cmd: MMUEXT_TLB_FLUSH_MULTI
81722 + * vcpumask: Pointer to bitmap of VCPUs to be flushed.
81723 + * 
81724 + * cmd: MMUEXT_INVLPG_MULTI
81725 + * linear_addr: Linear address to be flushed.
81726 + * vcpumask: Pointer to bitmap of VCPUs to be flushed.
81727 + * 
81728 + * cmd: MMUEXT_TLB_FLUSH_ALL
81729 + * No additional arguments. Flushes all VCPUs' TLBs.
81730 + * 
81731 + * cmd: MMUEXT_INVLPG_ALL
81732 + * linear_addr: Linear address to be flushed from all VCPUs' TLBs.
81733 + * 
81734 + * cmd: MMUEXT_FLUSH_CACHE
81735 + * No additional arguments. Writes back and flushes cache contents.
81736 + * 
81737 + * cmd: MMUEXT_SET_LDT
81738 + * linear_addr: Linear address of LDT base (NB. must be page-aligned).
81739 + * nr_ents: Number of entries in LDT.
81740 + */
81741 +#define MMUEXT_PIN_L1_TABLE      0
81742 +#define MMUEXT_PIN_L2_TABLE      1
81743 +#define MMUEXT_PIN_L3_TABLE      2
81744 +#define MMUEXT_PIN_L4_TABLE      3
81745 +#define MMUEXT_UNPIN_TABLE       4
81746 +#define MMUEXT_NEW_BASEPTR       5
81747 +#define MMUEXT_TLB_FLUSH_LOCAL   6
81748 +#define MMUEXT_INVLPG_LOCAL      7
81749 +#define MMUEXT_TLB_FLUSH_MULTI   8
81750 +#define MMUEXT_INVLPG_MULTI      9
81751 +#define MMUEXT_TLB_FLUSH_ALL    10
81752 +#define MMUEXT_INVLPG_ALL       11
81753 +#define MMUEXT_FLUSH_CACHE      12
81754 +#define MMUEXT_SET_LDT          13
81755 +#define MMUEXT_NEW_USER_BASEPTR 15
81756 +
81757 +#ifndef __ASSEMBLY__
81758 +typedef struct mmuext_op {
81759 +    unsigned int cmd;
81760 +    union {
81761 +        /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
81762 +        unsigned long mfn;
81763 +        /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
81764 +        unsigned long linear_addr;
81765 +    } arg1;
81766 +    union {
81767 +        /* SET_LDT */
81768 +        unsigned int nr_ents;
81769 +        /* TLB_FLUSH_MULTI, INVLPG_MULTI */
81770 +        void *vcpumask;
81771 +    } arg2;
81772 +} mmuext_op_t;
81773 +DEFINE_GUEST_HANDLE(mmuext_op_t);
81774 +#endif
81775 +
81776 +/* These are passed as 'flags' to update_va_mapping. They can be ORed. */
81777 +/* When specifying UVMF_MULTI, also OR in a pointer to a CPU bitmap.   */
81778 +/* UVMF_LOCAL is merely UVMF_MULTI with a NULL bitmap pointer.         */
81779 +#define UVMF_NONE               (0UL<<0) /* No flushing at all.   */
81780 +#define UVMF_TLB_FLUSH          (1UL<<0) /* Flush entire TLB(s).  */
81781 +#define UVMF_INVLPG             (2UL<<0) /* Flush only one entry. */
81782 +#define UVMF_FLUSHTYPE_MASK     (3UL<<0)
81783 +#define UVMF_MULTI              (0UL<<2) /* Flush subset of TLBs. */
81784 +#define UVMF_LOCAL              (0UL<<2) /* Flush local TLB.      */
81785 +#define UVMF_ALL                (1UL<<2) /* Flush all TLBs.       */
81786 +
81787 +/*
81788 + * Commands to HYPERVISOR_console_io().
81789 + */
81790 +#define CONSOLEIO_write         0
81791 +#define CONSOLEIO_read          1
81792 +
81793 +/*
81794 + * Commands to HYPERVISOR_vm_assist().
81795 + */
81796 +#define VMASST_CMD_enable                0
81797 +#define VMASST_CMD_disable               1
81798 +#define VMASST_TYPE_4gb_segments         0
81799 +#define VMASST_TYPE_4gb_segments_notify  1
81800 +#define VMASST_TYPE_writable_pagetables  2
81801 +#define MAX_VMASST_TYPE 2
81802 +
81803 +#ifndef __ASSEMBLY__
81804 +
81805 +typedef uint16_t domid_t;
81806 +
81807 +/* Domain ids >= DOMID_FIRST_RESERVED cannot be used for ordinary domains. */
81808 +#define DOMID_FIRST_RESERVED (0x7FF0U)
81809 +
81810 +/* DOMID_SELF is used in certain contexts to refer to oneself. */
81811 +#define DOMID_SELF (0x7FF0U)
81812 +
81813 +/*
81814 + * DOMID_IO is used to restrict page-table updates to mapping I/O memory.
81815 + * Although no Foreign Domain need be specified to map I/O pages, DOMID_IO
81816 + * is useful to ensure that no mappings to the OS's own heap are accidentally
81817 + * installed. (e.g., in Linux this could cause havoc as reference counts
81818 + * aren't adjusted on the I/O-mapping code path).
81819 + * This only makes sense in MMUEXT_SET_FOREIGNDOM, but in that context can
81820 + * be specified by any calling domain.
81821 + */
81822 +#define DOMID_IO   (0x7FF1U)
81823 +
81824 +/*
81825 + * DOMID_XEN is used to allow privileged domains to map restricted parts of
81826 + * Xen's heap space (e.g., the machine_to_phys table).
81827 + * This only makes sense in MMUEXT_SET_FOREIGNDOM, and is only permitted if
81828 + * the caller is privileged.
81829 + */
81830 +#define DOMID_XEN  (0x7FF2U)
81831 +
81832 +/*
81833 + * Send an array of these to HYPERVISOR_mmu_update().
81834 + * NB. The fields are natural pointer/address size for this architecture.
81835 + */
81836 +typedef struct mmu_update {
81837 +    uint64_t ptr;       /* Machine address of PTE. */
81838 +    uint64_t val;       /* New contents of PTE.    */
81839 +} mmu_update_t;
81840 +DEFINE_GUEST_HANDLE(mmu_update_t);
81841 +
81842 +/*
81843 + * Send an array of these to HYPERVISOR_multicall().
81844 + * NB. The fields are natural register size for this architecture.
81845 + */
81846 +typedef struct multicall_entry {
81847 +    unsigned long op, result;
81848 +    unsigned long args[6];
81849 +} multicall_entry_t;
81850 +DEFINE_GUEST_HANDLE(multicall_entry_t);
81851 +
81852 +/*
81853 + * Event channel endpoints per domain:
81854 + *  1024 if a long is 32 bits; 4096 if a long is 64 bits.
81855 + */
81856 +#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64)
81857 +
81858 +typedef struct vcpu_time_info {
81859 +    /*
81860 +     * Updates to the following values are preceded and followed by an
81861 +     * increment of 'version'. The guest can therefore detect updates by
81862 +     * looking for changes to 'version'. If the least-significant bit of
81863 +     * the version number is set then an update is in progress and the guest
81864 +     * must wait to read a consistent set of values.
81865 +     * The correct way to interact with the version number is similar to
81866 +     * Linux's seqlock: see the implementations of read_seqbegin/read_seqretry.
81867 +     */
81868 +    uint32_t version;
81869 +    uint32_t pad0;
81870 +    uint64_t tsc_timestamp;   /* TSC at last update of time vals.  */
81871 +    uint64_t system_time;     /* Time, in nanosecs, since boot.    */
81872 +    /*
81873 +     * Current system time:
81874 +     *   system_time + ((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul
81875 +     * CPU frequency (Hz):
81876 +     *   ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift
81877 +     */
81878 +    uint32_t tsc_to_system_mul;
81879 +    int8_t   tsc_shift;
81880 +    int8_t   pad1[3];
81881 +} vcpu_time_info_t; /* 32 bytes */
81882 +
81883 +typedef struct vcpu_info {
81884 +    /*
81885 +     * 'evtchn_upcall_pending' is written non-zero by Xen to indicate
81886 +     * a pending notification for a particular VCPU. It is then cleared 
81887 +     * by the guest OS /before/ checking for pending work, thus avoiding
81888 +     * a set-and-check race. Note that the mask is only accessed by Xen
81889 +     * on the CPU that is currently hosting the VCPU. This means that the
81890 +     * pending and mask flags can be updated by the guest without special
81891 +     * synchronisation (i.e., no need for the x86 LOCK prefix).
81892 +     * This may seem suboptimal because if the pending flag is set by
81893 +     * a different CPU then an IPI may be scheduled even when the mask
81894 +     * is set. However, note:
81895 +     *  1. The task of 'interrupt holdoff' is covered by the per-event-
81896 +     *     channel mask bits. A 'noisy' event that is continually being
81897 +     *     triggered can be masked at source at this very precise
81898 +     *     granularity.
81899 +     *  2. The main purpose of the per-VCPU mask is therefore to restrict
81900 +     *     reentrant execution: whether for concurrency control, or to
81901 +     *     prevent unbounded stack usage. Whatever the purpose, we expect
81902 +     *     that the mask will be asserted only for short periods at a time,
81903 +     *     and so the likelihood of a 'spurious' IPI is suitably small.
81904 +     * The mask is read before making an event upcall to the guest: a
81905 +     * non-zero mask therefore guarantees that the VCPU will not receive
81906 +     * an upcall activation. The mask is cleared when the VCPU requests
81907 +     * to block: this avoids wakeup-waiting races.
81908 +     */
81909 +    uint8_t evtchn_upcall_pending;
81910 +    uint8_t evtchn_upcall_mask;
81911 +    unsigned long evtchn_pending_sel;
81912 +    arch_vcpu_info_t arch;
81913 +    vcpu_time_info_t time;
81914 +} vcpu_info_t; /* 64 bytes (x86) */
81915 +
81916 +/*
81917 + * Xen/kernel shared data -- pointer provided in start_info.
81918 + * NB. We expect that this struct is smaller than a page.
81919 + */
81920 +typedef struct shared_info {
81921 +    vcpu_info_t vcpu_info[MAX_VIRT_CPUS];
81922 +
81923 +    /*
81924 +     * A domain can create "event channels" on which it can send and receive
81925 +     * asynchronous event notifications. There are three classes of event that
81926 +     * are delivered by this mechanism:
81927 +     *  1. Bi-directional inter- and intra-domain connections. Domains must
81928 +     *     arrange out-of-band to set up a connection (usually by allocating
81929 +     *     an unbound 'listener' port and avertising that via a storage service
81930 +     *     such as xenstore).
81931 +     *  2. Physical interrupts. A domain with suitable hardware-access
81932 +     *     privileges can bind an event-channel port to a physical interrupt
81933 +     *     source.
81934 +     *  3. Virtual interrupts ('events'). A domain can bind an event-channel
81935 +     *     port to a virtual interrupt source, such as the virtual-timer
81936 +     *     device or the emergency console.
81937 +     * 
81938 +     * Event channels are addressed by a "port index". Each channel is
81939 +     * associated with two bits of information:
81940 +     *  1. PENDING -- notifies the domain that there is a pending notification
81941 +     *     to be processed. This bit is cleared by the guest.
81942 +     *  2. MASK -- if this bit is clear then a 0->1 transition of PENDING
81943 +     *     will cause an asynchronous upcall to be scheduled. This bit is only
81944 +     *     updated by the guest. It is read-only within Xen. If a channel
81945 +     *     becomes pending while the channel is masked then the 'edge' is lost
81946 +     *     (i.e., when the channel is unmasked, the guest must manually handle
81947 +     *     pending notifications as no upcall will be scheduled by Xen).
81948 +     * 
81949 +     * To expedite scanning of pending notifications, any 0->1 pending
81950 +     * transition on an unmasked channel causes a corresponding bit in a
81951 +     * per-vcpu selector word to be set. Each bit in the selector covers a
81952 +     * 'C long' in the PENDING bitfield array.
81953 +     */
81954 +    unsigned long evtchn_pending[sizeof(unsigned long) * 8];
81955 +    unsigned long evtchn_mask[sizeof(unsigned long) * 8];
81956 +
81957 +    /*
81958 +     * Wallclock time: updated only by control software. Guests should base
81959 +     * their gettimeofday() syscall on this wallclock-base value.
81960 +     */
81961 +    uint32_t wc_version;      /* Version counter: see vcpu_time_info_t. */
81962 +    uint32_t wc_sec;          /* Secs  00:00:00 UTC, Jan 1, 1970.  */
81963 +    uint32_t wc_nsec;         /* Nsecs 00:00:00 UTC, Jan 1, 1970.  */
81964 +
81965 +    arch_shared_info_t arch;
81966 +
81967 +} shared_info_t;
81968 +
81969 +/*
81970 + * Start-of-day memory layout for the initial domain (DOM0):
81971 + *  1. The domain is started within contiguous virtual-memory region.
81972 + *  2. The contiguous region begins and ends on an aligned 4MB boundary.
81973 + *  3. The region start corresponds to the load address of the OS image.
81974 + *     If the load address is not 4MB aligned then the address is rounded down.
81975 + *  4. This the order of bootstrap elements in the initial virtual region:
81976 + *      a. relocated kernel image
81977 + *      b. initial ram disk              [mod_start, mod_len]
81978 + *      c. list of allocated page frames [mfn_list, nr_pages]
81979 + *      d. start_info_t structure        [register ESI (x86)]
81980 + *      e. bootstrap page tables         [pt_base, CR3 (x86)]
81981 + *      f. bootstrap stack               [register ESP (x86)]
81982 + *  5. Bootstrap elements are packed together, but each is 4kB-aligned.
81983 + *  6. The initial ram disk may be omitted.
81984 + *  7. The list of page frames forms a contiguous 'pseudo-physical' memory
81985 + *     layout for the domain. In particular, the bootstrap virtual-memory
81986 + *     region is a 1:1 mapping to the first section of the pseudo-physical map.
81987 + *  8. All bootstrap elements are mapped read-writable for the guest OS. The
81988 + *     only exception is the bootstrap page table, which is mapped read-only.
81989 + *  9. There is guaranteed to be at least 512kB padding after the final
81990 + *     bootstrap element. If necessary, the bootstrap virtual region is
81991 + *     extended by an extra 4MB to ensure this.
81992 + */
81993 +
81994 +#define MAX_GUEST_CMDLINE 1024
81995 +typedef struct start_info {
81996 +    /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME.    */
81997 +    char magic[32];             /* "xen-<version>-<platform>".            */
81998 +    unsigned long nr_pages;     /* Total pages allocated to this domain.  */
81999 +    unsigned long shared_info;  /* MACHINE address of shared info struct. */
82000 +    uint32_t flags;             /* SIF_xxx flags.                         */
82001 +    unsigned long store_mfn;    /* MACHINE page number of shared page.    */
82002 +    uint32_t store_evtchn;      /* Event channel for store communication. */
82003 +    unsigned long console_mfn;  /* MACHINE address of console page.       */
82004 +    uint32_t console_evtchn;    /* Event channel for console messages.    */
82005 +    /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME).     */
82006 +    unsigned long pt_base;      /* VIRTUAL address of page directory.     */
82007 +    unsigned long nr_pt_frames; /* Number of bootstrap p.t. frames.       */
82008 +    unsigned long mfn_list;     /* VIRTUAL address of page-frame list.    */
82009 +    unsigned long mod_start;    /* VIRTUAL address of pre-loaded module.  */
82010 +    unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     */
82011 +    int8_t cmd_line[MAX_GUEST_CMDLINE];
82012 +} start_info_t;
82013 +
82014 +/* These flags are passed in the 'flags' field of start_info_t. */
82015 +#define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
82016 +#define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
82017 +
82018 +typedef uint64_t cpumap_t;
82019 +
82020 +typedef uint8_t xen_domain_handle_t[16];
82021 +
82022 +/* Turn a plain number into a C unsigned long constant. */
82023 +#define __mk_unsigned_long(x) x ## UL
82024 +#define mk_unsigned_long(x) __mk_unsigned_long(x)
82025 +
82026 +#else /* __ASSEMBLY__ */
82027 +
82028 +/* In assembly code we cannot use C numeric constant suffixes. */
82029 +#define mk_unsigned_long(x) x
82030 +
82031 +#endif /* !__ASSEMBLY__ */
82032 +
82033 +#include "xen-compat.h"
82034 +
82035 +#endif /* __XEN_PUBLIC_XEN_H__ */
82036 +
82037 +/*
82038 + * Local variables:
82039 + * mode: C
82040 + * c-set-style: "BSD"
82041 + * c-basic-offset: 4
82042 + * tab-width: 4
82043 + * indent-tabs-mode: nil
82044 + * End:
82045 + */
82046 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/net_driver_util.h linux-2.6.16/include/xen/net_driver_util.h
82047 --- linux-2.6.16.orig/include/xen/net_driver_util.h     1970-01-01 01:00:00.000000000 +0100
82048 +++ linux-2.6.16/include/xen/net_driver_util.h  2006-06-26 09:51:32.000000000 +0200
82049 @@ -0,0 +1,58 @@
82050 +/*****************************************************************************
82051 + *
82052 + * Utility functions for Xen network devices.
82053 + *
82054 + * Copyright (c) 2005 XenSource Ltd.
82055 + * 
82056 + * This program is free software; you can redistribute it and/or
82057 + * modify it under the terms of the GNU General Public License version 2
82058 + * as published by the Free Software Foundation; or, when distributed
82059 + * separately from the Linux kernel or incorporated into other
82060 + * software packages, subject to the following license:
82061 + * 
82062 + * Permission is hereby granted, free of charge, to any person obtaining a
82063 + * copy of this source file (the "Software"), to deal in the Software without
82064 + * restriction, including without limitation the rights to use, copy, modify,
82065 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
82066 + * and to permit persons to whom the Software is furnished to do so, subject
82067 + * to the following conditions:
82068 + * 
82069 + * The above copyright notice and this permission notice shall be included in
82070 + * all copies or substantial portions of the Software.
82071 + * 
82072 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
82073 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
82074 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
82075 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
82076 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
82077 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
82078 + * DEALINGS IN THE SOFTWARE.
82079 + */
82080 +
82081 +#ifndef _ASM_XEN_NET_DRIVER_UTIL_H
82082 +#define _ASM_XEN_NET_DRIVER_UTIL_H
82083 +
82084 +
82085 +#include <xen/xenbus.h>
82086 +
82087 +
82088 +/**
82089 + * Read the 'mac' node at the given device's node in the store, and parse that
82090 + * as colon-separated octets, placing result the given mac array.  mac must be
82091 + * a preallocated array of length ETH_ALEN (as declared in linux/if_ether.h).
82092 + * Return 0 on success, or -errno on error.
82093 + */
82094 +int xen_net_read_mac(struct xenbus_device *dev, u8 mac[]);
82095 +
82096 +
82097 +#endif /* _ASM_XEN_NET_DRIVER_UTIL_H */
82098 +
82099 +/*
82100 + * Local variables:
82101 + *  c-file-style: "linux"
82102 + *  indent-tabs-mode: t
82103 + *  c-indent-level: 8
82104 + *  c-basic-offset: 8
82105 + *  tab-width: 8
82106 + * End:
82107 + */
82108 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/pcifront.h linux-2.6.16/include/xen/pcifront.h
82109 --- linux-2.6.16.orig/include/xen/pcifront.h    1970-01-01 01:00:00.000000000 +0100
82110 +++ linux-2.6.16/include/xen/pcifront.h 2006-06-26 09:51:32.000000000 +0200
82111 @@ -0,0 +1,39 @@
82112 +/*
82113 + * PCI Frontend - arch-dependendent declarations
82114 + *
82115 + *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
82116 + */
82117 +#ifndef __XEN_ASM_PCIFRONT_H__
82118 +#define __XEN_ASM_PCIFRONT_H__
82119 +
82120 +#include <linux/config.h>
82121 +#include <linux/spinlock.h>
82122 +
82123 +#ifdef __KERNEL__
82124 +
82125 +struct pcifront_device;
82126 +
82127 +struct pcifront_sd {
82128 +       int domain;
82129 +       struct pcifront_device *pdev;
82130 +};
82131 +
82132 +struct pci_bus;
82133 +
82134 +#ifdef CONFIG_PCI_DOMAINS
82135 +static inline int pci_domain_nr(struct pci_bus *bus)
82136 +{
82137 +       struct pcifront_sd *sd = bus->sysdata;
82138 +       return sd->domain;
82139 +}
82140 +static inline int pci_proc_domain(struct pci_bus *bus)
82141 +{
82142 +       return pci_domain_nr(bus);
82143 +}
82144 +#endif /* CONFIG_PCI_DOMAINS */
82145 +
82146 +extern spinlock_t pci_bus_lock;
82147 +
82148 +#endif /* __KERNEL__ */
82149 +
82150 +#endif /* __XEN_ASM_PCIFRONT_H__ */
82151 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/public/evtchn.h linux-2.6.16/include/xen/public/evtchn.h
82152 --- linux-2.6.16.orig/include/xen/public/evtchn.h       1970-01-01 01:00:00.000000000 +0100
82153 +++ linux-2.6.16/include/xen/public/evtchn.h    2006-06-26 09:51:32.000000000 +0200
82154 @@ -0,0 +1,101 @@
82155 +/******************************************************************************
82156 + * evtchn.h
82157 + * 
82158 + * Interface to /dev/xen/evtchn.
82159 + * 
82160 + * Copyright (c) 2003-2005, K A Fraser
82161 + * 
82162 + * This program is free software; you can redistribute it and/or
82163 + * modify it under the terms of the GNU General Public License version 2
82164 + * as published by the Free Software Foundation; or, when distributed
82165 + * separately from the Linux kernel or incorporated into other
82166 + * software packages, subject to the following license:
82167 + * 
82168 + * Permission is hereby granted, free of charge, to any person obtaining a copy
82169 + * of this source file (the "Software"), to deal in the Software without
82170 + * restriction, including without limitation the rights to use, copy, modify,
82171 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
82172 + * and to permit persons to whom the Software is furnished to do so, subject to
82173 + * the following conditions:
82174 + * 
82175 + * The above copyright notice and this permission notice shall be included in
82176 + * all copies or substantial portions of the Software.
82177 + * 
82178 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
82179 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
82180 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
82181 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
82182 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
82183 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
82184 + * IN THE SOFTWARE.
82185 + */
82186 +
82187 +#ifndef __LINUX_PUBLIC_EVTCHN_H__
82188 +#define __LINUX_PUBLIC_EVTCHN_H__
82189 +
82190 +/* /dev/xen/evtchn resides at device number major=10, minor=201 */
82191 +#define EVTCHN_MINOR 201
82192 +
82193 +/*
82194 + * Bind a fresh port to VIRQ @virq.
82195 + * Return allocated port.
82196 + */
82197 +#define IOCTL_EVTCHN_BIND_VIRQ                         \
82198 +       _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
82199 +struct ioctl_evtchn_bind_virq {
82200 +       unsigned int virq;
82201 +};
82202 +
82203 +/*
82204 + * Bind a fresh port to remote <@remote_domain, @remote_port>.
82205 + * Return allocated port.
82206 + */
82207 +#define IOCTL_EVTCHN_BIND_INTERDOMAIN                  \
82208 +       _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
82209 +struct ioctl_evtchn_bind_interdomain {
82210 +       unsigned int remote_domain, remote_port;
82211 +};
82212 +
82213 +/*
82214 + * Allocate a fresh port for binding to @remote_domain.
82215 + * Return allocated port.
82216 + */
82217 +#define IOCTL_EVTCHN_BIND_UNBOUND_PORT                 \
82218 +       _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
82219 +struct ioctl_evtchn_bind_unbound_port {
82220 +       unsigned int remote_domain;
82221 +};
82222 +
82223 +/*
82224 + * Unbind previously allocated @port.
82225 + */
82226 +#define IOCTL_EVTCHN_UNBIND                            \
82227 +       _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
82228 +struct ioctl_evtchn_unbind {
82229 +       unsigned int port;
82230 +};
82231 +
82232 +/*
82233 + * Unbind previously allocated @port.
82234 + */
82235 +#define IOCTL_EVTCHN_NOTIFY                            \
82236 +       _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
82237 +struct ioctl_evtchn_notify {
82238 +       unsigned int port;
82239 +};
82240 +
82241 +/* Clear and reinitialise the event buffer. Clear error condition. */
82242 +#define IOCTL_EVTCHN_RESET                             \
82243 +       _IOC(_IOC_NONE, 'E', 5, 0)
82244 +
82245 +#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
82246 +
82247 +/*
82248 + * Local variables:
82249 + *  c-file-style: "linux"
82250 + *  indent-tabs-mode: t
82251 + *  c-indent-level: 8
82252 + *  c-basic-offset: 8
82253 + *  tab-width: 8
82254 + * End:
82255 + */
82256 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/public/privcmd.h linux-2.6.16/include/xen/public/privcmd.h
82257 --- linux-2.6.16.orig/include/xen/public/privcmd.h      1970-01-01 01:00:00.000000000 +0100
82258 +++ linux-2.6.16/include/xen/public/privcmd.h   2006-06-26 09:51:32.000000000 +0200
82259 @@ -0,0 +1,94 @@
82260 +/******************************************************************************
82261 + * privcmd.h
82262 + * 
82263 + * Interface to /proc/xen/privcmd.
82264 + * 
82265 + * Copyright (c) 2003-2005, K A Fraser
82266 + * 
82267 + * This program is free software; you can redistribute it and/or
82268 + * modify it under the terms of the GNU General Public License version 2
82269 + * as published by the Free Software Foundation; or, when distributed
82270 + * separately from the Linux kernel or incorporated into other
82271 + * software packages, subject to the following license:
82272 + * 
82273 + * Permission is hereby granted, free of charge, to any person obtaining a copy
82274 + * of this source file (the "Software"), to deal in the Software without
82275 + * restriction, including without limitation the rights to use, copy, modify,
82276 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
82277 + * and to permit persons to whom the Software is furnished to do so, subject to
82278 + * the following conditions:
82279 + * 
82280 + * The above copyright notice and this permission notice shall be included in
82281 + * all copies or substantial portions of the Software.
82282 + * 
82283 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
82284 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
82285 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
82286 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
82287 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
82288 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
82289 + * IN THE SOFTWARE.
82290 + */
82291 +
82292 +#ifndef __LINUX_PUBLIC_PRIVCMD_H__
82293 +#define __LINUX_PUBLIC_PRIVCMD_H__
82294 +
82295 +#ifndef __user
82296 +#define __user
82297 +#endif
82298 +
82299 +typedef struct privcmd_hypercall
82300 +{
82301 +       unsigned long op;
82302 +       unsigned long arg[5];
82303 +} privcmd_hypercall_t;
82304 +
82305 +typedef struct privcmd_mmap_entry {
82306 +       unsigned long va;
82307 +       unsigned long mfn;
82308 +       unsigned long npages;
82309 +} privcmd_mmap_entry_t; 
82310 +
82311 +typedef struct privcmd_mmap {
82312 +       int num;
82313 +       domid_t dom; /* target domain */
82314 +       privcmd_mmap_entry_t __user *entry;
82315 +} privcmd_mmap_t; 
82316 +
82317 +typedef struct privcmd_mmapbatch {
82318 +       int num;     /* number of pages to populate */
82319 +       domid_t dom; /* target domain */
82320 +       unsigned long addr;  /* virtual address */
82321 +       unsigned long __user *arr; /* array of mfns - top nibble set on err */
82322 +} privcmd_mmapbatch_t; 
82323 +
82324 +typedef struct privcmd_blkmsg
82325 +{
82326 +       unsigned long op;
82327 +       void         *buf;
82328 +       int           buf_size;
82329 +} privcmd_blkmsg_t;
82330 +
82331 +/*
82332 + * @cmd: IOCTL_PRIVCMD_HYPERCALL
82333 + * @arg: &privcmd_hypercall_t
82334 + * Return: Value returned from execution of the specified hypercall.
82335 + */
82336 +#define IOCTL_PRIVCMD_HYPERCALL                                        \
82337 +       _IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t))
82338 +#define IOCTL_PRIVCMD_MMAP                                     \
82339 +       _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t))
82340 +#define IOCTL_PRIVCMD_MMAPBATCH                                        \
82341 +       _IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t))
82342 +
82343 +#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
82344 +
82345 +/*
82346 + * Local variables:
82347 + *  c-file-style: "linux"
82348 + *  indent-tabs-mode: t
82349 + *  c-indent-level: 8
82350 + *  c-basic-offset: 8
82351 + *  tab-width: 8
82352 + * End:
82353 + */
82354 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/tpmfe.h linux-2.6.16/include/xen/tpmfe.h
82355 --- linux-2.6.16.orig/include/xen/tpmfe.h       1970-01-01 01:00:00.000000000 +0100
82356 +++ linux-2.6.16/include/xen/tpmfe.h    2006-06-26 09:51:32.000000000 +0200
82357 @@ -0,0 +1,40 @@
82358 +#ifndef TPM_FE_H
82359 +#define TPM_FE_H
82360 +
82361 +struct tpm_private;
82362 +
82363 +struct tpmfe_device {
82364 +       /*
82365 +        * Let upper layer receive data from front-end
82366 +        */
82367 +       int (*receive)(const u8 *buffer, size_t count, const void *ptr);
82368 +       /*
82369 +        * Indicate the status of the front-end to the upper
82370 +        * layer.
82371 +        */
82372 +       void (*status)(unsigned int flags);
82373 +
82374 +       /*
82375 +        * This field indicates the maximum size the driver can
82376 +        * transfer in one chunk. It is filled out by the front-end
82377 +        * driver and should be propagated to the generic tpm driver
82378 +        * for allocation of buffers.
82379 +        */
82380 +       unsigned int max_tx_size;
82381 +       /*
82382 +        * The following is a private structure of the underlying
82383 +        * driver. It's expected as first parameter in the send function.
82384 +        */
82385 +       struct tpm_private *tpm_private;
82386 +};
82387 +
82388 +enum {
82389 +       TPMFE_STATUS_DISCONNECTED = 0x0,
82390 +       TPMFE_STATUS_CONNECTED = 0x1
82391 +};
82392 +
82393 +int tpm_fe_send(struct tpm_private * tp, const u8 * buf, size_t count, void *ptr);
82394 +int tpm_fe_register_receiver(struct tpmfe_device *);
82395 +void tpm_fe_unregister_receiver(void);
82396 +
82397 +#endif
82398 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/xen_proc.h linux-2.6.16/include/xen/xen_proc.h
82399 --- linux-2.6.16.orig/include/xen/xen_proc.h    1970-01-01 01:00:00.000000000 +0100
82400 +++ linux-2.6.16/include/xen/xen_proc.h 2006-06-26 09:51:32.000000000 +0200
82401 @@ -0,0 +1,23 @@
82402 +
82403 +#ifndef __ASM_XEN_PROC_H__
82404 +#define __ASM_XEN_PROC_H__
82405 +
82406 +#include <linux/config.h>
82407 +#include <linux/proc_fs.h>
82408 +
82409 +extern struct proc_dir_entry *create_xen_proc_entry(
82410 +       const char *name, mode_t mode);
82411 +extern void remove_xen_proc_entry(
82412 +       const char *name);
82413 +
82414 +#endif /* __ASM_XEN_PROC_H__ */
82415 +
82416 +/*
82417 + * Local variables:
82418 + *  c-file-style: "linux"
82419 + *  indent-tabs-mode: t
82420 + *  c-indent-level: 8
82421 + *  c-basic-offset: 8
82422 + *  tab-width: 8
82423 + * End:
82424 + */
82425 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/xenbus.h linux-2.6.16/include/xen/xenbus.h
82426 --- linux-2.6.16.orig/include/xen/xenbus.h      1970-01-01 01:00:00.000000000 +0100
82427 +++ linux-2.6.16/include/xen/xenbus.h   2006-06-26 09:51:32.000000000 +0200
82428 @@ -0,0 +1,306 @@
82429 +/******************************************************************************
82430 + * xenbus.h
82431 + *
82432 + * Talks to Xen Store to figure out what devices we have.
82433 + *
82434 + * Copyright (C) 2005 Rusty Russell, IBM Corporation
82435 + * Copyright (C) 2005 XenSource Ltd.
82436 + * 
82437 + * This program is free software; you can redistribute it and/or
82438 + * modify it under the terms of the GNU General Public License version 2
82439 + * as published by the Free Software Foundation; or, when distributed
82440 + * separately from the Linux kernel or incorporated into other
82441 + * software packages, subject to the following license:
82442 + * 
82443 + * Permission is hereby granted, free of charge, to any person obtaining a copy
82444 + * of this source file (the "Software"), to deal in the Software without
82445 + * restriction, including without limitation the rights to use, copy, modify,
82446 + * merge, publish, distribute, sublicense, and/or sell copies of the Software,
82447 + * and to permit persons to whom the Software is furnished to do so, subject to
82448 + * the following conditions:
82449 + * 
82450 + * The above copyright notice and this permission notice shall be included in
82451 + * all copies or substantial portions of the Software.
82452 + * 
82453 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
82454 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
82455 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
82456 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
82457 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
82458 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
82459 + * IN THE SOFTWARE.
82460 + */
82461 +
82462 +#ifndef _XEN_XENBUS_H
82463 +#define _XEN_XENBUS_H
82464 +
82465 +#include <linux/device.h>
82466 +#include <linux/notifier.h>
82467 +#include <linux/mutex.h>
82468 +#include <xen/interface/xen.h>
82469 +#include <xen/interface/grant_table.h>
82470 +#include <xen/interface/io/xenbus.h>
82471 +#include <xen/interface/io/xs_wire.h>
82472 +
82473 +#define XBT_NULL 0
82474 +
82475 +/* Register callback to watch this node. */
82476 +struct xenbus_watch
82477 +{
82478 +       struct list_head list;
82479 +
82480 +       /* Path being watched. */
82481 +       const char *node;
82482 +
82483 +       /* Callback (executed in a process context with no locks held). */
82484 +       void (*callback)(struct xenbus_watch *,
82485 +                        const char **vec, unsigned int len);
82486 +
82487 +       /* See XBWF_ definitions below. */
82488 +       unsigned long flags;
82489 +};
82490 +
82491 +/*
82492 + * Execute callback in its own kthread. Useful if the callback is long
82493 + * running or heavily serialised, to avoid taking out the main xenwatch thread
82494 + * for a long period of time (or even unwittingly causing a deadlock).
82495 + */
82496 +#define XBWF_new_thread        1
82497 +
82498 +/* A xenbus device. */
82499 +struct xenbus_device {
82500 +       const char *devicetype;
82501 +       const char *nodename;
82502 +       const char *otherend;
82503 +       int otherend_id;
82504 +       struct xenbus_watch otherend_watch;
82505 +       struct device dev;
82506 +       XenbusState state;
82507 +       void *data;
82508 +};
82509 +
82510 +static inline struct xenbus_device *to_xenbus_device(struct device *dev)
82511 +{
82512 +       return container_of(dev, struct xenbus_device, dev);
82513 +}
82514 +
82515 +struct xenbus_device_id
82516 +{
82517 +       /* .../device/<device_type>/<identifier> */
82518 +       char devicetype[32];    /* General class of device. */
82519 +};
82520 +
82521 +/* A xenbus driver. */
82522 +struct xenbus_driver {
82523 +       char *name;
82524 +       struct module *owner;
82525 +       const struct xenbus_device_id *ids;
82526 +       int (*probe)(struct xenbus_device *dev,
82527 +                    const struct xenbus_device_id *id);
82528 +       void (*otherend_changed)(struct xenbus_device *dev,
82529 +                                XenbusState backend_state);
82530 +       int (*remove)(struct xenbus_device *dev);
82531 +       int (*suspend)(struct xenbus_device *dev);
82532 +       int (*resume)(struct xenbus_device *dev);
82533 +       int (*uevent)(struct xenbus_device *, char **, int, char *, int);
82534 +       struct device_driver driver;
82535 +       int (*read_otherend_details)(struct xenbus_device *dev);
82536 +};
82537 +
82538 +static inline struct xenbus_driver *to_xenbus_driver(struct device_driver *drv)
82539 +{
82540 +       return container_of(drv, struct xenbus_driver, driver);
82541 +}
82542 +
82543 +int xenbus_register_frontend(struct xenbus_driver *drv);
82544 +int xenbus_register_backend(struct xenbus_driver *drv);
82545 +void xenbus_unregister_driver(struct xenbus_driver *drv);
82546 +
82547 +typedef u32 xenbus_transaction_t;
82548 +
82549 +char **xenbus_directory(xenbus_transaction_t t,
82550 +                       const char *dir, const char *node, unsigned int *num);
82551 +void *xenbus_read(xenbus_transaction_t t,
82552 +                 const char *dir, const char *node, unsigned int *len);
82553 +int xenbus_write(xenbus_transaction_t t,
82554 +                const char *dir, const char *node, const char *string);
82555 +int xenbus_mkdir(xenbus_transaction_t t,
82556 +                const char *dir, const char *node);
82557 +int xenbus_exists(xenbus_transaction_t t,
82558 +                 const char *dir, const char *node);
82559 +int xenbus_rm(xenbus_transaction_t t, const char *dir, const char *node);
82560 +int xenbus_transaction_start(xenbus_transaction_t *t);
82561 +int xenbus_transaction_end(xenbus_transaction_t t, int abort);
82562 +
82563 +/* Single read and scanf: returns -errno or num scanned if > 0. */
82564 +int xenbus_scanf(xenbus_transaction_t t,
82565 +                const char *dir, const char *node, const char *fmt, ...)
82566 +       __attribute__((format(scanf, 4, 5)));
82567 +
82568 +/* Single printf and write: returns -errno or 0. */
82569 +int xenbus_printf(xenbus_transaction_t t,
82570 +                 const char *dir, const char *node, const char *fmt, ...)
82571 +       __attribute__((format(printf, 4, 5)));
82572 +
82573 +/* Generic read function: NULL-terminated triples of name,
82574 + * sprintf-style type string, and pointer. Returns 0 or errno.*/
82575 +int xenbus_gather(xenbus_transaction_t t, const char *dir, ...);
82576 +
82577 +/* notifer routines for when the xenstore comes up */
82578 +int register_xenstore_notifier(struct notifier_block *nb);
82579 +void unregister_xenstore_notifier(struct notifier_block *nb);
82580 +
82581 +int register_xenbus_watch(struct xenbus_watch *watch);
82582 +void unregister_xenbus_watch(struct xenbus_watch *watch);
82583 +void xs_suspend(void);
82584 +void xs_resume(void);
82585 +
82586 +/* Used by xenbus_dev to borrow kernel's store connection. */
82587 +void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg);
82588 +
82589 +/* Called from xen core code. */
82590 +void xenbus_suspend(void);
82591 +void xenbus_resume(void);
82592 +
82593 +#define XENBUS_IS_ERR_READ(str) ({                     \
82594 +       if (!IS_ERR(str) && strlen(str) == 0) {         \
82595 +               kfree(str);                             \
82596 +               str = ERR_PTR(-ERANGE);                 \
82597 +       }                                               \
82598 +       IS_ERR(str);                                    \
82599 +})
82600 +
82601 +#define XENBUS_EXIST_ERR(err) ((err) == -ENOENT || (err) == -ERANGE)
82602 +
82603 +
82604 +/**
82605 + * Register a watch on the given path, using the given xenbus_watch structure
82606 + * for storage, and the given callback function as the callback.  Return 0 on
82607 + * success, or -errno on error.  On success, the given path will be saved as
82608 + * watch->node, and remains the caller's to free.  On error, watch->node will
82609 + * be NULL, the device will switch to XenbusStateClosing, and the error will
82610 + * be saved in the store.
82611 + */
82612 +int xenbus_watch_path(struct xenbus_device *dev, const char *path,
82613 +                     struct xenbus_watch *watch,
82614 +                     void (*callback)(struct xenbus_watch *,
82615 +                                      const char **, unsigned int));
82616 +
82617 +
82618 +/**
82619 + * Register a watch on the given path/path2, using the given xenbus_watch
82620 + * structure for storage, and the given callback function as the callback.
82621 + * Return 0 on success, or -errno on error.  On success, the watched path
82622 + * (path/path2) will be saved as watch->node, and becomes the caller's to
82623 + * kfree().  On error, watch->node will be NULL, so the caller has nothing to
82624 + * free, the device will switch to XenbusStateClosing, and the error will be
82625 + * saved in the store.
82626 + */
82627 +int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
82628 +                      const char *path2, struct xenbus_watch *watch,
82629 +                      void (*callback)(struct xenbus_watch *,
82630 +                                       const char **, unsigned int));
82631 +
82632 +
82633 +/**
82634 + * Advertise in the store a change of the given driver to the given new_state.
82635 + * Return 0 on success, or -errno on error.  On error, the device will switch
82636 + * to XenbusStateClosing, and the error will be saved in the store.
82637 + */
82638 +int xenbus_switch_state(struct xenbus_device *dev, XenbusState new_state);
82639 +
82640 +
82641 +/**
82642 + * Grant access to the given ring_mfn to the peer of the given device.  Return
82643 + * 0 on success, or -errno on error.  On error, the device will switch to
82644 + * XenbusStateClosing, and the error will be saved in the store.
82645 + */
82646 +int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn);
82647 +
82648 +
82649 +/**
82650 + * Map a page of memory into this domain from another domain's grant table.
82651 + * xenbus_map_ring_valloc allocates a page of virtual address space, maps the
82652 + * page to that address, and sets *vaddr to that address.
82653 + * xenbus_map_ring does not allocate the virtual address space (you must do
82654 + * this yourself!). It only maps in the page to the specified address.
82655 + * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
82656 + * or -ENOMEM on error. If an error is returned, device will switch to
82657 + * XenbusStateClosing and the error message will be saved in XenStore.
82658 + */
82659 +int xenbus_map_ring_valloc(struct xenbus_device *dev,
82660 +                          int gnt_ref, void **vaddr);
82661 +int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
82662 +                          grant_handle_t *handle, void *vaddr);
82663 +
82664 +
82665 +/**
82666 + * Unmap a page of memory in this domain that was imported from another domain.
82667 + * Use xenbus_unmap_ring_vfree if you mapped in your memory with
82668 + * xenbus_map_ring_valloc (it will free the virtual address space).
82669 + * Returns 0 on success and returns GNTST_* on error
82670 + * (see xen/include/interface/grant_table.h).
82671 + */
82672 +int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr);
82673 +int xenbus_unmap_ring(struct xenbus_device *dev,
82674 +                     grant_handle_t handle, void *vaddr);
82675 +
82676 +
82677 +/**
82678 + * Allocate an event channel for the given xenbus_device, assigning the newly
82679 + * created local port to *port.  Return 0 on success, or -errno on error.  On
82680 + * error, the device will switch to XenbusStateClosing, and the error will be
82681 + * saved in the store.
82682 + */
82683 +int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port);
82684 +
82685 +
82686 +/**
82687 + * Bind to an existing interdomain event channel in another domain. Returns 0
82688 + * on success and stores the local port in *port. On error, returns -errno,
82689 + * switches the device to XenbusStateClosing, and saves the error in XenStore.
82690 + */
82691 +int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port);
82692 +
82693 +
82694 +/**
82695 + * Free an existing event channel. Returns 0 on success or -errno on error.
82696 + */
82697 +int xenbus_free_evtchn(struct xenbus_device *dev, int port);
82698 +
82699 +
82700 +/**
82701 + * Return the state of the driver rooted at the given store path, or
82702 + * XenbusStateClosed if no state can be read.
82703 + */
82704 +XenbusState xenbus_read_driver_state(const char *path);
82705 +
82706 +
82707 +/***
82708 + * Report the given negative errno into the store, along with the given
82709 + * formatted message.
82710 + */
82711 +void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt,
82712 +                     ...);
82713 +
82714 +
82715 +/***
82716 + * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by
82717 + * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly
82718 + * closedown of this driver and its peer.
82719 + */
82720 +void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt,
82721 +                     ...);
82722 +
82723 +
82724 +#endif /* _XEN_XENBUS_H */
82725 +
82726 +/*
82727 + * Local variables:
82728 + *  c-file-style: "linux"
82729 + *  indent-tabs-mode: t
82730 + *  c-indent-level: 8
82731 + *  c-basic-offset: 8
82732 + *  tab-width: 8
82733 + * End:
82734 + */
82735 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/include/xen/xencons.h linux-2.6.16/include/xen/xencons.h
82736 --- linux-2.6.16.orig/include/xen/xencons.h     1970-01-01 01:00:00.000000000 +0100
82737 +++ linux-2.6.16/include/xen/xencons.h  2006-06-26 09:51:32.000000000 +0200
82738 @@ -0,0 +1,14 @@
82739 +#ifndef __ASM_XENCONS_H__
82740 +#define __ASM_XENCONS_H__
82741 +
82742 +void xencons_force_flush(void);
82743 +void xencons_resume(void);
82744 +
82745 +/* Interrupt work hooks. Receive data, or kick data out. */
82746 +void xencons_rx(char *buf, unsigned len, struct pt_regs *regs);
82747 +void xencons_tx(void);
82748 +
82749 +int xencons_ring_init(void);
82750 +int xencons_ring_send(const char *data, unsigned len);
82751 +
82752 +#endif /* __ASM_XENCONS_H__ */
82753 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/kernel/Kconfig.preempt linux-2.6.16/kernel/Kconfig.preempt
82754 --- linux-2.6.16.orig/kernel/Kconfig.preempt    2006-03-20 06:53:29.000000000 +0100
82755 +++ linux-2.6.16/kernel/Kconfig.preempt 2006-06-26 09:51:32.000000000 +0200
82756 @@ -35,6 +35,7 @@
82757  
82758  config PREEMPT
82759         bool "Preemptible Kernel (Low-Latency Desktop)"
82760 +       depends on !XEN
82761         help
82762           This option reduces the latency of the kernel by making
82763           all kernel code (that is not executing in a critical section)
82764 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/lib/Makefile linux-2.6.16/lib/Makefile
82765 --- linux-2.6.16.orig/lib/Makefile      2006-06-26 09:49:45.000000000 +0200
82766 +++ linux-2.6.16/lib/Makefile   2006-06-26 09:51:32.000000000 +0200
82767 @@ -47,6 +47,9 @@
82768  obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
82769  
82770  obj-$(CONFIG_SWIOTLB) += swiotlb.o
82771 +ifneq ($(CONFIG_IA64),y)
82772 +swiotlb-$(CONFIG_XEN) := ../arch/i386/kernel/swiotlb.o
82773 +endif
82774  
82775  hostprogs-y    := gen_crc32table
82776  clean-files    := crc32table.h
82777 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/mm/Kconfig linux-2.6.16/mm/Kconfig
82778 --- linux-2.6.16.orig/mm/Kconfig        2006-03-20 06:53:29.000000000 +0100
82779 +++ linux-2.6.16/mm/Kconfig     2006-06-26 09:51:32.000000000 +0200
82780 @@ -126,11 +126,14 @@
82781  # Default to 4 for wider testing, though 8 might be more appropriate.
82782  # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
82783  # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
82784 +# XEN uses the mapping field on pagetable pages to store a pointer to
82785 +# the destructor.
82786  #
82787  config SPLIT_PTLOCK_CPUS
82788         int
82789         default "4096" if ARM && !CPU_CACHE_VIPT
82790         default "4096" if PARISC && !PA20
82791 +       default "4096" if XEN
82792         default "4"
82793  
82794  #
82795 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/mm/highmem.c linux-2.6.16/mm/highmem.c
82796 --- linux-2.6.16.orig/mm/highmem.c      2006-03-20 06:53:29.000000000 +0100
82797 +++ linux-2.6.16/mm/highmem.c   2006-06-26 09:51:32.000000000 +0200
82798 @@ -152,6 +152,17 @@
82799         return vaddr;
82800  }
82801  
82802 +#ifdef CONFIG_XEN
82803 +void kmap_flush_unused(void)
82804 +{
82805 +       spin_lock(&kmap_lock);
82806 +       flush_all_zero_pkmaps();
82807 +       spin_unlock(&kmap_lock);
82808 +}
82809 +
82810 +EXPORT_SYMBOL(kmap_flush_unused);
82811 +#endif
82812 +
82813  void fastcall *kmap_high(struct page *page)
82814  {
82815         unsigned long vaddr;
82816 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/mm/memory.c linux-2.6.16/mm/memory.c
82817 --- linux-2.6.16.orig/mm/memory.c       2006-06-26 09:49:47.000000000 +0200
82818 +++ linux-2.6.16/mm/memory.c    2006-06-26 09:51:32.000000000 +0200
82819 @@ -405,7 +405,8 @@
82820          * Remove this test eventually!
82821          */
82822         if (unlikely(!pfn_valid(pfn))) {
82823 -               print_bad_pte(vma, pte, addr);
82824 +               if (!vma->vm_flags & VM_RESERVED)
82825 +                       print_bad_pte(vma, pte, addr);
82826                 return NULL;
82827         }
82828  
82829 @@ -1029,6 +1030,23 @@
82830                         continue;
82831                 }
82832  
82833 +#ifdef CONFIG_XEN
82834 +               if (vma && (vma->vm_flags & VM_FOREIGN)) {
82835 +                       struct page **map = vma->vm_private_data;
82836 +                       int offset = (start - vma->vm_start) >> PAGE_SHIFT;
82837 +
82838 +                       if (map[offset] != NULL) {
82839 +                               if (pages)
82840 +                                       pages[i] = map[offset];
82841 +                               if (vmas)
82842 +                                       vmas[i] = vma;
82843 +                               i++;
82844 +                               start += PAGE_SIZE;
82845 +                               len--;
82846 +                               continue;
82847 +                       }
82848 +               }
82849 +#endif
82850                 if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
82851                                 || !(vm_flags & vma->vm_flags))
82852                         return i ? : -EFAULT;
82853 @@ -1369,6 +1387,102 @@
82854  }
82855  EXPORT_SYMBOL(remap_pfn_range);
82856  
82857 +#ifdef CONFIG_XEN
82858 +static inline int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
82859 +                                    unsigned long addr, unsigned long end,
82860 +                                    pte_fn_t fn, void *data)
82861 +{
82862 +       pte_t *pte;
82863 +       int err;
82864 +       struct page *pmd_page;
82865 +       spinlock_t *ptl;
82866 +
82867 +       pte = (mm == &init_mm) ?
82868 +               pte_alloc_kernel(pmd, addr) :
82869 +               pte_alloc_map_lock(mm, pmd, addr, &ptl);
82870 +       if (!pte)
82871 +               return -ENOMEM;
82872 +
82873 +       BUG_ON(pmd_huge(*pmd));
82874 +
82875 +       pmd_page = pmd_page(*pmd);
82876 +
82877 +       do {
82878 +               err = fn(pte, pmd_page, addr, data);
82879 +               if (err)
82880 +                       break;
82881 +       } while (pte++, addr += PAGE_SIZE, addr != end);
82882 +
82883 +       if (mm != &init_mm)
82884 +               pte_unmap_unlock(pte-1, ptl);
82885 +       return err;
82886 +}
82887 +
82888 +static inline int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
82889 +                                    unsigned long addr, unsigned long end,
82890 +                                    pte_fn_t fn, void *data)
82891 +{
82892 +       pmd_t *pmd;
82893 +       unsigned long next;
82894 +       int err;
82895 +
82896 +       pmd = pmd_alloc(mm, pud, addr);
82897 +       if (!pmd)
82898 +               return -ENOMEM;
82899 +       do {
82900 +               next = pmd_addr_end(addr, end);
82901 +               err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
82902 +               if (err)
82903 +                       break;
82904 +       } while (pmd++, addr = next, addr != end);
82905 +       return err;
82906 +}
82907 +
82908 +static inline int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
82909 +                                    unsigned long addr, unsigned long end,
82910 +                                    pte_fn_t fn, void *data)
82911 +{
82912 +       pud_t *pud;
82913 +       unsigned long next;
82914 +       int err;
82915 +
82916 +       pud = pud_alloc(mm, pgd, addr);
82917 +       if (!pud)
82918 +               return -ENOMEM;
82919 +       do {
82920 +               next = pud_addr_end(addr, end);
82921 +               err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
82922 +               if (err)
82923 +                       break;
82924 +       } while (pud++, addr = next, addr != end);
82925 +       return err;
82926 +}
82927 +
82928 +/*
82929 + * Scan a region of virtual memory, filling in page tables as necessary
82930 + * and calling a provided function on each leaf page table.
82931 + */
82932 +int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
82933 +                       unsigned long size, pte_fn_t fn, void *data)
82934 +{
82935 +       pgd_t *pgd;
82936 +       unsigned long next;
82937 +       unsigned long end = addr + size;
82938 +       int err;
82939 +
82940 +       BUG_ON(addr >= end);
82941 +       pgd = pgd_offset(mm, addr);
82942 +       do {
82943 +               next = pgd_addr_end(addr, end);
82944 +               err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
82945 +               if (err)
82946 +                       break;
82947 +       } while (pgd++, addr = next, addr != end);
82948 +       return err;
82949 +}
82950 +EXPORT_SYMBOL_GPL(apply_to_page_range);
82951 +#endif
82952 +
82953  /*
82954   * handle_pte_fault chooses page fault handler according to an entry
82955   * which was read non-atomically.  Before making any commitment, on
82956 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/mm/mmap.c linux-2.6.16/mm/mmap.c
82957 --- linux-2.6.16.orig/mm/mmap.c 2006-06-26 09:49:48.000000000 +0200
82958 +++ linux-2.6.16/mm/mmap.c      2006-06-26 09:51:32.000000000 +0200
82959 @@ -1940,6 +1940,10 @@
82960         unsigned long nr_accounted = 0;
82961         unsigned long end;
82962  
82963 +#ifdef arch_exit_mmap
82964 +       arch_exit_mmap(mm);
82965 +#endif
82966 +
82967         lru_add_drain();
82968         flush_cache_mm(mm);
82969         tlb = tlb_gather_mmu(mm, 1);
82970 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/mm/page_alloc.c linux-2.6.16/mm/page_alloc.c
82971 --- linux-2.6.16.orig/mm/page_alloc.c   2006-06-26 09:49:48.000000000 +0200
82972 +++ linux-2.6.16/mm/page_alloc.c        2006-06-26 09:51:32.000000000 +0200
82973 @@ -156,8 +156,7 @@
82974                         1 << PG_reclaim |
82975                         1 << PG_slab    |
82976                         1 << PG_swapcache |
82977 -                       1 << PG_writeback |
82978 -                       1 << PG_buddy );
82979 +                       1 << PG_writeback );
82980         set_page_count(page, 0);
82981         reset_page_mapcount(page);
82982         page->mapping = NULL;
82983 @@ -228,12 +227,12 @@
82984  
82985  static inline void set_page_order(struct page *page, int order) {
82986         set_page_private(page, order);
82987 -       __SetPageBuddy(page);
82988 +       __SetPagePrivate(page);
82989  }
82990  
82991  static inline void rmv_page_order(struct page *page)
82992  {
82993 -       __ClearPageBuddy(page);
82994 +       __ClearPagePrivate(page);
82995         set_page_private(page, 0);
82996  }
82997  
82998 @@ -272,13 +271,11 @@
82999   * This function checks whether a page is free && is the buddy
83000   * we can do coalesce a page and its buddy if
83001   * (a) the buddy is not in a hole &&
83002 - * (b) the buddy is in the buddy system &&
83003 - * (c) a page and its buddy have the same order.
83004 - *
83005 - * For recording whether a page is in the buddy system, we use PG_buddy.
83006 - * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
83007 + * (b) the buddy is free &&
83008 + * (c) the buddy is on the buddy system &&
83009 + * (d) a page and its buddy have the same order.
83010 + * for recording page's order, we use page_private(page) and PG_private.
83011   *
83012 - * For recording page's order, we use page_private(page).
83013   */
83014  static inline int page_is_buddy(struct page *page, int order)
83015  {
83016 @@ -287,10 +284,10 @@
83017                 return 0;
83018  #endif
83019  
83020 -       if (PageBuddy(page) && page_order(page) == order) {
83021 -               BUG_ON(page_count(page) != 0);
83022 +       if (PagePrivate(page)           &&
83023 +           (page_order(page) == order) &&
83024 +            page_count(page) == 0)
83025                 return 1;
83026 -       }
83027         return 0;
83028  }
83029  
83030 @@ -307,7 +304,7 @@
83031   * as necessary, plus some accounting needed to play nicely with other
83032   * parts of the VM system.
83033   * At each level, we keep a list of pages, which are heads of continuous
83034 - * free pages of length of (1 << order) and marked with PG_buddy. Page's
83035 + * free pages of length of (1 << order) and marked with PG_Private.Page's
83036   * order is recorded in page_private(page) field.
83037   * So when we are allocating or freeing one, we can derive the state of the
83038   * other.  That is, if we allocate a small block, and both were   
83039 @@ -370,8 +367,7 @@
83040                         1 << PG_slab    |
83041                         1 << PG_swapcache |
83042                         1 << PG_writeback |
83043 -                       1 << PG_reserved |
83044 -                       1 << PG_buddy ))))
83045 +                       1 << PG_reserved ))))
83046                 bad_page(page);
83047         if (PageDirty(page))
83048                 __ClearPageDirty(page);
83049 @@ -425,7 +421,8 @@
83050         int i;
83051         int reserved = 0;
83052  
83053 -       arch_free_page(page, order);
83054 +       if (arch_free_page(page, order))
83055 +               return;
83056         if (!PageHighMem(page))
83057                 mutex_debug_check_no_locks_freed(page_address(page),
83058                                                  PAGE_SIZE<<order);
83059 @@ -531,8 +528,7 @@
83060                         1 << PG_slab    |
83061                         1 << PG_swapcache |
83062                         1 << PG_writeback |
83063 -                       1 << PG_reserved |
83064 -                       1 << PG_buddy ))))
83065 +                       1 << PG_reserved ))))
83066                 bad_page(page);
83067  
83068         /*
83069 @@ -721,7 +717,8 @@
83070         struct per_cpu_pages *pcp;
83071         unsigned long flags;
83072  
83073 -       arch_free_page(page, 0);
83074 +       if (arch_free_page(page, 0))
83075 +               return;
83076  
83077         if (PageAnon(page))
83078                 page->mapping = NULL;
83079 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/net/core/dev.c linux-2.6.16/net/core/dev.c
83080 --- linux-2.6.16.orig/net/core/dev.c    2006-06-26 09:49:48.000000000 +0200
83081 +++ linux-2.6.16/net/core/dev.c 2006-06-26 09:51:32.000000000 +0200
83082 @@ -117,6 +117,12 @@
83083  #include <linux/vs_network.h>
83084  #include <asm/current.h>
83085  
83086 +#ifdef CONFIG_XEN
83087 +#include <net/ip.h>
83088 +#include <linux/tcp.h>
83089 +#include <linux/udp.h>
83090 +#endif
83091 +
83092  /*
83093   *     The list of packet types we will receive (as opposed to discard)
83094   *     and the routines to invoke.
83095 @@ -1261,6 +1267,37 @@
83096             __skb_linearize(skb, GFP_ATOMIC))
83097                 goto out_kfree_skb;
83098  
83099 +#ifdef CONFIG_XEN
83100 +       /* If a checksum-deferred packet is forwarded to a device that needs a
83101 +        * checksum, correct the pointers and force checksumming.
83102 +        */
83103 +       if (skb->proto_csum_blank) {
83104 +               if (skb->protocol != htons(ETH_P_IP))
83105 +                       goto out_kfree_skb;
83106 +               skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl;
83107 +               if (skb->h.raw >= skb->tail)
83108 +                       goto out_kfree_skb;
83109 +               switch (skb->nh.iph->protocol) {
83110 +               case IPPROTO_TCP:
83111 +                       skb->csum = offsetof(struct tcphdr, check);
83112 +                       break;
83113 +               case IPPROTO_UDP:
83114 +                       skb->csum = offsetof(struct udphdr, check);
83115 +                       break;
83116 +               default:
83117 +                       if (net_ratelimit())
83118 +                               printk(KERN_ERR "Attempting to checksum a non-"
83119 +                                      "TCP/UDP packet, dropping a protocol"
83120 +                                      " %d packet", skb->nh.iph->protocol);
83121 +                       rc = -EPROTO;
83122 +                       goto out_kfree_skb;
83123 +               }
83124 +               if ((skb->h.raw + skb->csum + 2) > skb->tail)
83125 +                       goto out_kfree_skb;
83126 +               skb->ip_summed = CHECKSUM_HW;
83127 +       }
83128 +#endif
83129 +
83130         /* If packet is not checksummed and device does not support
83131          * checksumming for this protocol, complete checksumming here.
83132          */
83133 @@ -1610,6 +1647,19 @@
83134         }
83135  #endif
83136  
83137 +#ifdef CONFIG_XEN
83138 +       switch (skb->ip_summed) {
83139 +       case CHECKSUM_UNNECESSARY:
83140 +               skb->proto_data_valid = 1;
83141 +               break;
83142 +       case CHECKSUM_HW:
83143 +               /* XXX Implement me. */
83144 +       default:
83145 +               skb->proto_data_valid = 0;
83146 +               break;
83147 +       }
83148 +#endif
83149 +
83150         list_for_each_entry_rcu(ptype, &ptype_all, list) {
83151                 if (!ptype->dev || ptype->dev == skb->dev) {
83152                         if (pt_prev) 
83153 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/net/core/skbuff.c linux-2.6.16/net/core/skbuff.c
83154 --- linux-2.6.16.orig/net/core/skbuff.c 2006-06-26 09:49:46.000000000 +0200
83155 +++ linux-2.6.16/net/core/skbuff.c      2006-06-26 09:51:32.000000000 +0200
83156 @@ -132,6 +132,7 @@
83157   *     Buffers may only be allocated from interrupts using a @gfp_mask of
83158   *     %GFP_ATOMIC.
83159   */
83160 +#ifndef CONFIG_HAVE_ARCH_ALLOC_SKB
83161  struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
83162                             int fclone)
83163  {
83164 @@ -186,6 +187,7 @@
83165         skb = NULL;
83166         goto out;
83167  }
83168 +#endif /* !CONFIG_HAVE_ARCH_ALLOC_SKB */
83169  
83170  /**
83171   *     alloc_skb_from_cache    -       allocate a network buffer
83172 @@ -203,14 +205,18 @@
83173   */
83174  struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
83175                                      unsigned int size,
83176 -                                    gfp_t gfp_mask)
83177 +                                    gfp_t gfp_mask,
83178 +                                    int fclone)
83179  {
83180 +       kmem_cache_t *cache;
83181 +       struct skb_shared_info *shinfo;
83182         struct sk_buff *skb;
83183         u8 *data;
83184  
83185 +       cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
83186 +
83187         /* Get the HEAD */
83188 -       skb = kmem_cache_alloc(skbuff_head_cache,
83189 -                              gfp_mask & ~__GFP_DMA);
83190 +       skb = kmem_cache_alloc(cache, gfp_mask & ~__GFP_DMA);
83191         if (!skb)
83192                 goto out;
83193  
83194 @@ -227,17 +233,29 @@
83195         skb->data = data;
83196         skb->tail = data;
83197         skb->end  = data + size;
83198 +       /* make sure we initialize shinfo sequentially */
83199 +       shinfo = skb_shinfo(skb);
83200 +       atomic_set(&shinfo->dataref, 1);
83201 +       shinfo->nr_frags  = 0;
83202 +       shinfo->tso_size = 0;
83203 +       shinfo->tso_segs = 0;
83204 +       shinfo->ufo_size = 0;
83205 +       shinfo->ip6_frag_id = 0;
83206 +       shinfo->frag_list = NULL;
83207  
83208 -       atomic_set(&(skb_shinfo(skb)->dataref), 1);
83209 -       skb_shinfo(skb)->nr_frags  = 0;
83210 -       skb_shinfo(skb)->tso_size = 0;
83211 -       skb_shinfo(skb)->tso_segs = 0;
83212 -       skb_shinfo(skb)->ufo_size = 0;
83213 -       skb_shinfo(skb)->frag_list = NULL;
83214 +       if (fclone) {
83215 +               struct sk_buff *child = skb + 1;
83216 +               atomic_t *fclone_ref = (atomic_t *) (child + 1);
83217 +
83218 +               skb->fclone = SKB_FCLONE_ORIG;
83219 +               atomic_set(fclone_ref, 1);
83220 +
83221 +               child->fclone = SKB_FCLONE_UNAVAILABLE;
83222 +       }
83223  out:
83224         return skb;
83225  nodata:
83226 -       kmem_cache_free(skbuff_head_cache, skb);
83227 +       kmem_cache_free(cache, skb);
83228         skb = NULL;
83229         goto out;
83230  }
83231 @@ -408,6 +427,10 @@
83232         C(local_df);
83233         n->cloned = 1;
83234         n->nohdr = 0;
83235 +#ifdef CONFIG_XEN
83236 +       C(proto_data_valid);
83237 +       C(proto_csum_blank);
83238 +#endif
83239         C(pkt_type);
83240         C(ip_summed);
83241         C(priority);
83242 diff -durN -x '*~' -x '*.orig' -x '*.rej' linux-2.6.16.orig/scripts/Makefile.xen linux-2.6.16/scripts/Makefile.xen
83243 --- linux-2.6.16.orig/scripts/Makefile.xen      1970-01-01 01:00:00.000000000 +0100
83244 +++ linux-2.6.16/scripts/Makefile.xen   2006-06-26 09:51:32.000000000 +0200
83245 @@ -0,0 +1,14 @@
83246 +
83247 +# cherrypickxen($1 = allobj)
83248 +cherrypickxen = $(foreach var, $(1), \
83249 +               $(shell o=$(var); \
83250 +                       c=$${o/%.o/-xen.c}; \
83251 +                       s=$${o/%.o/-xen.S}; \
83252 +                       oxen=$${o/%.o/-xen.o}; \
83253 +                       [ -f $(srctree)/$(src)/$${c} ] || \
83254 +                          [ -f $(srctree)/$(src)/$${s} ] \
83255 +                               && echo $$oxen \
83256 +                               || echo $(var) ) \
83257 +         )
83258 +# filterxen($1 = allobj, $2 = noobjs)
83259 +filterxen = $(filter-out $(2), $(1))
This page took 6.736037 seconds and 3 git commands to generate.